diff --git a/sft_pretrain/Full_xmoe/added_tokens.json b/sft_pretrain/Full_xmoe/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_xmoe/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/added_tokens.json b/sft_pretrain/Full_xmoe/checkpoint-1040/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/config.json b/sft_pretrain/Full_xmoe/checkpoint-1040/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5ed860286ec8c9b3f17e5234326d2ed728ca6a65 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "xmoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/generation_config.json b/sft_pretrain/Full_xmoe/checkpoint-1040/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..184a5d7a5b94905baea160ff6893def3d479eac5 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c6c9c7746d0e86ace8af4950fc155bc535fbbf47ab90ea9ddae6f03e09ca7a3 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d24cd51e011accdb9c5f10cdbbe8dc9eb107e77 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2350a432202679cbbbbf5f244bad8e2891faece9747be7c77e59c388f53830a4 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2afdc6e6dc11c2b6e5714debd2792545cd5f416d --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17fd649614177e1112536ac3bdea5de1776f7404f86c04675783a71176c1ebbf +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..719a7c610b96083b57fe935ac7c9fc6f274d5711 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fabab2fa42f9c4c8e196771edc07d3d7f9a2eafb8aa530b3cc7ffdd0fcf5ea11 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..479c6ceabc79c1e2de97c74f4b19f2f628c7536c --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36b86452fbcae03789985902b77f993e965a6ca0679f9a3aec668ffeded3859c +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2dda9aed1e141fe3c18ff41e089fb4db0dd8bdb8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af902bab12c8a75b404efda3aa15c04a81dbcb3dfa1c2a40d8b542203f47597f +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bd855aab1966e0bf2ba9f4563e54a551a623f50 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ce32f0ba5e3479f216e7aaf3996f5db453e5a1201d5a2bce9efad4a7050a77 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a857f9aaaf10fffe7ebc2574fb3edaba2968b12c --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e03e86540f9e955c5e026a5165f04267b3c1df8bc87dc468c8db8cf45e763ba1 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/latest b/sft_pretrain/Full_xmoe/checkpoint-1040/latest new file mode 100644 index 0000000000000000000000000000000000000000..f37da78e3c7eee26ebe5f06b54d6621716edb6b9 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/latest @@ -0,0 +1 @@ +global_step1040 \ No newline at end of file diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/model-00001-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-1040/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/model-00002-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-1040/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07a5ac70ebcd44726e16e6b428a39a112258e155 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:986b770dcfd83575469d523277545ae237df6d46f5e01c763786ca180e88e6fa +size 3759044016 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/model.safetensors.index.json b/sft_pretrain/Full_xmoe/checkpoint-1040/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..507806fb086ee2ffdb4c1df263574fc5a7cfa513 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/model.safetensors.index.json @@ -0,0 +1,675 @@ +{ + "metadata": { + "total_size": 8731443248 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_0.pth b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_1.pth b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_2.pth b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_3.pth b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/special_tokens_map.json b/sft_pretrain/Full_xmoe/checkpoint-1040/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/tokenizer.model b/sft_pretrain/Full_xmoe/checkpoint-1040/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/tokenizer_config.json b/sft_pretrain/Full_xmoe/checkpoint-1040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/trainer_state.json b/sft_pretrain/Full_xmoe/checkpoint-1040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bd00e47da341f41271112aa03904c7c4c4ded579 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/trainer_state.json @@ -0,0 +1,15633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000769526741054, + "eval_steps": 500, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334678, + "balance_loss_mlp": 2.48847342, + "epoch": 0.00019238168526356292, + "flos": 471022563072.0, + "grad_norm": 15.010934477254423, + "language_loss": 2.91277003, + "learning_rate": 0.0, + "loss": 1.95375419, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 8.6015625, + "step": 1, + "time_per_iteration": 23.313215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03608113, + "balance_loss_mlp": 3.00043201, + "epoch": 0.00038476337052712584, + "flos": 505538830848.0, + "grad_norm": 25.821694542927546, + "language_loss": 10.7459116, + "learning_rate": 0.00013726078121135892, + "loss": 10.78199196, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.06640625, + "step": 2, + "time_per_iteration": 2.6342098712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03648002, + "balance_loss_mlp": 3.03803182, + "epoch": 0.0005771450557906887, + "flos": 600334166016.0, + "grad_norm": 27.537763142134942, + "language_loss": 10.88985825, + "learning_rate": 0.00021755319103969496, + "loss": 10.9263401, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.08984375, + "step": 3, + "time_per_iteration": 2.9129159450531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03639085, + "balance_loss_mlp": 3.03521824, + "epoch": 0.0007695267410542517, + "flos": 581497386240.0, + "grad_norm": 10.719163482624658, + "language_loss": 8.79598808, + "learning_rate": 0.00027452156242271784, + "loss": 8.83237934, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.02734375, + "step": 4, + "time_per_iteration": 2.72357439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03604871, + "balance_loss_mlp": 3.01435566, + "epoch": 0.0009619084263178145, + "flos": 487154061312.0, + "grad_norm": 22.68157363884245, + "language_loss": 9.41989708, + "learning_rate": 0.0003187096642208417, + "loss": 9.45594501, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.8984375, + "step": 5, + "time_per_iteration": 2.6791844367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03472164, + "balance_loss_mlp": 2.9011035, + "epoch": 0.0011542901115813775, + "flos": 561167503872.0, + "grad_norm": 7.113488232519407, + "language_loss": 9.41725159, + "learning_rate": 0.0003548139722510539, + "loss": 9.45197296, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.72265625, + "step": 6, + "time_per_iteration": 2.7308623790740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03266853, + "balance_loss_mlp": 2.70799947, + "epoch": 0.0013466717968449403, + "flos": 534951738624.0, + "grad_norm": 3.189932925125429, + "language_loss": 8.01036549, + "learning_rate": 0.00038533972973918044, + "loss": 8.0430336, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.59765625, + "step": 7, + "time_per_iteration": 2.6907436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02962571, + "balance_loss_mlp": 2.41211033, + "epoch": 0.0015390534821085034, + "flos": 493334485248.0, + "grad_norm": 5.13822781788523, + "language_loss": 7.84486008, + "learning_rate": 0.0004117823436340768, + "loss": 7.87448597, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.51171875, + "step": 8, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550478, + "balance_loss_mlp": 2.0114615, + "epoch": 0.0017314351673720662, + "flos": 565776090624.0, + "grad_norm": 3.8232757327488405, + "language_loss": 7.62468719, + "learning_rate": 0.00043510638207938993, + "loss": 7.65019178, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.7688682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02337757, + "balance_loss_mlp": 1.81705093, + "epoch": 0.001923816852635629, + "flos": 594509521152.0, + "grad_norm": 3.0012265425900817, + "language_loss": 6.96830463, + "learning_rate": 0.00045597044543220066, + "loss": 6.99168253, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.20703125, + "step": 10, + "time_per_iteration": 2.736985921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02262083, + "balance_loss_mlp": 1.74290299, + "epoch": 0.002116198537899192, + "flos": 610895709696.0, + "grad_norm": 2.2728267884834983, + "language_loss": 6.92078686, + "learning_rate": 0.00047484428652143135, + "loss": 6.94340801, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.19140625, + "step": 11, + "time_per_iteration": 2.8857340812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308547, + "balance_loss_mlp": 1.78135598, + "epoch": 0.002308580223162755, + "flos": 546175262976.0, + "grad_norm": 4.334726148282724, + "language_loss": 6.71077013, + "learning_rate": 0.0004920747534624128, + "loss": 6.73385572, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 5.2734375, + "step": 12, + "time_per_iteration": 2.635601282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02317905, + "balance_loss_mlp": 1.79147708, + "epoch": 0.002500961908426318, + "flos": 645924270336.0, + "grad_norm": 3.1568536142119923, + "language_loss": 6.53248501, + "learning_rate": 0.0005079252465375872, + "loss": 6.55566406, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 5.265625, + "step": 13, + "time_per_iteration": 2.8112540245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02242807, + "balance_loss_mlp": 1.72019386, + "epoch": 0.0026933435936898806, + "flos": 488849352960.0, + "grad_norm": 7.572425831928954, + "language_loss": 6.47189951, + "learning_rate": 0.0005226005109505393, + "loss": 6.49432755, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 5.2265625, + "step": 14, + "time_per_iteration": 2.590078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02247915, + "balance_loss_mlp": 1.72415757, + "epoch": 0.0028857252789534437, + "flos": 435526429440.0, + "grad_norm": 2.3229781853457747, + "language_loss": 6.01724243, + "learning_rate": 0.0005362628552605367, + "loss": 6.03972149, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 5.23828125, + "step": 15, + "time_per_iteration": 2.636983871459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02135688, + "balance_loss_mlp": 1.62108541, + "epoch": 0.0030781069642170067, + "flos": 597841778688.0, + "grad_norm": 4.36506198708269, + "language_loss": 5.46747923, + "learning_rate": 0.0005490431248454357, + "loss": 5.48883629, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 5.14453125, + "step": 16, + "time_per_iteration": 2.6904103755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02173305, + "balance_loss_mlp": 1.67586899, + "epoch": 0.0032704886494805694, + "flos": 1541513154048.0, + "grad_norm": 0.3693165783384919, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77878416, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 4.96875, + "step": 17, + "time_per_iteration": 6.815098285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01958957, + "balance_loss_mlp": 1.45846832, + "epoch": 0.0034628703347441324, + "flos": 474971102976.0, + "grad_norm": 7.376330921510473, + "language_loss": 3.16160107, + "learning_rate": 0.0005723671632907488, + "loss": 3.18119049, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 5.0, + "step": 18, + "time_per_iteration": 2.7730185985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974299, + "balance_loss_mlp": 1.48144007, + "epoch": 0.0036552520200076955, + "flos": 449478556416.0, + "grad_norm": 2.0435067055151803, + "language_loss": 1.8205657, + "learning_rate": 0.0005830738490244919, + "loss": 1.84030867, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.921875, + "step": 19, + "time_per_iteration": 2.5196421146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215561, + "balance_loss_mlp": 1.73147547, + "epoch": 0.003847633705271258, + "flos": 637351580928.0, + "grad_norm": 2.199322832792736, + "language_loss": 1.81859815, + "learning_rate": 0.0005932312266435596, + "loss": 1.84075379, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.83203125, + "step": 20, + "time_per_iteration": 2.7772061824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02397049, + "balance_loss_mlp": 1.91639686, + "epoch": 0.004040015390534821, + "flos": 590591105280.0, + "grad_norm": 2.068137361611091, + "language_loss": 1.81285238, + "learning_rate": 0.0006028929207788754, + "loss": 1.83682299, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.796875, + "step": 21, + "time_per_iteration": 2.7197327613830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02949394, + "balance_loss_mlp": 2.47560835, + "epoch": 0.004232397075798384, + "flos": 757866929664.0, + "grad_norm": 0.9893066861855494, + "language_loss": 1.43565178, + "learning_rate": 0.0006121050677327902, + "loss": 1.46514571, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.7265625, + "step": 22, + "time_per_iteration": 2.8821635246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04240368, + "balance_loss_mlp": 3.77421188, + "epoch": 0.004424778761061947, + "flos": 527727310080.0, + "grad_norm": 1.6702760591351544, + "language_loss": 1.36044598, + "learning_rate": 0.0006209076479463684, + "loss": 1.40284979, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.6484375, + "step": 23, + "time_per_iteration": 2.6194069385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04405254, + "balance_loss_mlp": 3.93871665, + "epoch": 0.00461716044632551, + "flos": 549218815488.0, + "grad_norm": 1.6356367296774819, + "language_loss": 1.46302319, + "learning_rate": 0.0006293355346737718, + "loss": 1.50707567, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.65234375, + "step": 24, + "time_per_iteration": 2.741433620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03977472, + "balance_loss_mlp": 3.50483179, + "epoch": 0.004809542131589073, + "flos": 568752569088.0, + "grad_norm": 1.079559317914091, + "language_loss": 1.33177948, + "learning_rate": 0.0006374193284416834, + "loss": 1.37155437, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.71484375, + "step": 25, + "time_per_iteration": 2.902089834213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03127712, + "balance_loss_mlp": 2.642483, + "epoch": 0.005001923816852636, + "flos": 471584410368.0, + "grad_norm": 0.4847890845471295, + "language_loss": 1.26058078, + "learning_rate": 0.0006451860277489461, + "loss": 1.29185796, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.84375, + "step": 26, + "time_per_iteration": 2.6045680046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02733563, + "balance_loss_mlp": 2.23154879, + "epoch": 0.005194305502116198, + "flos": 416381502720.0, + "grad_norm": 0.2845036760864029, + "language_loss": 1.33193052, + "learning_rate": 0.0006526595731190848, + "loss": 1.35926616, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.015625, + "step": 27, + "time_per_iteration": 2.4412264823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02759137, + "balance_loss_mlp": 2.2411015, + "epoch": 0.005386687187379761, + "flos": 629996894976.0, + "grad_norm": 0.34713687972437796, + "language_loss": 1.22031224, + "learning_rate": 0.0006598612921618983, + "loss": 1.24790359, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.1796875, + "step": 28, + "time_per_iteration": 2.80483078956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575766, + "balance_loss_mlp": 2.05010033, + "epoch": 0.005579068872643324, + "flos": 888021326592.0, + "grad_norm": 0.3062478898066755, + "language_loss": 1.16221631, + "learning_rate": 0.0006668102665011454, + "loss": 1.18797398, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.2578125, + "step": 29, + "time_per_iteration": 3.243164300918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02507804, + "balance_loss_mlp": 1.97527242, + "epoch": 0.005771450557906887, + "flos": 548658902016.0, + "grad_norm": 0.22276861521731073, + "language_loss": 1.24634933, + "learning_rate": 0.0006735236364718957, + "loss": 1.27142727, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.328125, + "step": 30, + "time_per_iteration": 2.7701382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02465182, + "balance_loss_mlp": 1.93226886, + "epoch": 0.00596383224317045, + "flos": 533069809152.0, + "grad_norm": 0.21102664747409663, + "language_loss": 1.23222375, + "learning_rate": 0.0006800168558381346, + "loss": 1.25687563, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.33203125, + "step": 31, + "time_per_iteration": 2.635246515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02445382, + "balance_loss_mlp": 1.91552007, + "epoch": 0.0061562139284340135, + "flos": 590163394560.0, + "grad_norm": 0.21886797396213825, + "language_loss": 1.26610851, + "learning_rate": 0.0006863039060567947, + "loss": 1.29056239, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.30078125, + "step": 32, + "time_per_iteration": 2.7791683673858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02338603, + "balance_loss_mlp": 1.80950415, + "epoch": 0.006348595613697576, + "flos": 619442154240.0, + "grad_norm": 0.18971916612404452, + "language_loss": 1.17543316, + "learning_rate": 0.0006923974775611263, + "loss": 1.19881916, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.29296875, + "step": 33, + "time_per_iteration": 2.836601495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02160521, + "balance_loss_mlp": 1.64134097, + "epoch": 0.006540977298961139, + "flos": 779300109312.0, + "grad_norm": 0.13369632510289112, + "language_loss": 1.13907146, + "learning_rate": 0.0006983091239737814, + "loss": 1.16067672, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.19140625, + "step": 34, + "time_per_iteration": 3.021479606628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0221033, + "balance_loss_mlp": 1.69649041, + "epoch": 0.006733358984224702, + "flos": 668373264384.0, + "grad_norm": 0.11522706717853448, + "language_loss": 1.11973858, + "learning_rate": 0.0007040493939600222, + "loss": 1.14184177, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 5.13671875, + "step": 35, + "time_per_iteration": 2.9400346279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0227657, + "balance_loss_mlp": 1.76997864, + "epoch": 0.006925740669488265, + "flos": 565496133888.0, + "grad_norm": 0.11143421895921844, + "language_loss": 1.12295914, + "learning_rate": 0.0007096279445021078, + "loss": 1.14572477, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 5.0625, + "step": 36, + "time_per_iteration": 2.698153495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02284885, + "balance_loss_mlp": 1.78668559, + "epoch": 0.007118122354751828, + "flos": 551112405504.0, + "grad_norm": 0.11733654674395574, + "language_loss": 1.1734066, + "learning_rate": 0.0007150536386503726, + "loss": 1.19625545, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.9765625, + "step": 37, + "time_per_iteration": 2.8579084873199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02268399, + "balance_loss_mlp": 1.77782845, + "epoch": 0.007310504040015391, + "flos": 703814951424.0, + "grad_norm": 0.14208952684155102, + "language_loss": 1.10088778, + "learning_rate": 0.0007203346302358509, + "loss": 1.12357187, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.8984375, + "step": 38, + "time_per_iteration": 2.928835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220017, + "balance_loss_mlp": 1.73555112, + "epoch": 0.007502885725278953, + "flos": 600501361920.0, + "grad_norm": 0.142042154575746, + "language_loss": 1.15486813, + "learning_rate": 0.000725478437577282, + "loss": 1.17706823, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.8359375, + "step": 39, + "time_per_iteration": 2.8706436157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0209897, + "balance_loss_mlp": 1.62251425, + "epoch": 0.007695267410542516, + "flos": 561428018688.0, + "grad_norm": 0.13255726845543458, + "language_loss": 1.10233212, + "learning_rate": 0.0007304920078549186, + "loss": 1.12332189, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.75390625, + "step": 40, + "time_per_iteration": 2.6895179748535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01939831, + "balance_loss_mlp": 1.46986008, + "epoch": 0.007887649095806078, + "flos": 509231725056.0, + "grad_norm": 0.11166218824526469, + "language_loss": 1.12161303, + "learning_rate": 0.0007353817735343603, + "loss": 1.14101124, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.6875, + "step": 41, + "time_per_iteration": 2.709167957305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0184399, + "balance_loss_mlp": 1.3778342, + "epoch": 0.008080030781069641, + "flos": 504905040384.0, + "grad_norm": 0.06254207778511488, + "language_loss": 1.07663667, + "learning_rate": 0.0007401537019902344, + "loss": 1.09507656, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.6484375, + "step": 42, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01789021, + "balance_loss_mlp": 1.32896876, + "epoch": 0.008272412466333205, + "flos": 519106988544.0, + "grad_norm": 0.07012531219711775, + "language_loss": 1.09992051, + "learning_rate": 0.0007448133392900729, + "loss": 1.11781073, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.5859375, + "step": 43, + "time_per_iteration": 2.6997878551483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01787217, + "balance_loss_mlp": 1.32983518, + "epoch": 0.008464794151596768, + "flos": 609184866816.0, + "grad_norm": 0.09276066699658307, + "language_loss": 1.05755496, + "learning_rate": 0.0007493658489441491, + "loss": 1.07542706, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.56640625, + "step": 44, + "time_per_iteration": 2.8852477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177156, + "balance_loss_mlp": 1.31913674, + "epoch": 0.00865717583686033, + "flos": 539007214848.0, + "grad_norm": 0.11478380715178954, + "language_loss": 1.09959674, + "learning_rate": 0.0007538160463002316, + "loss": 1.11731243, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.53125, + "step": 45, + "time_per_iteration": 2.685568332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01802016, + "balance_loss_mlp": 1.35378933, + "epoch": 0.008849557522123894, + "flos": 509010094080.0, + "grad_norm": 0.14537339285711792, + "language_loss": 1.13533509, + "learning_rate": 0.0007581684291577274, + "loss": 1.15335524, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.49609375, + "step": 46, + "time_per_iteration": 2.5798568725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764716, + "balance_loss_mlp": 1.31915987, + "epoch": 0.009041939207387457, + "flos": 626508135168.0, + "grad_norm": 0.13285081251714825, + "language_loss": 1.15270185, + "learning_rate": 0.0007624272050891776, + "loss": 1.17034888, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.46875, + "step": 47, + "time_per_iteration": 2.822632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0175788, + "balance_loss_mlp": 1.31461263, + "epoch": 0.00923432089265102, + "flos": 550610817792.0, + "grad_norm": 0.11934546954286276, + "language_loss": 1.04916859, + "learning_rate": 0.0007665963158851307, + "loss": 1.06674731, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.4453125, + "step": 48, + "time_per_iteration": 2.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01741735, + "balance_loss_mlp": 1.29846764, + "epoch": 0.009426702577914583, + "flos": 563679333120.0, + "grad_norm": 0.08548395668661983, + "language_loss": 1.13647461, + "learning_rate": 0.0007706794594783609, + "loss": 1.15389204, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.4453125, + "step": 49, + "time_per_iteration": 2.734813928604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01727457, + "balance_loss_mlp": 1.28838515, + "epoch": 0.009619084263178146, + "flos": 617926697472.0, + "grad_norm": 0.06892583067190382, + "language_loss": 1.12110853, + "learning_rate": 0.0007746801096530423, + "loss": 1.13838315, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.40234375, + "step": 50, + "time_per_iteration": 2.7447421550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01719129, + "balance_loss_mlp": 1.28043914, + "epoch": 0.009811465948441709, + "flos": 542489171712.0, + "grad_norm": 0.04778558244894799, + "language_loss": 1.16797209, + "learning_rate": 0.0007786015338021173, + "loss": 1.1851635, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.3984375, + "step": 51, + "time_per_iteration": 2.65645694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01722789, + "balance_loss_mlp": 1.28562462, + "epoch": 0.010003847633705272, + "flos": 536977531392.0, + "grad_norm": 0.06217135289779639, + "language_loss": 1.09074998, + "learning_rate": 0.0007824468089603051, + "loss": 1.10797799, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.3828125, + "step": 52, + "time_per_iteration": 2.7218713760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01697539, + "balance_loss_mlp": 1.26380801, + "epoch": 0.010196229318968833, + "flos": 910806657792.0, + "grad_norm": 0.04206474108062499, + "language_loss": 1.08130515, + "learning_rate": 0.0007862188363098669, + "loss": 1.09828055, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.34765625, + "step": 53, + "time_per_iteration": 3.149973154067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01668333, + "balance_loss_mlp": 1.23765349, + "epoch": 0.010388611004232396, + "flos": 586970142720.0, + "grad_norm": 0.050634309517598654, + "language_loss": 1.08688021, + "learning_rate": 0.0007899203543304438, + "loss": 1.10356343, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.31640625, + "step": 54, + "time_per_iteration": 2.7033088207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01691162, + "balance_loss_mlp": 1.26315343, + "epoch": 0.01058099268949596, + "flos": 503472208896.0, + "grad_norm": 0.06464656169002964, + "language_loss": 1.22991037, + "learning_rate": 0.0007935539507422731, + "loss": 1.246822, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.2890625, + "step": 55, + "time_per_iteration": 2.601745843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017059, + "balance_loss_mlp": 1.28017938, + "epoch": 0.010773374374759523, + "flos": 545558969088.0, + "grad_norm": 0.06403483907250343, + "language_loss": 1.12561536, + "learning_rate": 0.0007971220733732573, + "loss": 1.14267421, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.265625, + "step": 56, + "time_per_iteration": 2.677314281463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0169453, + "balance_loss_mlp": 1.27262425, + "epoch": 0.010965756060023086, + "flos": 527286960384.0, + "grad_norm": 0.061369678053330295, + "language_loss": 1.07931721, + "learning_rate": 0.0008006270400641869, + "loss": 1.09626245, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.2265625, + "step": 57, + "time_per_iteration": 2.7162468433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699229, + "balance_loss_mlp": 1.27846837, + "epoch": 0.011158137745286649, + "flos": 578098054656.0, + "grad_norm": 0.06126094216688289, + "language_loss": 1.08923888, + "learning_rate": 0.0008040710477125043, + "loss": 1.10623109, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.21484375, + "step": 58, + "time_per_iteration": 2.724116563796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648065, + "balance_loss_mlp": 1.23150039, + "epoch": 0.011350519430550212, + "flos": 530314961664.0, + "grad_norm": 0.059594432794803906, + "language_loss": 1.09501219, + "learning_rate": 0.0008074561805429771, + "loss": 1.11149275, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.171875, + "step": 59, + "time_per_iteration": 2.613821268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01628698, + "balance_loss_mlp": 1.21594822, + "epoch": 0.011542901115813775, + "flos": 556971076608.0, + "grad_norm": 0.046387810099464834, + "language_loss": 1.0703913, + "learning_rate": 0.0008107844176832545, + "loss": 1.08667827, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.1328125, + "step": 60, + "time_per_iteration": 2.6809566020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01602811, + "balance_loss_mlp": 1.19349384, + "epoch": 0.011735282801077338, + "flos": 573176463360.0, + "grad_norm": 0.036957475185327084, + "language_loss": 1.08104563, + "learning_rate": 0.0008140576401132568, + "loss": 1.09707379, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.09765625, + "step": 61, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596506, + "balance_loss_mlp": 1.19024038, + "epoch": 0.0119276644863409, + "flos": 616717442304.0, + "grad_norm": 0.034032461682055544, + "language_loss": 1.09685671, + "learning_rate": 0.0008172776370494935, + "loss": 1.11282182, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.06640625, + "step": 62, + "time_per_iteration": 2.7589328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01605764, + "balance_loss_mlp": 1.20255029, + "epoch": 0.012120046171604464, + "flos": 502085064192.0, + "grad_norm": 0.035968497482949544, + "language_loss": 1.17104983, + "learning_rate": 0.0008204461118185703, + "loss": 1.18710756, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.03515625, + "step": 63, + "time_per_iteration": 2.594369411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01603311, + "balance_loss_mlp": 1.20353031, + "epoch": 0.012312427856868027, + "flos": 474302319360.0, + "grad_norm": 0.04911792883083492, + "language_loss": 1.06295228, + "learning_rate": 0.0008235646872681536, + "loss": 1.07898545, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 3.99609375, + "step": 64, + "time_per_iteration": 2.5651702880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01599528, + "balance_loss_mlp": 1.20279896, + "epoch": 0.012504809542131588, + "flos": 539471864064.0, + "grad_norm": 0.049725750424410776, + "language_loss": 1.06296277, + "learning_rate": 0.0008266349107584288, + "loss": 1.07895803, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 3.95898438, + "step": 65, + "time_per_iteration": 2.6876485347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596697, + "balance_loss_mlp": 1.20492756, + "epoch": 0.012697191227395151, + "flos": 609857541120.0, + "grad_norm": 0.056540756097456804, + "language_loss": 1.08585978, + "learning_rate": 0.0008296582587724851, + "loss": 1.10182667, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 3.91210938, + "step": 66, + "time_per_iteration": 2.71223783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587883, + "balance_loss_mlp": 1.19821179, + "epoch": 0.012889572912658714, + "flos": 769398600960.0, + "grad_norm": 0.04465917834699911, + "language_loss": 1.0627861, + "learning_rate": 0.0008326361411800136, + "loss": 1.07866502, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 3.89648438, + "step": 67, + "time_per_iteration": 2.9413115978240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01577237, + "balance_loss_mlp": 1.19099891, + "epoch": 0.013081954597922277, + "flos": 535021724928.0, + "grad_norm": 0.05343660826588632, + "language_loss": 1.06744349, + "learning_rate": 0.0008355699051851403, + "loss": 1.08321595, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 3.86132812, + "step": 68, + "time_per_iteration": 2.726212501525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0157129, + "balance_loss_mlp": 1.18829489, + "epoch": 0.01327433628318584, + "flos": 574181584128.0, + "grad_norm": 0.041490887209285586, + "language_loss": 1.14052749, + "learning_rate": 0.0008384608389860635, + "loss": 1.15624034, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 3.828125, + "step": 69, + "time_per_iteration": 2.6679208278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156381, + "balance_loss_mlp": 1.18386579, + "epoch": 0.013466717968449404, + "flos": 498259967232.0, + "grad_norm": 0.03618836919088814, + "language_loss": 1.04182374, + "learning_rate": 0.000841310175171381, + "loss": 1.05746174, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 3.796875, + "step": 70, + "time_per_iteration": 2.6277127265930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01563963, + "balance_loss_mlp": 1.18592632, + "epoch": 0.013659099653712967, + "flos": 566622763776.0, + "grad_norm": 0.04320101591589407, + "language_loss": 1.02295327, + "learning_rate": 0.000844119093875517, + "loss": 1.03859293, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 3.77734375, + "step": 71, + "time_per_iteration": 2.7236883640289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558639, + "balance_loss_mlp": 1.18403625, + "epoch": 0.01385148133897653, + "flos": 574943686656.0, + "grad_norm": 0.03416580025853519, + "language_loss": 1.06855714, + "learning_rate": 0.0008468887257134666, + "loss": 1.08414352, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 3.7421875, + "step": 72, + "time_per_iteration": 2.6696412563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558456, + "balance_loss_mlp": 1.18499684, + "epoch": 0.014043863024240093, + "flos": 577959048960.0, + "grad_norm": 0.037886537215891476, + "language_loss": 1.09368944, + "learning_rate": 0.0008496201545131264, + "loss": 1.10927403, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 3.73046875, + "step": 73, + "time_per_iteration": 2.701594591140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545785, + "balance_loss_mlp": 1.17575896, + "epoch": 0.014236244709503656, + "flos": 940265252352.0, + "grad_norm": 0.04766211184506119, + "language_loss": 1.07240248, + "learning_rate": 0.0008523144198617317, + "loss": 1.08786011, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.6953125, + "step": 74, + "time_per_iteration": 3.1882145404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01551426, + "balance_loss_mlp": 1.18387985, + "epoch": 0.014428626394767219, + "flos": 529496478720.0, + "grad_norm": 0.031986864242930464, + "language_loss": 1.06216824, + "learning_rate": 0.0008549725194813783, + "loss": 1.0776825, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.66992188, + "step": 75, + "time_per_iteration": 2.666274309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01546662, + "balance_loss_mlp": 1.18102288, + "epoch": 0.014621008080030782, + "flos": 805283549952.0, + "grad_norm": 0.03321604497436844, + "language_loss": 1.05779314, + "learning_rate": 0.0008575954114472099, + "loss": 1.07325983, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.65039062, + "step": 76, + "time_per_iteration": 3.1192731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547179, + "balance_loss_mlp": 1.18478322, + "epoch": 0.014813389765294343, + "flos": 698357746176.0, + "grad_norm": 0.03477979781895141, + "language_loss": 1.02737951, + "learning_rate": 0.0008601840162606118, + "loss": 1.04285145, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.6171875, + "step": 77, + "time_per_iteration": 3.0015783309936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547226, + "balance_loss_mlp": 1.18788171, + "epoch": 0.015005771450557906, + "flos": 598165476864.0, + "grad_norm": 0.032631512960834254, + "language_loss": 1.09477437, + "learning_rate": 0.000862739218788641, + "loss": 1.11024666, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.58984375, + "step": 78, + "time_per_iteration": 2.790245771408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536731, + "balance_loss_mlp": 1.18177319, + "epoch": 0.01519815313582147, + "flos": 550493199360.0, + "grad_norm": 0.0308447873241268, + "language_loss": 1.07131243, + "learning_rate": 0.0008652618700799138, + "loss": 1.0866797, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.55664062, + "step": 79, + "time_per_iteration": 2.6302430629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01532812, + "balance_loss_mlp": 1.18033433, + "epoch": 0.015390534821085032, + "flos": 431440817664.0, + "grad_norm": 0.04595099678969376, + "language_loss": 1.06556606, + "learning_rate": 0.0008677527890662774, + "loss": 1.08089423, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.53125, + "step": 80, + "time_per_iteration": 2.4970459938049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01520539, + "balance_loss_mlp": 1.17130363, + "epoch": 0.015582916506348595, + "flos": 525185345280.0, + "grad_norm": 0.030530536654869142, + "language_loss": 1.07461143, + "learning_rate": 0.0008702127641587799, + "loss": 1.08981681, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.49804688, + "step": 81, + "time_per_iteration": 2.6258630752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01512144, + "balance_loss_mlp": 1.16500628, + "epoch": 0.015775298191612157, + "flos": 576617591040.0, + "grad_norm": 0.026948447424875538, + "language_loss": 1.02672768, + "learning_rate": 0.0008726425547457192, + "loss": 1.04184914, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.4765625, + "step": 82, + "time_per_iteration": 2.7344956398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517079, + "balance_loss_mlp": 1.17375636, + "epoch": 0.01596767987687572, + "flos": 611440071936.0, + "grad_norm": 0.03479426421062965, + "language_loss": 1.02940345, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457426, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.4375, + "step": 83, + "time_per_iteration": 2.738685369491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01509349, + "balance_loss_mlp": 1.16850555, + "epoch": 0.016160061562139283, + "flos": 568233484800.0, + "grad_norm": 0.05178756375238081, + "language_loss": 1.08039558, + "learning_rate": 0.0008774144832015932, + "loss": 1.09548914, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.41210938, + "step": 84, + "time_per_iteration": 2.6948299407958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575775, + "balance_loss_mlp": 2.26144409, + "epoch": 0.016352443247402846, + "flos": 1414502431488.0, + "grad_norm": 0.37456313977874084, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7735008, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.140625, + "step": 85, + "time_per_iteration": 4.596364974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517697, + "balance_loss_mlp": 1.17895198, + "epoch": 0.01654482493266641, + "flos": 731786279424.0, + "grad_norm": 0.04138572693056026, + "language_loss": 1.03059626, + "learning_rate": 0.0008820741205014318, + "loss": 1.04577315, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.390625, + "step": 86, + "time_per_iteration": 2.901047706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01566516, + "balance_loss_mlp": 1.22757995, + "epoch": 0.016737206617929972, + "flos": 537405242112.0, + "grad_norm": 0.0588613682629828, + "language_loss": 1.04849172, + "learning_rate": 0.0008843634575408404, + "loss": 1.06415701, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.39257812, + "step": 87, + "time_per_iteration": 2.6739823818206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583525, + "balance_loss_mlp": 1.24497032, + "epoch": 0.016929588303193535, + "flos": 538130406144.0, + "grad_norm": 0.09131872689500015, + "language_loss": 1.06101418, + "learning_rate": 0.0008866266301555082, + "loss": 1.07684946, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.38867188, + "step": 88, + "time_per_iteration": 2.741093635559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156783, + "balance_loss_mlp": 1.23118281, + "epoch": 0.017121969988457098, + "flos": 527792438784.0, + "grad_norm": 0.07103005743700296, + "language_loss": 1.07027078, + "learning_rate": 0.0008888642296509615, + "loss": 1.08594918, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.36914062, + "step": 89, + "time_per_iteration": 2.622267007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01554346, + "balance_loss_mlp": 1.2196058, + "epoch": 0.01731435167372066, + "flos": 626768649984.0, + "grad_norm": 0.057543283798364535, + "language_loss": 1.11941445, + "learning_rate": 0.0008910768275115906, + "loss": 1.13495779, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.34960938, + "step": 90, + "time_per_iteration": 2.778939962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536545, + "balance_loss_mlp": 1.20409441, + "epoch": 0.017506733358984224, + "flos": 497385103872.0, + "grad_norm": 0.06951140803051024, + "language_loss": 1.07318401, + "learning_rate": 0.0008932649762767675, + "loss": 1.08854938, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.32617188, + "step": 91, + "time_per_iteration": 2.5841660499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01529864, + "balance_loss_mlp": 1.20122755, + "epoch": 0.017699115044247787, + "flos": 747218870016.0, + "grad_norm": 0.037985069994816135, + "language_loss": 1.10022223, + "learning_rate": 0.0008954292103690864, + "loss": 1.11552095, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.28710938, + "step": 92, + "time_per_iteration": 2.976200580596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525091, + "balance_loss_mlp": 1.19893408, + "epoch": 0.01789149672951135, + "flos": 516521282304.0, + "grad_norm": 0.05507041657686672, + "language_loss": 1.1172272, + "learning_rate": 0.0008975700468778296, + "loss": 1.13247812, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.26171875, + "step": 93, + "time_per_iteration": 2.5778274536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01518338, + "balance_loss_mlp": 1.19427943, + "epoch": 0.018083878414774913, + "flos": 587230657536.0, + "grad_norm": 0.047907590915393955, + "language_loss": 1.05762661, + "learning_rate": 0.0008996879863005366, + "loss": 1.07280993, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.24023438, + "step": 94, + "time_per_iteration": 2.6827101707458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01506508, + "balance_loss_mlp": 1.18664575, + "epoch": 0.018276260100038477, + "flos": 498370782720.0, + "grad_norm": 0.03950158468897577, + "language_loss": 1.05640411, + "learning_rate": 0.0009017835132453337, + "loss": 1.07146931, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.19726562, + "step": 95, + "time_per_iteration": 2.5879104137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01488471, + "balance_loss_mlp": 1.17223215, + "epoch": 0.01846864178530204, + "flos": 641233058304.0, + "grad_norm": 0.042611409633865054, + "language_loss": 1.05607677, + "learning_rate": 0.0009038570970964896, + "loss": 1.07096148, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.16015625, + "step": 96, + "time_per_iteration": 2.761634349822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487316, + "balance_loss_mlp": 1.17374837, + "epoch": 0.018661023470565603, + "flos": 512667995136.0, + "grad_norm": 0.026597294022958493, + "language_loss": 1.02809072, + "learning_rate": 0.0009059091926454854, + "loss": 1.04296374, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.1328125, + "step": 97, + "time_per_iteration": 2.602036952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487556, + "balance_loss_mlp": 1.17742097, + "epoch": 0.018853405155829166, + "flos": 932697683712.0, + "grad_norm": 0.04097414840704221, + "language_loss": 1.01764143, + "learning_rate": 0.0009079402406897198, + "loss": 1.03251696, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.09765625, + "step": 98, + "time_per_iteration": 3.2514705657958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483888, + "balance_loss_mlp": 1.17642295, + "epoch": 0.01904578684109273, + "flos": 577587718656.0, + "grad_norm": 0.027217181555243938, + "language_loss": 1.03385735, + "learning_rate": 0.0009099506686008212, + "loss": 1.04869628, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.0703125, + "step": 99, + "time_per_iteration": 2.7867672443389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473245, + "balance_loss_mlp": 1.16883183, + "epoch": 0.019238168526356292, + "flos": 559521789696.0, + "grad_norm": 0.02943095981266107, + "language_loss": 1.06245995, + "learning_rate": 0.0009119408908644013, + "loss": 1.07719231, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.0390625, + "step": 100, + "time_per_iteration": 2.718982219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466855, + "balance_loss_mlp": 1.164922, + "epoch": 0.019430550211619855, + "flos": 725104267776.0, + "grad_norm": 0.035830377247789626, + "language_loss": 1.12020779, + "learning_rate": 0.0009139113095929519, + "loss": 1.13487625, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.01367188, + "step": 101, + "time_per_iteration": 2.9023444652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146708, + "balance_loss_mlp": 1.16781712, + "epoch": 0.019622931896883418, + "flos": 500456846592.0, + "grad_norm": 0.031534744220975436, + "language_loss": 1.0658195, + "learning_rate": 0.0009158623150134762, + "loss": 1.08049035, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 2.98632812, + "step": 102, + "time_per_iteration": 2.5731325149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01479653, + "balance_loss_mlp": 1.1828692, + "epoch": 0.01981531358214698, + "flos": 510282532608.0, + "grad_norm": 0.0334583858191085, + "language_loss": 1.05968487, + "learning_rate": 0.000917794285931332, + "loss": 1.07448149, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 2.9609375, + "step": 103, + "time_per_iteration": 2.656132221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477107, + "balance_loss_mlp": 1.18184972, + "epoch": 0.020007695267410544, + "flos": 522393559296.0, + "grad_norm": 0.033386157220771755, + "language_loss": 0.97816026, + "learning_rate": 0.0009197075901716639, + "loss": 0.99293131, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 2.9453125, + "step": 104, + "time_per_iteration": 2.7207133769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01472947, + "balance_loss_mlp": 1.1811223, + "epoch": 0.020200076952674107, + "flos": 534444314880.0, + "grad_norm": 0.03432724584635873, + "language_loss": 1.08410704, + "learning_rate": 0.0009216025849997171, + "loss": 1.09883642, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 2.92382812, + "step": 105, + "time_per_iteration": 2.783440113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461838, + "balance_loss_mlp": 1.17115784, + "epoch": 0.020392458637937667, + "flos": 686083414272.0, + "grad_norm": 0.04360543496830388, + "language_loss": 1.02907205, + "learning_rate": 0.0009234796175212258, + "loss": 1.04369044, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 2.9140625, + "step": 106, + "time_per_iteration": 2.914760112762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450941, + "balance_loss_mlp": 1.1615957, + "epoch": 0.02058484032320123, + "flos": 703415430912.0, + "grad_norm": 0.03266429542390293, + "language_loss": 1.06572628, + "learning_rate": 0.000925339025064007, + "loss": 1.08023572, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 2.90039062, + "step": 107, + "time_per_iteration": 2.951838254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453976, + "balance_loss_mlp": 1.16558492, + "epoch": 0.020777222008464793, + "flos": 640328059392.0, + "grad_norm": 0.03192051704400644, + "language_loss": 0.99516582, + "learning_rate": 0.0009271811355418027, + "loss": 1.00970554, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 2.890625, + "step": 108, + "time_per_iteration": 2.897881507873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01449031, + "balance_loss_mlp": 1.16159379, + "epoch": 0.020969603693728356, + "flos": 683321763840.0, + "grad_norm": 0.04466737388011785, + "language_loss": 1.06219566, + "learning_rate": 0.0009290062678013548, + "loss": 1.07668602, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 2.88085938, + "step": 109, + "time_per_iteration": 2.8423218727111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430047, + "balance_loss_mlp": 1.14413536, + "epoch": 0.02116198537899192, + "flos": 534420015360.0, + "grad_norm": 0.034258615277409615, + "language_loss": 1.04797208, + "learning_rate": 0.0009308147319536321, + "loss": 1.06227255, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 2.86523438, + "step": 110, + "time_per_iteration": 2.6316323280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425495, + "balance_loss_mlp": 1.14053667, + "epoch": 0.021354367064255482, + "flos": 718728457728.0, + "grad_norm": 0.048864006828935096, + "language_loss": 1.11352324, + "learning_rate": 0.0009326068296900676, + "loss": 1.12777817, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 2.85546875, + "step": 111, + "time_per_iteration": 2.8313205242156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416698, + "balance_loss_mlp": 1.13269377, + "epoch": 0.021546748749519045, + "flos": 520624390656.0, + "grad_norm": 0.040751650479700946, + "language_loss": 1.01643181, + "learning_rate": 0.0009343828545846161, + "loss": 1.03059864, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 2.84570312, + "step": 112, + "time_per_iteration": 2.7729175090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401674, + "balance_loss_mlp": 1.11805177, + "epoch": 0.021739130434782608, + "flos": 506161927680.0, + "grad_norm": 0.042106341000359294, + "language_loss": 1.06266427, + "learning_rate": 0.0009361430923823841, + "loss": 1.07668102, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 2.84179688, + "step": 113, + "time_per_iteration": 2.5920841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01394311, + "balance_loss_mlp": 1.11126053, + "epoch": 0.02193151212004617, + "flos": 464427055872.0, + "grad_norm": 0.07156510336232694, + "language_loss": 1.09574234, + "learning_rate": 0.0009378878212755459, + "loss": 1.10968542, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 2.8359375, + "step": 114, + "time_per_iteration": 2.5213706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376382, + "balance_loss_mlp": 1.09371293, + "epoch": 0.022123893805309734, + "flos": 553332617472.0, + "grad_norm": 0.03568103744776456, + "language_loss": 0.9948864, + "learning_rate": 0.0009396173121672103, + "loss": 1.0086503, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.83203125, + "step": 115, + "time_per_iteration": 2.654648780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351182, + "balance_loss_mlp": 1.0677501, + "epoch": 0.022316275490573297, + "flos": 637379771136.0, + "grad_norm": 0.04471438423319615, + "language_loss": 1.05214882, + "learning_rate": 0.0009413318289238633, + "loss": 1.06566072, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.83984375, + "step": 116, + "time_per_iteration": 2.7842695713043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311882, + "balance_loss_mlp": 1.0282588, + "epoch": 0.02250865717583686, + "flos": 800316271872.0, + "grad_norm": 0.046340717018109684, + "language_loss": 0.97282118, + "learning_rate": 0.0009430316286169771, + "loss": 0.98593992, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.84179688, + "step": 117, + "time_per_iteration": 3.015839099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377985, + "balance_loss_mlp": 1.09283674, + "epoch": 0.022701038861100423, + "flos": 457063621632.0, + "grad_norm": 0.07808854544893538, + "language_loss": 1.02862036, + "learning_rate": 0.0009447169617543361, + "loss": 1.04240024, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.85742188, + "step": 118, + "time_per_iteration": 2.582919120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371579, + "balance_loss_mlp": 1.08871901, + "epoch": 0.022893420546363986, + "flos": 584187105024.0, + "grad_norm": 0.08661397198668377, + "language_loss": 1.09685123, + "learning_rate": 0.0009463880725016029, + "loss": 1.11056697, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.83398438, + "step": 119, + "time_per_iteration": 2.6932969093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312523, + "balance_loss_mlp": 1.03252411, + "epoch": 0.02308580223162755, + "flos": 562478826240.0, + "grad_norm": 0.04303328442288268, + "language_loss": 1.04977584, + "learning_rate": 0.0009480451988946134, + "loss": 1.06290102, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.8046875, + "step": 120, + "time_per_iteration": 2.8070547580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299284, + "balance_loss_mlp": 1.02252805, + "epoch": 0.023278183916891113, + "flos": 772646287872.0, + "grad_norm": 0.03799067846502037, + "language_loss": 1.05637264, + "learning_rate": 0.0009496885730428627, + "loss": 1.0693655, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.77148438, + "step": 121, + "time_per_iteration": 3.014753580093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130842, + "balance_loss_mlp": 1.03376198, + "epoch": 0.023470565602154676, + "flos": 554431057152.0, + "grad_norm": 0.04194740398285866, + "language_loss": 1.04016769, + "learning_rate": 0.0009513184213246156, + "loss": 1.05325174, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.75, + "step": 122, + "time_per_iteration": 2.633074998855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316034, + "balance_loss_mlp": 1.04442739, + "epoch": 0.02366294728741824, + "flos": 561167503872.0, + "grad_norm": 0.038872106950025416, + "language_loss": 1.07101583, + "learning_rate": 0.0009529349645740552, + "loss": 1.08417618, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.71875, + "step": 123, + "time_per_iteration": 2.6846470832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320226, + "balance_loss_mlp": 1.05014575, + "epoch": 0.0238553289726818, + "flos": 469517788416.0, + "grad_norm": 0.03403697644067516, + "language_loss": 1.05937934, + "learning_rate": 0.0009545384182608524, + "loss": 1.07258177, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.703125, + "step": 124, + "time_per_iteration": 2.5332376956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326404, + "balance_loss_mlp": 1.05880272, + "epoch": 0.024047710657945365, + "flos": 561104320512.0, + "grad_norm": 0.042208642163400256, + "language_loss": 1.03444421, + "learning_rate": 0.0009561289926625252, + "loss": 1.04770815, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.67773438, + "step": 125, + "time_per_iteration": 2.68180251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324487, + "balance_loss_mlp": 1.05841172, + "epoch": 0.024240092343208928, + "flos": 505771155456.0, + "grad_norm": 0.03944680997458598, + "language_loss": 1.08491933, + "learning_rate": 0.0009577068930299292, + "loss": 1.0981642, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.66210938, + "step": 126, + "time_per_iteration": 2.602088689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323529, + "balance_loss_mlp": 1.05936122, + "epoch": 0.02443247402847249, + "flos": 436753181184.0, + "grad_norm": 0.04017271590188075, + "language_loss": 1.04077768, + "learning_rate": 0.0009592723197462087, + "loss": 1.05401289, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.64257812, + "step": 127, + "time_per_iteration": 2.643617630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318089, + "balance_loss_mlp": 1.05563784, + "epoch": 0.024624855713736054, + "flos": 685069545216.0, + "grad_norm": 0.03549644551725154, + "language_loss": 1.0056293, + "learning_rate": 0.0009608254684795125, + "loss": 1.01881027, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.625, + "step": 128, + "time_per_iteration": 2.949061632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309242, + "balance_loss_mlp": 1.04831672, + "epoch": 0.024817237398999614, + "flos": 526114643712.0, + "grad_norm": 0.03183934804306691, + "language_loss": 1.03377914, + "learning_rate": 0.0009623665303297678, + "loss": 1.04687166, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.609375, + "step": 129, + "time_per_iteration": 2.7315783500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130104, + "balance_loss_mlp": 1.04106867, + "epoch": 0.025009619084263177, + "flos": 656887279872.0, + "grad_norm": 0.038944166016075116, + "language_loss": 1.07603359, + "learning_rate": 0.0009638956919697878, + "loss": 1.08904397, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.59960938, + "step": 130, + "time_per_iteration": 2.9588887691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293161, + "balance_loss_mlp": 1.03395224, + "epoch": 0.02520200076952674, + "flos": 455370275328.0, + "grad_norm": 0.03345888261117193, + "language_loss": 0.99743778, + "learning_rate": 0.0009654131357809714, + "loss": 1.0103693, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.59179688, + "step": 131, + "time_per_iteration": 2.5802786350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296775, + "balance_loss_mlp": 1.03966463, + "epoch": 0.025394382454790303, + "flos": 841269599232.0, + "grad_norm": 0.04496153180844387, + "language_loss": 1.08517051, + "learning_rate": 0.0009669190399838441, + "loss": 1.09813821, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.5703125, + "step": 132, + "time_per_iteration": 3.1034374237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297684, + "balance_loss_mlp": 1.04190826, + "epoch": 0.025586764140053866, + "flos": 582229353216.0, + "grad_norm": 0.044253016077327914, + "language_loss": 1.0183959, + "learning_rate": 0.0009684135787636724, + "loss": 1.03137255, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.55664062, + "step": 133, + "time_per_iteration": 2.8056888580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284726, + "balance_loss_mlp": 1.03066742, + "epoch": 0.02577914582531743, + "flos": 791678453760.0, + "grad_norm": 0.04023348500073193, + "language_loss": 1.06134284, + "learning_rate": 0.0009698969223913726, + "loss": 1.07419014, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.5390625, + "step": 134, + "time_per_iteration": 3.0520598888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_mlp": 1.02717578, + "epoch": 0.025971527510580992, + "flos": 596063861760.0, + "grad_norm": 0.02965492003563146, + "language_loss": 1.08660483, + "learning_rate": 0.0009713692373399265, + "loss": 1.09939814, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.51953125, + "step": 135, + "time_per_iteration": 2.679379463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01931427, + "balance_loss_mlp": 1.66744995, + "epoch": 0.026163909195844555, + "flos": 1581077391360.0, + "grad_norm": 0.18396358569787127, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81387651, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.640625, + "step": 136, + "time_per_iteration": 5.69318151473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01580238, + "balance_loss_mlp": 1.32083893, + "epoch": 0.026356290881108118, + "flos": 1505163555840.0, + "grad_norm": 0.11058621392355464, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79391277, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.59375, + "step": 137, + "time_per_iteration": 4.930646896362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336039, + "balance_loss_mlp": 1.08846498, + "epoch": 0.02654867256637168, + "flos": 598341421056.0, + "grad_norm": 0.05793494017899448, + "language_loss": 1.01254559, + "learning_rate": 0.0009757216201974225, + "loss": 1.02590609, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.47265625, + "step": 138, + "time_per_iteration": 2.8532111644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376264, + "balance_loss_mlp": 1.13059723, + "epoch": 0.026741054251635244, + "flos": 546136379136.0, + "grad_norm": 0.07027637242601113, + "language_loss": 1.06507492, + "learning_rate": 0.0009771514130396581, + "loss": 1.07883763, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.453125, + "step": 139, + "time_per_iteration": 2.742065668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01373402, + "balance_loss_mlp": 1.12792611, + "epoch": 0.026933435936898807, + "flos": 507846525696.0, + "grad_norm": 0.06681977417406691, + "language_loss": 1.06790614, + "learning_rate": 0.00097857095638274, + "loss": 1.08164012, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.45117188, + "step": 140, + "time_per_iteration": 2.689812660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01350241, + "balance_loss_mlp": 1.10533786, + "epoch": 0.02712581762216237, + "flos": 742254504192.0, + "grad_norm": 0.04346752833457442, + "language_loss": 0.97943556, + "learning_rate": 0.0009799803961288726, + "loss": 0.99293798, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.4453125, + "step": 141, + "time_per_iteration": 3.064852714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340988, + "balance_loss_mlp": 1.09684777, + "epoch": 0.027318199307425933, + "flos": 849779105280.0, + "grad_norm": 0.04419232462487818, + "language_loss": 1.04253626, + "learning_rate": 0.000981379875086876, + "loss": 1.05594611, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.4375, + "step": 142, + "time_per_iteration": 3.049978494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342845, + "balance_loss_mlp": 1.09870481, + "epoch": 0.027510580992689496, + "flos": 576638978304.0, + "grad_norm": 0.03936283820829166, + "language_loss": 0.99339008, + "learning_rate": 0.0009827695330590185, + "loss": 1.00681853, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.4375, + "step": 143, + "time_per_iteration": 2.677050828933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360296, + "balance_loss_mlp": 1.11729932, + "epoch": 0.02770296267795306, + "flos": 773790414336.0, + "grad_norm": 0.036415015399305896, + "language_loss": 0.98794824, + "learning_rate": 0.0009841495069248256, + "loss": 1.00155115, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.42578125, + "step": 144, + "time_per_iteration": 2.9983932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369915, + "balance_loss_mlp": 1.12768197, + "epoch": 0.027895344363216622, + "flos": 570449806080.0, + "grad_norm": 0.04357781303470995, + "language_loss": 0.98341697, + "learning_rate": 0.0009855199307219871, + "loss": 0.99711609, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.41796875, + "step": 145, + "time_per_iteration": 2.6622605323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136275, + "balance_loss_mlp": 1.12261522, + "epoch": 0.028087726048480186, + "flos": 548409080832.0, + "grad_norm": 0.032618269384273584, + "language_loss": 1.00131154, + "learning_rate": 0.0009868809357244854, + "loss": 1.01493907, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.39648438, + "step": 146, + "time_per_iteration": 2.7002813816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347166, + "balance_loss_mlp": 1.10836601, + "epoch": 0.02828010773374375, + "flos": 525873570816.0, + "grad_norm": 0.032542426789695725, + "language_loss": 1.04416764, + "learning_rate": 0.0009882326505180556, + "loss": 1.05763924, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.3828125, + "step": 147, + "time_per_iteration": 2.710149049758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334853, + "balance_loss_mlp": 1.09815085, + "epoch": 0.02847248941900731, + "flos": 773772917760.0, + "grad_norm": 0.045451062042893155, + "language_loss": 1.02790403, + "learning_rate": 0.0009895752010730906, + "loss": 1.04125249, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.36132812, + "step": 148, + "time_per_iteration": 2.965888261795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328294, + "balance_loss_mlp": 1.0936898, + "epoch": 0.028664871104270875, + "flos": 535470822912.0, + "grad_norm": 0.03549847888949514, + "language_loss": 1.08720016, + "learning_rate": 0.0009909087108150867, + "loss": 1.10048318, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.33984375, + "step": 149, + "time_per_iteration": 2.759585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328431, + "balance_loss_mlp": 1.09649718, + "epoch": 0.028857252789534438, + "flos": 368605212672.0, + "grad_norm": 0.04584721914032896, + "language_loss": 1.09262538, + "learning_rate": 0.0009922333006927371, + "loss": 1.10590982, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.3125, + "step": 150, + "time_per_iteration": 2.5677716732025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132956, + "balance_loss_mlp": 1.09896171, + "epoch": 0.029049634474798, + "flos": 516484343808.0, + "grad_norm": 0.054837011337671125, + "language_loss": 1.02855873, + "learning_rate": 0.0009935490892437632, + "loss": 1.04185438, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.29882812, + "step": 151, + "time_per_iteration": 2.5842795372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323589, + "balance_loss_mlp": 1.09623301, + "epoch": 0.029242016160061564, + "flos": 589349769216.0, + "grad_norm": 0.041624099188269474, + "language_loss": 1.01284385, + "learning_rate": 0.0009948561926585687, + "loss": 1.02607965, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.2734375, + "step": 152, + "time_per_iteration": 2.7717602252960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309484, + "balance_loss_mlp": 1.08422625, + "epoch": 0.029434397845325123, + "flos": 553137231360.0, + "grad_norm": 0.04242067063834005, + "language_loss": 1.0541966, + "learning_rate": 0.0009961547248418122, + "loss": 1.0672915, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.25976562, + "step": 153, + "time_per_iteration": 2.6492583751678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303287, + "balance_loss_mlp": 1.07898307, + "epoch": 0.029626779530588686, + "flos": 604608360960.0, + "grad_norm": 0.03242941124289258, + "language_loss": 1.02145946, + "learning_rate": 0.0009974447974719707, + "loss": 1.03449237, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.25, + "step": 154, + "time_per_iteration": 2.7111871242523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303637, + "balance_loss_mlp": 1.08181214, + "epoch": 0.02981916121585225, + "flos": 622218388992.0, + "grad_norm": 0.03743420896054, + "language_loss": 1.03581393, + "learning_rate": 0.0009987265200589763, + "loss": 1.0488503, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.22460938, + "step": 155, + "time_per_iteration": 2.7590832710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281243, + "balance_loss_mlp": 1.06151628, + "epoch": 0.030011542901115813, + "flos": 662881065984.0, + "grad_norm": 0.03665146617631418, + "language_loss": 1.03448439, + "learning_rate": 0.001, + "loss": 1.04729688, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.203125, + "step": 156, + "time_per_iteration": 2.868732452392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262046, + "balance_loss_mlp": 1.04441714, + "epoch": 0.030203924586379376, + "flos": 652819164672.0, + "grad_norm": 0.048414208125286275, + "language_loss": 1.0101347, + "learning_rate": 0.0009999999029413921, + "loss": 1.02275515, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.18164062, + "step": 157, + "time_per_iteration": 2.8458704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249467, + "balance_loss_mlp": 1.03393674, + "epoch": 0.03039630627164294, + "flos": 532444766976.0, + "grad_norm": 0.038165698108555156, + "language_loss": 1.02398324, + "learning_rate": 0.0009999996117656068, + "loss": 1.03647804, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.16015625, + "step": 158, + "time_per_iteration": 2.7255747318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250196, + "balance_loss_mlp": 1.03657281, + "epoch": 0.030588687956906502, + "flos": 587295786240.0, + "grad_norm": 0.04636715302465643, + "language_loss": 0.95869231, + "learning_rate": 0.0009999991264727564, + "loss": 0.97119427, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.140625, + "step": 159, + "time_per_iteration": 2.7805936336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_mlp": 1.05284619, + "epoch": 0.030781069642170065, + "flos": 514287464448.0, + "grad_norm": 0.055354258548617474, + "language_loss": 1.07316554, + "learning_rate": 0.0009999984470630296, + "loss": 1.08580732, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.1171875, + "step": 160, + "time_per_iteration": 2.6011087894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284107, + "balance_loss_mlp": 1.07372677, + "epoch": 0.030973451327433628, + "flos": 719560546560.0, + "grad_norm": 0.03499871632601644, + "language_loss": 0.95530587, + "learning_rate": 0.0009999975735366902, + "loss": 0.96814692, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.10742188, + "step": 161, + "time_per_iteration": 3.083415985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_mlp": 1.07439709, + "epoch": 0.03116583301269719, + "flos": 1111615994880.0, + "grad_norm": 0.03722431710536786, + "language_loss": 0.96960843, + "learning_rate": 0.0009999965058940775, + "loss": 0.9824428, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.09375, + "step": 162, + "time_per_iteration": 3.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_mlp": 1.05655301, + "epoch": 0.031358214697960754, + "flos": 451833883392.0, + "grad_norm": 0.04231417263227255, + "language_loss": 1.04135799, + "learning_rate": 0.0009999952441356057, + "loss": 1.05399871, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.078125, + "step": 163, + "time_per_iteration": 2.5445146560668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239952, + "balance_loss_mlp": 1.03357697, + "epoch": 0.031550596383224314, + "flos": 1257087309312.0, + "grad_norm": 0.03293922474511325, + "language_loss": 1.04807603, + "learning_rate": 0.000999993788261765, + "loss": 1.06047547, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.06640625, + "step": 164, + "time_per_iteration": 3.603273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233685, + "balance_loss_mlp": 1.02769136, + "epoch": 0.03174297806848788, + "flos": 669323950080.0, + "grad_norm": 0.03785089383184646, + "language_loss": 1.05591631, + "learning_rate": 0.00099999213827312, + "loss": 1.06825328, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.0625, + "step": 165, + "time_per_iteration": 2.822242498397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237294, + "balance_loss_mlp": 1.03206336, + "epoch": 0.03193535975375144, + "flos": 552364435200.0, + "grad_norm": 0.03413051380570177, + "language_loss": 1.00392842, + "learning_rate": 0.000999990294170312, + "loss": 1.01630139, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.0546875, + "step": 166, + "time_per_iteration": 2.6473989486694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124218, + "balance_loss_mlp": 1.03790259, + "epoch": 0.032127741439015006, + "flos": 544740486144.0, + "grad_norm": 0.02951320831702663, + "language_loss": 1.04371905, + "learning_rate": 0.0009999882559540566, + "loss": 1.0561409, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.04492188, + "step": 167, + "time_per_iteration": 2.654994487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_mlp": 1.04661989, + "epoch": 0.032320123124278566, + "flos": 549514323456.0, + "grad_norm": 0.03217165834370848, + "language_loss": 1.01348543, + "learning_rate": 0.000999986023625145, + "loss": 1.02598298, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.03320312, + "step": 168, + "time_per_iteration": 2.759324550628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01736656, + "balance_loss_mlp": 1.53829193, + "epoch": 0.03251250480954213, + "flos": 1308817963776.0, + "grad_norm": 0.15145695156494207, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8066107, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.9765625, + "step": 169, + "time_per_iteration": 4.9954283237457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125768, + "balance_loss_mlp": 1.05588245, + "epoch": 0.03270488649480569, + "flos": 562202760192.0, + "grad_norm": 0.04037677915440104, + "language_loss": 1.01481748, + "learning_rate": 0.0009999809766328958, + "loss": 1.02739429, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.01953125, + "step": 170, + "time_per_iteration": 2.6656970977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250876, + "balance_loss_mlp": 1.0494597, + "epoch": 0.03289726818006926, + "flos": 483339657984.0, + "grad_norm": 0.04232720535630845, + "language_loss": 1.03883123, + "learning_rate": 0.0009999781619715177, + "loss": 1.0513401, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.015625, + "step": 171, + "time_per_iteration": 2.5408902168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238141, + "balance_loss_mlp": 1.03786898, + "epoch": 0.03308964986533282, + "flos": 675821269248.0, + "grad_norm": 0.04278552863969592, + "language_loss": 1.04043615, + "learning_rate": 0.000999975153201402, + "loss": 1.05281758, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.00390625, + "step": 172, + "time_per_iteration": 2.85229754447937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233367, + "balance_loss_mlp": 1.03385854, + "epoch": 0.033282031550596385, + "flos": 610341632256.0, + "grad_norm": 0.04144744195910536, + "language_loss": 1.01965618, + "learning_rate": 0.0009999719503237174, + "loss": 1.03198993, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 1.9921875, + "step": 173, + "time_per_iteration": 2.7612979412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234993, + "balance_loss_mlp": 1.03739214, + "epoch": 0.033474413235859944, + "flos": 468996758784.0, + "grad_norm": 0.06741318195929925, + "language_loss": 1.10547054, + "learning_rate": 0.0009999685533397073, + "loss": 1.1178205, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 1.97265625, + "step": 174, + "time_per_iteration": 2.5750949382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246896, + "balance_loss_mlp": 1.05101097, + "epoch": 0.03366679492112351, + "flos": 580715841792.0, + "grad_norm": 0.0354258140398677, + "language_loss": 1.02665091, + "learning_rate": 0.00099996496225069, + "loss": 1.03911996, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 1.95605469, + "step": 175, + "time_per_iteration": 2.6886191368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124614, + "balance_loss_mlp": 1.05168545, + "epoch": 0.03385917660638707, + "flos": 638886479616.0, + "grad_norm": 0.036851717024697625, + "language_loss": 1.04551578, + "learning_rate": 0.0009999611770580604, + "loss": 1.0579772, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 1.94433594, + "step": 176, + "time_per_iteration": 2.8528547286987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227252, + "balance_loss_mlp": 1.03422809, + "epoch": 0.03405155829165064, + "flos": 442740164352.0, + "grad_norm": 0.05003520598604069, + "language_loss": 1.03819132, + "learning_rate": 0.0009999571977632876, + "loss": 1.0504638, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 1.9296875, + "step": 177, + "time_per_iteration": 2.6220269203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224145, + "balance_loss_mlp": 1.03188384, + "epoch": 0.034243939976914196, + "flos": 467275222272.0, + "grad_norm": 0.0554689754659714, + "language_loss": 1.0658946, + "learning_rate": 0.0009999530243679166, + "loss": 1.07813609, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 1.921875, + "step": 178, + "time_per_iteration": 2.5593671798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235914, + "balance_loss_mlp": 1.04479802, + "epoch": 0.03443632166217776, + "flos": 780713498880.0, + "grad_norm": 0.03675993055709111, + "language_loss": 1.01102996, + "learning_rate": 0.0009999486568735675, + "loss": 1.02338898, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 1.91015625, + "step": 179, + "time_per_iteration": 3.083312749862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235549, + "balance_loss_mlp": 1.04548192, + "epoch": 0.03462870334744132, + "flos": 1265760120576.0, + "grad_norm": 0.04656515886260978, + "language_loss": 1.01660061, + "learning_rate": 0.0009999440952819362, + "loss": 1.02895617, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 1.89941406, + "step": 180, + "time_per_iteration": 3.691354513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231777, + "balance_loss_mlp": 1.04390287, + "epoch": 0.03482108503270489, + "flos": 608303200512.0, + "grad_norm": 0.04339398829325753, + "language_loss": 1.02140999, + "learning_rate": 0.0009999393395947935, + "loss": 1.03372765, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 1.87695312, + "step": 181, + "time_per_iteration": 2.8826780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222143, + "balance_loss_mlp": 1.03617644, + "epoch": 0.03501346671796845, + "flos": 539315361792.0, + "grad_norm": 0.033650569268787865, + "language_loss": 1.05363226, + "learning_rate": 0.0009999343898139858, + "loss": 1.06585371, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 1.85742188, + "step": 182, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217643, + "balance_loss_mlp": 1.03329813, + "epoch": 0.035205848403232015, + "flos": 519499706112.0, + "grad_norm": 0.04889617812287003, + "language_loss": 1.03914642, + "learning_rate": 0.0009999292459414348, + "loss": 1.05132294, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 1.84082031, + "step": 183, + "time_per_iteration": 2.648263931274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223183, + "balance_loss_mlp": 1.04103076, + "epoch": 0.035398230088495575, + "flos": 473334137088.0, + "grad_norm": 0.03546540132303448, + "language_loss": 1.08284354, + "learning_rate": 0.0009999239079791374, + "loss": 1.09507537, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 1.81835938, + "step": 184, + "time_per_iteration": 2.6003947257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229231, + "balance_loss_mlp": 1.04908144, + "epoch": 0.03559061177375914, + "flos": 513095705856.0, + "grad_norm": 0.03580873522044792, + "language_loss": 1.00877666, + "learning_rate": 0.0009999183759291659, + "loss": 1.02106905, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 1.79785156, + "step": 185, + "time_per_iteration": 2.7518959045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229953, + "balance_loss_mlp": 1.05161583, + "epoch": 0.0357829934590227, + "flos": 478350992640.0, + "grad_norm": 0.05401643684385997, + "language_loss": 1.03586912, + "learning_rate": 0.0009999126497936682, + "loss": 1.04816866, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 1.78710938, + "step": 186, + "time_per_iteration": 2.565373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218003, + "balance_loss_mlp": 1.04052448, + "epoch": 0.03597537514428627, + "flos": 645885386496.0, + "grad_norm": 0.027605248849540943, + "language_loss": 1.06344712, + "learning_rate": 0.0009999067295748676, + "loss": 1.07562721, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 1.77832031, + "step": 187, + "time_per_iteration": 2.862023115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208675, + "balance_loss_mlp": 1.03167319, + "epoch": 0.03616775682954983, + "flos": 582270182400.0, + "grad_norm": 0.041753828035088196, + "language_loss": 1.04174721, + "learning_rate": 0.000999900615275062, + "loss": 1.05383396, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 1.7734375, + "step": 188, + "time_per_iteration": 2.7248780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206757, + "balance_loss_mlp": 1.02994609, + "epoch": 0.03636013851481339, + "flos": 383265007104.0, + "grad_norm": 0.05119808239604003, + "language_loss": 1.10189009, + "learning_rate": 0.0009998943068966256, + "loss": 1.11395764, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 1.77148438, + "step": 189, + "time_per_iteration": 2.487445592880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216253, + "balance_loss_mlp": 1.04010975, + "epoch": 0.03655252020007695, + "flos": 584308614144.0, + "grad_norm": 0.029643950017142998, + "language_loss": 1.04644084, + "learning_rate": 0.0009998878044420072, + "loss": 1.05860329, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 1.76464844, + "step": 190, + "time_per_iteration": 2.736809015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012186, + "balance_loss_mlp": 1.04321897, + "epoch": 0.03674490188534051, + "flos": 472598279424.0, + "grad_norm": 0.03987592529636011, + "language_loss": 1.00565469, + "learning_rate": 0.0009998811079137318, + "loss": 1.01784062, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 1.75683594, + "step": 191, + "time_per_iteration": 2.6006946563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214791, + "balance_loss_mlp": 1.04017353, + "epoch": 0.03693728357060408, + "flos": 529411908096.0, + "grad_norm": 0.03601320862003297, + "language_loss": 1.01597381, + "learning_rate": 0.0009998742173143987, + "loss": 1.02812171, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 1.74902344, + "step": 192, + "time_per_iteration": 2.6246893405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200861, + "balance_loss_mlp": 1.02719736, + "epoch": 0.03712966525586764, + "flos": 800346407424.0, + "grad_norm": 0.02962706666311765, + "language_loss": 1.0204885, + "learning_rate": 0.0009998671326466833, + "loss": 1.03249693, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 1.73925781, + "step": 193, + "time_per_iteration": 2.9852418899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194118, + "balance_loss_mlp": 1.02121651, + "epoch": 0.037322046941131205, + "flos": 831359342592.0, + "grad_norm": 0.049736474928026, + "language_loss": 1.0340569, + "learning_rate": 0.0009998598539133362, + "loss": 1.04599798, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 1.73144531, + "step": 194, + "time_per_iteration": 3.0510568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194861, + "balance_loss_mlp": 1.02339077, + "epoch": 0.037514428626394765, + "flos": 438589423872.0, + "grad_norm": 0.030819097200883293, + "language_loss": 1.03682184, + "learning_rate": 0.0009998523811171828, + "loss": 1.04877055, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 1.71679688, + "step": 195, + "time_per_iteration": 2.5203936100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197718, + "balance_loss_mlp": 1.0269146, + "epoch": 0.03770681031165833, + "flos": 512639804928.0, + "grad_norm": 0.031890398221933944, + "language_loss": 1.04342675, + "learning_rate": 0.0009998447142611248, + "loss": 1.05540395, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 1.70996094, + "step": 196, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193737, + "balance_loss_mlp": 1.02341044, + "epoch": 0.03789919199692189, + "flos": 808843274496.0, + "grad_norm": 0.030368823498634023, + "language_loss": 0.97672093, + "learning_rate": 0.0009998368533481387, + "loss": 0.98865831, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 1.70507812, + "step": 197, + "time_per_iteration": 3.031437397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185957, + "balance_loss_mlp": 1.01677489, + "epoch": 0.03809157368218546, + "flos": 691792386048.0, + "grad_norm": 0.027429804092446938, + "language_loss": 1.00742936, + "learning_rate": 0.0009998287983812762, + "loss": 1.01928902, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 1.69335938, + "step": 198, + "time_per_iteration": 2.8533172607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186443, + "balance_loss_mlp": 1.01764262, + "epoch": 0.03828395536744902, + "flos": 519004921344.0, + "grad_norm": 0.029672573654994608, + "language_loss": 1.06761527, + "learning_rate": 0.0009998205493636646, + "loss": 1.07947969, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 1.68945312, + "step": 199, + "time_per_iteration": 2.6512415409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190294, + "balance_loss_mlp": 1.02197027, + "epoch": 0.038476337052712584, + "flos": 582763021824.0, + "grad_norm": 0.03300049351517658, + "language_loss": 0.99112457, + "learning_rate": 0.0009998121062985063, + "loss": 1.00302756, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 1.68457031, + "step": 200, + "time_per_iteration": 2.6979846954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187054, + "balance_loss_mlp": 1.01996994, + "epoch": 0.03866871873797614, + "flos": 578273998848.0, + "grad_norm": 0.03164459486115397, + "language_loss": 1.0110172, + "learning_rate": 0.0009998034691890794, + "loss": 1.02288771, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 1.671875, + "step": 201, + "time_per_iteration": 2.80670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183327, + "balance_loss_mlp": 1.01672018, + "epoch": 0.03886110042323971, + "flos": 541772755968.0, + "grad_norm": 0.032663388617215364, + "language_loss": 1.05587053, + "learning_rate": 0.0009997946380387369, + "loss": 1.06770372, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 1.66699219, + "step": 202, + "time_per_iteration": 2.6591310501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179406, + "balance_loss_mlp": 1.01394379, + "epoch": 0.03905348210850327, + "flos": 719240739072.0, + "grad_norm": 0.030305493428663434, + "language_loss": 1.08528447, + "learning_rate": 0.0009997856128509076, + "loss": 1.09707844, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 1.65527344, + "step": 203, + "time_per_iteration": 2.9006340503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181527, + "balance_loss_mlp": 1.01720893, + "epoch": 0.039245863793766836, + "flos": 428397265152.0, + "grad_norm": 0.03189317300504765, + "language_loss": 1.03375864, + "learning_rate": 0.0009997763936290952, + "loss": 1.04557395, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.64355469, + "step": 204, + "time_per_iteration": 2.5836358070373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178747, + "balance_loss_mlp": 1.01538289, + "epoch": 0.039438245479030395, + "flos": 664270156032.0, + "grad_norm": 0.033629424624266296, + "language_loss": 1.0866276, + "learning_rate": 0.0009997669803768789, + "loss": 1.09841514, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.63378906, + "step": 205, + "time_per_iteration": 2.7809464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180514, + "balance_loss_mlp": 1.01791251, + "epoch": 0.03963062716429396, + "flos": 636496159488.0, + "grad_norm": 0.025840840316256445, + "language_loss": 1.03755617, + "learning_rate": 0.0009997573730979134, + "loss": 1.04936123, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.62597656, + "step": 206, + "time_per_iteration": 2.7759904861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207138, + "balance_loss_mlp": 1.04272461, + "epoch": 0.03982300884955752, + "flos": 1421589799680.0, + "grad_norm": 0.03078548913711826, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80400336, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.64453125, + "step": 207, + "time_per_iteration": 4.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177297, + "balance_loss_mlp": 1.0162214, + "epoch": 0.04001539053482109, + "flos": 690520914432.0, + "grad_norm": 0.03233621027438014, + "language_loss": 1.02104092, + "learning_rate": 0.0009997375764747294, + "loss": 1.03281379, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.61035156, + "step": 208, + "time_per_iteration": 2.9808952808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181785, + "balance_loss_mlp": 1.02156758, + "epoch": 0.04020777222008465, + "flos": 534752461824.0, + "grad_norm": 0.037334696417832054, + "language_loss": 0.99876916, + "learning_rate": 0.0009997273871381967, + "loss": 1.01058698, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.6015625, + "step": 209, + "time_per_iteration": 2.6938650608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183132, + "balance_loss_mlp": 1.02396429, + "epoch": 0.040400153905348214, + "flos": 568997532672.0, + "grad_norm": 0.03228633343407045, + "language_loss": 1.04497194, + "learning_rate": 0.0009997170037902862, + "loss": 1.05680323, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.59082031, + "step": 210, + "time_per_iteration": 2.722900629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189763, + "balance_loss_mlp": 1.03145349, + "epoch": 0.040592535590611774, + "flos": 714679784448.0, + "grad_norm": 0.026587079094436805, + "language_loss": 1.0723207, + "learning_rate": 0.0009997064264350292, + "loss": 1.08421838, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.58203125, + "step": 211, + "time_per_iteration": 2.8636813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186902, + "balance_loss_mlp": 1.02954614, + "epoch": 0.04078491727587533, + "flos": 579207187968.0, + "grad_norm": 0.028855359605628288, + "language_loss": 1.01311755, + "learning_rate": 0.0009996956550765317, + "loss": 1.02498662, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.57226562, + "step": 212, + "time_per_iteration": 2.6752002239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183355, + "balance_loss_mlp": 1.0270474, + "epoch": 0.0409772989611389, + "flos": 553369555968.0, + "grad_norm": 0.03615073574048419, + "language_loss": 0.96463609, + "learning_rate": 0.0009996846897189762, + "loss": 0.97646964, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.56152344, + "step": 213, + "time_per_iteration": 2.618417501449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180441, + "balance_loss_mlp": 1.02470577, + "epoch": 0.04116968064640246, + "flos": 556764996864.0, + "grad_norm": 0.04473264124517712, + "language_loss": 1.02233624, + "learning_rate": 0.0009996735303666193, + "loss": 1.03414059, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.55566406, + "step": 214, + "time_per_iteration": 2.7398550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118203, + "balance_loss_mlp": 1.026963, + "epoch": 0.041362062331666026, + "flos": 579652395264.0, + "grad_norm": 0.027182691243245845, + "language_loss": 1.04435229, + "learning_rate": 0.0009996621770237937, + "loss": 1.05617261, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.54882812, + "step": 215, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182535, + "balance_loss_mlp": 1.02775347, + "epoch": 0.041554444016929586, + "flos": 612701816832.0, + "grad_norm": 0.028683660550217302, + "language_loss": 1.00582075, + "learning_rate": 0.0009996506296949073, + "loss": 1.01764607, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.54589844, + "step": 216, + "time_per_iteration": 2.877587080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180569, + "balance_loss_mlp": 1.02607429, + "epoch": 0.04174682570219315, + "flos": 529151393280.0, + "grad_norm": 0.031901868987761664, + "language_loss": 1.00452459, + "learning_rate": 0.0009996388883844428, + "loss": 1.01633024, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.54296875, + "step": 217, + "time_per_iteration": 2.6346311569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.02002692, + "epoch": 0.04193920738745671, + "flos": 512500799232.0, + "grad_norm": 0.02715845750356807, + "language_loss": 1.03465486, + "learning_rate": 0.0009996269530969588, + "loss": 1.04639161, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.53417969, + "step": 218, + "time_per_iteration": 2.6205921173095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170474, + "balance_loss_mlp": 1.0176959, + "epoch": 0.04213158907272028, + "flos": 572553366528.0, + "grad_norm": 0.03606301207395498, + "language_loss": 1.04169452, + "learning_rate": 0.0009996148238370888, + "loss": 1.05339921, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.52539062, + "step": 219, + "time_per_iteration": 2.8047173023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.01725543, + "epoch": 0.04232397075798384, + "flos": 965905552896.0, + "grad_norm": 0.026524392964530758, + "language_loss": 0.99111861, + "learning_rate": 0.0009996025006095421, + "loss": 1.00281417, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.52050781, + "step": 220, + "time_per_iteration": 3.315859317779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 0.99693298, + "epoch": 0.042516352443247404, + "flos": 1472733340416.0, + "grad_norm": 0.01509407607306266, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.78931135, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.5078125, + "step": 221, + "time_per_iteration": 5.540910243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166927, + "balance_loss_mlp": 1.0164367, + "epoch": 0.042708734128510964, + "flos": 655892852736.0, + "grad_norm": 0.029367950869880366, + "language_loss": 0.99126619, + "learning_rate": 0.0009995772722706307, + "loss": 1.00293541, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.50195312, + "step": 222, + "time_per_iteration": 2.901489019393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167445, + "balance_loss_mlp": 1.01705015, + "epoch": 0.04290111581377453, + "flos": 432734643456.0, + "grad_norm": 0.04040999725558835, + "language_loss": 1.13508129, + "learning_rate": 0.0009995643671690604, + "loss": 1.1467557, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.50097656, + "step": 223, + "time_per_iteration": 2.5576720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168153, + "balance_loss_mlp": 1.01823533, + "epoch": 0.04309349749903809, + "flos": 645867889920.0, + "grad_norm": 0.02824445481068148, + "language_loss": 1.00763512, + "learning_rate": 0.0009995512681194023, + "loss": 1.01931667, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.49609375, + "step": 224, + "time_per_iteration": 2.9571568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167559, + "balance_loss_mlp": 1.01840472, + "epoch": 0.04328587918430166, + "flos": 832897153536.0, + "grad_norm": 0.025764365733734692, + "language_loss": 0.98235118, + "learning_rate": 0.0009995379751267417, + "loss": 0.99402678, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.48828125, + "step": 225, + "time_per_iteration": 3.2627484798431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166832, + "balance_loss_mlp": 1.01824963, + "epoch": 0.043478260869565216, + "flos": 526116589056.0, + "grad_norm": 0.03531387708455554, + "language_loss": 1.00006318, + "learning_rate": 0.0009995244881962398, + "loss": 1.01173151, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.48242188, + "step": 226, + "time_per_iteration": 2.624209403991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170136, + "balance_loss_mlp": 1.02212548, + "epoch": 0.04367064255482878, + "flos": 440413027584.0, + "grad_norm": 0.039279482080902435, + "language_loss": 1.01293874, + "learning_rate": 0.0009995108073331323, + "loss": 1.02464008, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.4765625, + "step": 227, + "time_per_iteration": 2.6042520999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164096, + "balance_loss_mlp": 1.01742136, + "epoch": 0.04386302424009234, + "flos": 508467677184.0, + "grad_norm": 0.03801127181345805, + "language_loss": 1.03535032, + "learning_rate": 0.0009994969325427309, + "loss": 1.04699123, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.46582031, + "step": 228, + "time_per_iteration": 2.6691603660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163607, + "balance_loss_mlp": 1.01769507, + "epoch": 0.04405540592535591, + "flos": 541744565760.0, + "grad_norm": 0.03512041362752814, + "language_loss": 1.00143218, + "learning_rate": 0.0009994828638304218, + "loss": 1.0130682, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.46191406, + "step": 229, + "time_per_iteration": 2.627833366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164617, + "balance_loss_mlp": 1.01927722, + "epoch": 0.04424778761061947, + "flos": 447309867264.0, + "grad_norm": 0.03576658395893793, + "language_loss": 1.06260157, + "learning_rate": 0.0009994686012016675, + "loss": 1.07424784, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.45703125, + "step": 230, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159156, + "balance_loss_mlp": 1.01448417, + "epoch": 0.044440169295883035, + "flos": 701982599424.0, + "grad_norm": 0.03592315304636455, + "language_loss": 1.05298328, + "learning_rate": 0.000999454144662005, + "loss": 1.06457496, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.45019531, + "step": 231, + "time_per_iteration": 2.918896436691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156937, + "balance_loss_mlp": 1.01274192, + "epoch": 0.044632550981146595, + "flos": 589427536896.0, + "grad_norm": 0.032106980286660924, + "language_loss": 0.996499, + "learning_rate": 0.0009994394942170468, + "loss": 1.00806844, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.4453125, + "step": 232, + "time_per_iteration": 2.700378179550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169343, + "balance_loss_mlp": 1.02524316, + "epoch": 0.04482493266641016, + "flos": 555855140352.0, + "grad_norm": 0.03061962333593277, + "language_loss": 0.97402102, + "learning_rate": 0.0009994246498724808, + "loss": 0.9857145, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.44433594, + "step": 233, + "time_per_iteration": 2.692657232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171534, + "balance_loss_mlp": 1.02848291, + "epoch": 0.04501731435167372, + "flos": 724070956800.0, + "grad_norm": 0.03598428268947968, + "language_loss": 1.00358808, + "learning_rate": 0.00099940961163407, + "loss": 1.01530337, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.43359375, + "step": 234, + "time_per_iteration": 2.8496198654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167121, + "balance_loss_mlp": 1.02473748, + "epoch": 0.04520969603693728, + "flos": 512798252544.0, + "grad_norm": 0.03236637347420306, + "language_loss": 1.0231185, + "learning_rate": 0.0009993943795076528, + "loss": 1.03478956, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.42675781, + "step": 235, + "time_per_iteration": 2.6304001808166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157951, + "balance_loss_mlp": 1.01623452, + "epoch": 0.04540207772220085, + "flos": 365878555392.0, + "grad_norm": 0.04557463461025321, + "language_loss": 1.04854226, + "learning_rate": 0.0009993789534991427, + "loss": 1.06012177, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.41992188, + "step": 236, + "time_per_iteration": 2.500347852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156422, + "balance_loss_mlp": 1.01613641, + "epoch": 0.045594459407464406, + "flos": 523724323584.0, + "grad_norm": 0.028810086143122388, + "language_loss": 0.99360317, + "learning_rate": 0.0009993633336145287, + "loss": 1.00516737, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.40527344, + "step": 237, + "time_per_iteration": 2.6991968154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156358, + "balance_loss_mlp": 1.01664495, + "epoch": 0.04578684109272797, + "flos": 673116966144.0, + "grad_norm": 0.036851747197037266, + "language_loss": 1.03695393, + "learning_rate": 0.0009993475198598752, + "loss": 1.04851758, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.39941406, + "step": 238, + "time_per_iteration": 3.0150160789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160083, + "balance_loss_mlp": 1.02084696, + "epoch": 0.04597922277799153, + "flos": 542621374464.0, + "grad_norm": 0.03967898438127139, + "language_loss": 1.00323462, + "learning_rate": 0.0009993315122413212, + "loss": 1.01483548, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.39453125, + "step": 239, + "time_per_iteration": 2.6226179599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.01690221, + "epoch": 0.0461716044632551, + "flos": 459994413312.0, + "grad_norm": 0.029756199222484733, + "language_loss": 1.00536144, + "learning_rate": 0.0009993153107650818, + "loss": 1.01691425, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.38574219, + "step": 240, + "time_per_iteration": 2.635673999786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154071, + "balance_loss_mlp": 1.01607406, + "epoch": 0.04636398614851866, + "flos": 456171261696.0, + "grad_norm": 0.03103837756937707, + "language_loss": 0.99882519, + "learning_rate": 0.0009992989154374468, + "loss": 1.01036584, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.38183594, + "step": 241, + "time_per_iteration": 2.5449135303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115588, + "balance_loss_mlp": 1.01836014, + "epoch": 0.046556367833782225, + "flos": 557902320384.0, + "grad_norm": 0.06487144756994469, + "language_loss": 1.0686537, + "learning_rate": 0.0009992823262647817, + "loss": 1.08021247, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.37695312, + "step": 242, + "time_per_iteration": 2.705120325088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011561, + "balance_loss_mlp": 1.01905739, + "epoch": 0.046748749519045785, + "flos": 594088613376.0, + "grad_norm": 0.03633512017688626, + "language_loss": 1.00915635, + "learning_rate": 0.0009992655432535264, + "loss": 1.02071738, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.37207031, + "step": 243, + "time_per_iteration": 2.8158721923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160044, + "balance_loss_mlp": 1.02347767, + "epoch": 0.04694113120430935, + "flos": 570942645504.0, + "grad_norm": 0.036353271768507285, + "language_loss": 1.01172018, + "learning_rate": 0.0009992485664101973, + "loss": 1.02332067, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.3671875, + "step": 244, + "time_per_iteration": 2.723409414291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156505, + "balance_loss_mlp": 1.0207969, + "epoch": 0.04713351288957291, + "flos": 865246689024.0, + "grad_norm": 0.05316255083066814, + "language_loss": 1.03417325, + "learning_rate": 0.000999231395741385, + "loss": 1.04573822, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.35839844, + "step": 245, + "time_per_iteration": 3.1441562175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155254, + "balance_loss_mlp": 1.02011812, + "epoch": 0.04732589457483648, + "flos": 538236364032.0, + "grad_norm": 0.039550829703112036, + "language_loss": 1.01375949, + "learning_rate": 0.0009992140312537557, + "loss": 1.02531195, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.35253906, + "step": 246, + "time_per_iteration": 2.6407320499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158298, + "balance_loss_mlp": 1.02402055, + "epoch": 0.04751827626010004, + "flos": 763272612096.0, + "grad_norm": 0.029332271702031103, + "language_loss": 0.96132767, + "learning_rate": 0.000999196472954051, + "loss": 0.97291064, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.34375, + "step": 247, + "time_per_iteration": 2.9791386127471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115741, + "balance_loss_mlp": 1.02313232, + "epoch": 0.0477106579453636, + "flos": 1583128462080.0, + "grad_norm": 0.019406803026512872, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80582267, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.34375, + "step": 248, + "time_per_iteration": 5.547277927398682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115733, + "balance_loss_mlp": 1.02457833, + "epoch": 0.04790303963062716, + "flos": 458693784576.0, + "grad_norm": 0.04949407998464004, + "language_loss": 1.04053593, + "learning_rate": 0.0009991607749457578, + "loss": 1.05210924, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.328125, + "step": 249, + "time_per_iteration": 2.610372304916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158188, + "balance_loss_mlp": 1.02629459, + "epoch": 0.04809542131589073, + "flos": 783787186944.0, + "grad_norm": 0.03428496832179458, + "language_loss": 1.01565814, + "learning_rate": 0.0009991426352510286, + "loss": 1.02723992, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.31933594, + "step": 250, + "time_per_iteration": 2.9723451137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.0272516, + "epoch": 0.04828780300115429, + "flos": 560322776064.0, + "grad_norm": 0.03370153589925739, + "language_loss": 1.02967048, + "learning_rate": 0.0009991243017719422, + "loss": 1.04125512, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.3125, + "step": 251, + "time_per_iteration": 2.691317319869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115263, + "balance_loss_mlp": 1.02149975, + "epoch": 0.048480184686417856, + "flos": 502922989056.0, + "grad_norm": 0.033537523086657674, + "language_loss": 0.98110956, + "learning_rate": 0.0009991057745156165, + "loss": 0.99263585, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.31152344, + "step": 252, + "time_per_iteration": 2.615726947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126877, + "balance_loss_mlp": 0.99641418, + "epoch": 0.048672566371681415, + "flos": 1539471810048.0, + "grad_norm": 0.00943295316075806, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83037865, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.3046875, + "step": 253, + "time_per_iteration": 5.119662523269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155145, + "balance_loss_mlp": 1.02439594, + "epoch": 0.04886494805694498, + "flos": 538952779776.0, + "grad_norm": 0.04101934284448647, + "language_loss": 1.06555986, + "learning_rate": 0.0009990681387000943, + "loss": 1.07711136, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.30761719, + "step": 254, + "time_per_iteration": 2.7494144439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153346, + "balance_loss_mlp": 1.02316916, + "epoch": 0.04905732974220854, + "flos": 681485521152.0, + "grad_norm": 0.029284228955777224, + "language_loss": 1.01195645, + "learning_rate": 0.0009990490301555093, + "loss": 1.02348995, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.30175781, + "step": 255, + "time_per_iteration": 2.9595844745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_mlp": 1.00462341, + "epoch": 0.04924971142747211, + "flos": 1424277573120.0, + "grad_norm": 0.011666997955433429, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80348712, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.2890625, + "step": 256, + "time_per_iteration": 4.918023347854614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126785, + "balance_loss_mlp": 0.99822998, + "epoch": 0.04944209311273567, + "flos": 1561239381504.0, + "grad_norm": 0.006197531934497474, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80369532, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.28515625, + "step": 257, + "time_per_iteration": 4.996341228485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127556, + "balance_loss_mlp": 0.99976349, + "epoch": 0.04963447479799923, + "flos": 1574173748736.0, + "grad_norm": 0.01126324229515774, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71103442, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.27734375, + "step": 258, + "time_per_iteration": 4.951507329940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167552, + "balance_loss_mlp": 1.03966403, + "epoch": 0.049826856483262794, + "flos": 626499386880.0, + "grad_norm": 0.07394024090910019, + "language_loss": 0.96613419, + "learning_rate": 0.0009989706585723202, + "loss": 0.97780967, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.27832031, + "step": 259, + "time_per_iteration": 2.819796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158437, + "balance_loss_mlp": 1.03073978, + "epoch": 0.05001923816852635, + "flos": 505156806912.0, + "grad_norm": 0.042054435700702504, + "language_loss": 1.02184892, + "learning_rate": 0.0009989505813633442, + "loss": 1.0334332, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.27636719, + "step": 260, + "time_per_iteration": 2.671597719192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.02206886, + "epoch": 0.05021161985378992, + "flos": 588468102912.0, + "grad_norm": 0.05343186989039486, + "language_loss": 1.02308297, + "learning_rate": 0.000998930310444573, + "loss": 1.03457689, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.27246094, + "step": 261, + "time_per_iteration": 2.7573728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.01883233, + "epoch": 0.05040400153905348, + "flos": 634403292672.0, + "grad_norm": 0.052960623500171895, + "language_loss": 1.00806391, + "learning_rate": 0.0009989098458238765, + "loss": 1.01951981, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.26660156, + "step": 262, + "time_per_iteration": 2.7937912940979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146699, + "balance_loss_mlp": 1.02033675, + "epoch": 0.050596383224317046, + "flos": 554809190400.0, + "grad_norm": 0.04531187332347281, + "language_loss": 0.99888676, + "learning_rate": 0.0009988891875091998, + "loss": 1.0103538, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.26269531, + "step": 263, + "time_per_iteration": 2.811218500137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.01939976, + "epoch": 0.050788764909580605, + "flos": 550762462464.0, + "grad_norm": 0.03965392167411722, + "language_loss": 0.94696999, + "learning_rate": 0.0009988683355085636, + "loss": 0.95842183, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.25683594, + "step": 264, + "time_per_iteration": 2.7378242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141586, + "balance_loss_mlp": 1.01617777, + "epoch": 0.05098114659484417, + "flos": 606345448704.0, + "grad_norm": 0.024717188615823983, + "language_loss": 1.02827787, + "learning_rate": 0.000998847289830063, + "loss": 1.03969371, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.25292969, + "step": 265, + "time_per_iteration": 2.8625917434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142152, + "balance_loss_mlp": 1.01693416, + "epoch": 0.05117352828010773, + "flos": 439473035520.0, + "grad_norm": 0.036783183293041616, + "language_loss": 0.96527213, + "learning_rate": 0.0009988260504818682, + "loss": 0.97669363, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.25097656, + "step": 266, + "time_per_iteration": 2.5658230781555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138539, + "balance_loss_mlp": 1.0135119, + "epoch": 0.0513659099653713, + "flos": 506031670272.0, + "grad_norm": 0.04116504124695153, + "language_loss": 1.03285778, + "learning_rate": 0.000998804617472226, + "loss": 1.0442431, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.24902344, + "step": 267, + "time_per_iteration": 2.63395094871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138256, + "balance_loss_mlp": 1.01418352, + "epoch": 0.05155829165063486, + "flos": 696715922688.0, + "grad_norm": 0.034853618125567455, + "language_loss": 0.98327756, + "learning_rate": 0.0009987829908094568, + "loss": 0.9946602, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.23925781, + "step": 268, + "time_per_iteration": 2.8239262104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.01331627, + "epoch": 0.051750673335898424, + "flos": 1350302059008.0, + "grad_norm": 0.042488112993129025, + "language_loss": 1.04893267, + "learning_rate": 0.0009987611705019569, + "loss": 1.0603019, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.234375, + "step": 269, + "time_per_iteration": 4.33854079246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.01387095, + "epoch": 0.051943055021161984, + "flos": 490590331392.0, + "grad_norm": 0.037116049987967636, + "language_loss": 1.03026497, + "learning_rate": 0.0009987391565581978, + "loss": 1.04163671, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.23144531, + "step": 270, + "time_per_iteration": 2.609722852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136969, + "balance_loss_mlp": 1.01365864, + "epoch": 0.05213543670642555, + "flos": 546880985088.0, + "grad_norm": 0.03927026934880779, + "language_loss": 0.95517516, + "learning_rate": 0.000998716948986726, + "loss": 0.96654487, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.23144531, + "step": 271, + "time_per_iteration": 2.797673225402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137765, + "balance_loss_mlp": 1.01512277, + "epoch": 0.05232781839168911, + "flos": 604673489664.0, + "grad_norm": 0.04118655717732696, + "language_loss": 0.97937191, + "learning_rate": 0.0009986945477961633, + "loss": 0.9907496, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.22460938, + "step": 272, + "time_per_iteration": 2.6988775730133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135438, + "balance_loss_mlp": 1.01336777, + "epoch": 0.052520200076952676, + "flos": 539656556544.0, + "grad_norm": 0.027940819886650203, + "language_loss": 1.02222085, + "learning_rate": 0.0009986719529952066, + "loss": 1.0335753, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.21875, + "step": 273, + "time_per_iteration": 2.9503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133251, + "balance_loss_mlp": 1.01175284, + "epoch": 0.052712581762216236, + "flos": 464333736960.0, + "grad_norm": 0.036678205813438995, + "language_loss": 1.02377117, + "learning_rate": 0.000998649164592628, + "loss": 1.0351038, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.21289062, + "step": 274, + "time_per_iteration": 2.575183868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134193, + "balance_loss_mlp": 1.01279056, + "epoch": 0.0529049634474798, + "flos": 549106054656.0, + "grad_norm": 0.029580362230619023, + "language_loss": 1.00386071, + "learning_rate": 0.0009986261825972748, + "loss": 1.01520276, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.21191406, + "step": 275, + "time_per_iteration": 2.781388521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.01578796, + "epoch": 0.05309734513274336, + "flos": 619201081344.0, + "grad_norm": 0.028327187192750843, + "language_loss": 1.01742268, + "learning_rate": 0.000998603007018069, + "loss": 1.0287869, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.20410156, + "step": 276, + "time_per_iteration": 2.8231008052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137197, + "balance_loss_mlp": 1.01665294, + "epoch": 0.05328972681800693, + "flos": 606618602496.0, + "grad_norm": 0.02408735734832513, + "language_loss": 1.00149679, + "learning_rate": 0.0009985796378640089, + "loss": 1.01286888, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.203125, + "step": 277, + "time_per_iteration": 2.721719264984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136169, + "balance_loss_mlp": 1.01610124, + "epoch": 0.05348210850327049, + "flos": 605731100160.0, + "grad_norm": 0.0319931943489141, + "language_loss": 0.99697894, + "learning_rate": 0.0009985560751441665, + "loss": 1.0083406, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.19824219, + "step": 278, + "time_per_iteration": 2.835160255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133809, + "balance_loss_mlp": 1.01412332, + "epoch": 0.053674490188534055, + "flos": 631998388224.0, + "grad_norm": 0.030840524384760076, + "language_loss": 1.0228467, + "learning_rate": 0.00099853231886769, + "loss": 1.03418469, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.19433594, + "step": 279, + "time_per_iteration": 2.8541102409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131641, + "balance_loss_mlp": 1.01243138, + "epoch": 0.053866871873797614, + "flos": 480174596352.0, + "grad_norm": 0.030057370429500904, + "language_loss": 1.01521945, + "learning_rate": 0.0009985083690438024, + "loss": 1.02653599, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.18945312, + "step": 280, + "time_per_iteration": 2.778996706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133594, + "balance_loss_mlp": 1.01514757, + "epoch": 0.054059253559061174, + "flos": 789490322688.0, + "grad_norm": 0.030570218765999514, + "language_loss": 0.92515564, + "learning_rate": 0.0009984842256818016, + "loss": 0.93649161, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.18164062, + "step": 281, + "time_per_iteration": 3.113694429397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.01928854, + "epoch": 0.05425163524432474, + "flos": 629506000896.0, + "grad_norm": 0.043548376252248826, + "language_loss": 1.03102541, + "learning_rate": 0.0009984598887910613, + "loss": 1.04240274, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.18164062, + "step": 282, + "time_per_iteration": 2.8303444385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.01504183, + "epoch": 0.0544440169295883, + "flos": 616993508352.0, + "grad_norm": 0.05077708884656826, + "language_loss": 0.98823464, + "learning_rate": 0.0009984353583810297, + "loss": 0.99956, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.171875, + "step": 283, + "time_per_iteration": 2.835850954055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.01315546, + "epoch": 0.05463639861485187, + "flos": 648930884352.0, + "grad_norm": 0.03524270200319673, + "language_loss": 1.0117259, + "learning_rate": 0.0009984106344612302, + "loss": 1.02302563, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.16503906, + "step": 284, + "time_per_iteration": 2.760528564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129453, + "balance_loss_mlp": 1.01319993, + "epoch": 0.054828780300115426, + "flos": 798585987072.0, + "grad_norm": 0.03078454247465455, + "language_loss": 0.96210134, + "learning_rate": 0.0009983857170412615, + "loss": 0.97339588, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.15917969, + "step": 285, + "time_per_iteration": 2.9911587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131741, + "balance_loss_mlp": 1.01567924, + "epoch": 0.05502116198537899, + "flos": 550799400960.0, + "grad_norm": 0.028192528419898312, + "language_loss": 0.95645988, + "learning_rate": 0.000998360606130798, + "loss": 0.96777725, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.15722656, + "step": 286, + "time_per_iteration": 2.8603405952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119957, + "balance_loss_mlp": 1.00475311, + "epoch": 0.05521354367064255, + "flos": 1410909659136.0, + "grad_norm": 0.016802553847575376, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70193076, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.1484375, + "step": 287, + "time_per_iteration": 4.872994899749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139216, + "balance_loss_mlp": 1.02372622, + "epoch": 0.05540592535590612, + "flos": 646612495872.0, + "grad_norm": 0.03160477576624613, + "language_loss": 1.01500821, + "learning_rate": 0.0009983098038774552, + "loss": 1.02640033, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.15136719, + "step": 288, + "time_per_iteration": 2.7645044326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119652, + "balance_loss_mlp": 1.00521088, + "epoch": 0.05559830704116968, + "flos": 1514318512896.0, + "grad_norm": 0.011772143096286682, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79289877, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.140625, + "step": 289, + "time_per_iteration": 4.783201456069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150059, + "balance_loss_mlp": 1.03542745, + "epoch": 0.055790688726433245, + "flos": 509335737600.0, + "grad_norm": 0.037615798403722346, + "language_loss": 1.00063777, + "learning_rate": 0.0009982582277800948, + "loss": 1.01213825, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.14257812, + "step": 290, + "time_per_iteration": 2.5825588703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142418, + "balance_loss_mlp": 1.02873969, + "epoch": 0.055983070411696804, + "flos": 659075410944.0, + "grad_norm": 0.03490310528255379, + "language_loss": 1.06654799, + "learning_rate": 0.0009982321495648908, + "loss": 1.07797217, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.13671875, + "step": 291, + "time_per_iteration": 2.8099231719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137522, + "balance_loss_mlp": 1.02470279, + "epoch": 0.05617545209696037, + "flos": 588476851200.0, + "grad_norm": 0.035465642673631545, + "language_loss": 0.97683877, + "learning_rate": 0.0009982058779188115, + "loss": 0.98821402, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.13183594, + "step": 292, + "time_per_iteration": 2.7125580310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136096, + "balance_loss_mlp": 1.02384841, + "epoch": 0.05636783378222393, + "flos": 612788332800.0, + "grad_norm": 0.032210362870472055, + "language_loss": 1.05647731, + "learning_rate": 0.0009981794128520567, + "loss": 1.06783831, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.12597656, + "step": 293, + "time_per_iteration": 2.7916390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135514, + "balance_loss_mlp": 1.0241251, + "epoch": 0.0565602154674875, + "flos": 669424071936.0, + "grad_norm": 0.03595229916115603, + "language_loss": 1.02550793, + "learning_rate": 0.000998152754374901, + "loss": 1.03686309, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.1171875, + "step": 294, + "time_per_iteration": 2.8770558834075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134115, + "balance_loss_mlp": 1.0227263, + "epoch": 0.05675259715275106, + "flos": 618365101824.0, + "grad_norm": 0.028486588423889302, + "language_loss": 0.98274708, + "learning_rate": 0.0009981259024976943, + "loss": 0.99408829, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.1171875, + "step": 295, + "time_per_iteration": 2.729853630065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133246, + "balance_loss_mlp": 1.02204788, + "epoch": 0.05694497883801462, + "flos": 753154330368.0, + "grad_norm": 0.04188437456637708, + "language_loss": 0.968624, + "learning_rate": 0.0009980988572308612, + "loss": 0.97995651, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.11523438, + "step": 296, + "time_per_iteration": 3.0135345458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132496, + "balance_loss_mlp": 1.02187026, + "epoch": 0.05713736052327818, + "flos": 713382067968.0, + "grad_norm": 0.0305883196599643, + "language_loss": 0.9903996, + "learning_rate": 0.0009980716185849015, + "loss": 1.0017246, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.109375, + "step": 297, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.01865172, + "epoch": 0.05732974220854175, + "flos": 469936750848.0, + "grad_norm": 0.029025981508343963, + "language_loss": 0.95620793, + "learning_rate": 0.0009980441865703904, + "loss": 0.96750069, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.109375, + "step": 298, + "time_per_iteration": 2.67486572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.0163666, + "epoch": 0.05752212389380531, + "flos": 602541739008.0, + "grad_norm": 0.028406065642448373, + "language_loss": 1.04190016, + "learning_rate": 0.000998016561197978, + "loss": 1.05316436, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.10351562, + "step": 299, + "time_per_iteration": 2.7435965538024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127499, + "balance_loss_mlp": 1.01773107, + "epoch": 0.057714505579068875, + "flos": 679950622464.0, + "grad_norm": 0.02999406165417261, + "language_loss": 0.957955, + "learning_rate": 0.0009979887424783895, + "loss": 0.96922994, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.10058594, + "step": 300, + "time_per_iteration": 2.868412494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127678, + "balance_loss_mlp": 1.01800561, + "epoch": 0.057906887264332435, + "flos": 597012602112.0, + "grad_norm": 0.033381964405594114, + "language_loss": 0.95279002, + "learning_rate": 0.0009979607304224248, + "loss": 0.96406674, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.09960938, + "step": 301, + "time_per_iteration": 2.7196099758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127179, + "balance_loss_mlp": 1.01760185, + "epoch": 0.058099268949596, + "flos": 553165421568.0, + "grad_norm": 0.029428698202492602, + "language_loss": 1.02305853, + "learning_rate": 0.000997932525040959, + "loss": 1.03433037, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.09863281, + "step": 302, + "time_per_iteration": 2.645131826400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126073, + "balance_loss_mlp": 1.0166868, + "epoch": 0.05829165063485956, + "flos": 509231725056.0, + "grad_norm": 0.033454482596205204, + "language_loss": 1.04832363, + "learning_rate": 0.000997904126344943, + "loss": 1.05958426, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.09667969, + "step": 303, + "time_per_iteration": 2.60955810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_mlp": 1.0157212, + "epoch": 0.05848403232012313, + "flos": 616363608576.0, + "grad_norm": 0.0319979050325151, + "language_loss": 1.00779867, + "learning_rate": 0.0009978755343454018, + "loss": 1.01905453, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.1015625, + "step": 304, + "time_per_iteration": 2.733825206756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124254, + "balance_loss_mlp": 1.01467645, + "epoch": 0.05867641400538669, + "flos": 501079943424.0, + "grad_norm": 0.03385536533959698, + "language_loss": 1.01509869, + "learning_rate": 0.0009978467490534355, + "loss": 1.0263412, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.09863281, + "step": 305, + "time_per_iteration": 2.6263206005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121932, + "balance_loss_mlp": 1.01292717, + "epoch": 0.05886879569065025, + "flos": 532379638272.0, + "grad_norm": 0.03088897761094542, + "language_loss": 0.98605353, + "learning_rate": 0.00099781777048022, + "loss": 0.99727285, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.09277344, + "step": 306, + "time_per_iteration": 2.7351841926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122107, + "balance_loss_mlp": 1.01329267, + "epoch": 0.05906117737591381, + "flos": 490041111552.0, + "grad_norm": 0.034758856969872284, + "language_loss": 0.99957371, + "learning_rate": 0.0009977885986370057, + "loss": 1.01079476, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.09082031, + "step": 307, + "time_per_iteration": 2.566316843032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120423, + "balance_loss_mlp": 1.01199007, + "epoch": 0.05925355906117737, + "flos": 592710216960.0, + "grad_norm": 0.0408216139096099, + "language_loss": 0.95604599, + "learning_rate": 0.000997759233535118, + "loss": 0.96725023, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.08691406, + "step": 308, + "time_per_iteration": 2.781667470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119623, + "balance_loss_mlp": 1.01147592, + "epoch": 0.05944594074644094, + "flos": 564788466432.0, + "grad_norm": 0.03543125546238922, + "language_loss": 1.01945186, + "learning_rate": 0.0009977296751859576, + "loss": 1.03064811, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.08398438, + "step": 309, + "time_per_iteration": 2.778700828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121487, + "balance_loss_mlp": 1.0137223, + "epoch": 0.0596383224317045, + "flos": 539808201216.0, + "grad_norm": 0.03208598270087784, + "language_loss": 1.03591859, + "learning_rate": 0.0009976999236009998, + "loss": 1.04713345, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.08007812, + "step": 310, + "time_per_iteration": 2.790116786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121449, + "balance_loss_mlp": 1.01387453, + "epoch": 0.059830704116968066, + "flos": 562053060864.0, + "grad_norm": 0.03260901983169028, + "language_loss": 1.05564129, + "learning_rate": 0.0009976699787917955, + "loss": 1.06685579, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.078125, + "step": 311, + "time_per_iteration": 2.6586148738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108932, + "balance_loss_mlp": 1.00326538, + "epoch": 0.060023085802231625, + "flos": 1574050294272.0, + "grad_norm": 0.018314702584398344, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74551928, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.05859375, + "step": 312, + "time_per_iteration": 4.943182945251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128012, + "balance_loss_mlp": 1.02101004, + "epoch": 0.06021546748749519, + "flos": 483628363008.0, + "grad_norm": 0.04396023920554742, + "language_loss": 0.97026515, + "learning_rate": 0.0009976095095472243, + "loss": 0.98154521, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.07226562, + "step": 313, + "time_per_iteration": 2.619016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_mlp": 1.02425838, + "epoch": 0.06040784917275875, + "flos": 621424205568.0, + "grad_norm": 0.03687701456451143, + "language_loss": 0.97965562, + "learning_rate": 0.0009975789851353334, + "loss": 0.99096727, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.07128906, + "step": 314, + "time_per_iteration": 2.8331894874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125758, + "balance_loss_mlp": 1.01980519, + "epoch": 0.06060023085802232, + "flos": 484603348224.0, + "grad_norm": 0.029408756794299912, + "language_loss": 1.00726843, + "learning_rate": 0.0009975482675461487, + "loss": 1.01852608, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.06152344, + "step": 315, + "time_per_iteration": 2.659079074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125971, + "balance_loss_mlp": 1.02001762, + "epoch": 0.06079261254328588, + "flos": 582986598144.0, + "grad_norm": 0.027344501346145803, + "language_loss": 0.98408186, + "learning_rate": 0.0009975173567915952, + "loss": 0.99534154, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.06152344, + "step": 316, + "time_per_iteration": 2.6947872638702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123736, + "balance_loss_mlp": 1.01873684, + "epoch": 0.060984994228549444, + "flos": 689009348352.0, + "grad_norm": 0.03553374767777348, + "language_loss": 0.92618632, + "learning_rate": 0.000997486252883674, + "loss": 0.93742371, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.05175781, + "step": 317, + "time_per_iteration": 2.8523428440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123139, + "balance_loss_mlp": 1.01861632, + "epoch": 0.061177375913813004, + "flos": 1316749104384.0, + "grad_norm": 0.03506621320439297, + "language_loss": 0.97693729, + "learning_rate": 0.0009974549558344602, + "loss": 0.98816866, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.046875, + "step": 318, + "time_per_iteration": 3.705524206161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121913, + "balance_loss_mlp": 1.01805806, + "epoch": 0.06136975759907657, + "flos": 575401532928.0, + "grad_norm": 0.03493031867187039, + "language_loss": 1.07333064, + "learning_rate": 0.000997423465656105, + "loss": 1.08454978, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.04003906, + "step": 319, + "time_per_iteration": 2.75838565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119126, + "balance_loss_mlp": 1.01546133, + "epoch": 0.06156213928434013, + "flos": 528565234944.0, + "grad_norm": 0.037170039701900144, + "language_loss": 1.04350638, + "learning_rate": 0.0009973917823608335, + "loss": 1.05469775, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.03808594, + "step": 320, + "time_per_iteration": 2.6494460105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117676, + "balance_loss_mlp": 1.01458335, + "epoch": 0.061754520969603696, + "flos": 496590920448.0, + "grad_norm": 0.030464742512101767, + "language_loss": 0.98981547, + "learning_rate": 0.0009973599059609462, + "loss": 1.00099218, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.03222656, + "step": 321, + "time_per_iteration": 2.7119081020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116635, + "balance_loss_mlp": 1.01344728, + "epoch": 0.061946902654867256, + "flos": 441044872704.0, + "grad_norm": 0.031106795532346753, + "language_loss": 0.97035432, + "learning_rate": 0.000997327836468819, + "loss": 0.98152065, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.03320312, + "step": 322, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121262, + "balance_loss_mlp": 1.01836073, + "epoch": 0.06213928434013082, + "flos": 600043515648.0, + "grad_norm": 0.031546338171402045, + "language_loss": 1.00120687, + "learning_rate": 0.000997295573896902, + "loss": 1.01241946, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.03027344, + "step": 323, + "time_per_iteration": 2.825425624847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113502, + "balance_loss_mlp": 1.01126862, + "epoch": 0.06233166602539438, + "flos": 1453116961536.0, + "grad_norm": 0.009515746361157745, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82309544, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.0234375, + "step": 324, + "time_per_iteration": 4.7325074672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.0074234, + "epoch": 0.06252404771065795, + "flos": 1466631651072.0, + "grad_norm": 0.010337204897298672, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79680836, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.015625, + "step": 325, + "time_per_iteration": 4.845058917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131262, + "balance_loss_mlp": 1.02950513, + "epoch": 0.06271642939592151, + "flos": 465236790528.0, + "grad_norm": 0.04479189972062717, + "language_loss": 0.94122899, + "learning_rate": 0.000997197627828043, + "loss": 0.95254159, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.01855469, + "step": 326, + "time_per_iteration": 2.531477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136139, + "balance_loss_mlp": 1.03466833, + "epoch": 0.06290881108118507, + "flos": 533432391168.0, + "grad_norm": 0.03210871152906133, + "language_loss": 0.89633012, + "learning_rate": 0.0009971645930629716, + "loss": 0.9076916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.015625, + "step": 327, + "time_per_iteration": 2.766155481338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131438, + "balance_loss_mlp": 1.0305388, + "epoch": 0.06310119276644863, + "flos": 674768516352.0, + "grad_norm": 0.03217671154768682, + "language_loss": 1.03418863, + "learning_rate": 0.0009971313652814872, + "loss": 1.0455029, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.00976562, + "step": 328, + "time_per_iteration": 2.818718433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125209, + "balance_loss_mlp": 1.02440596, + "epoch": 0.0632935744517122, + "flos": 772051381248.0, + "grad_norm": 0.03902843256426295, + "language_loss": 1.00692391, + "learning_rate": 0.0009970979444964903, + "loss": 1.01817608, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.00878906, + "step": 329, + "time_per_iteration": 2.9847218990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119216, + "balance_loss_mlp": 1.01869905, + "epoch": 0.06348595613697576, + "flos": 562975556352.0, + "grad_norm": 0.040034835413812295, + "language_loss": 1.01797342, + "learning_rate": 0.0009970643307209556, + "loss": 1.02916563, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.00585938, + "step": 330, + "time_per_iteration": 2.817711353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112644, + "balance_loss_mlp": 1.01250839, + "epoch": 0.06367833782223932, + "flos": 677384358144.0, + "grad_norm": 0.031424074947949916, + "language_loss": 0.98358697, + "learning_rate": 0.0009970305239679334, + "loss": 0.99471337, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.00195312, + "step": 331, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.01247358, + "epoch": 0.06387071950750288, + "flos": 496349847552.0, + "grad_norm": 0.04016029313197435, + "language_loss": 1.03082633, + "learning_rate": 0.0009969965242505483, + "loss": 1.04195428, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.00390625, + "step": 332, + "time_per_iteration": 2.631326675415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113411, + "balance_loss_mlp": 1.01317954, + "epoch": 0.06406310119276645, + "flos": 534557075712.0, + "grad_norm": 0.03761595064373852, + "language_loss": 0.99054992, + "learning_rate": 0.0009969623315820007, + "loss": 1.00168395, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.00292969, + "step": 333, + "time_per_iteration": 2.6700048446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.01369655, + "epoch": 0.06425548287803001, + "flos": 457165688832.0, + "grad_norm": 0.0356255093132357, + "language_loss": 0.99075055, + "learning_rate": 0.000996927945975565, + "loss": 1.00188696, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.99951172, + "step": 334, + "time_per_iteration": 2.567225933074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112774, + "balance_loss_mlp": 1.01282871, + "epoch": 0.06444786456329357, + "flos": 561123762432.0, + "grad_norm": 0.034265188200332725, + "language_loss": 0.96451521, + "learning_rate": 0.0009968933674445906, + "loss": 0.97564298, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.99951172, + "step": 335, + "time_per_iteration": 2.6834452152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110954, + "balance_loss_mlp": 1.01100898, + "epoch": 0.06464024624855713, + "flos": 667357449984.0, + "grad_norm": 0.026754476738251005, + "language_loss": 0.980811, + "learning_rate": 0.0009968585960025028, + "loss": 0.99192053, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.99853516, + "step": 336, + "time_per_iteration": 2.9675402641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112488, + "balance_loss_mlp": 1.01368713, + "epoch": 0.0648326279338207, + "flos": 1524558303744.0, + "grad_norm": 0.027483244216433014, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78765678, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.98632812, + "step": 337, + "time_per_iteration": 4.80242133140564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.01540756, + "epoch": 0.06502500961908426, + "flos": 1145216581632.0, + "grad_norm": 0.03509421691107687, + "language_loss": 0.96500707, + "learning_rate": 0.0009967884744390583, + "loss": 0.97615772, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.99414062, + "step": 338, + "time_per_iteration": 3.517488479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118827, + "balance_loss_mlp": 1.01945412, + "epoch": 0.06521739130434782, + "flos": 583694265600.0, + "grad_norm": 0.03507378265000135, + "language_loss": 0.97375119, + "learning_rate": 0.0009967531243449256, + "loss": 0.98493946, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.9921875, + "step": 339, + "time_per_iteration": 2.713430404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119012, + "balance_loss_mlp": 1.02002037, + "epoch": 0.06540977298961138, + "flos": 498659487744.0, + "grad_norm": 0.03215705196534619, + "language_loss": 1.04762673, + "learning_rate": 0.000996717581394126, + "loss": 1.05881691, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.98876953, + "step": 340, + "time_per_iteration": 2.5391135215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116775, + "balance_loss_mlp": 1.01787901, + "epoch": 0.06560215467487496, + "flos": 543904506624.0, + "grad_norm": 0.030763143460584817, + "language_loss": 1.05044627, + "learning_rate": 0.000996681845600459, + "loss": 1.06161404, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.98632812, + "step": 341, + "time_per_iteration": 2.670804262161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118249, + "balance_loss_mlp": 1.01963949, + "epoch": 0.06579453636013852, + "flos": 414351819264.0, + "grad_norm": 0.040583240554979534, + "language_loss": 0.9744029, + "learning_rate": 0.0009966459169777982, + "loss": 0.98558539, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.98388672, + "step": 342, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115105, + "balance_loss_mlp": 1.01706719, + "epoch": 0.06598691804540208, + "flos": 561681730560.0, + "grad_norm": 0.04164342519277061, + "language_loss": 1.05655766, + "learning_rate": 0.0009966097955400924, + "loss": 1.06770873, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.97949219, + "step": 343, + "time_per_iteration": 2.666548728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_mlp": 1.01532912, + "epoch": 0.06617929973066564, + "flos": 573302830080.0, + "grad_norm": 0.03386977599556249, + "language_loss": 0.99970496, + "learning_rate": 0.0009965734813013652, + "loss": 1.01082909, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.97070312, + "step": 344, + "time_per_iteration": 2.8448328971862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_mlp": 1.01261127, + "epoch": 0.06637168141592921, + "flos": 491465194752.0, + "grad_norm": 0.03376822413453626, + "language_loss": 1.02026749, + "learning_rate": 0.0009965369742757151, + "loss": 1.03136492, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.97119141, + "step": 345, + "time_per_iteration": 2.568521738052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.01176453, + "epoch": 0.06656406310119277, + "flos": 1081039518720.0, + "grad_norm": 0.03449730062562062, + "language_loss": 0.98245382, + "learning_rate": 0.0009965002744773152, + "loss": 0.99353665, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.96484375, + "step": 346, + "time_per_iteration": 3.501471519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109602, + "balance_loss_mlp": 1.01347148, + "epoch": 0.06675644478645633, + "flos": 514723923456.0, + "grad_norm": 0.029121068034632647, + "language_loss": 0.95998263, + "learning_rate": 0.0009964633819204139, + "loss": 0.97107863, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.9609375, + "step": 347, + "time_per_iteration": 2.6675100326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093636, + "balance_loss_mlp": 0.9986496, + "epoch": 0.06694882647171989, + "flos": 1450537079808.0, + "grad_norm": 0.008592618933675954, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.82894754, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.94921875, + "step": 348, + "time_per_iteration": 4.92915415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 0.99832916, + "epoch": 0.06714120815698346, + "flos": 1555400152320.0, + "grad_norm": 0.006174818833869298, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76247013, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.94726562, + "step": 349, + "time_per_iteration": 4.8783159255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112042, + "balance_loss_mlp": 1.01719952, + "epoch": 0.06733358984224702, + "flos": 881617326336.0, + "grad_norm": 0.039044792628629706, + "language_loss": 0.95966816, + "learning_rate": 0.000996351547842304, + "loss": 0.97078854, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.94775391, + "step": 350, + "time_per_iteration": 3.151158094406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106972, + "balance_loss_mlp": 1.01222503, + "epoch": 0.06752597152751058, + "flos": 519918668544.0, + "grad_norm": 0.04011951728876299, + "language_loss": 0.94198334, + "learning_rate": 0.0009963138843953744, + "loss": 0.953053, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.94677734, + "step": 351, + "time_per_iteration": 2.6077194213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111271, + "balance_loss_mlp": 1.01661849, + "epoch": 0.06771835321277414, + "flos": 540883308288.0, + "grad_norm": 0.02897454745239974, + "language_loss": 0.98297268, + "learning_rate": 0.000996276028262306, + "loss": 0.99408543, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.94580078, + "step": 352, + "time_per_iteration": 2.8440346717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115128, + "balance_loss_mlp": 1.02052331, + "epoch": 0.0679107348980377, + "flos": 461615827968.0, + "grad_norm": 0.03358261828070724, + "language_loss": 1.05270672, + "learning_rate": 0.0009962379794577964, + "loss": 1.06385791, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.9453125, + "step": 353, + "time_per_iteration": 2.6153147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115897, + "balance_loss_mlp": 1.02129257, + "epoch": 0.06810311658330127, + "flos": 637208684544.0, + "grad_norm": 0.03193767698980152, + "language_loss": 0.94629884, + "learning_rate": 0.000996199737996617, + "loss": 0.95745778, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.9453125, + "step": 354, + "time_per_iteration": 2.9557363986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114833, + "balance_loss_mlp": 1.0208956, + "epoch": 0.06829549826856483, + "flos": 465627562752.0, + "grad_norm": 0.034421374529713736, + "language_loss": 1.03816652, + "learning_rate": 0.0009961613038936149, + "loss": 1.04931474, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.93847656, + "step": 355, + "time_per_iteration": 2.583648204803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112591, + "balance_loss_mlp": 1.01879704, + "epoch": 0.06848787995382839, + "flos": 635897362176.0, + "grad_norm": 0.027271592740405557, + "language_loss": 0.95725697, + "learning_rate": 0.000996122677163711, + "loss": 0.96838284, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.93701172, + "step": 356, + "time_per_iteration": 2.7997536659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.02022934, + "epoch": 0.06868026163909195, + "flos": 807781773312.0, + "grad_norm": 0.036098266403844226, + "language_loss": 1.02058005, + "learning_rate": 0.000996083857821902, + "loss": 1.03171647, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.93310547, + "step": 357, + "time_per_iteration": 3.0117554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113245, + "balance_loss_mlp": 1.01978505, + "epoch": 0.06887264332435553, + "flos": 440152512768.0, + "grad_norm": 0.03587140172627376, + "language_loss": 1.00045025, + "learning_rate": 0.0009960448458832588, + "loss": 1.01158273, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.93359375, + "step": 358, + "time_per_iteration": 2.6948373317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110172, + "balance_loss_mlp": 1.01714087, + "epoch": 0.06906502500961909, + "flos": 485786358528.0, + "grad_norm": 0.028895953236024122, + "language_loss": 0.99980301, + "learning_rate": 0.000996005641362927, + "loss": 1.01090467, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.92919922, + "step": 359, + "time_per_iteration": 2.600889205932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110333, + "balance_loss_mlp": 1.01715922, + "epoch": 0.06925740669488265, + "flos": 734886212352.0, + "grad_norm": 0.03093408458560108, + "language_loss": 1.02453041, + "learning_rate": 0.0009959662442761274, + "loss": 1.0356338, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.93066406, + "step": 360, + "time_per_iteration": 2.9324746131896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107676, + "balance_loss_mlp": 1.01445436, + "epoch": 0.0694497883801462, + "flos": 553571745024.0, + "grad_norm": 0.03028505188811882, + "language_loss": 0.95860314, + "learning_rate": 0.000995926654638155, + "loss": 0.96967983, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.93115234, + "step": 361, + "time_per_iteration": 2.8280868530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104746, + "balance_loss_mlp": 1.01157248, + "epoch": 0.06964217006540978, + "flos": 679244900352.0, + "grad_norm": 0.03450824772288923, + "language_loss": 0.98644811, + "learning_rate": 0.00099588687246438, + "loss": 0.99749553, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.93066406, + "step": 362, + "time_per_iteration": 2.8108932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108438, + "balance_loss_mlp": 1.01535928, + "epoch": 0.06983455175067334, + "flos": 525261167616.0, + "grad_norm": 0.03621302361184023, + "language_loss": 1.06105995, + "learning_rate": 0.0009958468977702471, + "loss": 1.07214439, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.9296875, + "step": 363, + "time_per_iteration": 2.6087372303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135422, + "balance_loss_mlp": 1.04272461, + "epoch": 0.0700269334359369, + "flos": 1580176283136.0, + "grad_norm": 0.03651647631774479, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.80870128, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.92578125, + "step": 364, + "time_per_iteration": 4.806072235107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104861, + "balance_loss_mlp": 1.01254511, + "epoch": 0.07021931512120046, + "flos": 1014858050304.0, + "grad_norm": 0.04058448706036458, + "language_loss": 0.94071019, + "learning_rate": 0.0009957663708830612, + "loss": 0.9517588, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.921875, + "step": 365, + "time_per_iteration": 3.30859637260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110656, + "balance_loss_mlp": 1.01862633, + "epoch": 0.07041169680646403, + "flos": 824432367360.0, + "grad_norm": 0.04186203278400794, + "language_loss": 0.98041129, + "learning_rate": 0.0009957258187212714, + "loss": 0.9915179, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.91894531, + "step": 366, + "time_per_iteration": 3.00058913230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097015, + "balance_loss_mlp": 1.00565338, + "epoch": 0.07060407849172759, + "flos": 1417293250560.0, + "grad_norm": 0.011820269564466843, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80291873, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.91210938, + "step": 367, + "time_per_iteration": 4.794500827789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113703, + "balance_loss_mlp": 1.02186394, + "epoch": 0.07079646017699115, + "flos": 513942379008.0, + "grad_norm": 0.041641563183133855, + "language_loss": 0.94691038, + "learning_rate": 0.0009956441370400167, + "loss": 0.95804739, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.91699219, + "step": 368, + "time_per_iteration": 2.63948917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111577, + "balance_loss_mlp": 1.02436066, + "epoch": 0.07098884186225471, + "flos": 541549179648.0, + "grad_norm": 0.03426405251061256, + "language_loss": 1.00885093, + "learning_rate": 0.0009956030075522636, + "loss": 1.02000868, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.91259766, + "step": 369, + "time_per_iteration": 2.74157452583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107449, + "balance_loss_mlp": 1.01613438, + "epoch": 0.07118122354751828, + "flos": 549739845120.0, + "grad_norm": 0.030296400642036637, + "language_loss": 1.0031743, + "learning_rate": 0.0009955616856543587, + "loss": 1.01424885, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.91162109, + "step": 370, + "time_per_iteration": 2.6210479736328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105786, + "balance_loss_mlp": 1.01475775, + "epoch": 0.07137360523278184, + "flos": 622077437952.0, + "grad_norm": 0.029509682347833893, + "language_loss": 0.92550498, + "learning_rate": 0.0009955201713623448, + "loss": 0.93656284, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.90869141, + "step": 371, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.00284576, + "epoch": 0.0715659869180454, + "flos": 1505976202752.0, + "grad_norm": 0.005566886599578838, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77765214, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.89648438, + "step": 372, + "time_per_iteration": 4.947838306427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126764, + "balance_loss_mlp": 1.0361172, + "epoch": 0.07175836860330896, + "flos": 496482050304.0, + "grad_norm": 0.040308561934975694, + "language_loss": 1.05629396, + "learning_rate": 0.0009954365656605333, + "loss": 1.06756163, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.90478516, + "step": 373, + "time_per_iteration": 2.5537302494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124141, + "balance_loss_mlp": 1.03416181, + "epoch": 0.07195075028857253, + "flos": 787082505984.0, + "grad_norm": 0.034789914575730614, + "language_loss": 0.98912442, + "learning_rate": 0.0009953944742831947, + "loss": 1.00036585, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.89892578, + "step": 374, + "time_per_iteration": 2.976074695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106044, + "balance_loss_mlp": 1.01678061, + "epoch": 0.0721431319738361, + "flos": 594347182848.0, + "grad_norm": 0.029628456658550576, + "language_loss": 1.02558136, + "learning_rate": 0.0009953521905766642, + "loss": 1.03664172, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.89404297, + "step": 375, + "time_per_iteration": 2.9556005001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101275, + "balance_loss_mlp": 1.01234496, + "epoch": 0.07233551365909965, + "flos": 549329630976.0, + "grad_norm": 0.034208323574026145, + "language_loss": 1.01073325, + "learning_rate": 0.0009953097145573577, + "loss": 1.02174592, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.89111328, + "step": 376, + "time_per_iteration": 2.6449482440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_mlp": 1.01759815, + "epoch": 0.07252789534436321, + "flos": 959169106176.0, + "grad_norm": 0.031040198427254525, + "language_loss": 0.98588479, + "learning_rate": 0.000995267046241766, + "loss": 0.99694908, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.89013672, + "step": 377, + "time_per_iteration": 3.2564361095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106989, + "balance_loss_mlp": 1.01877415, + "epoch": 0.07272027702962677, + "flos": 508656260352.0, + "grad_norm": 0.029229214223645432, + "language_loss": 0.98238575, + "learning_rate": 0.0009952241856464547, + "loss": 0.99345565, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.88378906, + "step": 378, + "time_per_iteration": 2.5843191146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108111, + "balance_loss_mlp": 1.02013505, + "epoch": 0.07291265871489035, + "flos": 613552380672.0, + "grad_norm": 0.03194005050639913, + "language_loss": 1.05557346, + "learning_rate": 0.0009951811327880632, + "loss": 1.06665444, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.88134766, + "step": 379, + "time_per_iteration": 2.727449655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107323, + "balance_loss_mlp": 1.01934636, + "epoch": 0.0731050404001539, + "flos": 496742565120.0, + "grad_norm": 0.03092115392183015, + "language_loss": 0.98400533, + "learning_rate": 0.0009951378876833063, + "loss": 0.99507862, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.88134766, + "step": 380, + "time_per_iteration": 2.5320205688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101258, + "balance_loss_mlp": 1.01332915, + "epoch": 0.07329742208541747, + "flos": 641130991104.0, + "grad_norm": 0.032065094183830696, + "language_loss": 1.04703462, + "learning_rate": 0.0009950944503489736, + "loss": 1.05804706, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.88085938, + "step": 381, + "time_per_iteration": 2.7422876358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_mlp": 1.01453876, + "epoch": 0.07348980377068103, + "flos": 817741607424.0, + "grad_norm": 0.030510114485064205, + "language_loss": 0.99112171, + "learning_rate": 0.0009950508208019285, + "loss": 1.00214303, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.87744141, + "step": 382, + "time_per_iteration": 3.046475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_mlp": 1.01323569, + "epoch": 0.0736821854559446, + "flos": 509670129408.0, + "grad_norm": 0.035756321159612754, + "language_loss": 1.03789318, + "learning_rate": 0.0009950069990591096, + "loss": 1.04890537, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.88134766, + "step": 383, + "time_per_iteration": 2.620088577270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.02674103, + "epoch": 0.07387456714120816, + "flos": 1558050987264.0, + "grad_norm": 0.043940663043905655, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77514511, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.86523438, + "step": 384, + "time_per_iteration": 4.87653374671936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121594, + "balance_loss_mlp": 1.03299809, + "epoch": 0.07406694882647172, + "flos": 526644421632.0, + "grad_norm": 0.039102279996233, + "language_loss": 0.96614265, + "learning_rate": 0.0009949187790542777, + "loss": 0.97735858, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.88769531, + "step": 385, + "time_per_iteration": 2.734100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112471, + "balance_loss_mlp": 1.03625691, + "epoch": 0.07425933051173528, + "flos": 498824738304.0, + "grad_norm": 0.03701278047407747, + "language_loss": 0.92462552, + "learning_rate": 0.0009948743808265148, + "loss": 0.93587261, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.88623047, + "step": 386, + "time_per_iteration": 2.7154581546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_mlp": 1.03704965, + "epoch": 0.07445171219699885, + "flos": 506057915136.0, + "grad_norm": 0.06663512882119103, + "language_loss": 1.02268195, + "learning_rate": 0.0009948297904714782, + "loss": 1.0339365, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.88574219, + "step": 387, + "time_per_iteration": 2.68532133102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112575, + "balance_loss_mlp": 1.03777313, + "epoch": 0.07464409388226241, + "flos": 555117337344.0, + "grad_norm": 0.036483324457394946, + "language_loss": 0.94151849, + "learning_rate": 0.0009947850080064796, + "loss": 0.95277596, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.88134766, + "step": 388, + "time_per_iteration": 2.789128303527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121204, + "balance_loss_mlp": 1.03370392, + "epoch": 0.07483647556752597, + "flos": 778275546624.0, + "grad_norm": 0.0421926900222792, + "language_loss": 0.99476451, + "learning_rate": 0.0009947400334489047, + "loss": 1.00597644, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.87646484, + "step": 389, + "time_per_iteration": 2.9937496185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011085, + "balance_loss_mlp": 1.02133441, + "epoch": 0.07502885725278953, + "flos": 613682638080.0, + "grad_norm": 0.0417493031738284, + "language_loss": 0.90741575, + "learning_rate": 0.0009946948668162145, + "loss": 0.91850078, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.87304688, + "step": 390, + "time_per_iteration": 2.7264010906219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101473, + "balance_loss_mlp": 1.01502275, + "epoch": 0.0752212389380531, + "flos": 689856021504.0, + "grad_norm": 0.03330838563423677, + "language_loss": 0.95001, + "learning_rate": 0.0009946495081259441, + "loss": 0.9610247, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.86572266, + "step": 391, + "time_per_iteration": 2.832472085952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097898, + "balance_loss_mlp": 1.01182938, + "epoch": 0.07541362062331666, + "flos": 767052022272.0, + "grad_norm": 0.03859494705227578, + "language_loss": 0.99014449, + "learning_rate": 0.0009946039573957035, + "loss": 1.00112355, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.86181641, + "step": 392, + "time_per_iteration": 2.925933361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101692, + "balance_loss_mlp": 1.01576602, + "epoch": 0.07560600230858022, + "flos": 589909682688.0, + "grad_norm": 0.039112379024015986, + "language_loss": 0.95485294, + "learning_rate": 0.000994558214643177, + "loss": 0.9658699, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.86035156, + "step": 393, + "time_per_iteration": 2.763448476791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095538, + "balance_loss_mlp": 1.00961244, + "epoch": 0.07579838399384378, + "flos": 751146034176.0, + "grad_norm": 0.03818992224284351, + "language_loss": 0.96862066, + "learning_rate": 0.000994512279886123, + "loss": 0.97957599, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.86035156, + "step": 394, + "time_per_iteration": 3.143615245819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101397, + "balance_loss_mlp": 1.01561391, + "epoch": 0.07599076567910736, + "flos": 524551554816.0, + "grad_norm": 0.030240351127206026, + "language_loss": 0.96659988, + "learning_rate": 0.0009944661531423758, + "loss": 0.97761387, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.85888672, + "step": 395, + "time_per_iteration": 2.6748764514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107513, + "balance_loss_mlp": 1.02206361, + "epoch": 0.07618314736437092, + "flos": 552186545664.0, + "grad_norm": 0.03358451790414236, + "language_loss": 0.95614338, + "learning_rate": 0.000994419834429843, + "loss": 0.96721858, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.85546875, + "step": 396, + "time_per_iteration": 2.6525089740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105372, + "balance_loss_mlp": 1.01987493, + "epoch": 0.07637552904963447, + "flos": 699433831680.0, + "grad_norm": 0.04315212632526892, + "language_loss": 1.00552011, + "learning_rate": 0.0009943733237665069, + "loss": 1.01657379, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.85595703, + "step": 397, + "time_per_iteration": 2.8678157329559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097353, + "balance_loss_mlp": 1.01218963, + "epoch": 0.07656791073489803, + "flos": 580636128768.0, + "grad_norm": 0.029538416941692198, + "language_loss": 0.99224108, + "learning_rate": 0.0009943266211704248, + "loss": 1.0032146, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.85253906, + "step": 398, + "time_per_iteration": 3.0023248195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099387, + "balance_loss_mlp": 1.01460528, + "epoch": 0.0767602924201616, + "flos": 418037910528.0, + "grad_norm": 0.03167845871290285, + "language_loss": 1.01143491, + "learning_rate": 0.000994279726659728, + "loss": 1.02242875, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.84863281, + "step": 399, + "time_per_iteration": 2.527693271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107007, + "balance_loss_mlp": 1.02246368, + "epoch": 0.07695267410542517, + "flos": 483888877824.0, + "grad_norm": 0.03414294034973106, + "language_loss": 0.9968133, + "learning_rate": 0.0009942326402526231, + "loss": 1.00788331, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.84619141, + "step": 400, + "time_per_iteration": 2.5610573291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_mlp": 1.02848434, + "epoch": 0.07714505579068873, + "flos": 532027749888.0, + "grad_norm": 0.030264499227930883, + "language_loss": 0.97403878, + "learning_rate": 0.0009941853619673902, + "loss": 0.98516715, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.84423828, + "step": 401, + "time_per_iteration": 2.680175542831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107236, + "balance_loss_mlp": 1.02302694, + "epoch": 0.07733743747595229, + "flos": 806440315392.0, + "grad_norm": 0.03979329481069023, + "language_loss": 1.01160502, + "learning_rate": 0.0009941378918223844, + "loss": 1.02267742, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.84277344, + "step": 402, + "time_per_iteration": 3.0908427238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098686, + "balance_loss_mlp": 1.01447606, + "epoch": 0.07752981916121585, + "flos": 623614281984.0, + "grad_norm": 0.03310929598543939, + "language_loss": 0.93567806, + "learning_rate": 0.0009940902298360354, + "loss": 0.94666493, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.84277344, + "step": 403, + "time_per_iteration": 2.7569308280944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.01048076, + "epoch": 0.07772220084647942, + "flos": 729543713280.0, + "grad_norm": 0.03955766616265138, + "language_loss": 1.03173304, + "learning_rate": 0.0009940423760268473, + "loss": 1.04268289, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.84570312, + "step": 404, + "time_per_iteration": 2.8456103801727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098252, + "balance_loss_mlp": 1.01375628, + "epoch": 0.07791458253174298, + "flos": 556469488896.0, + "grad_norm": 0.042207617679060144, + "language_loss": 0.96929657, + "learning_rate": 0.0009939943304133982, + "loss": 0.98027909, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.84570312, + "step": 405, + "time_per_iteration": 2.615145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104796, + "balance_loss_mlp": 1.02044404, + "epoch": 0.07810696421700654, + "flos": 554235671040.0, + "grad_norm": 0.04104566792755741, + "language_loss": 1.03659868, + "learning_rate": 0.0009939460930143416, + "loss": 1.04764676, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.84423828, + "step": 406, + "time_per_iteration": 2.6304614543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_mlp": 1.01745594, + "epoch": 0.0782993459022701, + "flos": 651879172608.0, + "grad_norm": 0.0317151282671847, + "language_loss": 0.97752666, + "learning_rate": 0.0009938976638484043, + "loss": 0.98854232, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.84179688, + "step": 407, + "time_per_iteration": 2.9032115936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109564, + "balance_loss_mlp": 1.01205039, + "epoch": 0.07849172758753367, + "flos": 497161527552.0, + "grad_norm": 0.04013855375776475, + "language_loss": 0.97246277, + "learning_rate": 0.0009938490429343887, + "loss": 0.98341918, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.83642578, + "step": 408, + "time_per_iteration": 2.5688796043395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.01236188, + "epoch": 0.07868410927279723, + "flos": 579076930560.0, + "grad_norm": 0.0397915036848884, + "language_loss": 0.97571141, + "learning_rate": 0.0009938002302911709, + "loss": 0.98666751, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.83300781, + "step": 409, + "time_per_iteration": 2.75036883354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096533, + "balance_loss_mlp": 1.01365864, + "epoch": 0.07887649095806079, + "flos": 524067463680.0, + "grad_norm": 0.03678821175613874, + "language_loss": 1.00230122, + "learning_rate": 0.0009937512259377015, + "loss": 1.01326644, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.82910156, + "step": 410, + "time_per_iteration": 2.6584975719451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110197, + "balance_loss_mlp": 1.01938236, + "epoch": 0.07906887264332435, + "flos": 558438901248.0, + "grad_norm": 0.04956969404692801, + "language_loss": 0.989124, + "learning_rate": 0.000993702029893006, + "loss": 1.00014377, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.82617188, + "step": 411, + "time_per_iteration": 2.7666263580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_mlp": 1.0196116, + "epoch": 0.07926125432858792, + "flos": 823364063232.0, + "grad_norm": 0.03322797228086769, + "language_loss": 0.99091381, + "learning_rate": 0.0009936526421761838, + "loss": 1.00193632, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.82666016, + "step": 412, + "time_per_iteration": 3.0222113132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102099, + "balance_loss_mlp": 1.01955855, + "epoch": 0.07945363601385148, + "flos": 563394518784.0, + "grad_norm": 0.04210923401756456, + "language_loss": 1.01423764, + "learning_rate": 0.000993603062806409, + "loss": 1.02525866, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.82568359, + "step": 413, + "time_per_iteration": 2.713226079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100317, + "balance_loss_mlp": 1.0176332, + "epoch": 0.07964601769911504, + "flos": 518885357568.0, + "grad_norm": 0.041362228888401006, + "language_loss": 1.04903626, + "learning_rate": 0.0009935532918029298, + "loss": 1.06003952, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.82714844, + "step": 414, + "time_per_iteration": 2.59602689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095709, + "balance_loss_mlp": 1.01326394, + "epoch": 0.0798383993843786, + "flos": 540301040640.0, + "grad_norm": 0.030384950019726516, + "language_loss": 0.97377884, + "learning_rate": 0.0009935033291850694, + "loss": 0.98473597, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.82470703, + "step": 415, + "time_per_iteration": 2.6417808532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094851, + "balance_loss_mlp": 1.013026, + "epoch": 0.08003078106964218, + "flos": 486122695680.0, + "grad_norm": 0.03579523867672845, + "language_loss": 1.00004411, + "learning_rate": 0.0009934531749722247, + "loss": 1.01099253, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.81835938, + "step": 416, + "time_per_iteration": 2.593029737472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095566, + "balance_loss_mlp": 1.01383638, + "epoch": 0.08022316275490574, + "flos": 519276129792.0, + "grad_norm": 0.0354518245662521, + "language_loss": 0.98370755, + "learning_rate": 0.0009934028291838672, + "loss": 0.99466318, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.81738281, + "step": 417, + "time_per_iteration": 2.7351250648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096643, + "balance_loss_mlp": 1.01496112, + "epoch": 0.0804155444401693, + "flos": 495047273472.0, + "grad_norm": 0.032920982329526526, + "language_loss": 0.93668723, + "learning_rate": 0.0009933522918395433, + "loss": 0.94765365, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.81689453, + "step": 418, + "time_per_iteration": 2.6427221298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.03316498, + "epoch": 0.08060792612543285, + "flos": 1584856801536.0, + "grad_norm": 0.029973653623271358, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79365897, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.81640625, + "step": 419, + "time_per_iteration": 4.8632917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.01569724, + "epoch": 0.08080030781069643, + "flos": 526359607296.0, + "grad_norm": 0.04163447523548115, + "language_loss": 1.12134457, + "learning_rate": 0.000993250642561551, + "loss": 1.13230991, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.80810547, + "step": 420, + "time_per_iteration": 2.608396053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109578, + "balance_loss_mlp": 1.01505113, + "epoch": 0.08099268949595999, + "flos": 547757793792.0, + "grad_norm": 0.04746808509414602, + "language_loss": 0.97398257, + "learning_rate": 0.0009931995306673466, + "loss": 0.98494035, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.80712891, + "step": 421, + "time_per_iteration": 2.7215850353240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097341, + "balance_loss_mlp": 1.01670778, + "epoch": 0.08118507118122355, + "flos": 511374169344.0, + "grad_norm": 0.04020038552675014, + "language_loss": 1.02514148, + "learning_rate": 0.000993148227296103, + "loss": 1.03611493, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.80615234, + "step": 422, + "time_per_iteration": 2.625366449356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010968, + "balance_loss_mlp": 1.01607168, + "epoch": 0.08137745286648711, + "flos": 722002389504.0, + "grad_norm": 0.03556088777041087, + "language_loss": 0.90137196, + "learning_rate": 0.000993096732467738, + "loss": 0.91233999, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.80712891, + "step": 423, + "time_per_iteration": 2.9795689582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.0118531, + "epoch": 0.08156983455175067, + "flos": 680818682880.0, + "grad_norm": 0.04422604915428747, + "language_loss": 0.99073571, + "learning_rate": 0.0009930450462022435, + "loss": 1.00165915, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.8046875, + "step": 424, + "time_per_iteration": 2.879889726638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.00783539, + "epoch": 0.08176221623701424, + "flos": 1456591137024.0, + "grad_norm": 0.006453860192715822, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.8027699, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.79296875, + "step": 425, + "time_per_iteration": 4.908784627914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.01541877, + "epoch": 0.0819545979222778, + "flos": 1558885044480.0, + "grad_norm": 0.04271462185638088, + "language_loss": 0.96659774, + "learning_rate": 0.0009929410994402065, + "loss": 0.9775573, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.80517578, + "step": 426, + "time_per_iteration": 3.7266876697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100573, + "balance_loss_mlp": 1.02013052, + "epoch": 0.08214697960754136, + "flos": 513801427968.0, + "grad_norm": 0.040597463537132866, + "language_loss": 1.00489211, + "learning_rate": 0.0009928888389840196, + "loss": 1.01589799, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.80419922, + "step": 427, + "time_per_iteration": 2.695010185241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098577, + "balance_loss_mlp": 1.01822996, + "epoch": 0.08233936129280492, + "flos": 596222309376.0, + "grad_norm": 0.03622779747664415, + "language_loss": 1.02622843, + "learning_rate": 0.0009928363871714147, + "loss": 1.03721428, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.80322266, + "step": 428, + "time_per_iteration": 2.66733455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097324, + "balance_loss_mlp": 1.01721525, + "epoch": 0.08253174297806849, + "flos": 573165769728.0, + "grad_norm": 0.028981657602537042, + "language_loss": 0.97141832, + "learning_rate": 0.0009927837440227556, + "loss": 0.98239154, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.80078125, + "step": 429, + "time_per_iteration": 2.8499114513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093938, + "balance_loss_mlp": 1.01392436, + "epoch": 0.08272412466333205, + "flos": 624643702272.0, + "grad_norm": 0.031878488957356683, + "language_loss": 0.91184896, + "learning_rate": 0.0009927309095584798, + "loss": 0.92278832, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.79980469, + "step": 430, + "time_per_iteration": 3.020768165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097624, + "balance_loss_mlp": 1.01756275, + "epoch": 0.08291650634859561, + "flos": 514995131904.0, + "grad_norm": 0.040558959270141796, + "language_loss": 1.03523278, + "learning_rate": 0.0009926778837991, + "loss": 1.0462091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.80029297, + "step": 431, + "time_per_iteration": 2.609189033508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101176, + "balance_loss_mlp": 1.02125835, + "epoch": 0.08310888803385917, + "flos": 668542405632.0, + "grad_norm": 0.035092839201242565, + "language_loss": 1.01323938, + "learning_rate": 0.000992624666765202, + "loss": 1.0242511, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.79882812, + "step": 432, + "time_per_iteration": 2.817399501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_mlp": 1.02154219, + "epoch": 0.08330126971912274, + "flos": 584491361280.0, + "grad_norm": 0.0354530922421884, + "language_loss": 0.98992586, + "learning_rate": 0.000992571258477447, + "loss": 1.00094295, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.80126953, + "step": 433, + "time_per_iteration": 2.777506113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010961, + "balance_loss_mlp": 1.0161345, + "epoch": 0.0834936514043863, + "flos": 562498268160.0, + "grad_norm": 0.03167346665720251, + "language_loss": 0.92772877, + "learning_rate": 0.0009925176589565695, + "loss": 0.93868983, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.79931641, + "step": 434, + "time_per_iteration": 2.801501512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093857, + "balance_loss_mlp": 1.01398647, + "epoch": 0.08368603308964986, + "flos": 495513868032.0, + "grad_norm": 0.03411426988917409, + "language_loss": 1.03318536, + "learning_rate": 0.0009924638682233791, + "loss": 1.04412401, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.79833984, + "step": 435, + "time_per_iteration": 2.573282241821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.01512909, + "epoch": 0.08387841477491342, + "flos": 1391811397632.0, + "grad_norm": 0.030642245427906535, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.8065716, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.7734375, + "step": 436, + "time_per_iteration": 4.596274375915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099407, + "balance_loss_mlp": 1.02006125, + "epoch": 0.084070796460177, + "flos": 800355155712.0, + "grad_norm": 0.040681894877429646, + "language_loss": 0.92768085, + "learning_rate": 0.0009923557132036668, + "loss": 0.93867493, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.79296875, + "step": 437, + "time_per_iteration": 3.0366878509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_mlp": 1.02364242, + "epoch": 0.08426317814544056, + "flos": 560097254400.0, + "grad_norm": 0.034275916488964116, + "language_loss": 0.96774155, + "learning_rate": 0.0009923013489591345, + "loss": 0.97876477, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.78613281, + "step": 438, + "time_per_iteration": 2.8060851097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_mlp": 1.0219903, + "epoch": 0.08445555983070412, + "flos": 811884881664.0, + "grad_norm": 0.035250716051411925, + "language_loss": 0.95655745, + "learning_rate": 0.0009922467935862681, + "loss": 0.96756417, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.78613281, + "step": 439, + "time_per_iteration": 3.116757869720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098598, + "balance_loss_mlp": 1.0204916, + "epoch": 0.08464794151596768, + "flos": 511170034944.0, + "grad_norm": 0.03561138790794706, + "language_loss": 0.98418635, + "learning_rate": 0.0009921920471062478, + "loss": 0.99517238, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.78027344, + "step": 440, + "time_per_iteration": 2.6008944511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093389, + "balance_loss_mlp": 1.01561701, + "epoch": 0.08484032320123125, + "flos": 557474609664.0, + "grad_norm": 0.02914226137027636, + "language_loss": 0.96590662, + "learning_rate": 0.0009921371095403281, + "loss": 0.97684056, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.77685547, + "step": 441, + "time_per_iteration": 2.638679265975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_mlp": 1.01697087, + "epoch": 0.08503270488649481, + "flos": 528361100544.0, + "grad_norm": 0.02987504029564206, + "language_loss": 0.99685514, + "learning_rate": 0.0009920819809098379, + "loss": 1.00780344, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.77783203, + "step": 442, + "time_per_iteration": 2.5915398597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089542, + "balance_loss_mlp": 1.01172209, + "epoch": 0.08522508657175837, + "flos": 615386678016.0, + "grad_norm": 0.03983619354546574, + "language_loss": 0.95535469, + "learning_rate": 0.0009920266612361798, + "loss": 0.96625006, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.77734375, + "step": 443, + "time_per_iteration": 2.724025249481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091681, + "balance_loss_mlp": 1.01371801, + "epoch": 0.08541746825702193, + "flos": 620987746560.0, + "grad_norm": 0.032808156584867194, + "language_loss": 0.9504559, + "learning_rate": 0.0009919711505408308, + "loss": 0.96137273, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.77880859, + "step": 444, + "time_per_iteration": 2.780973434448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087177, + "balance_loss_mlp": 1.00926137, + "epoch": 0.08560984994228549, + "flos": 483888877824.0, + "grad_norm": 0.03232110076143325, + "language_loss": 0.92813373, + "learning_rate": 0.000991915448845342, + "loss": 0.93900549, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.77832031, + "step": 445, + "time_per_iteration": 2.6011459827423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090069, + "balance_loss_mlp": 1.01243973, + "epoch": 0.08580223162754906, + "flos": 518177690112.0, + "grad_norm": 0.03377956208163177, + "language_loss": 1.02285504, + "learning_rate": 0.000991859556171339, + "loss": 1.03375578, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.77539062, + "step": 446, + "time_per_iteration": 2.606220006942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.01086187, + "epoch": 0.08599461331281262, + "flos": 532520589312.0, + "grad_norm": 0.037753212584348855, + "language_loss": 1.04541254, + "learning_rate": 0.000991803472540521, + "loss": 1.0562979, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.77587891, + "step": 447, + "time_per_iteration": 2.625401735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088712, + "balance_loss_mlp": 1.01113105, + "epoch": 0.08618699499807618, + "flos": 791634712320.0, + "grad_norm": 0.030920782852134367, + "language_loss": 0.98781657, + "learning_rate": 0.0009917471979746615, + "loss": 0.99870372, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.77490234, + "step": 448, + "time_per_iteration": 3.0066978931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.01195049, + "epoch": 0.08637937668333974, + "flos": 567115603200.0, + "grad_norm": 0.03238149886931097, + "language_loss": 0.98317528, + "learning_rate": 0.0009916907324956086, + "loss": 0.99407488, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.77929688, + "step": 449, + "time_per_iteration": 2.7561135292053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091057, + "balance_loss_mlp": 1.01333201, + "epoch": 0.08657175836860331, + "flos": 446118108672.0, + "grad_norm": 0.029046506526173844, + "language_loss": 0.94927382, + "learning_rate": 0.0009916340761252837, + "loss": 0.96018445, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.77636719, + "step": 450, + "time_per_iteration": 2.6452889442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089525, + "balance_loss_mlp": 1.01222932, + "epoch": 0.08676414005386687, + "flos": 845589480960.0, + "grad_norm": 0.032144406787761336, + "language_loss": 0.91630232, + "learning_rate": 0.0009915772288856832, + "loss": 0.92719758, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.77197266, + "step": 451, + "time_per_iteration": 3.0991322994232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108797, + "balance_loss_mlp": 1.01086605, + "epoch": 0.08695652173913043, + "flos": 604484906496.0, + "grad_norm": 0.025568476728402203, + "language_loss": 0.93134868, + "learning_rate": 0.000991520190798877, + "loss": 0.94222844, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.77001953, + "step": 452, + "time_per_iteration": 2.833534002304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093662, + "balance_loss_mlp": 1.01660514, + "epoch": 0.08714890342439399, + "flos": 732001107456.0, + "grad_norm": 0.03795734255344977, + "language_loss": 1.02428043, + "learning_rate": 0.0009914629618870089, + "loss": 1.03521705, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.76953125, + "step": 453, + "time_per_iteration": 2.9043643474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.02319336, + "epoch": 0.08734128510965757, + "flos": 1485456770304.0, + "grad_norm": 0.019964198948139205, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79774594, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.75390625, + "step": 454, + "time_per_iteration": 2.093019723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_mlp": 1.01278687, + "epoch": 0.08753366679492113, + "flos": 1526269146624.0, + "grad_norm": 0.012226751630218, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82515901, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.75, + "step": 455, + "time_per_iteration": 4.905871391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091043, + "balance_loss_mlp": 1.01379561, + "epoch": 0.08772604848018468, + "flos": 722525364480.0, + "grad_norm": 0.044152825797527884, + "language_loss": 0.95217329, + "learning_rate": 0.0009912901304235883, + "loss": 0.96308374, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.77148438, + "step": 456, + "time_per_iteration": 2.850330352783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090876, + "balance_loss_mlp": 1.01396191, + "epoch": 0.08791843016544824, + "flos": 709467542784.0, + "grad_norm": 0.038854584599924205, + "language_loss": 0.92178857, + "learning_rate": 0.000991232138434397, + "loss": 0.9326973, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.76806641, + "step": 457, + "time_per_iteration": 2.868957757949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.01477098, + "epoch": 0.08811081185071182, + "flos": 474022362624.0, + "grad_norm": 0.04035146689108268, + "language_loss": 0.99321103, + "learning_rate": 0.000991173955731976, + "loss": 1.00412512, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.76513672, + "step": 458, + "time_per_iteration": 2.6747970581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089272, + "balance_loss_mlp": 1.01288271, + "epoch": 0.08830319353597538, + "flos": 686315738880.0, + "grad_norm": 0.033089720334054364, + "language_loss": 1.03213239, + "learning_rate": 0.0009911155823389137, + "loss": 1.04302514, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.76269531, + "step": 459, + "time_per_iteration": 2.9462268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_mlp": 1.00881398, + "epoch": 0.08849557522123894, + "flos": 574609294848.0, + "grad_norm": 0.035557366742091014, + "language_loss": 0.99025905, + "learning_rate": 0.000991057018277873, + "loss": 1.00111353, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.76513672, + "step": 460, + "time_per_iteration": 2.6903369426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.00968456, + "epoch": 0.0886879569065025, + "flos": 565628336640.0, + "grad_norm": 0.039664118418905284, + "language_loss": 1.00002789, + "learning_rate": 0.0009909982635715898, + "loss": 1.01089334, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.76757812, + "step": 461, + "time_per_iteration": 2.620046615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010895, + "balance_loss_mlp": 1.0128243, + "epoch": 0.08888033859176607, + "flos": 564957607680.0, + "grad_norm": 0.03231802322071402, + "language_loss": 0.98670942, + "learning_rate": 0.0009909393182428751, + "loss": 0.99760437, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.765625, + "step": 462, + "time_per_iteration": 2.6466307640075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090991, + "balance_loss_mlp": 1.01412475, + "epoch": 0.08907272027702963, + "flos": 466743499008.0, + "grad_norm": 0.03344290639259395, + "language_loss": 0.93214953, + "learning_rate": 0.000990880182314614, + "loss": 0.94305944, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.76757812, + "step": 463, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_mlp": 1.0100224, + "epoch": 0.08926510196229319, + "flos": 682844475648.0, + "grad_norm": 0.03261982194681884, + "language_loss": 0.93093467, + "learning_rate": 0.0009908208558097643, + "loss": 0.94180012, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.76416016, + "step": 464, + "time_per_iteration": 2.9068925380706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089482, + "balance_loss_mlp": 1.01323605, + "epoch": 0.08945748364755675, + "flos": 597822336768.0, + "grad_norm": 0.03309433671244878, + "language_loss": 0.95414662, + "learning_rate": 0.000990761338751359, + "loss": 0.9650414, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.76123047, + "step": 465, + "time_per_iteration": 2.774606227874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079613, + "balance_loss_mlp": 1.00732422, + "epoch": 0.08964986533282032, + "flos": 1589343879168.0, + "grad_norm": 0.03434681355524106, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74739242, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.72460938, + "step": 466, + "time_per_iteration": 4.996358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092523, + "balance_loss_mlp": 1.01646745, + "epoch": 0.08984224701808388, + "flos": 534550272768.0, + "grad_norm": 0.03379784984504044, + "language_loss": 0.98391378, + "learning_rate": 0.0009906417330663815, + "loss": 0.99483901, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.75927734, + "step": 467, + "time_per_iteration": 2.6774964332580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092653, + "balance_loss_mlp": 1.01678836, + "epoch": 0.09003462870334744, + "flos": 479850898176.0, + "grad_norm": 0.04271038491910547, + "language_loss": 0.94838965, + "learning_rate": 0.0009905816444862442, + "loss": 0.95931625, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.75732422, + "step": 468, + "time_per_iteration": 2.6558451652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092919, + "balance_loss_mlp": 1.01691103, + "epoch": 0.090227010388611, + "flos": 654903283200.0, + "grad_norm": 0.031716132767048565, + "language_loss": 0.92225289, + "learning_rate": 0.0009905213654454216, + "loss": 0.933182, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.75878906, + "step": 469, + "time_per_iteration": 2.9322757720947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.01796389, + "epoch": 0.09041939207387456, + "flos": 619359528960.0, + "grad_norm": 0.03474651138537023, + "language_loss": 1.00819349, + "learning_rate": 0.0009904608959673158, + "loss": 1.01913023, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.75585938, + "step": 470, + "time_per_iteration": 2.7938003540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091787, + "balance_loss_mlp": 1.01620793, + "epoch": 0.09061177375913813, + "flos": 455296398336.0, + "grad_norm": 0.04023106246537731, + "language_loss": 1.00852847, + "learning_rate": 0.000990400236075403, + "loss": 1.01944637, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.75439453, + "step": 471, + "time_per_iteration": 2.5231049060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.01024961, + "epoch": 0.0908041554444017, + "flos": 545309147904.0, + "grad_norm": 0.036372029021066864, + "language_loss": 0.97571105, + "learning_rate": 0.0009903393857932338, + "loss": 0.98656648, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.75146484, + "step": 472, + "time_per_iteration": 2.700449228286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082684, + "balance_loss_mlp": 1.00786841, + "epoch": 0.09099653712966525, + "flos": 565467943680.0, + "grad_norm": 0.03263919317425628, + "language_loss": 0.95124531, + "learning_rate": 0.0009902783451444317, + "loss": 0.96207213, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.74658203, + "step": 473, + "time_per_iteration": 2.7006537914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.00667381, + "epoch": 0.09118891881492881, + "flos": 475502826240.0, + "grad_norm": 0.036465550100162274, + "language_loss": 0.98778975, + "learning_rate": 0.0009902171141526956, + "loss": 0.99860233, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.74414062, + "step": 474, + "time_per_iteration": 2.565852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081522, + "balance_loss_mlp": 1.00732613, + "epoch": 0.09138130050019239, + "flos": 546991800576.0, + "grad_norm": 0.03189281102051162, + "language_loss": 0.86324012, + "learning_rate": 0.000990155692841797, + "loss": 0.87405533, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.74023438, + "step": 475, + "time_per_iteration": 2.9694621562957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.0079515, + "epoch": 0.09157368218545595, + "flos": 733974410496.0, + "grad_norm": 0.03574286330183218, + "language_loss": 0.98287529, + "learning_rate": 0.0009900940812355818, + "loss": 0.99369442, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.73779297, + "step": 476, + "time_per_iteration": 2.8549702167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.00835192, + "epoch": 0.0917660638707195, + "flos": 612073862400.0, + "grad_norm": 0.03800316101532587, + "language_loss": 0.95275486, + "learning_rate": 0.00099003227935797, + "loss": 0.96357656, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.73632812, + "step": 477, + "time_per_iteration": 2.709808349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084092, + "balance_loss_mlp": 1.01051593, + "epoch": 0.09195844555598306, + "flos": 657019482624.0, + "grad_norm": 0.03875864993538346, + "language_loss": 0.99037415, + "learning_rate": 0.000989970287232955, + "loss": 1.0012151, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.73486328, + "step": 478, + "time_per_iteration": 2.7670538425445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.01252699, + "epoch": 0.09215082724124664, + "flos": 477541257984.0, + "grad_norm": 0.03367109557456403, + "language_loss": 0.95731258, + "learning_rate": 0.0009899081048846043, + "loss": 0.96817166, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.73339844, + "step": 479, + "time_per_iteration": 2.588352918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_mlp": 1.01208997, + "epoch": 0.0923432089265102, + "flos": 525326296320.0, + "grad_norm": 0.0462740033589213, + "language_loss": 1.00606585, + "learning_rate": 0.0009898457323370593, + "loss": 1.01691723, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.73046875, + "step": 480, + "time_per_iteration": 2.5808160305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082132, + "balance_loss_mlp": 1.00936687, + "epoch": 0.09253559061177376, + "flos": 546639912192.0, + "grad_norm": 0.03676160983227949, + "language_loss": 0.9798522, + "learning_rate": 0.000989783169614535, + "loss": 0.99067354, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.72900391, + "step": 481, + "time_per_iteration": 2.624483108520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097145, + "balance_loss_mlp": 1.02485657, + "epoch": 0.09272797229703732, + "flos": 1541337209856.0, + "grad_norm": 0.023489610904585654, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79849905, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.72460938, + "step": 482, + "time_per_iteration": 4.897305965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085543, + "balance_loss_mlp": 1.01330173, + "epoch": 0.09292035398230089, + "flos": 691065276672.0, + "grad_norm": 0.04252493421314706, + "language_loss": 0.95552129, + "learning_rate": 0.000989657473741779, + "loss": 0.96637678, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.72412109, + "step": 483, + "time_per_iteration": 2.8165738582611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.01184416, + "epoch": 0.09311273566756445, + "flos": 510823004160.0, + "grad_norm": 0.03895509426778844, + "language_loss": 0.97422099, + "learning_rate": 0.0009895943406403465, + "loss": 0.98506236, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.72460938, + "step": 484, + "time_per_iteration": 2.7523326873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086134, + "balance_loss_mlp": 1.01384509, + "epoch": 0.09330511735282801, + "flos": 660584064768.0, + "grad_norm": 0.04754513437429821, + "language_loss": 0.90526009, + "learning_rate": 0.0009895310174615338, + "loss": 0.91612148, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.72460938, + "step": 485, + "time_per_iteration": 2.843790292739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070969, + "balance_loss_mlp": 0.99982452, + "epoch": 0.09349749903809157, + "flos": 1456024420608.0, + "grad_norm": 0.007982392205281765, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76789486, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.71289062, + "step": 486, + "time_per_iteration": 4.649716138839722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.00877845, + "epoch": 0.09368988072335514, + "flos": 521900719872.0, + "grad_norm": 0.0379904908867083, + "language_loss": 0.94096279, + "learning_rate": 0.0009894038009701782, + "loss": 0.95177054, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.72167969, + "step": 487, + "time_per_iteration": 2.615767002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_mlp": 1.012941, + "epoch": 0.0938822624086187, + "flos": 498752806656.0, + "grad_norm": 0.041516659048387576, + "language_loss": 0.97017074, + "learning_rate": 0.0009893399077070253, + "loss": 0.98102111, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.72265625, + "step": 488, + "time_per_iteration": 2.592867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.01828361, + "epoch": 0.09407464409388226, + "flos": 534224629248.0, + "grad_norm": 0.031087819309936707, + "language_loss": 0.91152203, + "learning_rate": 0.0009892758244652718, + "loss": 0.92242396, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.72070312, + "step": 489, + "time_per_iteration": 2.702681541442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080571, + "balance_loss_mlp": 1.00852132, + "epoch": 0.09426702577914582, + "flos": 587091651840.0, + "grad_norm": 0.037758062155454256, + "language_loss": 0.98290044, + "learning_rate": 0.0009892115512697968, + "loss": 0.99370617, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.72216797, + "step": 490, + "time_per_iteration": 2.7222015857696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.01649261, + "epoch": 0.0944594074644094, + "flos": 504464690688.0, + "grad_norm": 0.03400132145466818, + "language_loss": 0.98617911, + "learning_rate": 0.0009891470881455537, + "loss": 0.99706453, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.72216797, + "step": 491, + "time_per_iteration": 2.6978650093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.01626599, + "epoch": 0.09465178914967295, + "flos": 572114962176.0, + "grad_norm": 0.03537229102294209, + "language_loss": 0.97051454, + "learning_rate": 0.0009890824351175692, + "loss": 0.98139298, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.71728516, + "step": 492, + "time_per_iteration": 2.7183802127838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.01590919, + "epoch": 0.09484417083493651, + "flos": 550419322368.0, + "grad_norm": 0.028677449722299516, + "language_loss": 1.00688422, + "learning_rate": 0.0009890175922109435, + "loss": 1.01776004, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.71826172, + "step": 493, + "time_per_iteration": 2.680469512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082456, + "balance_loss_mlp": 1.01088285, + "epoch": 0.09503655252020007, + "flos": 825272237568.0, + "grad_norm": 0.03488638846892438, + "language_loss": 0.98808897, + "learning_rate": 0.0009889525594508513, + "loss": 0.99891359, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.71728516, + "step": 494, + "time_per_iteration": 2.983400344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083066, + "balance_loss_mlp": 1.01154041, + "epoch": 0.09522893420546363, + "flos": 405518615040.0, + "grad_norm": 0.028649644857800794, + "language_loss": 0.9245472, + "learning_rate": 0.0009888873368625404, + "loss": 0.93537784, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.71679688, + "step": 495, + "time_per_iteration": 2.497526168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108369, + "balance_loss_mlp": 1.01206875, + "epoch": 0.0954213158907272, + "flos": 692257035264.0, + "grad_norm": 0.03396045626839725, + "language_loss": 0.96602595, + "learning_rate": 0.0009888219244713326, + "loss": 0.97686291, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.71777344, + "step": 496, + "time_per_iteration": 2.8588504791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.01018417, + "epoch": 0.09561369757599077, + "flos": 520075170816.0, + "grad_norm": 0.039869543083186736, + "language_loss": 0.97707164, + "learning_rate": 0.0009887563223026229, + "loss": 0.98788875, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.71679688, + "step": 497, + "time_per_iteration": 2.6856894493103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075874, + "balance_loss_mlp": 1.00644684, + "epoch": 0.09580607926125433, + "flos": 1388784363264.0, + "grad_norm": 0.01625235818526382, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80144036, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.6953125, + "step": 498, + "time_per_iteration": 4.882593393325806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086748, + "balance_loss_mlp": 1.0150795, + "epoch": 0.09599846094651789, + "flos": 718826634240.0, + "grad_norm": 0.03326061844711544, + "language_loss": 0.95632416, + "learning_rate": 0.0009886245487346482, + "loss": 0.9671917, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.71826172, + "step": 499, + "time_per_iteration": 3.0426785945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.01568544, + "epoch": 0.09619084263178146, + "flos": 386894717952.0, + "grad_norm": 0.04298067648683731, + "language_loss": 0.98954022, + "learning_rate": 0.0009885583773865422, + "loss": 1.00041187, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.71630859, + "step": 500, + "time_per_iteration": 2.452941417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.01467967, + "epoch": 0.09638322431704502, + "flos": 535173369600.0, + "grad_norm": 0.04172266818012015, + "language_loss": 0.95971203, + "learning_rate": 0.0009884920163632524, + "loss": 0.97057414, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.71679688, + "step": 501, + "time_per_iteration": 2.657940626144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080406, + "balance_loss_mlp": 1.00911927, + "epoch": 0.09657560600230858, + "flos": 501657353472.0, + "grad_norm": 0.041437287127294276, + "language_loss": 0.9960922, + "learning_rate": 0.000988425465690543, + "loss": 1.00689626, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.71435547, + "step": 502, + "time_per_iteration": 2.5540428161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.00642741, + "epoch": 0.09676798768757214, + "flos": 530332458240.0, + "grad_norm": 0.03187665411612151, + "language_loss": 0.96807587, + "learning_rate": 0.0009883587253942505, + "loss": 0.97885495, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.71630859, + "step": 503, + "time_per_iteration": 2.7744338512420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086901, + "balance_loss_mlp": 1.01542282, + "epoch": 0.09696036937283571, + "flos": 464557313280.0, + "grad_norm": 0.038653015311582224, + "language_loss": 1.0234406, + "learning_rate": 0.0009882917955002862, + "loss": 1.03430974, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.71630859, + "step": 504, + "time_per_iteration": 2.500669479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_mlp": 1.01074982, + "epoch": 0.09715275105809927, + "flos": 536011294464.0, + "grad_norm": 0.035792041916504785, + "language_loss": 0.94188601, + "learning_rate": 0.0009882246760346343, + "loss": 0.95270395, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.71191406, + "step": 505, + "time_per_iteration": 2.6442148685455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077575, + "balance_loss_mlp": 1.00652647, + "epoch": 0.09734513274336283, + "flos": 455882556672.0, + "grad_norm": 0.04461237962136338, + "language_loss": 1.00418711, + "learning_rate": 0.0009881573670233533, + "loss": 1.01496279, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.71191406, + "step": 506, + "time_per_iteration": 2.5102410316467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075502, + "balance_loss_mlp": 1.00450063, + "epoch": 0.09753751442862639, + "flos": 509828577024.0, + "grad_norm": 0.03506590591484262, + "language_loss": 0.93374205, + "learning_rate": 0.0009880898684925747, + "loss": 0.94449711, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.71142578, + "step": 507, + "time_per_iteration": 2.652381658554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.00624609, + "epoch": 0.09772989611388996, + "flos": 485247832320.0, + "grad_norm": 0.03501422949918711, + "language_loss": 0.92606336, + "learning_rate": 0.0009880221804685037, + "loss": 0.9368335, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.70898438, + "step": 508, + "time_per_iteration": 2.5481274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073608, + "balance_loss_mlp": 1.00456238, + "epoch": 0.09792227779915352, + "flos": 1569319231488.0, + "grad_norm": 0.011873284077886747, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80418032, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.69140625, + "step": 509, + "time_per_iteration": 4.725191354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076044, + "balance_loss_mlp": 1.00590122, + "epoch": 0.09811465948441708, + "flos": 588915255552.0, + "grad_norm": 0.04172960474096109, + "language_loss": 0.98818666, + "learning_rate": 0.0009878862360456733, + "loss": 0.99894708, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.70263672, + "step": 510, + "time_per_iteration": 2.7094569206237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.00828481, + "epoch": 0.09830704116968064, + "flos": 614129790720.0, + "grad_norm": 0.037035801977756785, + "language_loss": 0.90851068, + "learning_rate": 0.0009878179796996922, + "loss": 0.919294, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.70166016, + "step": 511, + "time_per_iteration": 2.6973366737365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079637, + "balance_loss_mlp": 1.00973296, + "epoch": 0.09849942285494422, + "flos": 539936513280.0, + "grad_norm": 0.0318668020933778, + "language_loss": 0.94484478, + "learning_rate": 0.0009877495339659754, + "loss": 0.95564115, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.70019531, + "step": 512, + "time_per_iteration": 2.7476089000701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083598, + "balance_loss_mlp": 1.0137887, + "epoch": 0.09869180454020778, + "flos": 621604040448.0, + "grad_norm": 0.03763698097825182, + "language_loss": 0.89467418, + "learning_rate": 0.000987680898871096, + "loss": 0.90551007, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.69921875, + "step": 513, + "time_per_iteration": 2.7254321575164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.01382184, + "epoch": 0.09888418622547133, + "flos": 813061089024.0, + "grad_norm": 0.049179676158016515, + "language_loss": 0.91816097, + "learning_rate": 0.0009876120744417, + "loss": 0.9289968, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.69873047, + "step": 514, + "time_per_iteration": 2.9596974849700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083293, + "balance_loss_mlp": 1.01357901, + "epoch": 0.0990765679107349, + "flos": 536857967616.0, + "grad_norm": 0.03966041946019195, + "language_loss": 0.99294269, + "learning_rate": 0.0009875430607045078, + "loss": 1.0037756, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.69824219, + "step": 515, + "time_per_iteration": 2.7065181732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_mlp": 1.01439941, + "epoch": 0.09926894959599845, + "flos": 588971635968.0, + "grad_norm": 0.037836000479060286, + "language_loss": 0.94664383, + "learning_rate": 0.000987473857686313, + "loss": 0.95748156, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.69482422, + "step": 516, + "time_per_iteration": 2.712947130203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_mlp": 1.01582849, + "epoch": 0.09946133128126203, + "flos": 642387878400.0, + "grad_norm": 0.04191957443387863, + "language_loss": 0.98466003, + "learning_rate": 0.0009874044654139824, + "loss": 0.99551111, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.69384766, + "step": 517, + "time_per_iteration": 2.7391469478607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081227, + "balance_loss_mlp": 1.01194227, + "epoch": 0.09965371296652559, + "flos": 466726002432.0, + "grad_norm": 0.049265237591549625, + "language_loss": 0.97911566, + "learning_rate": 0.0009873348839144563, + "loss": 0.98992795, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.69384766, + "step": 518, + "time_per_iteration": 2.5496554374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081078, + "balance_loss_mlp": 1.01198411, + "epoch": 0.09984609465178915, + "flos": 484559606784.0, + "grad_norm": 0.04039588305244337, + "language_loss": 0.99084902, + "learning_rate": 0.000987265113214749, + "loss": 1.00165975, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.69189453, + "step": 519, + "time_per_iteration": 2.592350721359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_mlp": 1.01200545, + "epoch": 0.1000384763370527, + "flos": 570095972352.0, + "grad_norm": 0.04690738730083641, + "language_loss": 1.01784182, + "learning_rate": 0.0009871951533419476, + "loss": 1.02865279, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.69189453, + "step": 520, + "time_per_iteration": 2.699725866317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.00854921, + "epoch": 0.10023085802231628, + "flos": 546926671872.0, + "grad_norm": 0.03422053119670882, + "language_loss": 0.91227025, + "learning_rate": 0.0009871250043232132, + "loss": 0.92304718, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.69238281, + "step": 521, + "time_per_iteration": 2.74124813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078273, + "balance_loss_mlp": 1.00913203, + "epoch": 0.10042323970757984, + "flos": 504440391168.0, + "grad_norm": 0.0407416967929008, + "language_loss": 0.91114902, + "learning_rate": 0.0009870546661857797, + "loss": 0.92193174, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.69238281, + "step": 522, + "time_per_iteration": 2.6524126529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080712, + "balance_loss_mlp": 1.01199949, + "epoch": 0.1006156213928434, + "flos": 771725737728.0, + "grad_norm": 0.04764395650012834, + "language_loss": 1.0071038, + "learning_rate": 0.0009869841389569553, + "loss": 1.01791096, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.68798828, + "step": 523, + "time_per_iteration": 2.9797816276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.01237857, + "epoch": 0.10080800307810696, + "flos": 491009293824.0, + "grad_norm": 0.04526617857315469, + "language_loss": 0.93126583, + "learning_rate": 0.0009869134226641206, + "loss": 0.94207817, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.68945312, + "step": 524, + "time_per_iteration": 2.624396562576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079355, + "balance_loss_mlp": 1.01064241, + "epoch": 0.10100038476337053, + "flos": 455713415424.0, + "grad_norm": 0.04976961118682096, + "language_loss": 0.93662071, + "learning_rate": 0.0009868425173347303, + "loss": 0.94741422, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.68798828, + "step": 525, + "time_per_iteration": 2.659106731414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077138, + "balance_loss_mlp": 1.00809169, + "epoch": 0.10119276644863409, + "flos": 557574731520.0, + "grad_norm": 0.04197638521891018, + "language_loss": 0.9924143, + "learning_rate": 0.0009867714229963125, + "loss": 1.00318575, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.69140625, + "step": 526, + "time_per_iteration": 2.7414495944976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.01201165, + "epoch": 0.10138514813389765, + "flos": 517220201472.0, + "grad_norm": 0.044929109849797505, + "language_loss": 0.96641302, + "learning_rate": 0.000986700139676468, + "loss": 0.97722065, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.68847656, + "step": 527, + "time_per_iteration": 2.620313882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083057, + "balance_loss_mlp": 1.01405847, + "epoch": 0.10157752981916121, + "flos": 501564034560.0, + "grad_norm": 0.03558874762709202, + "language_loss": 0.9424324, + "learning_rate": 0.0009866286674028717, + "loss": 0.95326293, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.69091797, + "step": 528, + "time_per_iteration": 2.632835865020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082511, + "balance_loss_mlp": 1.01379848, + "epoch": 0.10176991150442478, + "flos": 658094589696.0, + "grad_norm": 0.042026744727430246, + "language_loss": 0.91470444, + "learning_rate": 0.0009865570062032717, + "loss": 0.9255296, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.68798828, + "step": 529, + "time_per_iteration": 2.9185874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.01519477, + "epoch": 0.10196229318968834, + "flos": 574403215104.0, + "grad_norm": 0.031693910674612406, + "language_loss": 0.95307148, + "learning_rate": 0.0009864851561054893, + "loss": 0.96391344, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.69091797, + "step": 530, + "time_per_iteration": 2.7826597690582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.01765728, + "epoch": 0.1021546748749519, + "flos": 519256687872.0, + "grad_norm": 0.0418084670656813, + "language_loss": 0.94574928, + "learning_rate": 0.0009864131171374191, + "loss": 0.95661592, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.69091797, + "step": 531, + "time_per_iteration": 2.67000150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088238, + "balance_loss_mlp": 1.01919198, + "epoch": 0.10234705656021546, + "flos": 610954035456.0, + "grad_norm": 0.03906444640078033, + "language_loss": 0.94287467, + "learning_rate": 0.0009863408893270292, + "loss": 0.95375705, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.69140625, + "step": 532, + "time_per_iteration": 2.7893166542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_mlp": 1.02029741, + "epoch": 0.10253943824547904, + "flos": 602913069312.0, + "grad_norm": 0.046708965243717, + "language_loss": 0.90346718, + "learning_rate": 0.0009862684727023605, + "loss": 0.91435778, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.68847656, + "step": 533, + "time_per_iteration": 2.7212483882904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.0105468, + "epoch": 0.1027318199307426, + "flos": 664157395200.0, + "grad_norm": 0.04923575085492922, + "language_loss": 0.9286049, + "learning_rate": 0.0009861958672915283, + "loss": 0.93939555, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.68603516, + "step": 534, + "time_per_iteration": 2.8216443061828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080998, + "balance_loss_mlp": 1.01271474, + "epoch": 0.10292420161600616, + "flos": 684531019008.0, + "grad_norm": 0.03566434899904423, + "language_loss": 0.91122925, + "learning_rate": 0.0009861230731227201, + "loss": 0.92203927, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.68359375, + "step": 535, + "time_per_iteration": 2.8432843685150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082908, + "balance_loss_mlp": 1.01514912, + "epoch": 0.10311658330126972, + "flos": 491269808640.0, + "grad_norm": 0.04656876258351904, + "language_loss": 0.9494285, + "learning_rate": 0.0009860500902241973, + "loss": 0.96025753, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.67822266, + "step": 536, + "time_per_iteration": 2.601234197616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085787, + "balance_loss_mlp": 1.01831496, + "epoch": 0.10330896498653329, + "flos": 432687011328.0, + "grad_norm": 0.046264109011482965, + "language_loss": 0.99409795, + "learning_rate": 0.0009859769186242942, + "loss": 1.00495577, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.67529297, + "step": 537, + "time_per_iteration": 2.527156114578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079891, + "balance_loss_mlp": 1.01265681, + "epoch": 0.10350134667179685, + "flos": 550642898688.0, + "grad_norm": 0.04274411195548745, + "language_loss": 0.92667055, + "learning_rate": 0.0009859035583514187, + "loss": 0.93746948, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.67285156, + "step": 538, + "time_per_iteration": 2.6489107608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082236, + "balance_loss_mlp": 1.01505005, + "epoch": 0.10369372835706041, + "flos": 641827964928.0, + "grad_norm": 0.04978782417937993, + "language_loss": 0.95941103, + "learning_rate": 0.0009858300094340517, + "loss": 0.97023344, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.67236328, + "step": 539, + "time_per_iteration": 2.8078534603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107826, + "balance_loss_mlp": 1.01102614, + "epoch": 0.10388611004232397, + "flos": 522766834944.0, + "grad_norm": 0.04233995967203171, + "language_loss": 0.8846426, + "learning_rate": 0.0009857562719007473, + "loss": 0.8954252, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.67285156, + "step": 540, + "time_per_iteration": 2.605253219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108211, + "balance_loss_mlp": 1.01487637, + "epoch": 0.10407849172758753, + "flos": 703741074432.0, + "grad_norm": 0.04489314852578161, + "language_loss": 0.9024663, + "learning_rate": 0.0009856823457801331, + "loss": 0.91328734, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.67285156, + "step": 541, + "time_per_iteration": 2.8836264610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074756, + "balance_loss_mlp": 1.00737894, + "epoch": 0.1042708734128511, + "flos": 503945606400.0, + "grad_norm": 0.04545070943505171, + "language_loss": 0.97841358, + "learning_rate": 0.00098560823110091, + "loss": 0.98916113, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.67431641, + "step": 542, + "time_per_iteration": 2.629241466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078174, + "balance_loss_mlp": 1.01084471, + "epoch": 0.10446325509811466, + "flos": 486641779968.0, + "grad_norm": 0.04151430298304091, + "language_loss": 0.974545, + "learning_rate": 0.000985533927891851, + "loss": 0.98532677, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.67382812, + "step": 543, + "time_per_iteration": 2.712714195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.01110125, + "epoch": 0.10465563678337822, + "flos": 569713948416.0, + "grad_norm": 0.043537531534841835, + "language_loss": 0.9559319, + "learning_rate": 0.0009854594361818044, + "loss": 0.96671236, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.66992188, + "step": 544, + "time_per_iteration": 2.66324520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075343, + "balance_loss_mlp": 1.00806153, + "epoch": 0.10484801846864178, + "flos": 627243992832.0, + "grad_norm": 0.042858245855360314, + "language_loss": 0.94459403, + "learning_rate": 0.0009853847559996897, + "loss": 0.95534742, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.67333984, + "step": 545, + "time_per_iteration": 2.749379873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.00697374, + "epoch": 0.10504040015390535, + "flos": 744813965568.0, + "grad_norm": 0.04113973833070077, + "language_loss": 0.93940508, + "learning_rate": 0.0009853098873745, + "loss": 0.95015049, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.67626953, + "step": 546, + "time_per_iteration": 3.0356035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082094, + "balance_loss_mlp": 1.01457405, + "epoch": 0.10523278183916891, + "flos": 587843060736.0, + "grad_norm": 0.04039468180414331, + "language_loss": 0.92498314, + "learning_rate": 0.0009852348303353027, + "loss": 0.93580401, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.67578125, + "step": 547, + "time_per_iteration": 2.787853479385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_mlp": 1.01283157, + "epoch": 0.10542516352443247, + "flos": 871147156224.0, + "grad_norm": 0.04319215205461418, + "language_loss": 0.86143011, + "learning_rate": 0.000985159584911237, + "loss": 0.872235, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.67724609, + "step": 548, + "time_per_iteration": 3.103173017501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.00949633, + "epoch": 0.10561754520969603, + "flos": 506413694208.0, + "grad_norm": 0.04405333210851084, + "language_loss": 0.94064271, + "learning_rate": 0.0009850841511315162, + "loss": 0.95141286, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.67578125, + "step": 549, + "time_per_iteration": 2.647629737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107948, + "balance_loss_mlp": 1.01176953, + "epoch": 0.1058099268949596, + "flos": 561148061952.0, + "grad_norm": 0.03728506713954383, + "language_loss": 0.9326818, + "learning_rate": 0.0009850085290254256, + "loss": 0.94347662, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.67773438, + "step": 550, + "time_per_iteration": 2.7680838108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081585, + "balance_loss_mlp": 1.01411295, + "epoch": 0.10600230858022316, + "flos": 563160248832.0, + "grad_norm": 0.031635589688873186, + "language_loss": 0.90350562, + "learning_rate": 0.0009849327186223246, + "loss": 0.91432148, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.67529297, + "step": 551, + "time_per_iteration": 2.7540531158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077249, + "balance_loss_mlp": 1.01001453, + "epoch": 0.10619469026548672, + "flos": 495318481920.0, + "grad_norm": 0.03875875468173829, + "language_loss": 0.97612774, + "learning_rate": 0.000984856719951646, + "loss": 0.98690015, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.67285156, + "step": 552, + "time_per_iteration": 2.5471906661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_mlp": 1.01300704, + "epoch": 0.10638707195075028, + "flos": 677465038080.0, + "grad_norm": 0.04041077275123314, + "language_loss": 0.94560456, + "learning_rate": 0.0009847805330428943, + "loss": 0.95640558, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.67138672, + "step": 553, + "time_per_iteration": 2.879901647567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081173, + "balance_loss_mlp": 1.01398706, + "epoch": 0.10657945363601386, + "flos": 489035990784.0, + "grad_norm": 0.051524237529684984, + "language_loss": 0.97161597, + "learning_rate": 0.0009847041579256481, + "loss": 0.98242772, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.67236328, + "step": 554, + "time_per_iteration": 2.5838425159454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076637, + "balance_loss_mlp": 1.00997543, + "epoch": 0.10677183532127742, + "flos": 483971503104.0, + "grad_norm": 0.03890900728724459, + "language_loss": 0.96058643, + "learning_rate": 0.0009846275946295592, + "loss": 0.97135282, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.66699219, + "step": 555, + "time_per_iteration": 2.619490623474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074794, + "balance_loss_mlp": 1.00813222, + "epoch": 0.10696421700654098, + "flos": 657582308352.0, + "grad_norm": 0.03350037319549477, + "language_loss": 0.89189553, + "learning_rate": 0.0009845508431843518, + "loss": 0.9026435, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.66699219, + "step": 556, + "time_per_iteration": 3.0074055194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_mlp": 1.00895333, + "epoch": 0.10715659869180454, + "flos": 568793398272.0, + "grad_norm": 0.03867425342149035, + "language_loss": 0.90383601, + "learning_rate": 0.0009844739036198233, + "loss": 0.91459262, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.66748047, + "step": 557, + "time_per_iteration": 2.719309091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073849, + "balance_loss_mlp": 1.00756896, + "epoch": 0.10734898037706811, + "flos": 541744565760.0, + "grad_norm": 0.03845092177051005, + "language_loss": 0.97656357, + "learning_rate": 0.0009843967759658448, + "loss": 0.98730206, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.66308594, + "step": 558, + "time_per_iteration": 2.679964065551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077583, + "balance_loss_mlp": 1.01311493, + "epoch": 0.10754136206233167, + "flos": 1479734192640.0, + "grad_norm": 0.013283033162601723, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73845339, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.64453125, + "step": 559, + "time_per_iteration": 4.837440729141235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_mlp": 1.00977802, + "epoch": 0.10773374374759523, + "flos": 513412601088.0, + "grad_norm": 0.03702065367467253, + "language_loss": 0.97501957, + "learning_rate": 0.000984241956509384, + "loss": 0.98577774, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.66064453, + "step": 560, + "time_per_iteration": 2.6579978466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079218, + "balance_loss_mlp": 1.01312864, + "epoch": 0.10792612543285879, + "flos": 497478422784.0, + "grad_norm": 0.05173888564395698, + "language_loss": 0.9404971, + "learning_rate": 0.0009841642647670078, + "loss": 0.9512893, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.66113281, + "step": 561, + "time_per_iteration": 2.557605743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080143, + "balance_loss_mlp": 1.01429176, + "epoch": 0.10811850711812235, + "flos": 736838128128.0, + "grad_norm": 0.0493873548723288, + "language_loss": 0.88547891, + "learning_rate": 0.0009840863850553944, + "loss": 0.89628035, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.65869141, + "step": 562, + "time_per_iteration": 2.949580669403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077668, + "balance_loss_mlp": 1.0115304, + "epoch": 0.10831088880338592, + "flos": 612677517312.0, + "grad_norm": 0.04173462884607535, + "language_loss": 0.94150907, + "learning_rate": 0.0009840083174047782, + "loss": 0.95228577, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.66162109, + "step": 563, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081559, + "balance_loss_mlp": 1.01561248, + "epoch": 0.10850327048864948, + "flos": 557498909184.0, + "grad_norm": 0.034100755270258146, + "language_loss": 0.88515103, + "learning_rate": 0.0009839300618454685, + "loss": 0.89596659, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.65966797, + "step": 564, + "time_per_iteration": 2.8846256732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080528, + "balance_loss_mlp": 1.0148201, + "epoch": 0.10869565217391304, + "flos": 604437274368.0, + "grad_norm": 0.036735298053950545, + "language_loss": 0.93941957, + "learning_rate": 0.0009838516184078466, + "loss": 0.95022488, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.65722656, + "step": 565, + "time_per_iteration": 2.813284158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078727, + "balance_loss_mlp": 1.01297164, + "epoch": 0.1088880338591766, + "flos": 527206280448.0, + "grad_norm": 0.040314305725270186, + "language_loss": 0.91096556, + "learning_rate": 0.0009837729871223669, + "loss": 0.92175281, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.65771484, + "step": 566, + "time_per_iteration": 2.651611089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078801, + "balance_loss_mlp": 1.01318836, + "epoch": 0.10908041554444017, + "flos": 621417402624.0, + "grad_norm": 0.042325065837349046, + "language_loss": 0.91458869, + "learning_rate": 0.0009836941680195568, + "loss": 0.92537665, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.65625, + "step": 567, + "time_per_iteration": 2.8296427726745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.01508534, + "epoch": 0.10927279722970373, + "flos": 899674507008.0, + "grad_norm": 0.04990856516123606, + "language_loss": 0.87414277, + "learning_rate": 0.0009836151611300166, + "loss": 0.88495302, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.65966797, + "step": 568, + "time_per_iteration": 3.2401816844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107825, + "balance_loss_mlp": 1.01206517, + "epoch": 0.10946517891496729, + "flos": 529700613120.0, + "grad_norm": 0.0427731854110213, + "language_loss": 0.96863574, + "learning_rate": 0.0009835359664844194, + "loss": 0.97941828, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.66210938, + "step": 569, + "time_per_iteration": 2.6190173625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064438, + "balance_loss_mlp": 1.00092316, + "epoch": 0.10965756060023085, + "flos": 1563994228992.0, + "grad_norm": 0.005811935039235345, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.8210125, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.63476562, + "step": 570, + "time_per_iteration": 4.957117795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080699, + "balance_loss_mlp": 1.0151341, + "epoch": 0.10984994228549443, + "flos": 514100826624.0, + "grad_norm": 0.04369440603786518, + "language_loss": 0.94858396, + "learning_rate": 0.0009833770140481118, + "loss": 0.95939088, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.65576172, + "step": 571, + "time_per_iteration": 2.6529860496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_mlp": 1.02059519, + "epoch": 0.11004232397075799, + "flos": 956275252992.0, + "grad_norm": 0.04378732511153692, + "language_loss": 0.85010409, + "learning_rate": 0.000983297256319112, + "loss": 0.86096668, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.65673828, + "step": 572, + "time_per_iteration": 3.2036497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.01499045, + "epoch": 0.11023470565602154, + "flos": 489229431552.0, + "grad_norm": 0.043497603291787354, + "language_loss": 0.89141667, + "learning_rate": 0.000983217310957477, + "loss": 0.90222269, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.65625, + "step": 573, + "time_per_iteration": 2.7763278484344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078757, + "balance_loss_mlp": 1.01333535, + "epoch": 0.1104270873412851, + "flos": 656991292416.0, + "grad_norm": 0.04901418812727031, + "language_loss": 0.9269613, + "learning_rate": 0.000983137177994244, + "loss": 0.93774891, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.65429688, + "step": 574, + "time_per_iteration": 2.8529646396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080019, + "balance_loss_mlp": 1.01474011, + "epoch": 0.11061946902654868, + "flos": 724748488704.0, + "grad_norm": 0.03457948694206611, + "language_loss": 0.87449324, + "learning_rate": 0.0009830568574605235, + "loss": 0.88529336, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.65283203, + "step": 575, + "time_per_iteration": 2.94710373878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010791, + "balance_loss_mlp": 1.01367807, + "epoch": 0.11081185071181224, + "flos": 836869037568.0, + "grad_norm": 0.04085001299476677, + "language_loss": 0.90086508, + "learning_rate": 0.0009829763493874992, + "loss": 0.91165602, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.65429688, + "step": 576, + "time_per_iteration": 3.0296730995178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107807, + "balance_loss_mlp": 1.01283884, + "epoch": 0.1110042323970758, + "flos": 610283306496.0, + "grad_norm": 0.03775485835018356, + "language_loss": 0.95256275, + "learning_rate": 0.0009828956538064264, + "loss": 0.9633435, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.65234375, + "step": 577, + "time_per_iteration": 2.7944416999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.00893569, + "epoch": 0.11119661408233936, + "flos": 597040792320.0, + "grad_norm": 0.04378674390965236, + "language_loss": 0.93033826, + "learning_rate": 0.0009828147707486344, + "loss": 0.94107759, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.64990234, + "step": 578, + "time_per_iteration": 2.7034592628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.01099229, + "epoch": 0.11138899576760293, + "flos": 556888451328.0, + "grad_norm": 0.05042820660432219, + "language_loss": 0.89312434, + "learning_rate": 0.0009827337002455245, + "loss": 0.90388274, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.6484375, + "step": 579, + "time_per_iteration": 2.6187195777893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074948, + "balance_loss_mlp": 1.01057482, + "epoch": 0.11158137745286649, + "flos": 691063331328.0, + "grad_norm": 0.03501309245374513, + "language_loss": 0.89977694, + "learning_rate": 0.0009826524423285712, + "loss": 0.91052639, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.64355469, + "step": 580, + "time_per_iteration": 2.9009909629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.0148946, + "epoch": 0.11177375913813005, + "flos": 764307868416.0, + "grad_norm": 0.04023884017549449, + "language_loss": 0.91280103, + "learning_rate": 0.0009825709970293218, + "loss": 0.92359698, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.64697266, + "step": 581, + "time_per_iteration": 2.9111618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.0095998, + "epoch": 0.11196614082339361, + "flos": 808031594496.0, + "grad_norm": 0.038028140255108665, + "language_loss": 0.97163212, + "learning_rate": 0.0009824893643793956, + "loss": 0.98237336, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.64501953, + "step": 582, + "time_per_iteration": 3.0907368659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072862, + "balance_loss_mlp": 1.00796497, + "epoch": 0.11215852250865718, + "flos": 559725924096.0, + "grad_norm": 0.04580369165919148, + "language_loss": 0.90464842, + "learning_rate": 0.0009824075444104857, + "loss": 0.91537702, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.64892578, + "step": 583, + "time_per_iteration": 2.7276525497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_mlp": 1.01285601, + "epoch": 0.11235090419392074, + "flos": 514576169472.0, + "grad_norm": 0.03926612419770205, + "language_loss": 0.95381963, + "learning_rate": 0.000982325537154357, + "loss": 0.96459383, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.64550781, + "step": 584, + "time_per_iteration": 2.6261777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074375, + "balance_loss_mlp": 1.0100019, + "epoch": 0.1125432858791843, + "flos": 492433377024.0, + "grad_norm": 0.043221505898455144, + "language_loss": 0.96143711, + "learning_rate": 0.0009822433426428484, + "loss": 0.97218084, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.64355469, + "step": 585, + "time_per_iteration": 2.5630125999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.01079714, + "epoch": 0.11273566756444786, + "flos": 511728003072.0, + "grad_norm": 0.04466131563000304, + "language_loss": 0.88984096, + "learning_rate": 0.0009821609609078697, + "loss": 0.90059173, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.64257812, + "step": 586, + "time_per_iteration": 2.649122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.01077783, + "epoch": 0.11292804924971142, + "flos": 623640526848.0, + "grad_norm": 0.03579172726266892, + "language_loss": 0.91595018, + "learning_rate": 0.0009820783919814045, + "loss": 0.92670119, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.64306641, + "step": 587, + "time_per_iteration": 2.7977845668792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_mlp": 1.00830126, + "epoch": 0.113120430934975, + "flos": 479039218176.0, + "grad_norm": 0.04738669495581529, + "language_loss": 0.85574889, + "learning_rate": 0.0009819956358955095, + "loss": 0.86647511, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.64306641, + "step": 588, + "time_per_iteration": 2.59133243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_mlp": 1.01245642, + "epoch": 0.11331281262023855, + "flos": 467991638016.0, + "grad_norm": 0.048752038127388646, + "language_loss": 0.86982751, + "learning_rate": 0.0009819126926823127, + "loss": 0.88059437, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.64208984, + "step": 589, + "time_per_iteration": 2.511939764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075971, + "balance_loss_mlp": 1.01174104, + "epoch": 0.11350519430550211, + "flos": 651611854848.0, + "grad_norm": 0.04204370934342767, + "language_loss": 0.89311969, + "learning_rate": 0.000981829562374016, + "loss": 0.9038794, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.64208984, + "step": 590, + "time_per_iteration": 2.798734426498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.01111591, + "epoch": 0.11369757599076567, + "flos": 558861754368.0, + "grad_norm": 0.04723710161718091, + "language_loss": 0.99783856, + "learning_rate": 0.0009817462450028933, + "loss": 1.00858927, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.63916016, + "step": 591, + "time_per_iteration": 2.717622756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076867, + "balance_loss_mlp": 1.01316178, + "epoch": 0.11388995767602925, + "flos": 572306457600.0, + "grad_norm": 0.041300229846526024, + "language_loss": 0.87103492, + "learning_rate": 0.0009816627406012916, + "loss": 0.88180363, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.63671875, + "step": 592, + "time_per_iteration": 2.783677339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077614, + "balance_loss_mlp": 1.01376593, + "epoch": 0.1140823393612928, + "flos": 741744168192.0, + "grad_norm": 0.04574882804976793, + "language_loss": 0.87044728, + "learning_rate": 0.0009815790492016295, + "loss": 0.88122344, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.63818359, + "step": 593, + "time_per_iteration": 2.920262336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079358, + "balance_loss_mlp": 1.01560438, + "epoch": 0.11427472104655637, + "flos": 700252314624.0, + "grad_norm": 0.042792726491020304, + "language_loss": 0.89086539, + "learning_rate": 0.0009814951708363993, + "loss": 0.90165901, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.63720703, + "step": 594, + "time_per_iteration": 2.8244025707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.00799561, + "epoch": 0.11446710273181993, + "flos": 1480355344128.0, + "grad_norm": 0.0135056408383676, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79060781, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.6171875, + "step": 595, + "time_per_iteration": 4.779642105102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.01189995, + "epoch": 0.1146594844170835, + "flos": 495913388544.0, + "grad_norm": 0.038757735955663945, + "language_loss": 0.90035105, + "learning_rate": 0.0009813268533395648, + "loss": 0.91110331, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.6328125, + "step": 596, + "time_per_iteration": 2.5933825969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082133, + "balance_loss_mlp": 1.01895213, + "epoch": 0.11485186610234706, + "flos": 475791531264.0, + "grad_norm": 0.0538004660752225, + "language_loss": 0.90474582, + "learning_rate": 0.0009812424142733073, + "loss": 0.9155671, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.63134766, + "step": 597, + "time_per_iteration": 2.528027296066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.01089013, + "epoch": 0.11504424778761062, + "flos": 732620313600.0, + "grad_norm": 0.03283482462688361, + "language_loss": 0.87953097, + "learning_rate": 0.000981157788372175, + "loss": 0.89027071, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.63037109, + "step": 598, + "time_per_iteration": 3.008469343185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.01160276, + "epoch": 0.11523662947287418, + "flos": 546963610368.0, + "grad_norm": 0.037424804687157906, + "language_loss": 0.91041148, + "learning_rate": 0.0009810729756690223, + "loss": 0.92115927, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.63134766, + "step": 599, + "time_per_iteration": 2.75840163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077312, + "balance_loss_mlp": 1.01408339, + "epoch": 0.11542901115813775, + "flos": 776388759552.0, + "grad_norm": 0.04126969924944996, + "language_loss": 0.9391377, + "learning_rate": 0.0009809879761967766, + "loss": 0.94991082, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.63183594, + "step": 600, + "time_per_iteration": 2.9511778354644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081843, + "balance_loss_mlp": 1.01828074, + "epoch": 0.11562139284340131, + "flos": 732213990144.0, + "grad_norm": 0.05544181306164312, + "language_loss": 0.88981479, + "learning_rate": 0.0009809027899884378, + "loss": 0.90063322, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.63525391, + "step": 601, + "time_per_iteration": 2.888591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.01256609, + "epoch": 0.11581377452866487, + "flos": 537040714752.0, + "grad_norm": 0.03483284203155477, + "language_loss": 0.90335476, + "learning_rate": 0.0009808174170770779, + "loss": 0.9141165, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.63574219, + "step": 602, + "time_per_iteration": 2.7933802604675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073479, + "balance_loss_mlp": 1.01263428, + "epoch": 0.11600615621392843, + "flos": 1559214555648.0, + "grad_norm": 0.012041981792172347, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85971725, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.60742188, + "step": 603, + "time_per_iteration": 4.875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.01658237, + "epoch": 0.116198537899192, + "flos": 538468688640.0, + "grad_norm": 0.046063141341509364, + "language_loss": 0.95944118, + "learning_rate": 0.0009806461112779462, + "loss": 0.97023928, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.63183594, + "step": 604, + "time_per_iteration": 2.708552360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077772, + "balance_loss_mlp": 1.01444781, + "epoch": 0.11639091958445556, + "flos": 455137950720.0, + "grad_norm": 0.05737724930332189, + "language_loss": 0.90764457, + "learning_rate": 0.0009805601784566814, + "loss": 0.91842222, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.6328125, + "step": 605, + "time_per_iteration": 2.545696496963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076475, + "balance_loss_mlp": 1.01329422, + "epoch": 0.11658330126971912, + "flos": 556152593664.0, + "grad_norm": 0.04016687987230144, + "language_loss": 0.97276044, + "learning_rate": 0.0009804740590654089, + "loss": 0.98352522, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.63134766, + "step": 606, + "time_per_iteration": 2.6464574337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_mlp": 1.01399851, + "epoch": 0.11677568295498268, + "flos": 717601827840.0, + "grad_norm": 0.0453344941203476, + "language_loss": 0.91881627, + "learning_rate": 0.0009803877531375635, + "loss": 0.9295876, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.63085938, + "step": 607, + "time_per_iteration": 2.8467392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074228, + "balance_loss_mlp": 1.0111903, + "epoch": 0.11696806464024626, + "flos": 610899600384.0, + "grad_norm": 0.04469679718872237, + "language_loss": 0.92976171, + "learning_rate": 0.0009803012607066523, + "loss": 0.94050401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.62988281, + "step": 608, + "time_per_iteration": 2.7587811946868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.01030838, + "epoch": 0.11716044632550981, + "flos": 521416628736.0, + "grad_norm": 0.04044307397502579, + "language_loss": 0.91207683, + "learning_rate": 0.0009802145818062543, + "loss": 0.92280889, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.62841797, + "step": 609, + "time_per_iteration": 2.7623538970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.00919068, + "epoch": 0.11735282801077337, + "flos": 508489064448.0, + "grad_norm": 0.04251091083777229, + "language_loss": 0.93763256, + "learning_rate": 0.0009801277164700212, + "loss": 0.9483524, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.62744141, + "step": 610, + "time_per_iteration": 2.6250369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079805, + "balance_loss_mlp": 1.0171963, + "epoch": 0.11754520969603693, + "flos": 687837031680.0, + "grad_norm": 0.044835447829723894, + "language_loss": 0.91796255, + "learning_rate": 0.0009800406647316776, + "loss": 0.92876053, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.62548828, + "step": 611, + "time_per_iteration": 2.81438946723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058731, + "balance_loss_mlp": 0.99807739, + "epoch": 0.1177375913813005, + "flos": 1545759158784.0, + "grad_norm": 0.00493114536612535, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77973187, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.60546875, + "step": 612, + "time_per_iteration": 4.795796871185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.01008153, + "epoch": 0.11792997306656407, + "flos": 521538137856.0, + "grad_norm": 0.049162221556570344, + "language_loss": 0.91035461, + "learning_rate": 0.000979866002183916, + "loss": 0.92108488, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.62890625, + "step": 613, + "time_per_iteration": 2.6470768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.00820458, + "epoch": 0.11812235475182763, + "flos": 667489652736.0, + "grad_norm": 0.0453482214384289, + "language_loss": 0.92239928, + "learning_rate": 0.0009797783914423082, + "loss": 0.93311322, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.63134766, + "step": 614, + "time_per_iteration": 2.8020856380462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_mlp": 1.01220894, + "epoch": 0.11831473643709119, + "flos": 622505148672.0, + "grad_norm": 0.04034391423157231, + "language_loss": 0.86097217, + "learning_rate": 0.0009796905944342094, + "loss": 0.87172604, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.63134766, + "step": 615, + "time_per_iteration": 2.839617967605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_mlp": 1.0160079, + "epoch": 0.11850711812235475, + "flos": 457695466752.0, + "grad_norm": 0.03330066749319758, + "language_loss": 0.89949274, + "learning_rate": 0.0009796026111937057, + "loss": 0.91028321, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.62988281, + "step": 616, + "time_per_iteration": 2.6211540699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077601, + "balance_loss_mlp": 1.0150882, + "epoch": 0.11869949980761832, + "flos": 514928057856.0, + "grad_norm": 0.034464018290856886, + "language_loss": 0.90251315, + "learning_rate": 0.0009795144417549552, + "loss": 0.91328913, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.62451172, + "step": 617, + "time_per_iteration": 2.6946897506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080332, + "balance_loss_mlp": 1.01815259, + "epoch": 0.11889188149288188, + "flos": 536157103104.0, + "grad_norm": 0.035314864293198016, + "language_loss": 0.91583192, + "learning_rate": 0.0009794260861521883, + "loss": 0.92663527, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.62109375, + "step": 618, + "time_per_iteration": 2.77822208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.01979554, + "epoch": 0.11908426317814544, + "flos": 499645166592.0, + "grad_norm": 0.042334404758790994, + "language_loss": 0.88659471, + "learning_rate": 0.0009793375444197075, + "loss": 0.89741158, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.61816406, + "step": 619, + "time_per_iteration": 2.6199400424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086152, + "balance_loss_mlp": 1.02416277, + "epoch": 0.119276644863409, + "flos": 661068155904.0, + "grad_norm": 0.043937618111938345, + "language_loss": 0.86906028, + "learning_rate": 0.000979248816591888, + "loss": 0.87992179, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.61914062, + "step": 620, + "time_per_iteration": 2.789858341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_mlp": 1.01947308, + "epoch": 0.11946902654867257, + "flos": 760153237248.0, + "grad_norm": 0.04701199265522289, + "language_loss": 0.87992656, + "learning_rate": 0.0009791599027031766, + "loss": 0.89074314, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.62109375, + "step": 621, + "time_per_iteration": 3.026487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074317, + "balance_loss_mlp": 1.01223314, + "epoch": 0.11966140823393613, + "flos": 682214575872.0, + "grad_norm": 0.0506686420393155, + "language_loss": 0.88143325, + "learning_rate": 0.0009790708027880932, + "loss": 0.89217639, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.62011719, + "step": 622, + "time_per_iteration": 2.8321774005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.02143097, + "epoch": 0.11985378991919969, + "flos": 1454300938752.0, + "grad_norm": 0.023212611497014573, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78508806, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.59960938, + "step": 623, + "time_per_iteration": 4.862462759017944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071261, + "balance_loss_mlp": 1.00936747, + "epoch": 0.12004617160446325, + "flos": 528899626752.0, + "grad_norm": 0.04437858339694968, + "language_loss": 0.95209736, + "learning_rate": 0.0009788920450172487, + "loss": 0.96280998, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.61816406, + "step": 624, + "time_per_iteration": 2.630764961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078232, + "balance_loss_mlp": 1.01619518, + "epoch": 0.12023855328972682, + "flos": 475177182720.0, + "grad_norm": 0.048047229360432486, + "language_loss": 0.92430472, + "learning_rate": 0.0009788023872308875, + "loss": 0.93508708, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.61962891, + "step": 625, + "time_per_iteration": 2.5534780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076523, + "balance_loss_mlp": 1.01682281, + "epoch": 0.12043093497499038, + "flos": 1535054718720.0, + "grad_norm": 0.022021305117703366, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.7650553, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.59570312, + "step": 626, + "time_per_iteration": 4.738527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108475, + "balance_loss_mlp": 1.023, + "epoch": 0.12062331666025394, + "flos": 540915389184.0, + "grad_norm": 0.04663901515177362, + "language_loss": 0.9603011, + "learning_rate": 0.0009786225140303285, + "loss": 0.97114861, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.61669922, + "step": 627, + "time_per_iteration": 2.634160280227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.02427304, + "epoch": 0.1208156983455175, + "flos": 513000441600.0, + "grad_norm": 0.042540459475059536, + "language_loss": 0.94019556, + "learning_rate": 0.0009785322986859634, + "loss": 0.95105481, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.61572266, + "step": 628, + "time_per_iteration": 2.681070327758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.01725972, + "epoch": 0.12100808003078108, + "flos": 597590012160.0, + "grad_norm": 0.03866803919075334, + "language_loss": 0.94614279, + "learning_rate": 0.0009784418975588838, + "loss": 0.95693052, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.61425781, + "step": 629, + "time_per_iteration": 2.7337839603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073027, + "balance_loss_mlp": 1.01132393, + "epoch": 0.12120046171604464, + "flos": 524067463680.0, + "grad_norm": 0.03279843121618067, + "language_loss": 0.94581258, + "learning_rate": 0.0009783513106841862, + "loss": 0.95654285, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.61621094, + "step": 630, + "time_per_iteration": 2.702615737915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080086, + "balance_loss_mlp": 1.01981354, + "epoch": 0.1213928434013082, + "flos": 1557910036224.0, + "grad_norm": 0.01502333088768157, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77812791, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.6015625, + "step": 631, + "time_per_iteration": 4.998409032821655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080339, + "balance_loss_mlp": 1.01835024, + "epoch": 0.12158522508657175, + "flos": 496388731392.0, + "grad_norm": 0.04174070683076465, + "language_loss": 0.89320499, + "learning_rate": 0.0009781695798326854, + "loss": 0.90400839, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.61914062, + "step": 632, + "time_per_iteration": 2.5908379554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079195, + "balance_loss_mlp": 1.01744485, + "epoch": 0.12177760677183531, + "flos": 476590572288.0, + "grad_norm": 0.04165368210868703, + "language_loss": 0.89744723, + "learning_rate": 0.0009780784359264365, + "loss": 0.90823919, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.61669922, + "step": 633, + "time_per_iteration": 2.689202070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073334, + "balance_loss_mlp": 1.01382446, + "epoch": 0.12196998845709889, + "flos": 1471787512320.0, + "grad_norm": 0.011333314510513573, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75262028, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.59375, + "step": 634, + "time_per_iteration": 4.762145757675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073337, + "balance_loss_mlp": 1.01187229, + "epoch": 0.12216237014236245, + "flos": 587749741824.0, + "grad_norm": 0.03178889939160208, + "language_loss": 0.88649213, + "learning_rate": 0.000977895591329867, + "loss": 0.8972255, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.61376953, + "step": 635, + "time_per_iteration": 2.7996504306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075051, + "balance_loss_mlp": 1.01372933, + "epoch": 0.12235475182762601, + "flos": 599107414272.0, + "grad_norm": 0.038321985001081305, + "language_loss": 0.88459468, + "learning_rate": 0.000977803890710533, + "loss": 0.89534515, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.61230469, + "step": 636, + "time_per_iteration": 2.7200405597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072884, + "balance_loss_mlp": 1.0117538, + "epoch": 0.12254713351288957, + "flos": 498761554944.0, + "grad_norm": 0.03313527469264444, + "language_loss": 0.94808865, + "learning_rate": 0.0009777120045912774, + "loss": 0.95881748, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.61035156, + "step": 637, + "time_per_iteration": 2.6253507137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072019, + "balance_loss_mlp": 1.01084125, + "epoch": 0.12273951519815314, + "flos": 606981184512.0, + "grad_norm": 0.04065251745031248, + "language_loss": 0.91558111, + "learning_rate": 0.0009776199330077736, + "loss": 0.92630136, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.61083984, + "step": 638, + "time_per_iteration": 2.724416732788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069779, + "balance_loss_mlp": 1.0086484, + "epoch": 0.1229318968834167, + "flos": 598985905152.0, + "grad_norm": 0.04427923240085457, + "language_loss": 0.94062102, + "learning_rate": 0.0009775276759957667, + "loss": 0.9513188, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.61035156, + "step": 639, + "time_per_iteration": 2.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_mlp": 1.00851989, + "epoch": 0.12312427856868026, + "flos": 679589985792.0, + "grad_norm": 0.04435656949952303, + "language_loss": 0.91938198, + "learning_rate": 0.0009774352335910745, + "loss": 0.93008226, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.61425781, + "step": 640, + "time_per_iteration": 2.8135974407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072128, + "balance_loss_mlp": 1.01095021, + "epoch": 0.12331666025394382, + "flos": 610044178944.0, + "grad_norm": 0.03352322480141845, + "language_loss": 0.95842457, + "learning_rate": 0.000977342605829586, + "loss": 0.96914589, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.61083984, + "step": 641, + "time_per_iteration": 2.734373092651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107341, + "balance_loss_mlp": 1.01208854, + "epoch": 0.12350904193920739, + "flos": 763841273856.0, + "grad_norm": 0.04166007448412618, + "language_loss": 0.87458932, + "learning_rate": 0.0009772497927472623, + "loss": 0.88532341, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.61230469, + "step": 642, + "time_per_iteration": 3.069495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.01199543, + "epoch": 0.12370142362447095, + "flos": 542050767360.0, + "grad_norm": 0.04189965725350253, + "language_loss": 0.86664522, + "learning_rate": 0.0009771567943801368, + "loss": 0.87737978, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.61376953, + "step": 643, + "time_per_iteration": 2.6783955097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.01000655, + "epoch": 0.12389380530973451, + "flos": 549253808640.0, + "grad_norm": 0.03907898995026106, + "language_loss": 0.90534973, + "learning_rate": 0.0009770636107643152, + "loss": 0.91606158, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.61083984, + "step": 644, + "time_per_iteration": 2.7792532444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.01343274, + "epoch": 0.12408618699499807, + "flos": 541353793536.0, + "grad_norm": 0.03775088580197231, + "language_loss": 0.89077818, + "learning_rate": 0.0009769702419359738, + "loss": 0.9015224, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.60888672, + "step": 645, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071725, + "balance_loss_mlp": 1.01083338, + "epoch": 0.12427856868026164, + "flos": 747160544256.0, + "grad_norm": 0.03491310842571494, + "language_loss": 0.90435565, + "learning_rate": 0.000976876687931362, + "loss": 0.91507292, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.60791016, + "step": 646, + "time_per_iteration": 3.028578758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074215, + "balance_loss_mlp": 1.01332271, + "epoch": 0.1244709503655252, + "flos": 534745658880.0, + "grad_norm": 0.04739554944994068, + "language_loss": 0.86433625, + "learning_rate": 0.0009767829487868005, + "loss": 0.87507832, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.60791016, + "step": 647, + "time_per_iteration": 2.6323471069335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075713, + "balance_loss_mlp": 1.01472592, + "epoch": 0.12466333205078876, + "flos": 509112161280.0, + "grad_norm": 0.0390766896094967, + "language_loss": 0.89632404, + "learning_rate": 0.000976689024538682, + "loss": 0.90708113, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.60888672, + "step": 648, + "time_per_iteration": 2.6233997344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069043, + "balance_loss_mlp": 1.00819838, + "epoch": 0.12485571373605232, + "flos": 682640341248.0, + "grad_norm": 0.04106035596266842, + "language_loss": 0.87981439, + "learning_rate": 0.0009765949152234716, + "loss": 0.89050484, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.60742188, + "step": 649, + "time_per_iteration": 2.9135711193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064964, + "balance_loss_mlp": 1.00659943, + "epoch": 0.1250480954213159, + "flos": 1333201377024.0, + "grad_norm": 0.013063081234142807, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79751045, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.58203125, + "step": 650, + "time_per_iteration": 4.696362495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.0093261, + "epoch": 0.12524047710657946, + "flos": 940198178304.0, + "grad_norm": 0.03723688894295025, + "language_loss": 0.82869852, + "learning_rate": 0.0009764061415379919, + "loss": 0.83939779, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.60498047, + "step": 651, + "time_per_iteration": 3.287029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071729, + "balance_loss_mlp": 1.01078951, + "epoch": 0.12543285879184302, + "flos": 514901812992.0, + "grad_norm": 0.03842788822410913, + "language_loss": 0.90123397, + "learning_rate": 0.0009763114772410109, + "loss": 0.91195124, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.60839844, + "step": 652, + "time_per_iteration": 2.5726470947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071215, + "balance_loss_mlp": 1.01075244, + "epoch": 0.12562524047710658, + "flos": 719684001024.0, + "grad_norm": 0.03790395950388449, + "language_loss": 0.88320071, + "learning_rate": 0.0009762166280235146, + "loss": 0.89391285, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.60351562, + "step": 653, + "time_per_iteration": 2.9728682041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073992, + "balance_loss_mlp": 1.01372027, + "epoch": 0.12581762216237014, + "flos": 564799160064.0, + "grad_norm": 0.039966468352906216, + "language_loss": 0.88308495, + "learning_rate": 0.0009761215939223267, + "loss": 0.89382488, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.6015625, + "step": 654, + "time_per_iteration": 2.7552366256713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.01100981, + "epoch": 0.1260100038476337, + "flos": 482901253632.0, + "grad_norm": 0.045851790315233704, + "language_loss": 0.87049586, + "learning_rate": 0.0009760263749743428, + "loss": 0.88121206, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.60498047, + "step": 655, + "time_per_iteration": 2.5859339237213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.01301908, + "epoch": 0.12620238553289725, + "flos": 576702161664.0, + "grad_norm": 0.03680601760412016, + "language_loss": 0.91127861, + "learning_rate": 0.0009759309712165299, + "loss": 0.9220134, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.60351562, + "step": 656, + "time_per_iteration": 2.7411043643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.00841653, + "epoch": 0.12639476721816084, + "flos": 532186197504.0, + "grad_norm": 0.050748048847022796, + "language_loss": 0.94208288, + "learning_rate": 0.0009758353826859272, + "loss": 0.95277309, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.60498047, + "step": 657, + "time_per_iteration": 2.5851681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071924, + "balance_loss_mlp": 1.01117456, + "epoch": 0.1265871489034244, + "flos": 691232472576.0, + "grad_norm": 0.04052834214006204, + "language_loss": 0.90056133, + "learning_rate": 0.0009757396094196456, + "loss": 0.91128063, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.60644531, + "step": 658, + "time_per_iteration": 2.9119739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.01083672, + "epoch": 0.12677953058868796, + "flos": 538243166976.0, + "grad_norm": 0.03305987481805703, + "language_loss": 0.85138786, + "learning_rate": 0.0009756436514548673, + "loss": 0.86210179, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.60449219, + "step": 659, + "time_per_iteration": 2.8146860599517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070631, + "balance_loss_mlp": 1.01021552, + "epoch": 0.12697191227395152, + "flos": 520120857600.0, + "grad_norm": 0.03322369158928612, + "language_loss": 0.89052176, + "learning_rate": 0.0009755475088288466, + "loss": 0.90122807, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.60302734, + "step": 660, + "time_per_iteration": 2.7092652320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070351, + "balance_loss_mlp": 1.01007843, + "epoch": 0.12716429395921508, + "flos": 567666768384.0, + "grad_norm": 0.0427017471912124, + "language_loss": 0.91535795, + "learning_rate": 0.0009754511815789095, + "loss": 0.92606151, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.6015625, + "step": 661, + "time_per_iteration": 2.790198564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.00809085, + "epoch": 0.12735667564447864, + "flos": 515142885888.0, + "grad_norm": 0.0409493229321676, + "language_loss": 0.8685838, + "learning_rate": 0.0009753546697424533, + "loss": 0.87926698, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.60107422, + "step": 662, + "time_per_iteration": 2.6784565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070378, + "balance_loss_mlp": 1.01020074, + "epoch": 0.1275490573297422, + "flos": 542321975808.0, + "grad_norm": 0.039351291895580044, + "language_loss": 0.91270494, + "learning_rate": 0.0009752579733569475, + "loss": 0.92340875, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.60058594, + "step": 663, + "time_per_iteration": 2.679379940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071762, + "balance_loss_mlp": 1.01358795, + "epoch": 0.12774143901500576, + "flos": 1562027728896.0, + "grad_norm": 0.016936801864205438, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7595315, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.58007812, + "step": 664, + "time_per_iteration": 4.936127424240112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070961, + "balance_loss_mlp": 1.01092672, + "epoch": 0.12793382070026935, + "flos": 614874396672.0, + "grad_norm": 0.047422479810277696, + "language_loss": 0.90634137, + "learning_rate": 0.0009750640270890217, + "loss": 0.91705096, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.59912109, + "step": 665, + "time_per_iteration": 2.712202548980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.01361179, + "epoch": 0.1281262023855329, + "flos": 709118566656.0, + "grad_norm": 0.04721256261198653, + "language_loss": 0.97348696, + "learning_rate": 0.0009749667772818983, + "loss": 0.98422199, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.59765625, + "step": 666, + "time_per_iteration": 2.959563732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065521, + "balance_loss_mlp": 1.00791931, + "epoch": 0.12831858407079647, + "flos": 1428185295360.0, + "grad_norm": 0.00958948420866419, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78001463, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.57421875, + "step": 667, + "time_per_iteration": 4.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.01259768, + "epoch": 0.12851096575606002, + "flos": 450019027968.0, + "grad_norm": 0.04331482152431362, + "language_loss": 0.96237415, + "learning_rate": 0.0009747717245101093, + "loss": 0.97309327, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.59179688, + "step": 668, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.01193655, + "epoch": 0.12870334744132358, + "flos": 480910454016.0, + "grad_norm": 0.040015395826151615, + "language_loss": 0.86231172, + "learning_rate": 0.00097467392162117, + "loss": 0.87302423, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.59179688, + "step": 669, + "time_per_iteration": 2.620121717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.01342034, + "epoch": 0.12889572912658714, + "flos": 640152115200.0, + "grad_norm": 0.03307407171369126, + "language_loss": 0.91950834, + "learning_rate": 0.0009745759344474708, + "loss": 0.9302386, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.59472656, + "step": 670, + "time_per_iteration": 2.834406852722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070894, + "balance_loss_mlp": 1.01114607, + "epoch": 0.1290881108118507, + "flos": 510955206912.0, + "grad_norm": 0.03904079329345599, + "language_loss": 0.90752548, + "learning_rate": 0.0009744777630270536, + "loss": 0.91823441, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.59619141, + "step": 671, + "time_per_iteration": 2.5841259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.00967062, + "epoch": 0.12928049249711426, + "flos": 672291680256.0, + "grad_norm": 0.0427916369984872, + "language_loss": 0.94394779, + "learning_rate": 0.000974379407398032, + "loss": 0.95464385, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.59814453, + "step": 672, + "time_per_iteration": 2.8698208332061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072071, + "balance_loss_mlp": 1.0120368, + "epoch": 0.12947287418237785, + "flos": 795000017664.0, + "grad_norm": 0.03399258645873994, + "language_loss": 0.83039552, + "learning_rate": 0.0009742808675985913, + "loss": 0.84111625, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.59912109, + "step": 673, + "time_per_iteration": 3.1018688678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_mlp": 1.00729334, + "epoch": 0.1296652558676414, + "flos": 486448339200.0, + "grad_norm": 0.039807509100232605, + "language_loss": 0.91899526, + "learning_rate": 0.0009741821436669876, + "loss": 0.92966807, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.59863281, + "step": 674, + "time_per_iteration": 2.6348536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068278, + "balance_loss_mlp": 1.00853038, + "epoch": 0.12985763755290497, + "flos": 454393344768.0, + "grad_norm": 0.044170807310258554, + "language_loss": 0.93403888, + "learning_rate": 0.0009740832356415492, + "loss": 0.9447217, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.59619141, + "step": 675, + "time_per_iteration": 2.483262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072639, + "balance_loss_mlp": 1.01265311, + "epoch": 0.13005001923816853, + "flos": 826435805952.0, + "grad_norm": 0.043859966784303914, + "language_loss": 0.89693773, + "learning_rate": 0.0009739841435606756, + "loss": 0.90766412, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.59863281, + "step": 676, + "time_per_iteration": 2.992385149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_mlp": 1.00726056, + "epoch": 0.1302424009234321, + "flos": 532481705472.0, + "grad_norm": 0.03559705023164985, + "language_loss": 0.91210669, + "learning_rate": 0.0009738848674628377, + "loss": 0.92277622, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.59570312, + "step": 677, + "time_per_iteration": 2.766364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.00765288, + "epoch": 0.13043478260869565, + "flos": 526917575424.0, + "grad_norm": 0.03838556287658105, + "language_loss": 0.90382779, + "learning_rate": 0.000973785407386578, + "loss": 0.91449988, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.59423828, + "step": 678, + "time_per_iteration": 2.772854804992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070658, + "balance_loss_mlp": 1.01076782, + "epoch": 0.1306271642939592, + "flos": 627417991680.0, + "grad_norm": 0.03509098765963207, + "language_loss": 0.88142246, + "learning_rate": 0.0009736857633705103, + "loss": 0.89212906, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.59765625, + "step": 679, + "time_per_iteration": 2.851567268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.01602292, + "epoch": 0.13081954597922277, + "flos": 551841460224.0, + "grad_norm": 0.03859467755451503, + "language_loss": 0.94306064, + "learning_rate": 0.0009735859354533196, + "loss": 0.95381933, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.59716797, + "step": 680, + "time_per_iteration": 2.6908183097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.01038456, + "epoch": 0.13101192766448633, + "flos": 537956407296.0, + "grad_norm": 0.04695623305024525, + "language_loss": 0.92768431, + "learning_rate": 0.0009734859236737628, + "loss": 0.93838656, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.59716797, + "step": 681, + "time_per_iteration": 2.618556261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.00631785, + "epoch": 0.13120430934974991, + "flos": 504514268160.0, + "grad_norm": 0.03771498494962771, + "language_loss": 0.94425803, + "learning_rate": 0.0009733857280706678, + "loss": 0.95491678, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.59423828, + "step": 682, + "time_per_iteration": 2.607445240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.00883758, + "epoch": 0.13139669103501347, + "flos": 615423616512.0, + "grad_norm": 0.040497909024236244, + "language_loss": 0.85748106, + "learning_rate": 0.000973285348682934, + "loss": 0.86816311, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.59228516, + "step": 683, + "time_per_iteration": 2.749258518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.00846863, + "epoch": 0.13158907272027703, + "flos": 1488218420736.0, + "grad_norm": 0.017735586482065788, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.78962922, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.5625, + "step": 684, + "time_per_iteration": 4.792337894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069614, + "balance_loss_mlp": 1.01053405, + "epoch": 0.1317814544055406, + "flos": 987119046912.0, + "grad_norm": 0.04121230716493085, + "language_loss": 0.86815995, + "learning_rate": 0.0009730840387095046, + "loss": 0.87885606, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.58935547, + "step": 685, + "time_per_iteration": 3.324737071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.00972676, + "epoch": 0.13197383609080415, + "flos": 612629885184.0, + "grad_norm": 0.03769323902360627, + "language_loss": 0.91733027, + "learning_rate": 0.0009729831082019642, + "loss": 0.92801929, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.59033203, + "step": 686, + "time_per_iteration": 2.883368968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069054, + "balance_loss_mlp": 1.0096879, + "epoch": 0.1321662177760677, + "flos": 495555664128.0, + "grad_norm": 0.03344682577786829, + "language_loss": 0.90060174, + "learning_rate": 0.0009728819940660958, + "loss": 0.91129231, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.59228516, + "step": 687, + "time_per_iteration": 2.7771294116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069131, + "balance_loss_mlp": 1.00971675, + "epoch": 0.13235859946133127, + "flos": 496844632320.0, + "grad_norm": 0.041743180753116546, + "language_loss": 0.8673048, + "learning_rate": 0.0009727806963411557, + "loss": 0.87799615, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.59277344, + "step": 688, + "time_per_iteration": 2.5879924297332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069414, + "balance_loss_mlp": 1.00971425, + "epoch": 0.13255098114659483, + "flos": 512768116992.0, + "grad_norm": 0.035278095584539565, + "language_loss": 0.88457793, + "learning_rate": 0.000972679215066471, + "loss": 0.89527214, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.59570312, + "step": 689, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_mlp": 1.00826621, + "epoch": 0.13274336283185842, + "flos": 548400332544.0, + "grad_norm": 0.043703661342582356, + "language_loss": 1.0036962, + "learning_rate": 0.0009725775502814401, + "loss": 1.01437247, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.59228516, + "step": 690, + "time_per_iteration": 2.580975294113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072547, + "balance_loss_mlp": 1.01313293, + "epoch": 0.13293574451712198, + "flos": 642003909120.0, + "grad_norm": 0.041755939912029, + "language_loss": 0.86554468, + "learning_rate": 0.0009724757020255327, + "loss": 0.87627012, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.59277344, + "step": 691, + "time_per_iteration": 2.895805835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074323, + "balance_loss_mlp": 1.01533794, + "epoch": 0.13312812620238554, + "flos": 492470315520.0, + "grad_norm": 0.04584738151589033, + "language_loss": 0.8907311, + "learning_rate": 0.0009723736703382902, + "loss": 0.90147436, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.58837891, + "step": 692, + "time_per_iteration": 2.593621253967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073259, + "balance_loss_mlp": 1.01427472, + "epoch": 0.1333205078876491, + "flos": 509950086144.0, + "grad_norm": 0.042207641511909956, + "language_loss": 0.84734881, + "learning_rate": 0.0009722714552593244, + "loss": 0.85808134, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.58837891, + "step": 693, + "time_per_iteration": 2.6628286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069094, + "balance_loss_mlp": 1.01010931, + "epoch": 0.13351288957291266, + "flos": 419592251136.0, + "grad_norm": 0.04342856140262568, + "language_loss": 0.95545483, + "learning_rate": 0.000972169056828319, + "loss": 0.96614575, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.58837891, + "step": 694, + "time_per_iteration": 2.491511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.00896847, + "epoch": 0.13370527125817622, + "flos": 617051834112.0, + "grad_norm": 0.03328111889388194, + "language_loss": 0.87929142, + "learning_rate": 0.0009720664750850283, + "loss": 0.88997287, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.59033203, + "step": 695, + "time_per_iteration": 2.802238941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066112, + "balance_loss_mlp": 1.00693631, + "epoch": 0.13389765294343978, + "flos": 627170115840.0, + "grad_norm": 0.04111883948503256, + "language_loss": 0.94899035, + "learning_rate": 0.0009719637100692784, + "loss": 0.95965147, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.59033203, + "step": 696, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066724, + "balance_loss_mlp": 1.00764382, + "epoch": 0.13409003462870334, + "flos": 610897655040.0, + "grad_norm": 0.03903466400724949, + "language_loss": 0.84625083, + "learning_rate": 0.0009718607618209661, + "loss": 0.85691804, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.58935547, + "step": 697, + "time_per_iteration": 2.8612687587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.00915492, + "epoch": 0.13428241631396692, + "flos": 685088987136.0, + "grad_norm": 0.03548160791415639, + "language_loss": 0.8885181, + "learning_rate": 0.0009717576303800595, + "loss": 0.89919716, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.5859375, + "step": 698, + "time_per_iteration": 3.046081304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.00870502, + "epoch": 0.13447479799923048, + "flos": 509819828736.0, + "grad_norm": 0.04099621387271608, + "language_loss": 0.8689754, + "learning_rate": 0.0009716543157865975, + "loss": 0.87964994, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.5859375, + "step": 699, + "time_per_iteration": 2.7116739749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.00893724, + "epoch": 0.13466717968449404, + "flos": 899060158464.0, + "grad_norm": 0.03800712734159662, + "language_loss": 0.8517018, + "learning_rate": 0.0009715508180806907, + "loss": 0.86237621, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.58349609, + "step": 700, + "time_per_iteration": 3.184324026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.00777256, + "epoch": 0.1348595613697576, + "flos": 991695552768.0, + "grad_norm": 0.036541360765650906, + "language_loss": 0.91219282, + "learning_rate": 0.0009714471373025202, + "loss": 0.92285609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.58398438, + "step": 701, + "time_per_iteration": 3.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.0059582, + "epoch": 0.13505194305502116, + "flos": 488812414464.0, + "grad_norm": 0.038284394577449095, + "language_loss": 0.90020943, + "learning_rate": 0.0009713432734923386, + "loss": 0.91085601, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.58544922, + "step": 702, + "time_per_iteration": 2.6416144371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.00842357, + "epoch": 0.13524432474028472, + "flos": 614520562944.0, + "grad_norm": 0.03635122731697363, + "language_loss": 0.87970936, + "learning_rate": 0.0009712392266904696, + "loss": 0.89038247, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.58740234, + "step": 703, + "time_per_iteration": 2.73490309715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.00782144, + "epoch": 0.13543670642554828, + "flos": 906275838720.0, + "grad_norm": 0.040994558071305906, + "language_loss": 0.86788869, + "learning_rate": 0.0009711349969373076, + "loss": 0.87855482, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.58642578, + "step": 704, + "time_per_iteration": 3.1667368412017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066356, + "balance_loss_mlp": 1.00765777, + "epoch": 0.13562908811081184, + "flos": 551748141312.0, + "grad_norm": 0.040707128775991024, + "language_loss": 0.81448901, + "learning_rate": 0.0009710305842733178, + "loss": 0.82515258, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.58544922, + "step": 705, + "time_per_iteration": 2.7456798553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064755, + "balance_loss_mlp": 1.00648558, + "epoch": 0.1358214697960754, + "flos": 509038284288.0, + "grad_norm": 0.04235852839756889, + "language_loss": 0.91048527, + "learning_rate": 0.0009709259887390373, + "loss": 0.9211328, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.58105469, + "step": 706, + "time_per_iteration": 2.614645481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067613, + "balance_loss_mlp": 1.0098201, + "epoch": 0.136013851481339, + "flos": 529924189440.0, + "grad_norm": 0.045207837368539144, + "language_loss": 0.92539275, + "learning_rate": 0.0009708212103750737, + "loss": 0.93606889, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.57617188, + "step": 707, + "time_per_iteration": 2.5839250087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073243, + "balance_loss_mlp": 1.01525927, + "epoch": 0.13620623316660255, + "flos": 660321604608.0, + "grad_norm": 0.04139663244511697, + "language_loss": 0.88690269, + "learning_rate": 0.0009707162492221051, + "loss": 0.8976351, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.578125, + "step": 708, + "time_per_iteration": 2.8753738403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106855, + "balance_loss_mlp": 1.01051939, + "epoch": 0.1363986148518661, + "flos": 673083918336.0, + "grad_norm": 0.04870142688483653, + "language_loss": 0.89226341, + "learning_rate": 0.0009706111053208815, + "loss": 0.90294898, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.57861328, + "step": 709, + "time_per_iteration": 2.792555570602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.0069865, + "epoch": 0.13659099653712967, + "flos": 474004866048.0, + "grad_norm": 0.041589756065930725, + "language_loss": 0.87875092, + "learning_rate": 0.0009705057787122232, + "loss": 0.88940346, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.58105469, + "step": 710, + "time_per_iteration": 2.5474488735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106841, + "balance_loss_mlp": 1.00980711, + "epoch": 0.13678337822239323, + "flos": 453648738816.0, + "grad_norm": 0.03947638411835938, + "language_loss": 0.92397159, + "learning_rate": 0.0009704002694370216, + "loss": 0.93465567, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.58447266, + "step": 711, + "time_per_iteration": 2.5812153816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107162, + "balance_loss_mlp": 1.01306474, + "epoch": 0.13697575990765679, + "flos": 520626336000.0, + "grad_norm": 0.04103000756090051, + "language_loss": 0.88202429, + "learning_rate": 0.0009702945775362388, + "loss": 0.89274049, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.58398438, + "step": 712, + "time_per_iteration": 2.6084940433502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067699, + "balance_loss_mlp": 1.00914371, + "epoch": 0.13716814159292035, + "flos": 481366354944.0, + "grad_norm": 0.04017855754763819, + "language_loss": 0.88458985, + "learning_rate": 0.0009701887030509086, + "loss": 0.89526689, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.58398438, + "step": 713, + "time_per_iteration": 2.6361663341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.01425505, + "epoch": 0.1373605232781839, + "flos": 546750727680.0, + "grad_norm": 0.04169009137316196, + "language_loss": 0.92536753, + "learning_rate": 0.0009700826460221346, + "loss": 0.93609238, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.58056641, + "step": 714, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068882, + "balance_loss_mlp": 1.01080275, + "epoch": 0.1375529049634475, + "flos": 710071197696.0, + "grad_norm": 0.042053375460334, + "language_loss": 0.94210052, + "learning_rate": 0.0009699764064910921, + "loss": 0.95278937, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.57910156, + "step": 715, + "time_per_iteration": 2.870835542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069166, + "balance_loss_mlp": 1.01099169, + "epoch": 0.13774528664871105, + "flos": 487677036288.0, + "grad_norm": 0.04018028408764831, + "language_loss": 0.88572168, + "learning_rate": 0.0009698699844990268, + "loss": 0.89641333, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.58007812, + "step": 716, + "time_per_iteration": 2.6557233333587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106644, + "balance_loss_mlp": 1.00817037, + "epoch": 0.1379376683339746, + "flos": 681459276288.0, + "grad_norm": 0.03631196674856893, + "language_loss": 0.89737439, + "learning_rate": 0.0009697633800872555, + "loss": 0.90803885, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.58105469, + "step": 717, + "time_per_iteration": 2.9236202239990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.00998127, + "epoch": 0.13813005001923817, + "flos": 612226473984.0, + "grad_norm": 0.040527486313319094, + "language_loss": 0.9214747, + "learning_rate": 0.0009696565932971655, + "loss": 0.93215865, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.58251953, + "step": 718, + "time_per_iteration": 2.8931636810302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_mlp": 1.01394677, + "epoch": 0.13832243170450173, + "flos": 589927179264.0, + "grad_norm": 0.042228364331249636, + "language_loss": 0.91184157, + "learning_rate": 0.0009695496241702153, + "loss": 0.92256421, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.58154297, + "step": 719, + "time_per_iteration": 2.8006720542907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010673, + "balance_loss_mlp": 1.00917327, + "epoch": 0.1385148133897653, + "flos": 701320618752.0, + "grad_norm": 0.04012183054192491, + "language_loss": 0.87174737, + "learning_rate": 0.0009694424727479339, + "loss": 0.88242036, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.57958984, + "step": 720, + "time_per_iteration": 2.9363977909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.0081414, + "epoch": 0.13870719507502885, + "flos": 599367929088.0, + "grad_norm": 0.04032336097495746, + "language_loss": 0.90803999, + "learning_rate": 0.0009693351390719213, + "loss": 0.91870457, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.58154297, + "step": 721, + "time_per_iteration": 2.7786271572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01273775, + "epoch": 0.1388995767602924, + "flos": 587749741824.0, + "grad_norm": 0.04179929290372652, + "language_loss": 0.92465305, + "learning_rate": 0.000969227623183848, + "loss": 0.93536115, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.57910156, + "step": 722, + "time_per_iteration": 2.777453660964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.00870621, + "epoch": 0.139091958445556, + "flos": 652363263744.0, + "grad_norm": 0.041578114374578125, + "language_loss": 0.92603219, + "learning_rate": 0.0009691199251254554, + "loss": 0.9366982, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.57714844, + "step": 723, + "time_per_iteration": 2.813610553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.00586045, + "epoch": 0.13928434013081956, + "flos": 576906296064.0, + "grad_norm": 0.03663552971403626, + "language_loss": 0.88541949, + "learning_rate": 0.0009690120449385555, + "loss": 0.89605606, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.57617188, + "step": 724, + "time_per_iteration": 2.7604424953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_mlp": 1.00582433, + "epoch": 0.13947672181608312, + "flos": 564315068928.0, + "grad_norm": 0.034271197388489986, + "language_loss": 0.93926299, + "learning_rate": 0.0009689039826650312, + "loss": 0.94990206, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.57910156, + "step": 725, + "time_per_iteration": 2.7856695652008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095871, + "balance_loss_mlp": 1.03941345, + "epoch": 0.13966910350134668, + "flos": 1524951988224.0, + "grad_norm": 0.03128450212810151, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77618933, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.56640625, + "step": 726, + "time_per_iteration": 4.903306245803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.00902975, + "epoch": 0.13986148518661023, + "flos": 500856367104.0, + "grad_norm": 0.052764167671210026, + "language_loss": 0.89172196, + "learning_rate": 0.0009686873120259941, + "loss": 0.90239263, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.57861328, + "step": 727, + "time_per_iteration": 2.6450552940368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072692, + "balance_loss_mlp": 1.01518559, + "epoch": 0.1400538668718738, + "flos": 599850074880.0, + "grad_norm": 0.036488800736072635, + "language_loss": 0.88047451, + "learning_rate": 0.0009685787037446004, + "loss": 0.89120144, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.57324219, + "step": 728, + "time_per_iteration": 2.763434648513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072512, + "balance_loss_mlp": 1.01481462, + "epoch": 0.14024624855713735, + "flos": 595169556480.0, + "grad_norm": 0.047561697925478, + "language_loss": 0.88858587, + "learning_rate": 0.0009684699135448201, + "loss": 0.89931101, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.57519531, + "step": 729, + "time_per_iteration": 2.745037078857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067277, + "balance_loss_mlp": 1.00962722, + "epoch": 0.1404386302424009, + "flos": 507586010880.0, + "grad_norm": 0.03094406590189725, + "language_loss": 0.9291476, + "learning_rate": 0.0009683609414688895, + "loss": 0.93982029, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.57470703, + "step": 730, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.01039195, + "epoch": 0.14063101192766447, + "flos": 574515975936.0, + "grad_norm": 0.037780385553924656, + "language_loss": 0.87345785, + "learning_rate": 0.0009682517875591154, + "loss": 0.88414258, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.57910156, + "step": 731, + "time_per_iteration": 2.752572536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071345, + "balance_loss_mlp": 1.0129801, + "epoch": 0.14082339361292806, + "flos": 565765396992.0, + "grad_norm": 0.03832964150159033, + "language_loss": 0.87666118, + "learning_rate": 0.0009681424518578749, + "loss": 0.88737464, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.58203125, + "step": 732, + "time_per_iteration": 2.7323830127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.01028764, + "epoch": 0.14101577529819162, + "flos": 464583558144.0, + "grad_norm": 0.035957988569031644, + "language_loss": 0.88670099, + "learning_rate": 0.000968032934407616, + "loss": 0.8973856, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.58007812, + "step": 733, + "time_per_iteration": 2.6479005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064525, + "balance_loss_mlp": 1.00644577, + "epoch": 0.14120815698345518, + "flos": 597262423296.0, + "grad_norm": 0.039547782577588224, + "language_loss": 0.82413781, + "learning_rate": 0.0009679232352508571, + "loss": 0.83478296, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.57910156, + "step": 734, + "time_per_iteration": 2.7924795150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.00599897, + "epoch": 0.14140053866871874, + "flos": 536232925440.0, + "grad_norm": 0.03854566850595878, + "language_loss": 0.82520735, + "learning_rate": 0.0009678133544301871, + "loss": 0.83584428, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.57519531, + "step": 735, + "time_per_iteration": 2.658731698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_mlp": 1.00498438, + "epoch": 0.1415929203539823, + "flos": 521277623040.0, + "grad_norm": 0.0297517777524564, + "language_loss": 0.92917788, + "learning_rate": 0.0009677032919882658, + "loss": 0.93980187, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.57226562, + "step": 736, + "time_per_iteration": 2.661276340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_mlp": 1.0113374, + "epoch": 0.14178530203924586, + "flos": 483302719488.0, + "grad_norm": 0.041037110936195734, + "language_loss": 0.92867804, + "learning_rate": 0.000967593047967823, + "loss": 0.93936217, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.56982422, + "step": 737, + "time_per_iteration": 2.52840256690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068115, + "balance_loss_mlp": 1.01056099, + "epoch": 0.14197768372450942, + "flos": 677840259072.0, + "grad_norm": 0.04254557939420697, + "language_loss": 0.88126308, + "learning_rate": 0.0009674826224116593, + "loss": 0.89194429, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.57373047, + "step": 738, + "time_per_iteration": 2.858147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.0167979, + "epoch": 0.14217006540977298, + "flos": 446992972032.0, + "grad_norm": 0.045930563119643074, + "language_loss": 0.87994051, + "learning_rate": 0.0009673720153626455, + "loss": 0.89068353, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.57324219, + "step": 739, + "time_per_iteration": 2.664236545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.01199603, + "epoch": 0.14236244709503657, + "flos": 497478422784.0, + "grad_norm": 0.040566684483093814, + "language_loss": 0.88105047, + "learning_rate": 0.0009672612268637235, + "loss": 0.89174449, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.57226562, + "step": 740, + "time_per_iteration": 2.634126901626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069933, + "balance_loss_mlp": 1.01304626, + "epoch": 0.14255482878030012, + "flos": 649480104192.0, + "grad_norm": 0.05086050125917657, + "language_loss": 0.85906518, + "learning_rate": 0.0009671502569579048, + "loss": 0.86976457, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.56884766, + "step": 741, + "time_per_iteration": 2.7642107009887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071964, + "balance_loss_mlp": 1.01564944, + "epoch": 0.14274721046556368, + "flos": 537274984704.0, + "grad_norm": 0.037356444744632025, + "language_loss": 0.90824854, + "learning_rate": 0.0009670391056882719, + "loss": 0.91896814, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.56445312, + "step": 742, + "time_per_iteration": 2.7307372093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069292, + "balance_loss_mlp": 1.01288199, + "epoch": 0.14293959215082724, + "flos": 958584893184.0, + "grad_norm": 0.03744948002603285, + "language_loss": 0.89976203, + "learning_rate": 0.0009669277730979776, + "loss": 0.91045499, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.56494141, + "step": 743, + "time_per_iteration": 3.2251601219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068321, + "balance_loss_mlp": 1.01162553, + "epoch": 0.1431319738360908, + "flos": 694386840576.0, + "grad_norm": 0.037398516399228816, + "language_loss": 0.86562485, + "learning_rate": 0.0009668162592302449, + "loss": 0.87630802, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.56738281, + "step": 744, + "time_per_iteration": 2.924435615539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.01015854, + "epoch": 0.14332435552135436, + "flos": 566503200000.0, + "grad_norm": 0.037819132294000864, + "language_loss": 0.86981773, + "learning_rate": 0.0009667045641283676, + "loss": 0.88048917, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.56933594, + "step": 745, + "time_per_iteration": 2.6744887828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071811, + "balance_loss_mlp": 1.01540148, + "epoch": 0.14351673720661792, + "flos": 739696988160.0, + "grad_norm": 0.042480690817339954, + "language_loss": 0.96115947, + "learning_rate": 0.0009665926878357092, + "loss": 0.97187757, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.56591797, + "step": 746, + "time_per_iteration": 2.9137520790100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069908, + "balance_loss_mlp": 1.0134027, + "epoch": 0.14370911889188148, + "flos": 550352248320.0, + "grad_norm": 0.037361960218361134, + "language_loss": 0.92219329, + "learning_rate": 0.0009664806303957043, + "loss": 0.93289238, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.56542969, + "step": 747, + "time_per_iteration": 2.7734382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.01453757, + "epoch": 0.14390150057714507, + "flos": 591590390016.0, + "grad_norm": 0.040803275102161134, + "language_loss": 0.88578373, + "learning_rate": 0.0009663683918518571, + "loss": 0.89649272, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.56542969, + "step": 748, + "time_per_iteration": 2.93782114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.0103749, + "epoch": 0.14409388226240863, + "flos": 592145445888.0, + "grad_norm": 0.040391516566669984, + "language_loss": 0.87085271, + "learning_rate": 0.0009662559722477428, + "loss": 0.88152146, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.56640625, + "step": 749, + "time_per_iteration": 2.696570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.08542633, + "epoch": 0.1442862639476722, + "flos": 1514657762304.0, + "grad_norm": 0.043557664449290004, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77303517, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.55273438, + "step": 750, + "time_per_iteration": 5.024984836578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106928, + "balance_loss_mlp": 1.01263177, + "epoch": 0.14447864563293575, + "flos": 497856556032.0, + "grad_norm": 0.03544029116038115, + "language_loss": 0.90697813, + "learning_rate": 0.0009660305900333632, + "loss": 0.91767091, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.56738281, + "step": 751, + "time_per_iteration": 2.678037166595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078207, + "balance_loss_mlp": 1.02165437, + "epoch": 0.1446710273181993, + "flos": 590795239680.0, + "grad_norm": 0.04141635113788076, + "language_loss": 0.83649188, + "learning_rate": 0.0009659176275105992, + "loss": 0.84727395, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.56640625, + "step": 752, + "time_per_iteration": 2.714871883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.02074409, + "epoch": 0.14486340900346287, + "flos": 587013884160.0, + "grad_norm": 0.03637909883196532, + "language_loss": 0.87195009, + "learning_rate": 0.0009658044841025701, + "loss": 0.88271976, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.56396484, + "step": 753, + "time_per_iteration": 2.7753467559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.01901722, + "epoch": 0.14505579068872643, + "flos": 505741019904.0, + "grad_norm": 0.041255413340114844, + "language_loss": 0.82866222, + "learning_rate": 0.0009656911598532021, + "loss": 0.83941746, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.56591797, + "step": 754, + "time_per_iteration": 2.657831907272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.02119958, + "epoch": 0.14524817237399, + "flos": 487816041984.0, + "grad_norm": 0.03637506550278126, + "language_loss": 0.9138847, + "learning_rate": 0.0009655776548064917, + "loss": 0.92465889, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.56347656, + "step": 755, + "time_per_iteration": 2.6499805450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070806, + "balance_loss_mlp": 1.01477778, + "epoch": 0.14544055405925355, + "flos": 729450394368.0, + "grad_norm": 0.037726189244012505, + "language_loss": 0.89799821, + "learning_rate": 0.0009654639690065054, + "loss": 0.90870631, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.56201172, + "step": 756, + "time_per_iteration": 2.913638114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.01461017, + "epoch": 0.14563293574451713, + "flos": 594787532544.0, + "grad_norm": 0.03772784195488967, + "language_loss": 0.8914414, + "learning_rate": 0.00096535010249738, + "loss": 0.90214825, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.5625, + "step": 757, + "time_per_iteration": 2.721640110015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.01121712, + "epoch": 0.1458253174297807, + "flos": 561623404800.0, + "grad_norm": 0.04410713855467511, + "language_loss": 0.84106696, + "learning_rate": 0.0009652360553233224, + "loss": 0.8517437, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.56591797, + "step": 758, + "time_per_iteration": 2.771986484527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080421, + "balance_loss_mlp": 1.02625275, + "epoch": 0.14601769911504425, + "flos": 1561189804032.0, + "grad_norm": 0.021986445825835567, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74854165, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.54296875, + "step": 759, + "time_per_iteration": 4.951657056808472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064246, + "balance_loss_mlp": 1.00712132, + "epoch": 0.1462100808003078, + "flos": 867823646976.0, + "grad_norm": 0.03532102179266325, + "language_loss": 0.82350075, + "learning_rate": 0.0009650074191575883, + "loss": 0.83414322, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.56982422, + "step": 760, + "time_per_iteration": 3.2275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.02083874, + "epoch": 0.14640246248557137, + "flos": 524030525184.0, + "grad_norm": 0.0394901057776484, + "language_loss": 0.87295806, + "learning_rate": 0.0009648928302546766, + "loss": 0.88373965, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.57177734, + "step": 761, + "time_per_iteration": 2.6739044189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108061, + "balance_loss_mlp": 1.02319896, + "epoch": 0.14659484417083493, + "flos": 1032242556672.0, + "grad_norm": 0.0381114836464334, + "language_loss": 0.86423808, + "learning_rate": 0.0009647780608643613, + "loss": 0.87504417, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.57226562, + "step": 762, + "time_per_iteration": 3.355055332183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_mlp": 1.02742219, + "epoch": 0.1467872258560985, + "flos": 501657353472.0, + "grad_norm": 0.04884269069306727, + "language_loss": 0.89483184, + "learning_rate": 0.0009646631110312001, + "loss": 0.90568066, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.57275391, + "step": 763, + "time_per_iteration": 2.638404607772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074108, + "balance_loss_mlp": 1.01683939, + "epoch": 0.14697960754136205, + "flos": 548936913408.0, + "grad_norm": 0.030517371118051684, + "language_loss": 0.89587164, + "learning_rate": 0.0009645479807998203, + "loss": 0.90661263, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.57128906, + "step": 764, + "time_per_iteration": 2.7784340381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066881, + "balance_loss_mlp": 1.0099467, + "epoch": 0.14717198922662564, + "flos": 518902854144.0, + "grad_norm": 0.03321738346858149, + "language_loss": 0.93693149, + "learning_rate": 0.0009644326702149196, + "loss": 0.94760031, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.56884766, + "step": 765, + "time_per_iteration": 2.712148904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.009902, + "epoch": 0.1473643709118892, + "flos": 733484483328.0, + "grad_norm": 0.042813367444357694, + "language_loss": 0.86227441, + "learning_rate": 0.0009643171793212653, + "loss": 0.87293845, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.56591797, + "step": 766, + "time_per_iteration": 3.0350003242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.01357007, + "epoch": 0.14755675259715276, + "flos": 621669169152.0, + "grad_norm": 0.04397904632105779, + "language_loss": 0.90884185, + "learning_rate": 0.0009642015081636952, + "loss": 0.91953874, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.56298828, + "step": 767, + "time_per_iteration": 2.6967811584472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.01185656, + "epoch": 0.14774913428241632, + "flos": 453173395968.0, + "grad_norm": 0.040409537343205924, + "language_loss": 0.89756525, + "learning_rate": 0.0009640856567871166, + "loss": 0.90824074, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.55859375, + "step": 768, + "time_per_iteration": 2.5016207695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.00803363, + "epoch": 0.14794151596767988, + "flos": 838655702784.0, + "grad_norm": 0.03518214363191685, + "language_loss": 0.90024096, + "learning_rate": 0.0009639696252365072, + "loss": 0.91087824, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.55859375, + "step": 769, + "time_per_iteration": 3.0535316467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064247, + "balance_loss_mlp": 1.00874364, + "epoch": 0.14813389765294344, + "flos": 687405430272.0, + "grad_norm": 0.03578436651039587, + "language_loss": 0.83073497, + "learning_rate": 0.0009638534135569144, + "loss": 0.8413775, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.55664062, + "step": 770, + "time_per_iteration": 2.8983683586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065854, + "balance_loss_mlp": 1.01039767, + "epoch": 0.148326279338207, + "flos": 510944513280.0, + "grad_norm": 0.03931230706380594, + "language_loss": 0.91550887, + "learning_rate": 0.0009637370217934554, + "loss": 0.92616743, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.55615234, + "step": 771, + "time_per_iteration": 2.6311967372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061556, + "balance_loss_mlp": 1.00590932, + "epoch": 0.14851866102347056, + "flos": 589332272640.0, + "grad_norm": 0.03214719611667013, + "language_loss": 0.8436957, + "learning_rate": 0.0009636204499913175, + "loss": 0.85431123, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.55810547, + "step": 772, + "time_per_iteration": 2.8748695850372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066056, + "balance_loss_mlp": 1.01069546, + "epoch": 0.14871104270873411, + "flos": 692248286976.0, + "grad_norm": 0.034034874980260935, + "language_loss": 0.89455193, + "learning_rate": 0.0009635036981957581, + "loss": 0.9052124, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.55517578, + "step": 773, + "time_per_iteration": 2.8526012897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063541, + "balance_loss_mlp": 1.00789392, + "epoch": 0.1489034243939977, + "flos": 656283624960.0, + "grad_norm": 0.03841304714783139, + "language_loss": 0.91971016, + "learning_rate": 0.0009633867664521043, + "loss": 0.93034559, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.55810547, + "step": 774, + "time_per_iteration": 2.823320150375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_mlp": 1.00736797, + "epoch": 0.14909580607926126, + "flos": 476796652032.0, + "grad_norm": 0.0404919947218097, + "language_loss": 0.88328946, + "learning_rate": 0.0009632696548057527, + "loss": 0.89392436, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.56298828, + "step": 775, + "time_per_iteration": 2.5567190647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072265, + "balance_loss_mlp": 1.01609385, + "epoch": 0.14928818776452482, + "flos": 612284799744.0, + "grad_norm": 0.03821441574416946, + "language_loss": 0.86270714, + "learning_rate": 0.0009631523633021704, + "loss": 0.87342978, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.56347656, + "step": 776, + "time_per_iteration": 2.783348321914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.01187015, + "epoch": 0.14948056944978838, + "flos": 562917230592.0, + "grad_norm": 0.039790220133906304, + "language_loss": 0.90072912, + "learning_rate": 0.0009630348919868936, + "loss": 0.9114095, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.56347656, + "step": 777, + "time_per_iteration": 2.7115018367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.01736236, + "epoch": 0.14967295113505194, + "flos": 450112346880.0, + "grad_norm": 0.044777999480791836, + "language_loss": 0.82363755, + "learning_rate": 0.0009629172409055293, + "loss": 0.83437192, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.5625, + "step": 778, + "time_per_iteration": 2.578178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079079, + "balance_loss_mlp": 1.02319324, + "epoch": 0.1498653328203155, + "flos": 572429912064.0, + "grad_norm": 0.03699200582710457, + "language_loss": 0.8876617, + "learning_rate": 0.0009627994101037531, + "loss": 0.89845246, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.56054688, + "step": 779, + "time_per_iteration": 2.7733986377716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.01556909, + "epoch": 0.15005771450557906, + "flos": 632408602368.0, + "grad_norm": 0.04036301028093645, + "language_loss": 0.90477651, + "learning_rate": 0.0009626813996273114, + "loss": 0.91549194, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.56152344, + "step": 780, + "time_per_iteration": 2.8476834297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064638, + "balance_loss_mlp": 1.00884771, + "epoch": 0.15025009619084262, + "flos": 579166358784.0, + "grad_norm": 0.036574622666600026, + "language_loss": 0.89819682, + "learning_rate": 0.0009625632095220198, + "loss": 0.90884316, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.55957031, + "step": 781, + "time_per_iteration": 2.8279531002044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065406, + "balance_loss_mlp": 1.00961614, + "epoch": 0.1504424778761062, + "flos": 484857060096.0, + "grad_norm": 0.04416373966784989, + "language_loss": 0.8858574, + "learning_rate": 0.0009624448398337637, + "loss": 0.89651144, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.55957031, + "step": 782, + "time_per_iteration": 2.512742280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0075767, + "epoch": 0.15063485956136977, + "flos": 763895708928.0, + "grad_norm": 0.03630111779859241, + "language_loss": 0.90811443, + "learning_rate": 0.0009623262906084984, + "loss": 0.9187429, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.55419922, + "step": 783, + "time_per_iteration": 3.0409936904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.01156867, + "epoch": 0.15082724124663333, + "flos": 498676984320.0, + "grad_norm": 0.03758683048429116, + "language_loss": 0.91324949, + "learning_rate": 0.0009622075618922486, + "loss": 0.92391407, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.55029297, + "step": 784, + "time_per_iteration": 2.716580629348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.01188219, + "epoch": 0.15101962293189689, + "flos": 510722882304.0, + "grad_norm": 0.0361748672236624, + "language_loss": 0.88713133, + "learning_rate": 0.0009620886537311091, + "loss": 0.89779752, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.54882812, + "step": 785, + "time_per_iteration": 2.7197515964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.01069367, + "epoch": 0.15121200461716044, + "flos": 458702532864.0, + "grad_norm": 0.0476660620131034, + "language_loss": 0.86751854, + "learning_rate": 0.000961969566171244, + "loss": 0.87817287, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.54882812, + "step": 786, + "time_per_iteration": 2.519826650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063397, + "balance_loss_mlp": 1.00865602, + "epoch": 0.151404386302424, + "flos": 539017908480.0, + "grad_norm": 0.0401982478312821, + "language_loss": 0.91594857, + "learning_rate": 0.0009618502992588873, + "loss": 0.92658257, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.54882812, + "step": 787, + "time_per_iteration": 2.6427645683288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.02133262, + "epoch": 0.15159676798768756, + "flos": 689617860864.0, + "grad_norm": 0.04258050045209434, + "language_loss": 0.8916502, + "learning_rate": 0.0009617308530403424, + "loss": 0.9024148, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.55273438, + "step": 788, + "time_per_iteration": 3.0662577152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106461, + "balance_loss_mlp": 1.00958323, + "epoch": 0.15178914967295112, + "flos": 546433832448.0, + "grad_norm": 0.03354297731817266, + "language_loss": 0.88695067, + "learning_rate": 0.0009616112275619825, + "loss": 0.89759684, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.55175781, + "step": 789, + "time_per_iteration": 2.7230606079101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065845, + "balance_loss_mlp": 1.01081765, + "epoch": 0.1519815313582147, + "flos": 512815749120.0, + "grad_norm": 0.03087624340708216, + "language_loss": 0.85391772, + "learning_rate": 0.0009614914228702503, + "loss": 0.86457616, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.55175781, + "step": 790, + "time_per_iteration": 2.6690316200256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075082, + "balance_loss_mlp": 1.02024603, + "epoch": 0.15217391304347827, + "flos": 685458372096.0, + "grad_norm": 0.03877155611381102, + "language_loss": 0.90952718, + "learning_rate": 0.0009613714390116581, + "loss": 0.92027801, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.54980469, + "step": 791, + "time_per_iteration": 3.006898880004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069036, + "balance_loss_mlp": 1.01396108, + "epoch": 0.15236629472874183, + "flos": 645446982144.0, + "grad_norm": 0.03750254169389994, + "language_loss": 0.87660968, + "learning_rate": 0.0009612512760327879, + "loss": 0.88730001, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.55224609, + "step": 792, + "time_per_iteration": 2.858262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068429, + "balance_loss_mlp": 1.01297235, + "epoch": 0.1525586764140054, + "flos": 413765660928.0, + "grad_norm": 0.044925092089749936, + "language_loss": 0.86468709, + "learning_rate": 0.0009611309339802909, + "loss": 0.87537134, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.55615234, + "step": 793, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070738, + "balance_loss_mlp": 1.01485312, + "epoch": 0.15275105809926895, + "flos": 804234687744.0, + "grad_norm": 0.03634630877191588, + "language_loss": 0.85518378, + "learning_rate": 0.0009610104129008881, + "loss": 0.8658911, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.56054688, + "step": 794, + "time_per_iteration": 3.119896173477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064568, + "balance_loss_mlp": 1.0088737, + "epoch": 0.1529434397845325, + "flos": 613543632384.0, + "grad_norm": 0.039196324818253456, + "language_loss": 0.89691782, + "learning_rate": 0.0009608897128413701, + "loss": 0.90756351, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.55859375, + "step": 795, + "time_per_iteration": 2.7244484424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.00949657, + "epoch": 0.15313582146979607, + "flos": 616472478720.0, + "grad_norm": 0.031652256183926086, + "language_loss": 0.86697376, + "learning_rate": 0.0009607688338485965, + "loss": 0.87762469, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.55761719, + "step": 796, + "time_per_iteration": 2.859959363937378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106534, + "balance_loss_mlp": 1.00997913, + "epoch": 0.15332820315505963, + "flos": 794993214720.0, + "grad_norm": 0.036135713167076366, + "language_loss": 0.91464871, + "learning_rate": 0.0009606477759694969, + "loss": 0.92530215, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.55517578, + "step": 797, + "time_per_iteration": 3.0383169651031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.00806129, + "epoch": 0.1535205848403232, + "flos": 551257247232.0, + "grad_norm": 0.04267360012583918, + "language_loss": 0.89290035, + "learning_rate": 0.0009605265392510703, + "loss": 0.90353841, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.55908203, + "step": 798, + "time_per_iteration": 2.642423152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063071, + "balance_loss_mlp": 1.00732899, + "epoch": 0.15371296652558677, + "flos": 536979476736.0, + "grad_norm": 0.03662373873498648, + "language_loss": 0.93232477, + "learning_rate": 0.0009604051237403846, + "loss": 0.94295549, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.55908203, + "step": 799, + "time_per_iteration": 2.6661648750305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062966, + "balance_loss_mlp": 1.00693774, + "epoch": 0.15390534821085033, + "flos": 396090504192.0, + "grad_norm": 0.042222005302764924, + "language_loss": 0.87381375, + "learning_rate": 0.0009602835294845776, + "loss": 0.8844434, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.56201172, + "step": 800, + "time_per_iteration": 2.4529898166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_mlp": 1.00520432, + "epoch": 0.1540977298961139, + "flos": 536886157824.0, + "grad_norm": 0.03888031973735598, + "language_loss": 0.91938102, + "learning_rate": 0.0009601617565308565, + "loss": 0.92998952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.55810547, + "step": 801, + "time_per_iteration": 2.6380698680877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.0085746, + "epoch": 0.15429011158137745, + "flos": 725091628800.0, + "grad_norm": 0.03523983772327724, + "language_loss": 0.87975162, + "learning_rate": 0.0009600398049264977, + "loss": 0.89039195, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.55615234, + "step": 802, + "time_per_iteration": 2.9610986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064973, + "balance_loss_mlp": 1.00970769, + "epoch": 0.154482493266641, + "flos": 621749849088.0, + "grad_norm": 0.04424510077845192, + "language_loss": 0.93353879, + "learning_rate": 0.0009599176747188469, + "loss": 0.94418848, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.55419922, + "step": 803, + "time_per_iteration": 2.883296251296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065036, + "balance_loss_mlp": 1.00981843, + "epoch": 0.15467487495190457, + "flos": 526720243968.0, + "grad_norm": 0.03833070581853241, + "language_loss": 0.84471631, + "learning_rate": 0.0009597953659553196, + "loss": 0.85536671, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.55371094, + "step": 804, + "time_per_iteration": 2.7128705978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.00712788, + "epoch": 0.15486725663716813, + "flos": 528760621056.0, + "grad_norm": 0.03896986919959599, + "language_loss": 0.90159577, + "learning_rate": 0.0009596728786833997, + "loss": 0.9122197, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.55419922, + "step": 805, + "time_per_iteration": 2.605398178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062158, + "balance_loss_mlp": 1.00684452, + "epoch": 0.1550596383224317, + "flos": 1050280295424.0, + "grad_norm": 0.039312204875199507, + "language_loss": 0.90827858, + "learning_rate": 0.0009595502129506415, + "loss": 0.91890013, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.5546875, + "step": 806, + "time_per_iteration": 3.355556011199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_mlp": 1.00736439, + "epoch": 0.15525202000769528, + "flos": 614837458176.0, + "grad_norm": 0.03934214137038287, + "language_loss": 0.83726299, + "learning_rate": 0.0009594273688046678, + "loss": 0.8478874, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.55224609, + "step": 807, + "time_per_iteration": 2.765700101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062118, + "balance_loss_mlp": 1.00728118, + "epoch": 0.15544440169295884, + "flos": 534103120128.0, + "grad_norm": 0.042258492962953934, + "language_loss": 0.86714661, + "learning_rate": 0.000959304346293171, + "loss": 0.8777678, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.54980469, + "step": 808, + "time_per_iteration": 2.6490986347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.00928247, + "epoch": 0.1556367833782224, + "flos": 645887331840.0, + "grad_norm": 0.047675746935091516, + "language_loss": 0.89139616, + "learning_rate": 0.0009591811454639125, + "loss": 0.90203738, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.54980469, + "step": 809, + "time_per_iteration": 2.7880568504333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059631, + "balance_loss_mlp": 1.00469911, + "epoch": 0.15582916506348596, + "flos": 544953368832.0, + "grad_norm": 0.05205155355433054, + "language_loss": 0.89500809, + "learning_rate": 0.0009590577663647234, + "loss": 0.90560436, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.55078125, + "step": 810, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061907, + "balance_loss_mlp": 1.0068804, + "epoch": 0.15602154674874952, + "flos": 581215484160.0, + "grad_norm": 0.039153260843753375, + "language_loss": 0.87186325, + "learning_rate": 0.0009589342090435036, + "loss": 0.88248235, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.55175781, + "step": 811, + "time_per_iteration": 2.806425094604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_mlp": 1.00607169, + "epoch": 0.15621392843401308, + "flos": 536317496064.0, + "grad_norm": 0.04937652455074429, + "language_loss": 0.88453877, + "learning_rate": 0.0009588104735482223, + "loss": 0.89514732, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.54931641, + "step": 812, + "time_per_iteration": 2.647728204727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060077, + "balance_loss_mlp": 1.00538397, + "epoch": 0.15640631011927664, + "flos": 551982411264.0, + "grad_norm": 0.04402679292728805, + "language_loss": 0.85281312, + "learning_rate": 0.0009586865599269177, + "loss": 0.86341381, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.54833984, + "step": 813, + "time_per_iteration": 2.642218828201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061354, + "balance_loss_mlp": 1.0069474, + "epoch": 0.1565986918045402, + "flos": 638636658432.0, + "grad_norm": 0.0415768255708782, + "language_loss": 0.89702487, + "learning_rate": 0.0009585624682276977, + "loss": 0.90763843, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.54541016, + "step": 814, + "time_per_iteration": 2.7770931720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058453, + "balance_loss_mlp": 1.00366414, + "epoch": 0.15679107348980378, + "flos": 491782089984.0, + "grad_norm": 0.039213144049943555, + "language_loss": 0.88436091, + "learning_rate": 0.0009584381984987386, + "loss": 0.89494538, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.54931641, + "step": 815, + "time_per_iteration": 2.617560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061581, + "balance_loss_mlp": 1.00655353, + "epoch": 0.15698345517506734, + "flos": 531003187200.0, + "grad_norm": 0.030486806446719653, + "language_loss": 0.91117728, + "learning_rate": 0.0009583137507882864, + "loss": 0.92179304, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.55175781, + "step": 816, + "time_per_iteration": 2.6757051944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060617, + "balance_loss_mlp": 1.00568497, + "epoch": 0.1571758368603309, + "flos": 547078316544.0, + "grad_norm": 0.03910336486934304, + "language_loss": 0.82217371, + "learning_rate": 0.000958189125144656, + "loss": 0.83277988, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.55078125, + "step": 817, + "time_per_iteration": 2.7065701484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061392, + "balance_loss_mlp": 1.00655591, + "epoch": 0.15736821854559446, + "flos": 566744272896.0, + "grad_norm": 0.03730967846547413, + "language_loss": 0.89150202, + "learning_rate": 0.0009580643216162313, + "loss": 0.90211594, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.54980469, + "step": 818, + "time_per_iteration": 2.6849937438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_mlp": 1.00792253, + "epoch": 0.15756060023085802, + "flos": 501954806784.0, + "grad_norm": 0.041127076818974775, + "language_loss": 0.80838168, + "learning_rate": 0.0009579393402514652, + "loss": 0.81900686, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.54736328, + "step": 819, + "time_per_iteration": 2.615342378616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060056, + "balance_loss_mlp": 1.00560164, + "epoch": 0.15775298191612158, + "flos": 520272502272.0, + "grad_norm": 0.037825026421493144, + "language_loss": 0.91941106, + "learning_rate": 0.0009578141810988801, + "loss": 0.93001157, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.54589844, + "step": 820, + "time_per_iteration": 2.6530544757843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.00666904, + "epoch": 0.15794536360138514, + "flos": 467088584448.0, + "grad_norm": 0.039348813654249644, + "language_loss": 0.92238629, + "learning_rate": 0.0009576888442070668, + "loss": 0.93299985, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.54833984, + "step": 821, + "time_per_iteration": 2.5978658199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_mlp": 1.00809062, + "epoch": 0.1581377452866487, + "flos": 518168941824.0, + "grad_norm": 0.03790806580601569, + "language_loss": 0.93657464, + "learning_rate": 0.0009575633296246854, + "loss": 0.94720107, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.546875, + "step": 822, + "time_per_iteration": 2.582139492034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_mlp": 1.00711334, + "epoch": 0.15833012697191226, + "flos": 550838284800.0, + "grad_norm": 0.03604802690546967, + "language_loss": 0.84146446, + "learning_rate": 0.0009574376374004652, + "loss": 0.85208106, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.546875, + "step": 823, + "time_per_iteration": 2.6182329654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.00703347, + "epoch": 0.15852250865717585, + "flos": 488467329024.0, + "grad_norm": 0.0382059884648543, + "language_loss": 0.82121176, + "learning_rate": 0.000957311767583204, + "loss": 0.83182758, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.546875, + "step": 824, + "time_per_iteration": 2.584266185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_mlp": 1.00531006, + "epoch": 0.1587148903424394, + "flos": 1312699441152.0, + "grad_norm": 0.00659207066158758, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83129162, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.5234375, + "step": 825, + "time_per_iteration": 4.734830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_mlp": 1.00965643, + "epoch": 0.15890727202770297, + "flos": 467833190400.0, + "grad_norm": 0.04624650490850591, + "language_loss": 0.92764026, + "learning_rate": 0.0009570594953650961, + "loss": 0.93828189, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.54638672, + "step": 826, + "time_per_iteration": 2.5117454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.00937772, + "epoch": 0.15909965371296653, + "flos": 778607993088.0, + "grad_norm": 0.03976637787958364, + "language_loss": 0.81327987, + "learning_rate": 0.00095693309306219, + "loss": 0.8239187, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.54638672, + "step": 827, + "time_per_iteration": 3.1954681873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.00599849, + "epoch": 0.1592920353982301, + "flos": 1079964411648.0, + "grad_norm": 0.038150784713437476, + "language_loss": 0.89750922, + "learning_rate": 0.0009568065133621244, + "loss": 0.90811658, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54882812, + "step": 828, + "time_per_iteration": 3.3355016708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.00642896, + "epoch": 0.15948441708349365, + "flos": 726890932992.0, + "grad_norm": 0.03986186218144037, + "language_loss": 0.85834098, + "learning_rate": 0.0009566797563140422, + "loss": 0.86894989, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.54589844, + "step": 829, + "time_per_iteration": 2.873845100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_mlp": 1.00519884, + "epoch": 0.1596767987687572, + "flos": 580076215296.0, + "grad_norm": 0.03433333328837374, + "language_loss": 0.89395094, + "learning_rate": 0.0009565528219671547, + "loss": 0.90454364, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.54199219, + "step": 830, + "time_per_iteration": 2.9566032886505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.00991619, + "epoch": 0.15986918045402077, + "flos": 530026256640.0, + "grad_norm": 0.037800776955081314, + "language_loss": 0.86586118, + "learning_rate": 0.0009564257103707418, + "loss": 0.87649965, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.54052734, + "step": 831, + "time_per_iteration": 2.6305205821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.00870061, + "epoch": 0.16006156213928435, + "flos": 575670796032.0, + "grad_norm": 0.04196239075383403, + "language_loss": 0.92502224, + "learning_rate": 0.0009562984215741533, + "loss": 0.93564951, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54150391, + "step": 832, + "time_per_iteration": 2.6781210899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00743783, + "epoch": 0.1602539438245479, + "flos": 516675839232.0, + "grad_norm": 0.039654673227061156, + "language_loss": 0.83729708, + "learning_rate": 0.0009561709556268065, + "loss": 0.84791321, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.54296875, + "step": 833, + "time_per_iteration": 2.732191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064816, + "balance_loss_mlp": 1.01021826, + "epoch": 0.16044632550981147, + "flos": 622162008576.0, + "grad_norm": 0.03600956841171521, + "language_loss": 0.95349514, + "learning_rate": 0.0009560433125781884, + "loss": 0.96414334, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.54736328, + "step": 834, + "time_per_iteration": 4.227160215377808 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063475, + "balance_loss_mlp": 1.008973, + "epoch": 0.16063870719507503, + "flos": 562128883200.0, + "grad_norm": 0.03652136008848007, + "language_loss": 0.94107795, + "learning_rate": 0.0009559154924778544, + "loss": 0.95171273, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.54638672, + "step": 835, + "time_per_iteration": 2.7238283157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_mlp": 1.01251614, + "epoch": 0.1608310888803386, + "flos": 806561824512.0, + "grad_norm": 0.044196177378580975, + "language_loss": 0.86185992, + "learning_rate": 0.0009557874953754284, + "loss": 0.87252581, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.54199219, + "step": 836, + "time_per_iteration": 3.03965425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.00943184, + "epoch": 0.16102347056560215, + "flos": 601695065856.0, + "grad_norm": 0.04086380423696876, + "language_loss": 0.84961462, + "learning_rate": 0.0009556593213206038, + "loss": 0.86025023, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.54248047, + "step": 837, + "time_per_iteration": 2.714165687561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_mlp": 1.0095681, + "epoch": 0.1612158522508657, + "flos": 554615749632.0, + "grad_norm": 0.03942211179170501, + "language_loss": 0.88284755, + "learning_rate": 0.0009555309703631414, + "loss": 0.89348304, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.54101562, + "step": 838, + "time_per_iteration": 2.6616575717926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061318, + "balance_loss_mlp": 1.00729215, + "epoch": 0.16140823393612927, + "flos": 557018708736.0, + "grad_norm": 0.03970121061853926, + "language_loss": 0.88476837, + "learning_rate": 0.0009554024425528722, + "loss": 0.89538157, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.54150391, + "step": 839, + "time_per_iteration": 2.6778693199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061761, + "balance_loss_mlp": 1.00792611, + "epoch": 0.16160061562139286, + "flos": 544909627392.0, + "grad_norm": 0.03616953348933095, + "language_loss": 0.90216744, + "learning_rate": 0.0009552737379396948, + "loss": 0.91278505, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.53955078, + "step": 840, + "time_per_iteration": 2.6190080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060215, + "balance_loss_mlp": 1.00638056, + "epoch": 0.16179299730665642, + "flos": 605007881472.0, + "grad_norm": 0.03485432207779616, + "language_loss": 0.88917094, + "learning_rate": 0.0009551448565735767, + "loss": 0.89977312, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.53955078, + "step": 841, + "time_per_iteration": 2.771730422973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059839, + "balance_loss_mlp": 1.00624251, + "epoch": 0.16198537899191998, + "flos": 788552275968.0, + "grad_norm": 0.040424272174261144, + "language_loss": 0.855564, + "learning_rate": 0.0009550157985045543, + "loss": 0.86616236, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.53710938, + "step": 842, + "time_per_iteration": 3.014448642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063416, + "balance_loss_mlp": 1.00986671, + "epoch": 0.16217776067718354, + "flos": 520830470400.0, + "grad_norm": 0.03210449059239548, + "language_loss": 0.9010545, + "learning_rate": 0.0009548865637827321, + "loss": 0.91168869, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.53662109, + "step": 843, + "time_per_iteration": 2.663733959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.00725794, + "epoch": 0.1623701423624471, + "flos": 506255246592.0, + "grad_norm": 0.04236042945807781, + "language_loss": 0.91279781, + "learning_rate": 0.0009547571524582838, + "loss": 0.92340446, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.53515625, + "step": 844, + "time_per_iteration": 2.5841143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00848722, + "epoch": 0.16256252404771065, + "flos": 498157900032.0, + "grad_norm": 0.043042899099755685, + "language_loss": 0.93573415, + "learning_rate": 0.0009546275645814512, + "loss": 0.94635028, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.53222656, + "step": 845, + "time_per_iteration": 2.601743221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064884, + "balance_loss_mlp": 1.01152599, + "epoch": 0.16275490573297421, + "flos": 503287516416.0, + "grad_norm": 0.046422900850994125, + "language_loss": 0.90658545, + "learning_rate": 0.0009544978002025446, + "loss": 0.9172343, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.53466797, + "step": 846, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.00957346, + "epoch": 0.16294728741823777, + "flos": 508354916352.0, + "grad_norm": 0.03474620131823351, + "language_loss": 0.88017273, + "learning_rate": 0.0009543678593719434, + "loss": 0.89080155, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.53417969, + "step": 847, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_mlp": 1.01334834, + "epoch": 0.16313966910350133, + "flos": 510757875456.0, + "grad_norm": 0.031134263506057067, + "language_loss": 0.88570058, + "learning_rate": 0.0009542377421400945, + "loss": 0.89637142, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.53857422, + "step": 848, + "time_per_iteration": 2.79311203956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061983, + "balance_loss_mlp": 1.00810015, + "epoch": 0.16333205078876492, + "flos": 545057381376.0, + "grad_norm": 0.03805815068737175, + "language_loss": 0.84448338, + "learning_rate": 0.0009541074485575145, + "loss": 0.85510319, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.54003906, + "step": 849, + "time_per_iteration": 2.714644193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106829, + "balance_loss_mlp": 1.01450312, + "epoch": 0.16352443247402848, + "flos": 508712640768.0, + "grad_norm": 0.03447226436126556, + "language_loss": 0.93184924, + "learning_rate": 0.0009539769786747874, + "loss": 0.94253218, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.5390625, + "step": 850, + "time_per_iteration": 2.5857110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070929, + "balance_loss_mlp": 1.01709449, + "epoch": 0.16371681415929204, + "flos": 543223084032.0, + "grad_norm": 0.036141614394747515, + "language_loss": 0.82550752, + "learning_rate": 0.0009538463325425665, + "loss": 0.83621687, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.53955078, + "step": 851, + "time_per_iteration": 2.7186405658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.01242912, + "epoch": 0.1639091958445556, + "flos": 521761714176.0, + "grad_norm": 0.03784697093976771, + "language_loss": 0.87203169, + "learning_rate": 0.0009537155102115728, + "loss": 0.8826977, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.54296875, + "step": 852, + "time_per_iteration": 2.5761775970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061784, + "balance_loss_mlp": 1.00771022, + "epoch": 0.16410157752981916, + "flos": 548482957824.0, + "grad_norm": 0.03731294741121226, + "language_loss": 0.85278255, + "learning_rate": 0.0009535845117325961, + "loss": 0.8634004, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.54199219, + "step": 853, + "time_per_iteration": 2.6968846321105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065026, + "balance_loss_mlp": 1.01085758, + "epoch": 0.16429395921508272, + "flos": 584026712064.0, + "grad_norm": 0.031860977478103375, + "language_loss": 0.9423098, + "learning_rate": 0.0009534533371564946, + "loss": 0.95296007, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.54296875, + "step": 854, + "time_per_iteration": 2.7640349864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106098, + "balance_loss_mlp": 1.00709713, + "epoch": 0.16448634090034628, + "flos": 531962621184.0, + "grad_norm": 0.03950290113288642, + "language_loss": 0.89868152, + "learning_rate": 0.0009533219865341949, + "loss": 0.90929133, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.54003906, + "step": 855, + "time_per_iteration": 2.6025009155273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060489, + "balance_loss_mlp": 1.00693989, + "epoch": 0.16467872258560984, + "flos": 492961209600.0, + "grad_norm": 0.03645156199748424, + "language_loss": 0.87602645, + "learning_rate": 0.0009531904599166916, + "loss": 0.88663131, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.53662109, + "step": 856, + "time_per_iteration": 2.656604290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060758, + "balance_loss_mlp": 1.00730467, + "epoch": 0.16487110427087343, + "flos": 507260367360.0, + "grad_norm": 0.04426557796634758, + "language_loss": 0.86560714, + "learning_rate": 0.0009530587573550478, + "loss": 0.87621474, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.53564453, + "step": 857, + "time_per_iteration": 2.610445261001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_mlp": 1.00538635, + "epoch": 0.16506348595613698, + "flos": 1436111555328.0, + "grad_norm": 0.010874217326465607, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75375891, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.51171875, + "step": 858, + "time_per_iteration": 4.991516590118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.00718212, + "epoch": 0.16525586764140054, + "flos": 478090477824.0, + "grad_norm": 0.04454190836652637, + "language_loss": 0.91544032, + "learning_rate": 0.0009527948246039337, + "loss": 0.9260481, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.53710938, + "step": 859, + "time_per_iteration": 2.538290500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058142, + "balance_loss_mlp": 1.00425971, + "epoch": 0.1654482493266641, + "flos": 882541767168.0, + "grad_norm": 0.03991834039284953, + "language_loss": 0.88867122, + "learning_rate": 0.000952662594516931, + "loss": 0.89925265, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.54003906, + "step": 860, + "time_per_iteration": 3.083786964416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065202, + "balance_loss_mlp": 1.01122451, + "epoch": 0.16564063101192766, + "flos": 628106217216.0, + "grad_norm": 0.03630731527649873, + "language_loss": 0.87934124, + "learning_rate": 0.0009525301886907234, + "loss": 0.88999331, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.54101562, + "step": 861, + "time_per_iteration": 2.8606412410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.00884438, + "epoch": 0.16583301269719122, + "flos": 562593532416.0, + "grad_norm": 0.03632506699489255, + "language_loss": 0.8885988, + "learning_rate": 0.0009523976071767155, + "loss": 0.89922649, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.54052734, + "step": 862, + "time_per_iteration": 2.651202440261841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062989, + "balance_loss_mlp": 1.0094403, + "epoch": 0.16602539438245478, + "flos": 568984893696.0, + "grad_norm": 0.03883194498572106, + "language_loss": 0.88789731, + "learning_rate": 0.00095226485002638, + "loss": 0.8985272, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.53662109, + "step": 863, + "time_per_iteration": 2.798125982284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.01019073, + "epoch": 0.16621777606771834, + "flos": 576022684416.0, + "grad_norm": 0.03638934937563812, + "language_loss": 0.89892161, + "learning_rate": 0.0009521319172912576, + "loss": 0.90955949, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.53710938, + "step": 864, + "time_per_iteration": 4.098716974258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_mlp": 1.00632548, + "epoch": 0.16641015775298193, + "flos": 515598786816.0, + "grad_norm": 0.037169751839881825, + "language_loss": 0.96108532, + "learning_rate": 0.0009519988090229579, + "loss": 0.97168505, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.53759766, + "step": 865, + "time_per_iteration": 2.659381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068447, + "balance_loss_mlp": 1.01489806, + "epoch": 0.1666025394382455, + "flos": 622850234112.0, + "grad_norm": 0.04388029559541895, + "language_loss": 0.88811028, + "learning_rate": 0.0009518655252731576, + "loss": 0.89879477, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.53662109, + "step": 866, + "time_per_iteration": 2.738511323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061925, + "balance_loss_mlp": 1.00880551, + "epoch": 0.16679492112350905, + "flos": 549933285888.0, + "grad_norm": 0.03352631932153436, + "language_loss": 0.91113746, + "learning_rate": 0.0009517320660936022, + "loss": 0.92175674, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.53222656, + "step": 867, + "time_per_iteration": 2.7755699157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.01343453, + "epoch": 0.1669873028087726, + "flos": 666866555904.0, + "grad_norm": 0.04051359913494383, + "language_loss": 0.84396493, + "learning_rate": 0.0009515984315361051, + "loss": 0.85462809, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.52978516, + "step": 868, + "time_per_iteration": 2.8502533435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_mlp": 1.00944042, + "epoch": 0.16717968449403617, + "flos": 539604066816.0, + "grad_norm": 0.03969494402961726, + "language_loss": 0.88029611, + "learning_rate": 0.000951464621652548, + "loss": 0.89092225, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.53271484, + "step": 869, + "time_per_iteration": 2.6079800128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.01233244, + "epoch": 0.16737206617929973, + "flos": 531279253248.0, + "grad_norm": 0.03349656106003216, + "language_loss": 0.7990135, + "learning_rate": 0.0009513306364948804, + "loss": 0.80967236, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.53662109, + "step": 870, + "time_per_iteration": 2.824232578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106371, + "balance_loss_mlp": 1.00987494, + "epoch": 0.1675644478645633, + "flos": 481757127168.0, + "grad_norm": 0.04264569815750397, + "language_loss": 0.90229708, + "learning_rate": 0.0009511964761151197, + "loss": 0.91293418, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.53955078, + "step": 871, + "time_per_iteration": 2.6326816082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106642, + "balance_loss_mlp": 1.01344323, + "epoch": 0.16775682954982685, + "flos": 495542058240.0, + "grad_norm": 0.04000245460937008, + "language_loss": 0.91825569, + "learning_rate": 0.0009510621405653521, + "loss": 0.92891991, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.53076172, + "step": 872, + "time_per_iteration": 2.5802783966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074346, + "balance_loss_mlp": 1.02151191, + "epoch": 0.1679492112350904, + "flos": 753406096896.0, + "grad_norm": 0.04130745072346603, + "language_loss": 0.85908926, + "learning_rate": 0.0009509276298977309, + "loss": 0.86983275, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.52929688, + "step": 873, + "time_per_iteration": 2.9676413536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069963, + "balance_loss_mlp": 1.01689136, + "epoch": 0.168141592920354, + "flos": 1137733583616.0, + "grad_norm": 0.036676349776393134, + "language_loss": 0.82925022, + "learning_rate": 0.0009507929441644778, + "loss": 0.83994985, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.53173828, + "step": 874, + "time_per_iteration": 3.5441927909851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.00924039, + "epoch": 0.16833397460561755, + "flos": 633554674176.0, + "grad_norm": 0.03715311549034911, + "language_loss": 0.86810201, + "learning_rate": 0.0009506580834178826, + "loss": 0.87872851, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.53515625, + "step": 875, + "time_per_iteration": 2.767840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106879, + "balance_loss_mlp": 1.01524162, + "epoch": 0.1685263562908811, + "flos": 542543606784.0, + "grad_norm": 0.041322978640758234, + "language_loss": 0.92533737, + "learning_rate": 0.0009505230477103028, + "loss": 0.93602526, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.53662109, + "step": 876, + "time_per_iteration": 2.68626070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_mlp": 1.01151776, + "epoch": 0.16871873797614467, + "flos": 620486158848.0, + "grad_norm": 0.04979097271806245, + "language_loss": 0.82312369, + "learning_rate": 0.0009503878370941641, + "loss": 0.83377057, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.53271484, + "step": 877, + "time_per_iteration": 2.738828182220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.01455081, + "epoch": 0.16891111966140823, + "flos": 607456527360.0, + "grad_norm": 0.048240798926105125, + "language_loss": 0.90597415, + "learning_rate": 0.0009502524516219595, + "loss": 0.91664839, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.52978516, + "step": 878, + "time_per_iteration": 2.7533464431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.01234174, + "epoch": 0.1691035013466718, + "flos": 553406494464.0, + "grad_norm": 0.04285435284136928, + "language_loss": 0.91275579, + "learning_rate": 0.0009501168913462506, + "loss": 0.92340994, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.53173828, + "step": 879, + "time_per_iteration": 2.6498849391937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106115, + "balance_loss_mlp": 1.00946045, + "epoch": 0.16929588303193535, + "flos": 1479308427264.0, + "grad_norm": 0.010969186313753012, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80183077, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.51757812, + "step": 880, + "time_per_iteration": 4.8127734661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065784, + "balance_loss_mlp": 1.01228285, + "epoch": 0.1694882647171989, + "flos": 927848024064.0, + "grad_norm": 0.04254449001590413, + "language_loss": 0.86211771, + "learning_rate": 0.0009498452465949042, + "loss": 0.87277561, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.53613281, + "step": 881, + "time_per_iteration": 3.242352247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.00668061, + "epoch": 0.1696806464024625, + "flos": 547152193536.0, + "grad_norm": 0.03842920637304405, + "language_loss": 0.92758489, + "learning_rate": 0.0009497091622247285, + "loss": 0.93818152, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.53076172, + "step": 882, + "time_per_iteration": 2.7538321018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.01363766, + "epoch": 0.16987302808772606, + "flos": 530295519744.0, + "grad_norm": 0.04346709327253658, + "language_loss": 0.94739175, + "learning_rate": 0.0009495729032619723, + "loss": 0.95805502, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.52783203, + "step": 883, + "time_per_iteration": 2.681851863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.00830746, + "epoch": 0.17006540977298962, + "flos": 756479784960.0, + "grad_norm": 0.03707996109728333, + "language_loss": 0.85065424, + "learning_rate": 0.0009494364697595354, + "loss": 0.86126566, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.52929688, + "step": 884, + "time_per_iteration": 2.886613607406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058078, + "balance_loss_mlp": 1.00495851, + "epoch": 0.17025779145825318, + "flos": 559875623424.0, + "grad_norm": 0.04262534374301406, + "language_loss": 0.90753883, + "learning_rate": 0.0009492998617703867, + "loss": 0.91811961, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.53222656, + "step": 885, + "time_per_iteration": 2.7197954654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069962, + "balance_loss_mlp": 1.01684284, + "epoch": 0.17045017314351674, + "flos": 513217214976.0, + "grad_norm": 0.04472607646913617, + "language_loss": 0.89151132, + "learning_rate": 0.0009491630793475619, + "loss": 0.90221095, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.53222656, + "step": 886, + "time_per_iteration": 2.6023643016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059759, + "balance_loss_mlp": 1.00706899, + "epoch": 0.1706425548287803, + "flos": 510013269504.0, + "grad_norm": 0.03690999998020265, + "language_loss": 0.86250949, + "learning_rate": 0.0009490261225441643, + "loss": 0.87310708, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.52783203, + "step": 887, + "time_per_iteration": 2.8811516761779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01845872, + "epoch": 0.17083493651404386, + "flos": 718715818752.0, + "grad_norm": 0.037520519160069404, + "language_loss": 0.91723603, + "learning_rate": 0.0009488889914133656, + "loss": 0.92794418, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.52441406, + "step": 888, + "time_per_iteration": 2.983920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.01515496, + "epoch": 0.17102731819930742, + "flos": 560201266944.0, + "grad_norm": 0.034570155262309, + "language_loss": 0.90050644, + "learning_rate": 0.0009487516860084047, + "loss": 0.91118205, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.52490234, + "step": 889, + "time_per_iteration": 2.739945888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061028, + "balance_loss_mlp": 1.0078603, + "epoch": 0.17121969988457098, + "flos": 495765634560.0, + "grad_norm": 0.04354558177795279, + "language_loss": 0.9033885, + "learning_rate": 0.0009486142063825884, + "loss": 0.91399872, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.53271484, + "step": 890, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.02223206, + "epoch": 0.17141208156983456, + "flos": 1552108723968.0, + "grad_norm": 0.01766408052426257, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73499948, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.515625, + "step": 891, + "time_per_iteration": 4.968579053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_mlp": 1.00568736, + "epoch": 0.17160446325509812, + "flos": 620700986880.0, + "grad_norm": 0.037544702591063864, + "language_loss": 0.91210532, + "learning_rate": 0.0009483387246819542, + "loss": 0.92269152, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.53027344, + "step": 892, + "time_per_iteration": 2.7970938682556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071655, + "balance_loss_mlp": 1.0209198, + "epoch": 0.17179684494036168, + "flos": 1384695839232.0, + "grad_norm": 0.01601076320839161, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83357239, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.5078125, + "step": 893, + "time_per_iteration": 4.629605054855347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.01386988, + "epoch": 0.17198922662562524, + "flos": 493642632192.0, + "grad_norm": 0.03763004911158334, + "language_loss": 0.90241146, + "learning_rate": 0.0009480625467392688, + "loss": 0.91307414, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.52490234, + "step": 894, + "time_per_iteration": 2.6142358779907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_mlp": 1.01822662, + "epoch": 0.1721816083108888, + "flos": 1461488428800.0, + "grad_norm": 0.016749035753296605, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79063439, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.50585938, + "step": 895, + "time_per_iteration": 4.811494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112065, + "balance_loss_mlp": 1.06719661, + "epoch": 0.17237398999615236, + "flos": 529205828352.0, + "grad_norm": 0.05241044192650153, + "language_loss": 0.88738441, + "learning_rate": 0.0009477856729834196, + "loss": 0.89859092, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.53564453, + "step": 896, + "time_per_iteration": 2.7389612197875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066584, + "balance_loss_mlp": 1.01446557, + "epoch": 0.17256637168141592, + "flos": 605027323392.0, + "grad_norm": 0.03860455021635393, + "language_loss": 0.90989411, + "learning_rate": 0.0009476469753098809, + "loss": 0.92055988, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.52197266, + "step": 897, + "time_per_iteration": 2.7175238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.02507758, + "epoch": 0.17275875336667948, + "flos": 510694692096.0, + "grad_norm": 0.040412661310783936, + "language_loss": 0.88453948, + "learning_rate": 0.0009475081038443738, + "loss": 0.89531147, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.52197266, + "step": 898, + "time_per_iteration": 2.6398110389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.02753115, + "epoch": 0.17295113505194307, + "flos": 666502028544.0, + "grad_norm": 0.045107808798334564, + "language_loss": 0.87902451, + "learning_rate": 0.0009473690586408124, + "loss": 0.88981915, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.52001953, + "step": 899, + "time_per_iteration": 2.817730665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.01965487, + "epoch": 0.17314351673720663, + "flos": 556432550400.0, + "grad_norm": 0.03870851432877784, + "language_loss": 0.87576568, + "learning_rate": 0.0009472298397531792, + "loss": 0.88648236, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.52099609, + "step": 900, + "time_per_iteration": 2.6932764053344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061802, + "balance_loss_mlp": 1.00892079, + "epoch": 0.17333589842247019, + "flos": 504607587072.0, + "grad_norm": 0.03631909976073519, + "language_loss": 0.87174571, + "learning_rate": 0.0009470904472355235, + "loss": 0.88236374, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.52978516, + "step": 901, + "time_per_iteration": 2.669405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099242, + "balance_loss_mlp": 1.04593205, + "epoch": 0.17352828010773375, + "flos": 557351155200.0, + "grad_norm": 0.04839261993488341, + "language_loss": 0.80976391, + "learning_rate": 0.0009469508811419626, + "loss": 0.82075632, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.53417969, + "step": 902, + "time_per_iteration": 2.7412211894989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083992, + "balance_loss_mlp": 1.033638, + "epoch": 0.1737206617929973, + "flos": 1557794363136.0, + "grad_norm": 0.02136399149953286, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72697818, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.50390625, + "step": 903, + "time_per_iteration": 4.800720930099487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075494, + "balance_loss_mlp": 1.02318478, + "epoch": 0.17391304347826086, + "flos": 517756782336.0, + "grad_norm": 0.04178806719411302, + "language_loss": 0.85797513, + "learning_rate": 0.0009466712284439292, + "loss": 0.86873007, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.52392578, + "step": 904, + "time_per_iteration": 2.7409780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076244, + "balance_loss_mlp": 1.02360141, + "epoch": 0.17410542516352442, + "flos": 542161582848.0, + "grad_norm": 0.043268311729831165, + "language_loss": 0.90273786, + "learning_rate": 0.0009465311419480276, + "loss": 0.91350031, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.52734375, + "step": 905, + "time_per_iteration": 2.7310986518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068245, + "balance_loss_mlp": 1.01526833, + "epoch": 0.17429780684878798, + "flos": 625082106624.0, + "grad_norm": 0.0375699532684124, + "language_loss": 0.89484948, + "learning_rate": 0.0009463908820933622, + "loss": 0.905532, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.53076172, + "step": 906, + "time_per_iteration": 2.8575551509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086696, + "balance_loss_mlp": 1.03281319, + "epoch": 0.17449018853405157, + "flos": 576849915648.0, + "grad_norm": 0.04286783530345041, + "language_loss": 0.83513701, + "learning_rate": 0.0009462504489343868, + "loss": 0.84600401, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.54003906, + "step": 907, + "time_per_iteration": 2.83085036277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.0128628, + "epoch": 0.17468257021931513, + "flos": 534773849088.0, + "grad_norm": 0.0408315501053547, + "language_loss": 0.90177906, + "learning_rate": 0.0009461098425256222, + "loss": 0.91243982, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.53320312, + "step": 908, + "time_per_iteration": 2.6000654697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075411, + "balance_loss_mlp": 1.02257717, + "epoch": 0.1748749519045787, + "flos": 541809694464.0, + "grad_norm": 0.0381088809784924, + "language_loss": 0.87053907, + "learning_rate": 0.0009459690629216567, + "loss": 0.88129318, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.52929688, + "step": 909, + "time_per_iteration": 2.622178316116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_mlp": 1.02770495, + "epoch": 0.17506733358984225, + "flos": 499627670016.0, + "grad_norm": 0.039096197570908604, + "language_loss": 0.88898331, + "learning_rate": 0.0009458281101771457, + "loss": 0.89978582, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.52636719, + "step": 910, + "time_per_iteration": 2.5964770317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.01217556, + "epoch": 0.1752597152751058, + "flos": 624133366272.0, + "grad_norm": 0.035444142957055544, + "language_loss": 0.83730716, + "learning_rate": 0.0009456869843468122, + "loss": 0.84795535, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.52734375, + "step": 911, + "time_per_iteration": 2.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059336, + "balance_loss_mlp": 1.00650251, + "epoch": 0.17545209696036937, + "flos": 521994038784.0, + "grad_norm": 0.04587594362499167, + "language_loss": 0.79429859, + "learning_rate": 0.0009455456854854459, + "loss": 0.80489194, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.52929688, + "step": 912, + "time_per_iteration": 2.627058744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.0219084, + "epoch": 0.17564447864563293, + "flos": 462946592256.0, + "grad_norm": 0.044462507375804226, + "language_loss": 0.85522115, + "learning_rate": 0.0009454042136479039, + "loss": 0.86597091, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.53173828, + "step": 913, + "time_per_iteration": 2.562453031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.01250815, + "epoch": 0.1758368603308965, + "flos": 481618121472.0, + "grad_norm": 0.03599423435064716, + "language_loss": 0.84144086, + "learning_rate": 0.0009452625688891103, + "loss": 0.85208857, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.5234375, + "step": 914, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.0137558, + "epoch": 0.17602924201616005, + "flos": 1482087574272.0, + "grad_norm": 0.013260252544834742, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79798466, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.49902344, + "step": 915, + "time_per_iteration": 4.572151184082031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_mlp": 1.0219233, + "epoch": 0.17622162370142364, + "flos": 603471037440.0, + "grad_norm": 0.044830704586910027, + "language_loss": 0.94022703, + "learning_rate": 0.0009449787608278015, + "loss": 0.95096982, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.52441406, + "step": 916, + "time_per_iteration": 2.731264114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062837, + "balance_loss_mlp": 1.0104804, + "epoch": 0.1764140053866872, + "flos": 443606279424.0, + "grad_norm": 0.0370205772569368, + "language_loss": 0.92972034, + "learning_rate": 0.0009448365976354704, + "loss": 0.94034874, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.52441406, + "step": 917, + "time_per_iteration": 2.478041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.0204134, + "epoch": 0.17660638707195075, + "flos": 501592224768.0, + "grad_norm": 0.047363321454448416, + "language_loss": 0.907022, + "learning_rate": 0.0009446942617422558, + "loss": 0.91775542, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.53027344, + "step": 918, + "time_per_iteration": 2.5698564052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_mlp": 1.00789583, + "epoch": 0.17679876875721431, + "flos": 539984145408.0, + "grad_norm": 0.03732253291641402, + "language_loss": 0.86447889, + "learning_rate": 0.0009445517532034176, + "loss": 0.87508708, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.53027344, + "step": 919, + "time_per_iteration": 2.6916563510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.00926292, + "epoch": 0.17699115044247787, + "flos": 498715868160.0, + "grad_norm": 0.04444616550081301, + "language_loss": 0.8994987, + "learning_rate": 0.0009444090720742824, + "loss": 0.91012013, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.52978516, + "step": 920, + "time_per_iteration": 2.5798380374908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.01706016, + "epoch": 0.17718353212774143, + "flos": 663916322304.0, + "grad_norm": 0.04662040468857239, + "language_loss": 0.89399016, + "learning_rate": 0.0009442662184102439, + "loss": 0.90468818, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.52832031, + "step": 921, + "time_per_iteration": 2.755929708480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.01164341, + "epoch": 0.177375913813005, + "flos": 583848822528.0, + "grad_norm": 0.03479566109485236, + "language_loss": 0.88455689, + "learning_rate": 0.000944123192266763, + "loss": 0.89519787, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.52539062, + "step": 922, + "time_per_iteration": 2.8776824474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.00980616, + "epoch": 0.17756829549826855, + "flos": 553684505856.0, + "grad_norm": 0.036018663808135676, + "language_loss": 0.84559548, + "learning_rate": 0.0009439799936993671, + "loss": 0.85622525, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.53271484, + "step": 923, + "time_per_iteration": 2.708897113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.01041508, + "epoch": 0.17776067718353214, + "flos": 557372542464.0, + "grad_norm": 0.06706828820902193, + "language_loss": 0.89721078, + "learning_rate": 0.0009438366227636511, + "loss": 0.90784371, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.52978516, + "step": 924, + "time_per_iteration": 2.6524295806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.01035416, + "epoch": 0.1779530588687957, + "flos": 659652820992.0, + "grad_norm": 0.03503923634288643, + "language_loss": 0.87549317, + "learning_rate": 0.0009436930795152763, + "loss": 0.8861202, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.52441406, + "step": 925, + "time_per_iteration": 2.8627374172210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_mlp": 1.01823378, + "epoch": 0.17814544055405926, + "flos": 645672503808.0, + "grad_norm": 0.03989967380061369, + "language_loss": 0.87815237, + "learning_rate": 0.0009435493640099713, + "loss": 0.88885403, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.52001953, + "step": 926, + "time_per_iteration": 2.7886180877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.01283479, + "epoch": 0.17833782223932282, + "flos": 461885091072.0, + "grad_norm": 0.040977111340993126, + "language_loss": 0.85709256, + "learning_rate": 0.0009434054763035314, + "loss": 0.86774307, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.52294922, + "step": 927, + "time_per_iteration": 2.635576009750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00520515, + "epoch": 0.17853020392458638, + "flos": 760854101760.0, + "grad_norm": 0.029435711646972902, + "language_loss": 0.86359227, + "learning_rate": 0.0009432614164518185, + "loss": 0.8741703, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.52685547, + "step": 928, + "time_per_iteration": 2.945253849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074963, + "balance_loss_mlp": 1.02203369, + "epoch": 0.17872258560984994, + "flos": 784056450048.0, + "grad_norm": 0.039066121455708196, + "language_loss": 0.84876156, + "learning_rate": 0.000943117184510762, + "loss": 0.85951114, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 3.0016870498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092369, + "balance_loss_mlp": 1.04201508, + "epoch": 0.1789149672951135, + "flos": 1463034021120.0, + "grad_norm": 0.03241390760866092, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79882336, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.50390625, + "step": 930, + "time_per_iteration": 5.0408923625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.04005396, + "epoch": 0.17910734898037706, + "flos": 504931285248.0, + "grad_norm": 0.037670754636037675, + "language_loss": 0.90276599, + "learning_rate": 0.0009428282045846674, + "loss": 0.91368294, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.51708984, + "step": 931, + "time_per_iteration": 2.699357509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093914, + "balance_loss_mlp": 1.04260671, + "epoch": 0.17929973066564064, + "flos": 747670880256.0, + "grad_norm": 0.03557447538434831, + "language_loss": 0.91468316, + "learning_rate": 0.0009426834567118214, + "loss": 0.92562228, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.51367188, + "step": 932, + "time_per_iteration": 3.0888116359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095199, + "balance_loss_mlp": 1.04370034, + "epoch": 0.1794921123509042, + "flos": 714573826560.0, + "grad_norm": 0.03713873812168088, + "language_loss": 0.82311261, + "learning_rate": 0.0009425385369740155, + "loss": 0.8340646, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.515625, + "step": 933, + "time_per_iteration": 3.0156304836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.04465711, + "epoch": 0.17968449403616776, + "flos": 634362463488.0, + "grad_norm": 0.04581160448205157, + "language_loss": 0.89044029, + "learning_rate": 0.0009423934454275125, + "loss": 0.90140092, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.51464844, + "step": 934, + "time_per_iteration": 2.8524558544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095874, + "balance_loss_mlp": 1.04428041, + "epoch": 0.17987687572143132, + "flos": 537378997248.0, + "grad_norm": 0.045982575553228676, + "language_loss": 0.93734717, + "learning_rate": 0.0009422481821286418, + "loss": 0.94830596, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.51660156, + "step": 935, + "time_per_iteration": 2.7354249954223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096416, + "balance_loss_mlp": 1.0448221, + "epoch": 0.18006925740669488, + "flos": 539119975680.0, + "grad_norm": 0.04748543050697339, + "language_loss": 0.89948702, + "learning_rate": 0.0009421027471337998, + "loss": 0.91045117, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.51660156, + "step": 936, + "time_per_iteration": 2.660287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095184, + "balance_loss_mlp": 1.04363835, + "epoch": 0.18026163909195844, + "flos": 540535310592.0, + "grad_norm": 0.04911488628490749, + "language_loss": 0.84066534, + "learning_rate": 0.0009419571404994493, + "loss": 0.8516171, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.51611328, + "step": 937, + "time_per_iteration": 2.624769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.03865409, + "epoch": 0.180454020777222, + "flos": 501683598336.0, + "grad_norm": 0.0468107226861285, + "language_loss": 0.92304778, + "learning_rate": 0.00094181136228212, + "loss": 0.9339512, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.51757812, + "step": 938, + "time_per_iteration": 2.6784133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092284, + "balance_loss_mlp": 1.04069054, + "epoch": 0.18064640246248556, + "flos": 500007748608.0, + "grad_norm": 0.039466745711782485, + "language_loss": 0.87082231, + "learning_rate": 0.0009416654125384077, + "loss": 0.8817451, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.51660156, + "step": 939, + "time_per_iteration": 2.7231576442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081085, + "balance_loss_mlp": 1.03034973, + "epoch": 0.18083878414774912, + "flos": 1522293383424.0, + "grad_norm": 0.016406546431804496, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80853462, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.5078125, + "step": 940, + "time_per_iteration": 4.919930934906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329403, + "balance_loss_mlp": 1.27490067, + "epoch": 0.1810311658330127, + "flos": 728666904576.0, + "grad_norm": 0.12503564718566265, + "language_loss": 0.85519916, + "learning_rate": 0.000941372998698552, + "loss": 0.8684932, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.54638672, + "step": 941, + "time_per_iteration": 2.9731380939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093385, + "balance_loss_mlp": 1.04121876, + "epoch": 0.18122354751827627, + "flos": 566045353728.0, + "grad_norm": 0.05253753965114479, + "language_loss": 0.83319217, + "learning_rate": 0.0009412265347159336, + "loss": 0.84412599, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.52246094, + "step": 942, + "time_per_iteration": 2.7150988578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103842, + "balance_loss_mlp": 1.05162799, + "epoch": 0.18141592920353983, + "flos": 520318189056.0, + "grad_norm": 0.046885904923641086, + "language_loss": 0.86687338, + "learning_rate": 0.0009410798994339829, + "loss": 0.87791175, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.52294922, + "step": 943, + "time_per_iteration": 2.598576545715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111341, + "balance_loss_mlp": 1.05831623, + "epoch": 0.1816083108888034, + "flos": 513477729792.0, + "grad_norm": 0.04639702407841738, + "language_loss": 0.8991158, + "learning_rate": 0.000940933092909628, + "loss": 0.91022921, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.53125, + "step": 944, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104212, + "balance_loss_mlp": 1.05109203, + "epoch": 0.18180069257406695, + "flos": 493373369088.0, + "grad_norm": 0.04493061679832577, + "language_loss": 0.85416293, + "learning_rate": 0.0009407861151998649, + "loss": 0.86520505, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.53222656, + "step": 945, + "time_per_iteration": 2.5710983276367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.04692006, + "epoch": 0.1819930742593305, + "flos": 571231350528.0, + "grad_norm": 0.04259629183686275, + "language_loss": 0.87787771, + "learning_rate": 0.0009406389663617552, + "loss": 0.88888001, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.53417969, + "step": 946, + "time_per_iteration": 2.6741456985473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.04661465, + "epoch": 0.18218545594459407, + "flos": 607111441920.0, + "grad_norm": 0.04866460503106345, + "language_loss": 0.87927794, + "learning_rate": 0.000940491646452427, + "loss": 0.89027911, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.53613281, + "step": 947, + "time_per_iteration": 2.718358278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101327, + "balance_loss_mlp": 1.04753995, + "epoch": 0.18237783762985763, + "flos": 549739845120.0, + "grad_norm": 0.042994543525894185, + "language_loss": 0.92601323, + "learning_rate": 0.000940344155529075, + "loss": 0.93702656, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.5390625, + "step": 948, + "time_per_iteration": 2.624303102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097087, + "balance_loss_mlp": 1.04325247, + "epoch": 0.1825702193151212, + "flos": 451675435776.0, + "grad_norm": 0.046415524987670945, + "language_loss": 0.89178842, + "learning_rate": 0.0009401964936489605, + "loss": 0.90275931, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.53955078, + "step": 949, + "time_per_iteration": 2.5104119777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088983, + "balance_loss_mlp": 1.03524303, + "epoch": 0.18276260100038477, + "flos": 590385025536.0, + "grad_norm": 0.0430347708706334, + "language_loss": 0.86972219, + "learning_rate": 0.0009400486608694108, + "loss": 0.88061202, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.53857422, + "step": 950, + "time_per_iteration": 2.744044065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085663, + "balance_loss_mlp": 1.03154159, + "epoch": 0.18295498268564833, + "flos": 788710723584.0, + "grad_norm": 0.040810758702646055, + "language_loss": 0.88588369, + "learning_rate": 0.0009399006572478195, + "loss": 0.89674032, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.54248047, + "step": 951, + "time_per_iteration": 3.0828475952148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.02493632, + "epoch": 0.1831473643709119, + "flos": 579226629888.0, + "grad_norm": 0.03747434947067488, + "language_loss": 0.92113942, + "learning_rate": 0.0009397524828416468, + "loss": 0.93193376, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.54638672, + "step": 952, + "time_per_iteration": 2.6881086826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.03405273, + "epoch": 0.18333974605617545, + "flos": 567964221696.0, + "grad_norm": 0.0419825959367211, + "language_loss": 0.97306633, + "learning_rate": 0.0009396041377084192, + "loss": 0.9839648, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.55957031, + "step": 953, + "time_per_iteration": 2.673654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097804, + "balance_loss_mlp": 1.04191864, + "epoch": 0.183532127741439, + "flos": 528070450176.0, + "grad_norm": 0.04203850234568462, + "language_loss": 0.89016271, + "learning_rate": 0.0009394556219057295, + "loss": 0.90114069, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.56054688, + "step": 954, + "time_per_iteration": 2.7255043983459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.01610565, + "epoch": 0.18372450942670257, + "flos": 595644899328.0, + "grad_norm": 0.03789415730727427, + "language_loss": 0.84751296, + "learning_rate": 0.0009393069354912362, + "loss": 0.85822284, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.55029297, + "step": 955, + "time_per_iteration": 2.7474210262298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_mlp": 1.02963698, + "epoch": 0.18391689111196613, + "flos": 646284907008.0, + "grad_norm": 0.04389714766773939, + "language_loss": 0.83882308, + "learning_rate": 0.0009391580785226649, + "loss": 0.84966445, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.54638672, + "step": 956, + "time_per_iteration": 2.844409465789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081024, + "balance_loss_mlp": 1.02990723, + "epoch": 0.18410927279722972, + "flos": 1460394846720.0, + "grad_norm": 0.013082177800516761, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80421472, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.51171875, + "step": 957, + "time_per_iteration": 4.792405843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_mlp": 1.030267, + "epoch": 0.18430165448249328, + "flos": 660004709376.0, + "grad_norm": 0.04089111102732722, + "language_loss": 0.88231802, + "learning_rate": 0.0009388598531545196, + "loss": 0.89316285, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.54345703, + "step": 958, + "time_per_iteration": 2.900062084197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084489, + "balance_loss_mlp": 1.03017747, + "epoch": 0.18449403616775684, + "flos": 518950486272.0, + "grad_norm": 0.045948437313162956, + "language_loss": 0.87467843, + "learning_rate": 0.000938710484870727, + "loss": 0.88552332, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.54443359, + "step": 959, + "time_per_iteration": 2.5785140991210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.031569, + "epoch": 0.1846864178530204, + "flos": 553825456896.0, + "grad_norm": 0.04362127254920589, + "language_loss": 0.87369549, + "learning_rate": 0.0009385609462644189, + "loss": 0.88455284, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.54296875, + "step": 960, + "time_per_iteration": 2.686221122741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.02774417, + "epoch": 0.18487879953828396, + "flos": 467116774656.0, + "grad_norm": 0.04468558895083242, + "language_loss": 0.86931455, + "learning_rate": 0.0009384112373936514, + "loss": 0.88013744, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.546875, + "step": 961, + "time_per_iteration": 2.633582830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.00935197, + "epoch": 0.18507118122354752, + "flos": 649684238592.0, + "grad_norm": 0.03687654302408078, + "language_loss": 0.9259429, + "learning_rate": 0.0009382613583165467, + "loss": 0.93658715, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.55224609, + "step": 962, + "time_per_iteration": 2.7910635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458915, + "balance_loss_mlp": 1.40078855, + "epoch": 0.18526356290881107, + "flos": 627923470080.0, + "grad_norm": 0.09306974449566385, + "language_loss": 0.90611041, + "learning_rate": 0.0009381113090912928, + "loss": 0.92069954, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.57958984, + "step": 963, + "time_per_iteration": 2.7445125579833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078714, + "balance_loss_mlp": 1.02464056, + "epoch": 0.18545594459407463, + "flos": 433646445312.0, + "grad_norm": 0.04076594680163087, + "language_loss": 0.91471934, + "learning_rate": 0.000937961089776144, + "loss": 0.92550647, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.54199219, + "step": 964, + "time_per_iteration": 2.5835955142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089429, + "balance_loss_mlp": 1.03607059, + "epoch": 0.1856483262793382, + "flos": 750427673088.0, + "grad_norm": 0.041116434601540804, + "language_loss": 0.8449949, + "learning_rate": 0.0009378107004294208, + "loss": 0.8558892, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.53466797, + "step": 965, + "time_per_iteration": 2.9773664474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090833, + "balance_loss_mlp": 1.03790379, + "epoch": 0.18584070796460178, + "flos": 531402707712.0, + "grad_norm": 0.04029010126422192, + "language_loss": 0.93043375, + "learning_rate": 0.0009376601411095096, + "loss": 0.94134206, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.53027344, + "step": 966, + "time_per_iteration": 2.6703643798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088702, + "balance_loss_mlp": 1.03639269, + "epoch": 0.18603308964986534, + "flos": 484084263936.0, + "grad_norm": 0.03934020689435504, + "language_loss": 0.87718618, + "learning_rate": 0.0009375094118748622, + "loss": 0.88807321, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.52392578, + "step": 967, + "time_per_iteration": 2.5719969272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091813, + "balance_loss_mlp": 1.03974187, + "epoch": 0.1862254713351289, + "flos": 802682292480.0, + "grad_norm": 0.042176858736630414, + "language_loss": 0.92643285, + "learning_rate": 0.0009373585127839976, + "loss": 0.93735105, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.52148438, + "step": 968, + "time_per_iteration": 2.956153392791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096332, + "balance_loss_mlp": 1.04483318, + "epoch": 0.18641785302039246, + "flos": 479290984704.0, + "grad_norm": 0.04307464179422831, + "language_loss": 0.92206955, + "learning_rate": 0.0009372074438954994, + "loss": 0.93303293, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.515625, + "step": 969, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092255, + "balance_loss_mlp": 1.04085171, + "epoch": 0.18661023470565602, + "flos": 389779822848.0, + "grad_norm": 0.044792080488554424, + "language_loss": 0.93312657, + "learning_rate": 0.0009370562052680181, + "loss": 0.94404912, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.51464844, + "step": 970, + "time_per_iteration": 2.4642274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109029, + "balance_loss_mlp": 1.03926873, + "epoch": 0.18680261639091958, + "flos": 565776090624.0, + "grad_norm": 0.03666794569701081, + "language_loss": 0.90593827, + "learning_rate": 0.0009369047969602695, + "loss": 0.91684115, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.51074219, + "step": 971, + "time_per_iteration": 2.6925313472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090519, + "balance_loss_mlp": 1.03968859, + "epoch": 0.18699499807618314, + "flos": 480230976768.0, + "grad_norm": 0.04959033368050126, + "language_loss": 0.88274431, + "learning_rate": 0.0009367532190310357, + "loss": 0.89364946, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.50878906, + "step": 972, + "time_per_iteration": 2.5632824897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095286, + "balance_loss_mlp": 1.04464579, + "epoch": 0.1871873797614467, + "flos": 554328989952.0, + "grad_norm": 0.047101191533600484, + "language_loss": 0.90956879, + "learning_rate": 0.0009366014715391644, + "loss": 0.92052168, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.50683594, + "step": 973, + "time_per_iteration": 2.6131792068481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087331, + "balance_loss_mlp": 1.03669059, + "epoch": 0.18737976144671029, + "flos": 553953768960.0, + "grad_norm": 0.03277863870695053, + "language_loss": 0.85193431, + "learning_rate": 0.0009364495545435693, + "loss": 0.86280763, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.50683594, + "step": 974, + "time_per_iteration": 2.768160820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077828, + "balance_loss_mlp": 1.02647221, + "epoch": 0.18757214313197385, + "flos": 503248632576.0, + "grad_norm": 0.03709252074476072, + "language_loss": 0.90046728, + "learning_rate": 0.0009362974681032297, + "loss": 0.91124547, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.51416016, + "step": 975, + "time_per_iteration": 2.596752405166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358762, + "balance_loss_mlp": 1.30464137, + "epoch": 0.1877645248172374, + "flos": 676292721408.0, + "grad_norm": 0.11355211768831018, + "language_loss": 0.89691889, + "learning_rate": 0.0009361452122771907, + "loss": 0.91050649, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.54248047, + "step": 976, + "time_per_iteration": 2.841670036315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.03649426, + "epoch": 0.18795690650250096, + "flos": 405863700480.0, + "grad_norm": 0.05182073733860081, + "language_loss": 0.85757113, + "learning_rate": 0.0009359927871245635, + "loss": 0.86844826, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.51269531, + "step": 977, + "time_per_iteration": 2.4593758583068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110124, + "balance_loss_mlp": 1.04988456, + "epoch": 0.18814928818776452, + "flos": 639064369152.0, + "grad_norm": 0.04599902588150218, + "language_loss": 0.8843354, + "learning_rate": 0.0009358401927045246, + "loss": 0.89534783, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.51416016, + "step": 978, + "time_per_iteration": 2.8043553829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_mlp": 1.05197036, + "epoch": 0.18834166987302808, + "flos": 1140117100800.0, + "grad_norm": 0.05109113713971293, + "language_loss": 0.89583617, + "learning_rate": 0.0009356874290763166, + "loss": 0.90687132, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.51611328, + "step": 979, + "time_per_iteration": 3.4783685207366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105346, + "balance_loss_mlp": 1.0536567, + "epoch": 0.18853405155829164, + "flos": 505816842240.0, + "grad_norm": 0.03906189308485337, + "language_loss": 0.90395761, + "learning_rate": 0.0009355344962992474, + "loss": 0.91501105, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.51757812, + "step": 980, + "time_per_iteration": 2.6457359790802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_mlp": 1.05116904, + "epoch": 0.1887264332435552, + "flos": 609371504640.0, + "grad_norm": 0.038270487176229884, + "language_loss": 0.89782834, + "learning_rate": 0.0009353813944326908, + "loss": 0.9088589, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.51953125, + "step": 981, + "time_per_iteration": 2.923243761062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102538, + "balance_loss_mlp": 1.05070543, + "epoch": 0.1889188149288188, + "flos": 553593132288.0, + "grad_norm": 0.04212053297292714, + "language_loss": 0.84181225, + "learning_rate": 0.0009352281235360863, + "loss": 0.85283768, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.51904297, + "step": 982, + "time_per_iteration": 2.674790620803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_mlp": 1.05135345, + "epoch": 0.18911119661408235, + "flos": 419470742016.0, + "grad_norm": 0.03892833341753514, + "language_loss": 0.86323905, + "learning_rate": 0.0009350746836689389, + "loss": 0.87426949, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.51757812, + "step": 983, + "time_per_iteration": 2.5294649600982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103523, + "balance_loss_mlp": 1.05335999, + "epoch": 0.1893035782993459, + "flos": 1485320676864.0, + "grad_norm": 0.016207020064155576, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82542741, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.50195312, + "step": 984, + "time_per_iteration": 5.031845569610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094201, + "balance_loss_mlp": 1.04227316, + "epoch": 0.18949595998460947, + "flos": 509457246720.0, + "grad_norm": 0.045438139941342374, + "language_loss": 0.84563899, + "learning_rate": 0.0009347672972613634, + "loss": 0.85658097, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.52001953, + "step": 985, + "time_per_iteration": 2.6333274841308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.0384593, + "epoch": 0.18968834166987303, + "flos": 532193000448.0, + "grad_norm": 0.03993027053802703, + "language_loss": 0.8704083, + "learning_rate": 0.0009346133508402735, + "loss": 0.8813107, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.51855469, + "step": 986, + "time_per_iteration": 2.751340389251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089761, + "balance_loss_mlp": 1.03797686, + "epoch": 0.1898807233551366, + "flos": 500754299904.0, + "grad_norm": 0.04595906606263721, + "language_loss": 0.85852754, + "learning_rate": 0.0009344592356873166, + "loss": 0.86942512, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.51855469, + "step": 987, + "time_per_iteration": 2.6785645484924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.03223073, + "epoch": 0.19007310504040015, + "flos": 603360221952.0, + "grad_norm": 0.042275439246703725, + "language_loss": 0.79788595, + "learning_rate": 0.0009343049518623255, + "loss": 0.80872947, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.52197266, + "step": 988, + "time_per_iteration": 2.709439516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365061, + "balance_loss_mlp": 1.30979574, + "epoch": 0.1902654867256637, + "flos": 602765315328.0, + "grad_norm": 0.1049262798815586, + "language_loss": 0.8386007, + "learning_rate": 0.0009341504994251985, + "loss": 0.85225129, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.55419922, + "step": 989, + "time_per_iteration": 2.925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.03952026, + "epoch": 0.19045786841092727, + "flos": 1579234345728.0, + "grad_norm": 0.01847097645999908, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74610186, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.50195312, + "step": 990, + "time_per_iteration": 5.025054216384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_mlp": 1.04845631, + "epoch": 0.19065025009619085, + "flos": 683055412992.0, + "grad_norm": 0.039739471389523856, + "language_loss": 0.8281374, + "learning_rate": 0.0009338410889544574, + "loss": 0.83915699, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.53613281, + "step": 991, + "time_per_iteration": 3.0653748512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112616, + "balance_loss_mlp": 1.05868626, + "epoch": 0.1908426317814544, + "flos": 603442847232.0, + "grad_norm": 0.04383499470371995, + "language_loss": 0.89543211, + "learning_rate": 0.000933686131040967, + "loss": 0.90655828, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.54052734, + "step": 992, + "time_per_iteration": 2.7901530265808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106006, + "balance_loss_mlp": 1.0517416, + "epoch": 0.19103501346671797, + "flos": 587434791936.0, + "grad_norm": 0.04122735235002176, + "language_loss": 0.92173266, + "learning_rate": 0.0009335310047555883, + "loss": 0.93279278, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.54394531, + "step": 993, + "time_per_iteration": 2.7153608798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097163, + "balance_loss_mlp": 1.04285157, + "epoch": 0.19122739515198153, + "flos": 546835298304.0, + "grad_norm": 0.04052898350535971, + "language_loss": 0.89637405, + "learning_rate": 0.0009333757101585467, + "loss": 0.90734565, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.54443359, + "step": 994, + "time_per_iteration": 2.6286795139312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091826, + "balance_loss_mlp": 1.03732359, + "epoch": 0.1914197768372451, + "flos": 522550061568.0, + "grad_norm": 0.03850908176124289, + "language_loss": 0.94694555, + "learning_rate": 0.0009332202473101329, + "loss": 0.95786381, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.54638672, + "step": 995, + "time_per_iteration": 2.649850368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072176, + "balance_loss_mlp": 1.01714945, + "epoch": 0.19161215852250865, + "flos": 612388812288.0, + "grad_norm": 0.03654296504823072, + "language_loss": 0.83743644, + "learning_rate": 0.0009330646162707028, + "loss": 0.84815824, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.55175781, + "step": 996, + "time_per_iteration": 2.7329981327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059087, + "balance_loss_mlp": 1.0033443, + "epoch": 0.1918045402077722, + "flos": 848183935488.0, + "grad_norm": 0.03315860340701524, + "language_loss": 0.85236025, + "learning_rate": 0.0009329088171006779, + "loss": 0.8629511, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.55908203, + "step": 997, + "time_per_iteration": 3.135049343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290698, + "balance_loss_mlp": 1.2330482, + "epoch": 0.19199692189303577, + "flos": 466893198336.0, + "grad_norm": 0.06463762674453556, + "language_loss": 0.86239529, + "learning_rate": 0.0009327528498605446, + "loss": 0.87530231, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.57470703, + "step": 998, + "time_per_iteration": 2.5807580947875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.01727533, + "epoch": 0.19218930357829936, + "flos": 532613908224.0, + "grad_norm": 0.04280698068802137, + "language_loss": 0.90856296, + "learning_rate": 0.0009325967146108548, + "loss": 0.91928697, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.55273438, + "step": 999, + "time_per_iteration": 2.637840986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086346, + "balance_loss_mlp": 1.03217781, + "epoch": 0.19238168526356292, + "flos": 602728376832.0, + "grad_norm": 0.04847652630230049, + "language_loss": 0.88902158, + "learning_rate": 0.0009324404114122258, + "loss": 0.89988506, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.54296875, + "step": 1000, + "time_per_iteration": 4.1391942501068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090902, + "balance_loss_mlp": 1.03701913, + "epoch": 0.19257406694882648, + "flos": 573155076096.0, + "grad_norm": 0.04193719314851312, + "language_loss": 0.88362414, + "learning_rate": 0.0009322839403253397, + "loss": 0.89453316, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.54003906, + "step": 1001, + "time_per_iteration": 2.8266265392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.03395164, + "epoch": 0.19276644863409004, + "flos": 803157635328.0, + "grad_norm": 0.04353601683576214, + "language_loss": 0.85235333, + "learning_rate": 0.0009321273014109439, + "loss": 0.86323166, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.54003906, + "step": 1002, + "time_per_iteration": 2.9539175033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_mlp": 1.04068995, + "epoch": 0.1929588303193536, + "flos": 564480319488.0, + "grad_norm": 0.03718563884895513, + "language_loss": 0.86078906, + "learning_rate": 0.0009319704947298513, + "loss": 0.87173432, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.53955078, + "step": 1003, + "time_per_iteration": 2.8760387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091693, + "balance_loss_mlp": 1.0380007, + "epoch": 0.19315121200461716, + "flos": 627988598784.0, + "grad_norm": 0.03744955738150477, + "language_loss": 0.89579475, + "learning_rate": 0.0009318135203429393, + "loss": 0.9067117, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.53808594, + "step": 1004, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094654, + "balance_loss_mlp": 1.04058087, + "epoch": 0.19334359368988072, + "flos": 518584013568.0, + "grad_norm": 0.03742742378220975, + "language_loss": 0.89228511, + "learning_rate": 0.0009316563783111511, + "loss": 0.90323162, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.54199219, + "step": 1005, + "time_per_iteration": 2.7024500370025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090205, + "balance_loss_mlp": 1.03598833, + "epoch": 0.19353597537514428, + "flos": 695400709632.0, + "grad_norm": 0.036019255491177425, + "language_loss": 0.83731771, + "learning_rate": 0.0009314990686954943, + "loss": 0.84821975, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.54345703, + "step": 1006, + "time_per_iteration": 2.901319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092974, + "balance_loss_mlp": 1.03866184, + "epoch": 0.19372835706040784, + "flos": 1212200981760.0, + "grad_norm": 0.03507497873235563, + "language_loss": 0.82359284, + "learning_rate": 0.000931341591557042, + "loss": 0.8345226, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.54443359, + "step": 1007, + "time_per_iteration": 3.70509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.03467596, + "epoch": 0.19392073874567142, + "flos": 521685891840.0, + "grad_norm": 0.04354230775215961, + "language_loss": 0.88703787, + "learning_rate": 0.0009311839469569325, + "loss": 0.89792681, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.54345703, + "step": 1008, + "time_per_iteration": 2.632070302963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088222, + "balance_loss_mlp": 1.03386211, + "epoch": 0.19411312043093498, + "flos": 589911628032.0, + "grad_norm": 0.044503426382111445, + "language_loss": 0.88821465, + "learning_rate": 0.0009310261349563687, + "loss": 0.89909685, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.54492188, + "step": 1009, + "time_per_iteration": 2.7138211727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0067606, + "epoch": 0.19430550211619854, + "flos": 580572945408.0, + "grad_norm": 0.029375689409949213, + "language_loss": 0.86173785, + "learning_rate": 0.0009308681556166186, + "loss": 0.87235624, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.55224609, + "step": 1010, + "time_per_iteration": 2.834946870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05162705, + "balance_loss_mlp": 5.08607721, + "epoch": 0.1944978838014621, + "flos": 622246579200.0, + "grad_norm": 0.2884784307389343, + "language_loss": 0.88793403, + "learning_rate": 0.0009307100089990152, + "loss": 0.93956107, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.76513672, + "step": 1011, + "time_per_iteration": 2.705335855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094303, + "balance_loss_mlp": 1.04189909, + "epoch": 0.19469026548672566, + "flos": 599815081728.0, + "grad_norm": 0.04633555371791679, + "language_loss": 0.85740912, + "learning_rate": 0.0009305516951649568, + "loss": 0.86835217, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.52490234, + "step": 1012, + "time_per_iteration": 2.7048773765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164213, + "balance_loss_mlp": 1.11281013, + "epoch": 0.19488264717198922, + "flos": 553248046848.0, + "grad_norm": 0.04991787894778298, + "language_loss": 0.87912452, + "learning_rate": 0.0009303932141759057, + "loss": 0.89076668, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.51464844, + "step": 1013, + "time_per_iteration": 2.8072102069854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211245, + "balance_loss_mlp": 1.15984225, + "epoch": 0.19507502885725278, + "flos": 667313708544.0, + "grad_norm": 0.06529111316537192, + "language_loss": 0.85445917, + "learning_rate": 0.0009302345660933902, + "loss": 0.86657166, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.51464844, + "step": 1014, + "time_per_iteration": 2.7895615100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244014, + "balance_loss_mlp": 1.19265878, + "epoch": 0.19526741054251634, + "flos": 672328618752.0, + "grad_norm": 0.06071591874537116, + "language_loss": 0.86587232, + "learning_rate": 0.0009300757509790026, + "loss": 0.87831247, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.51416016, + "step": 1015, + "time_per_iteration": 2.8867006301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012313, + "balance_loss_mlp": 1.18008745, + "epoch": 0.19545979222777993, + "flos": 448147792128.0, + "grad_norm": 0.057262662434688416, + "language_loss": 0.91914976, + "learning_rate": 0.0009299167688944005, + "loss": 0.93146276, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.51269531, + "step": 1016, + "time_per_iteration": 2.526421546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226901, + "balance_loss_mlp": 1.17568827, + "epoch": 0.1956521739130435, + "flos": 570169849344.0, + "grad_norm": 0.05343522997619492, + "language_loss": 0.87454194, + "learning_rate": 0.0009297576199013063, + "loss": 0.8868109, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.51269531, + "step": 1017, + "time_per_iteration": 2.7184784412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012071, + "balance_loss_mlp": 1.15884399, + "epoch": 0.19584455559830705, + "flos": 1458883280640.0, + "grad_norm": 0.03399393552013433, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74209231, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.48242188, + "step": 1018, + "time_per_iteration": 4.916393756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159874, + "balance_loss_mlp": 1.11199951, + "epoch": 0.1960369372835706, + "flos": 1594484189184.0, + "grad_norm": 0.02523442502037962, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80586171, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.47851562, + "step": 1019, + "time_per_iteration": 5.5991902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163202, + "balance_loss_mlp": 1.11241901, + "epoch": 0.19622931896883417, + "flos": 617254023168.0, + "grad_norm": 0.06792637193668423, + "language_loss": 0.88615566, + "learning_rate": 0.0009292791720892659, + "loss": 0.89778763, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.50830078, + "step": 1020, + "time_per_iteration": 2.8419806957244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.08191884, + "epoch": 0.19642170065409773, + "flos": 467208148224.0, + "grad_norm": 0.044541966790476714, + "language_loss": 0.90245676, + "learning_rate": 0.0009291193560807218, + "loss": 0.91378373, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.50830078, + "step": 1021, + "time_per_iteration": 2.60357403755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111942, + "balance_loss_mlp": 1.06858945, + "epoch": 0.19661408233936128, + "flos": 516288957696.0, + "grad_norm": 0.03957164107654416, + "language_loss": 0.88134921, + "learning_rate": 0.0009289593734732688, + "loss": 0.89254344, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.50878906, + "step": 1022, + "time_per_iteration": 2.6077988147735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115639, + "balance_loss_mlp": 1.06461763, + "epoch": 0.19680646402462484, + "flos": 393494104320.0, + "grad_norm": 0.03618938319364158, + "language_loss": 0.94921708, + "learning_rate": 0.0009287992243290175, + "loss": 0.96037352, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.51074219, + "step": 1023, + "time_per_iteration": 2.486910820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_mlp": 1.05263603, + "epoch": 0.19699884570988843, + "flos": 627624071424.0, + "grad_norm": 0.04088238638674664, + "language_loss": 0.91379654, + "learning_rate": 0.0009286389087101435, + "loss": 0.92483938, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.51708984, + "step": 1024, + "time_per_iteration": 2.7762300968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083626, + "balance_loss_mlp": 1.03126919, + "epoch": 0.197191227395152, + "flos": 559074637056.0, + "grad_norm": 0.038177798611856564, + "language_loss": 0.89866579, + "learning_rate": 0.0009284784266788864, + "loss": 0.90950203, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.52441406, + "step": 1025, + "time_per_iteration": 2.7595441341400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.05275905, + "epoch": 0.19738360908041555, + "flos": 666250262016.0, + "grad_norm": 0.08120700653890094, + "language_loss": 0.93505025, + "learning_rate": 0.0009283177782975512, + "loss": 0.94610423, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.52734375, + "step": 1026, + "time_per_iteration": 2.9439735412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158523, + "balance_loss_mlp": 1.10511732, + "epoch": 0.1975759907656791, + "flos": 523511440896.0, + "grad_norm": 0.05175943009769999, + "language_loss": 0.89213437, + "learning_rate": 0.000928156963628507, + "loss": 0.9037196, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.53515625, + "step": 1027, + "time_per_iteration": 2.5648727416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124606, + "balance_loss_mlp": 1.0717721, + "epoch": 0.19776837245094267, + "flos": 463485118464.0, + "grad_norm": 0.0380471847687272, + "language_loss": 0.89530945, + "learning_rate": 0.0009279959827341877, + "loss": 0.90655547, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.52929688, + "step": 1028, + "time_per_iteration": 2.7482099533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114747, + "balance_loss_mlp": 1.0622474, + "epoch": 0.19796075413620623, + "flos": 504058367232.0, + "grad_norm": 0.038077776452832945, + "language_loss": 0.88821751, + "learning_rate": 0.0009278348356770915, + "loss": 0.89936495, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.52587891, + "step": 1029, + "time_per_iteration": 2.5559866428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125843, + "balance_loss_mlp": 1.07362974, + "epoch": 0.1981531358214698, + "flos": 508571689728.0, + "grad_norm": 0.03906482091144459, + "language_loss": 0.87010926, + "learning_rate": 0.0009276735225197814, + "loss": 0.88136768, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.52294922, + "step": 1030, + "time_per_iteration": 2.598353862762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116206, + "balance_loss_mlp": 1.06418335, + "epoch": 0.19834551750673335, + "flos": 532640153088.0, + "grad_norm": 0.039761606091750314, + "language_loss": 0.8715511, + "learning_rate": 0.0009275120433248847, + "loss": 0.88271314, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.52099609, + "step": 1031, + "time_per_iteration": 2.691051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105688, + "balance_loss_mlp": 1.05414224, + "epoch": 0.1985378991919969, + "flos": 776971027200.0, + "grad_norm": 0.03650424605094363, + "language_loss": 0.87217546, + "learning_rate": 0.0009273503981550931, + "loss": 0.88323236, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.51611328, + "step": 1032, + "time_per_iteration": 3.05829119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094626, + "balance_loss_mlp": 1.04336572, + "epoch": 0.1987302808772605, + "flos": 435192037632.0, + "grad_norm": 0.04492232470085823, + "language_loss": 0.88675368, + "learning_rate": 0.0009271885870731626, + "loss": 0.89769995, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.51318359, + "step": 1033, + "time_per_iteration": 2.5097644329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.04036272, + "epoch": 0.19892266256252406, + "flos": 554654633472.0, + "grad_norm": 0.041410721104386976, + "language_loss": 0.89478087, + "learning_rate": 0.0009270266101419143, + "loss": 0.90569472, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.51074219, + "step": 1034, + "time_per_iteration": 2.6359710693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.04026711, + "epoch": 0.19911504424778761, + "flos": 550949100288.0, + "grad_norm": 0.034987230226667505, + "language_loss": 0.86329561, + "learning_rate": 0.0009268644674242328, + "loss": 0.87420899, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.51123047, + "step": 1035, + "time_per_iteration": 2.679041624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091574, + "balance_loss_mlp": 1.04045713, + "epoch": 0.19930742593305117, + "flos": 519313068288.0, + "grad_norm": 0.035495194235479824, + "language_loss": 0.81977046, + "learning_rate": 0.0009267021589830678, + "loss": 0.83068615, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.51171875, + "step": 1036, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330025, + "balance_loss_mlp": 1.27871704, + "epoch": 0.19949980761831473, + "flos": 1512640717824.0, + "grad_norm": 0.0530000786951376, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78957105, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.51367188, + "step": 1037, + "time_per_iteration": 5.041083097457886 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.04635978, + "epoch": 0.1996921893035783, + "flos": 699440634624.0, + "grad_norm": 0.03827221066614039, + "language_loss": 0.93735194, + "learning_rate": 0.000926377045182406, + "loss": 0.94832766, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.51269531, + "step": 1038, + "time_per_iteration": 2.921194314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_mlp": 1.05443072, + "epoch": 0.19988457098884185, + "flos": 728395696128.0, + "grad_norm": 0.0388450926907903, + "language_loss": 0.89164472, + "learning_rate": 0.0009262142399491296, + "loss": 0.90270543, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.51708984, + "step": 1039, + "time_per_iteration": 3.0543293952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.05093122, + "epoch": 0.2000769526741054, + "flos": 561625350144.0, + "grad_norm": 0.04341407711707897, + "language_loss": 0.8911137, + "learning_rate": 0.0009260512692448105, + "loss": 0.90213847, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.51611328, + "step": 1040, + "time_per_iteration": 2.6906111240386963 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 86214480, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2348217393741824.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/training_args.bin b/sft_pretrain/Full_xmoe/checkpoint-1040/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e62437ed6fbf4cf3ea22fcfae3749bb9df2d0109 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144fbe7f1cf435dbbf0ef9621414cb3e97a5ff4a560571b878000caf2931b07 +size 7992 diff --git a/sft_pretrain/Full_xmoe/checkpoint-1040/zero_to_fp32.py b/sft_pretrain/Full_xmoe/checkpoint-1040/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-1040/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/added_tokens.json b/sft_pretrain/Full_xmoe/checkpoint-2080/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/config.json b/sft_pretrain/Full_xmoe/checkpoint-2080/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5ed860286ec8c9b3f17e5234326d2ed728ca6a65 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "xmoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/generation_config.json b/sft_pretrain/Full_xmoe/checkpoint-2080/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acd8990253b57f1db6bb52784f147176c8f34002 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74bb1384b368db9146031632fdb25c8829284f43ae580d68268985ef7071a7ed +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c58fa21ccc68852d87fecf2ccb67a62715c7e97 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8bddd2ef7e5ac219bb84cadf1f7f9b3d67968f927b30c95280f6d5343144265 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f48fa23dee3a8cafed7222eda5e29380cd4579b --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e72ead37031898c7b34ff55875c374c39acc894a7eb8789167ed7867ae332a0 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a65c3f8b786cbf3741d2e6f8c5ad32589f5ceaf --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f5006fa68314971a1bc16c6f9e30c6b04428c7881189d7372a4c08358a7af2 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c86239d69bd3c9a95e6c9b78a2aeaa7a2ef5ca4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59adda3b4d6f472811c6654d2f053684e04e84a70ccbf330de452c6703f4fd1a +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f377bbb60628f298d9fee043ce8dce0c4c6a767 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7747759edd94ce71d43499041f52c7fdfe86ae996d441f725b25c3966305c3d1 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..082723162f16b4fd7af5aa11498ab7fad5e4f8ac --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76d8200a04707d5cb751026da169f03e88ef48169d0674ae8b00864e65f77697 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..860d593365e6f0ada9b5cb0860e031f95c2ddd7f --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fad4ab406c8f9b8394bfe594776935a2df0cf6538e327b74d033c0692e6a13d2 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/latest b/sft_pretrain/Full_xmoe/checkpoint-2080/latest new file mode 100644 index 0000000000000000000000000000000000000000..306b989cc55bbad3d1661dff0bcd6923a752cb0a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/latest @@ -0,0 +1 @@ +global_step2080 \ No newline at end of file diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/model-00001-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-2080/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/model-00002-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-2080/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d7db2e23479ef81aea81bf871601b07fd39c1afe --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a74517d7b804572e9cb39ad0f94bc8b9f7c9dba23090e6f23f572b2044efb98 +size 3759044016 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/model.safetensors.index.json b/sft_pretrain/Full_xmoe/checkpoint-2080/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..507806fb086ee2ffdb4c1df263574fc5a7cfa513 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/model.safetensors.index.json @@ -0,0 +1,675 @@ +{ + "metadata": { + "total_size": 8731443248 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_0.pth b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_1.pth b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_2.pth b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_3.pth b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/special_tokens_map.json b/sft_pretrain/Full_xmoe/checkpoint-2080/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/tokenizer.model b/sft_pretrain/Full_xmoe/checkpoint-2080/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/tokenizer_config.json b/sft_pretrain/Full_xmoe/checkpoint-2080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/trainer_state.json b/sft_pretrain/Full_xmoe/checkpoint-2080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..64a091f78b6cbb675ba2c10a621ae8dc06545e1e --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/trainer_state.json @@ -0,0 +1,31233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001539053482108, + "eval_steps": 500, + "global_step": 2080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334678, + "balance_loss_mlp": 2.48847342, + "epoch": 0.00019238168526356292, + "flos": 471022563072.0, + "grad_norm": 15.010934477254423, + "language_loss": 2.91277003, + "learning_rate": 0.0, + "loss": 1.95375419, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 8.6015625, + "step": 1, + "time_per_iteration": 23.313215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03608113, + "balance_loss_mlp": 3.00043201, + "epoch": 0.00038476337052712584, + "flos": 505538830848.0, + "grad_norm": 25.821694542927546, + "language_loss": 10.7459116, + "learning_rate": 0.00013726078121135892, + "loss": 10.78199196, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.06640625, + "step": 2, + "time_per_iteration": 2.6342098712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03648002, + "balance_loss_mlp": 3.03803182, + "epoch": 0.0005771450557906887, + "flos": 600334166016.0, + "grad_norm": 27.537763142134942, + "language_loss": 10.88985825, + "learning_rate": 0.00021755319103969496, + "loss": 10.9263401, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.08984375, + "step": 3, + "time_per_iteration": 2.9129159450531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03639085, + "balance_loss_mlp": 3.03521824, + "epoch": 0.0007695267410542517, + "flos": 581497386240.0, + "grad_norm": 10.719163482624658, + "language_loss": 8.79598808, + "learning_rate": 0.00027452156242271784, + "loss": 8.83237934, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.02734375, + "step": 4, + "time_per_iteration": 2.72357439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03604871, + "balance_loss_mlp": 3.01435566, + "epoch": 0.0009619084263178145, + "flos": 487154061312.0, + "grad_norm": 22.68157363884245, + "language_loss": 9.41989708, + "learning_rate": 0.0003187096642208417, + "loss": 9.45594501, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.8984375, + "step": 5, + "time_per_iteration": 2.6791844367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03472164, + "balance_loss_mlp": 2.9011035, + "epoch": 0.0011542901115813775, + "flos": 561167503872.0, + "grad_norm": 7.113488232519407, + "language_loss": 9.41725159, + "learning_rate": 0.0003548139722510539, + "loss": 9.45197296, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.72265625, + "step": 6, + "time_per_iteration": 2.7308623790740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03266853, + "balance_loss_mlp": 2.70799947, + "epoch": 0.0013466717968449403, + "flos": 534951738624.0, + "grad_norm": 3.189932925125429, + "language_loss": 8.01036549, + "learning_rate": 0.00038533972973918044, + "loss": 8.0430336, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.59765625, + "step": 7, + "time_per_iteration": 2.6907436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02962571, + "balance_loss_mlp": 2.41211033, + "epoch": 0.0015390534821085034, + "flos": 493334485248.0, + "grad_norm": 5.13822781788523, + "language_loss": 7.84486008, + "learning_rate": 0.0004117823436340768, + "loss": 7.87448597, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.51171875, + "step": 8, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550478, + "balance_loss_mlp": 2.0114615, + "epoch": 0.0017314351673720662, + "flos": 565776090624.0, + "grad_norm": 3.8232757327488405, + "language_loss": 7.62468719, + "learning_rate": 0.00043510638207938993, + "loss": 7.65019178, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.7688682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02337757, + "balance_loss_mlp": 1.81705093, + "epoch": 0.001923816852635629, + "flos": 594509521152.0, + "grad_norm": 3.0012265425900817, + "language_loss": 6.96830463, + "learning_rate": 0.00045597044543220066, + "loss": 6.99168253, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.20703125, + "step": 10, + "time_per_iteration": 2.736985921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02262083, + "balance_loss_mlp": 1.74290299, + "epoch": 0.002116198537899192, + "flos": 610895709696.0, + "grad_norm": 2.2728267884834983, + "language_loss": 6.92078686, + "learning_rate": 0.00047484428652143135, + "loss": 6.94340801, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.19140625, + "step": 11, + "time_per_iteration": 2.8857340812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308547, + "balance_loss_mlp": 1.78135598, + "epoch": 0.002308580223162755, + "flos": 546175262976.0, + "grad_norm": 4.334726148282724, + "language_loss": 6.71077013, + "learning_rate": 0.0004920747534624128, + "loss": 6.73385572, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 5.2734375, + "step": 12, + "time_per_iteration": 2.635601282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02317905, + "balance_loss_mlp": 1.79147708, + "epoch": 0.002500961908426318, + "flos": 645924270336.0, + "grad_norm": 3.1568536142119923, + "language_loss": 6.53248501, + "learning_rate": 0.0005079252465375872, + "loss": 6.55566406, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 5.265625, + "step": 13, + "time_per_iteration": 2.8112540245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02242807, + "balance_loss_mlp": 1.72019386, + "epoch": 0.0026933435936898806, + "flos": 488849352960.0, + "grad_norm": 7.572425831928954, + "language_loss": 6.47189951, + "learning_rate": 0.0005226005109505393, + "loss": 6.49432755, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 5.2265625, + "step": 14, + "time_per_iteration": 2.590078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02247915, + "balance_loss_mlp": 1.72415757, + "epoch": 0.0028857252789534437, + "flos": 435526429440.0, + "grad_norm": 2.3229781853457747, + "language_loss": 6.01724243, + "learning_rate": 0.0005362628552605367, + "loss": 6.03972149, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 5.23828125, + "step": 15, + "time_per_iteration": 2.636983871459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02135688, + "balance_loss_mlp": 1.62108541, + "epoch": 0.0030781069642170067, + "flos": 597841778688.0, + "grad_norm": 4.36506198708269, + "language_loss": 5.46747923, + "learning_rate": 0.0005490431248454357, + "loss": 5.48883629, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 5.14453125, + "step": 16, + "time_per_iteration": 2.6904103755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02173305, + "balance_loss_mlp": 1.67586899, + "epoch": 0.0032704886494805694, + "flos": 1541513154048.0, + "grad_norm": 0.3693165783384919, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77878416, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 4.96875, + "step": 17, + "time_per_iteration": 6.815098285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01958957, + "balance_loss_mlp": 1.45846832, + "epoch": 0.0034628703347441324, + "flos": 474971102976.0, + "grad_norm": 7.376330921510473, + "language_loss": 3.16160107, + "learning_rate": 0.0005723671632907488, + "loss": 3.18119049, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 5.0, + "step": 18, + "time_per_iteration": 2.7730185985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974299, + "balance_loss_mlp": 1.48144007, + "epoch": 0.0036552520200076955, + "flos": 449478556416.0, + "grad_norm": 2.0435067055151803, + "language_loss": 1.8205657, + "learning_rate": 0.0005830738490244919, + "loss": 1.84030867, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.921875, + "step": 19, + "time_per_iteration": 2.5196421146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215561, + "balance_loss_mlp": 1.73147547, + "epoch": 0.003847633705271258, + "flos": 637351580928.0, + "grad_norm": 2.199322832792736, + "language_loss": 1.81859815, + "learning_rate": 0.0005932312266435596, + "loss": 1.84075379, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.83203125, + "step": 20, + "time_per_iteration": 2.7772061824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02397049, + "balance_loss_mlp": 1.91639686, + "epoch": 0.004040015390534821, + "flos": 590591105280.0, + "grad_norm": 2.068137361611091, + "language_loss": 1.81285238, + "learning_rate": 0.0006028929207788754, + "loss": 1.83682299, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.796875, + "step": 21, + "time_per_iteration": 2.7197327613830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02949394, + "balance_loss_mlp": 2.47560835, + "epoch": 0.004232397075798384, + "flos": 757866929664.0, + "grad_norm": 0.9893066861855494, + "language_loss": 1.43565178, + "learning_rate": 0.0006121050677327902, + "loss": 1.46514571, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.7265625, + "step": 22, + "time_per_iteration": 2.8821635246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04240368, + "balance_loss_mlp": 3.77421188, + "epoch": 0.004424778761061947, + "flos": 527727310080.0, + "grad_norm": 1.6702760591351544, + "language_loss": 1.36044598, + "learning_rate": 0.0006209076479463684, + "loss": 1.40284979, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.6484375, + "step": 23, + "time_per_iteration": 2.6194069385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04405254, + "balance_loss_mlp": 3.93871665, + "epoch": 0.00461716044632551, + "flos": 549218815488.0, + "grad_norm": 1.6356367296774819, + "language_loss": 1.46302319, + "learning_rate": 0.0006293355346737718, + "loss": 1.50707567, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.65234375, + "step": 24, + "time_per_iteration": 2.741433620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03977472, + "balance_loss_mlp": 3.50483179, + "epoch": 0.004809542131589073, + "flos": 568752569088.0, + "grad_norm": 1.079559317914091, + "language_loss": 1.33177948, + "learning_rate": 0.0006374193284416834, + "loss": 1.37155437, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.71484375, + "step": 25, + "time_per_iteration": 2.902089834213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03127712, + "balance_loss_mlp": 2.642483, + "epoch": 0.005001923816852636, + "flos": 471584410368.0, + "grad_norm": 0.4847890845471295, + "language_loss": 1.26058078, + "learning_rate": 0.0006451860277489461, + "loss": 1.29185796, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.84375, + "step": 26, + "time_per_iteration": 2.6045680046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02733563, + "balance_loss_mlp": 2.23154879, + "epoch": 0.005194305502116198, + "flos": 416381502720.0, + "grad_norm": 0.2845036760864029, + "language_loss": 1.33193052, + "learning_rate": 0.0006526595731190848, + "loss": 1.35926616, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.015625, + "step": 27, + "time_per_iteration": 2.4412264823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02759137, + "balance_loss_mlp": 2.2411015, + "epoch": 0.005386687187379761, + "flos": 629996894976.0, + "grad_norm": 0.34713687972437796, + "language_loss": 1.22031224, + "learning_rate": 0.0006598612921618983, + "loss": 1.24790359, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.1796875, + "step": 28, + "time_per_iteration": 2.80483078956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575766, + "balance_loss_mlp": 2.05010033, + "epoch": 0.005579068872643324, + "flos": 888021326592.0, + "grad_norm": 0.3062478898066755, + "language_loss": 1.16221631, + "learning_rate": 0.0006668102665011454, + "loss": 1.18797398, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.2578125, + "step": 29, + "time_per_iteration": 3.243164300918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02507804, + "balance_loss_mlp": 1.97527242, + "epoch": 0.005771450557906887, + "flos": 548658902016.0, + "grad_norm": 0.22276861521731073, + "language_loss": 1.24634933, + "learning_rate": 0.0006735236364718957, + "loss": 1.27142727, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.328125, + "step": 30, + "time_per_iteration": 2.7701382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02465182, + "balance_loss_mlp": 1.93226886, + "epoch": 0.00596383224317045, + "flos": 533069809152.0, + "grad_norm": 0.21102664747409663, + "language_loss": 1.23222375, + "learning_rate": 0.0006800168558381346, + "loss": 1.25687563, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.33203125, + "step": 31, + "time_per_iteration": 2.635246515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02445382, + "balance_loss_mlp": 1.91552007, + "epoch": 0.0061562139284340135, + "flos": 590163394560.0, + "grad_norm": 0.21886797396213825, + "language_loss": 1.26610851, + "learning_rate": 0.0006863039060567947, + "loss": 1.29056239, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.30078125, + "step": 32, + "time_per_iteration": 2.7791683673858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02338603, + "balance_loss_mlp": 1.80950415, + "epoch": 0.006348595613697576, + "flos": 619442154240.0, + "grad_norm": 0.18971916612404452, + "language_loss": 1.17543316, + "learning_rate": 0.0006923974775611263, + "loss": 1.19881916, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.29296875, + "step": 33, + "time_per_iteration": 2.836601495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02160521, + "balance_loss_mlp": 1.64134097, + "epoch": 0.006540977298961139, + "flos": 779300109312.0, + "grad_norm": 0.13369632510289112, + "language_loss": 1.13907146, + "learning_rate": 0.0006983091239737814, + "loss": 1.16067672, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.19140625, + "step": 34, + "time_per_iteration": 3.021479606628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0221033, + "balance_loss_mlp": 1.69649041, + "epoch": 0.006733358984224702, + "flos": 668373264384.0, + "grad_norm": 0.11522706717853448, + "language_loss": 1.11973858, + "learning_rate": 0.0007040493939600222, + "loss": 1.14184177, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 5.13671875, + "step": 35, + "time_per_iteration": 2.9400346279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0227657, + "balance_loss_mlp": 1.76997864, + "epoch": 0.006925740669488265, + "flos": 565496133888.0, + "grad_norm": 0.11143421895921844, + "language_loss": 1.12295914, + "learning_rate": 0.0007096279445021078, + "loss": 1.14572477, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 5.0625, + "step": 36, + "time_per_iteration": 2.698153495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02284885, + "balance_loss_mlp": 1.78668559, + "epoch": 0.007118122354751828, + "flos": 551112405504.0, + "grad_norm": 0.11733654674395574, + "language_loss": 1.1734066, + "learning_rate": 0.0007150536386503726, + "loss": 1.19625545, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.9765625, + "step": 37, + "time_per_iteration": 2.8579084873199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02268399, + "balance_loss_mlp": 1.77782845, + "epoch": 0.007310504040015391, + "flos": 703814951424.0, + "grad_norm": 0.14208952684155102, + "language_loss": 1.10088778, + "learning_rate": 0.0007203346302358509, + "loss": 1.12357187, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.8984375, + "step": 38, + "time_per_iteration": 2.928835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220017, + "balance_loss_mlp": 1.73555112, + "epoch": 0.007502885725278953, + "flos": 600501361920.0, + "grad_norm": 0.142042154575746, + "language_loss": 1.15486813, + "learning_rate": 0.000725478437577282, + "loss": 1.17706823, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.8359375, + "step": 39, + "time_per_iteration": 2.8706436157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0209897, + "balance_loss_mlp": 1.62251425, + "epoch": 0.007695267410542516, + "flos": 561428018688.0, + "grad_norm": 0.13255726845543458, + "language_loss": 1.10233212, + "learning_rate": 0.0007304920078549186, + "loss": 1.12332189, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.75390625, + "step": 40, + "time_per_iteration": 2.6895179748535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01939831, + "balance_loss_mlp": 1.46986008, + "epoch": 0.007887649095806078, + "flos": 509231725056.0, + "grad_norm": 0.11166218824526469, + "language_loss": 1.12161303, + "learning_rate": 0.0007353817735343603, + "loss": 1.14101124, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.6875, + "step": 41, + "time_per_iteration": 2.709167957305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0184399, + "balance_loss_mlp": 1.3778342, + "epoch": 0.008080030781069641, + "flos": 504905040384.0, + "grad_norm": 0.06254207778511488, + "language_loss": 1.07663667, + "learning_rate": 0.0007401537019902344, + "loss": 1.09507656, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.6484375, + "step": 42, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01789021, + "balance_loss_mlp": 1.32896876, + "epoch": 0.008272412466333205, + "flos": 519106988544.0, + "grad_norm": 0.07012531219711775, + "language_loss": 1.09992051, + "learning_rate": 0.0007448133392900729, + "loss": 1.11781073, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.5859375, + "step": 43, + "time_per_iteration": 2.6997878551483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01787217, + "balance_loss_mlp": 1.32983518, + "epoch": 0.008464794151596768, + "flos": 609184866816.0, + "grad_norm": 0.09276066699658307, + "language_loss": 1.05755496, + "learning_rate": 0.0007493658489441491, + "loss": 1.07542706, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.56640625, + "step": 44, + "time_per_iteration": 2.8852477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177156, + "balance_loss_mlp": 1.31913674, + "epoch": 0.00865717583686033, + "flos": 539007214848.0, + "grad_norm": 0.11478380715178954, + "language_loss": 1.09959674, + "learning_rate": 0.0007538160463002316, + "loss": 1.11731243, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.53125, + "step": 45, + "time_per_iteration": 2.685568332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01802016, + "balance_loss_mlp": 1.35378933, + "epoch": 0.008849557522123894, + "flos": 509010094080.0, + "grad_norm": 0.14537339285711792, + "language_loss": 1.13533509, + "learning_rate": 0.0007581684291577274, + "loss": 1.15335524, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.49609375, + "step": 46, + "time_per_iteration": 2.5798568725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764716, + "balance_loss_mlp": 1.31915987, + "epoch": 0.009041939207387457, + "flos": 626508135168.0, + "grad_norm": 0.13285081251714825, + "language_loss": 1.15270185, + "learning_rate": 0.0007624272050891776, + "loss": 1.17034888, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.46875, + "step": 47, + "time_per_iteration": 2.822632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0175788, + "balance_loss_mlp": 1.31461263, + "epoch": 0.00923432089265102, + "flos": 550610817792.0, + "grad_norm": 0.11934546954286276, + "language_loss": 1.04916859, + "learning_rate": 0.0007665963158851307, + "loss": 1.06674731, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.4453125, + "step": 48, + "time_per_iteration": 2.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01741735, + "balance_loss_mlp": 1.29846764, + "epoch": 0.009426702577914583, + "flos": 563679333120.0, + "grad_norm": 0.08548395668661983, + "language_loss": 1.13647461, + "learning_rate": 0.0007706794594783609, + "loss": 1.15389204, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.4453125, + "step": 49, + "time_per_iteration": 2.734813928604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01727457, + "balance_loss_mlp": 1.28838515, + "epoch": 0.009619084263178146, + "flos": 617926697472.0, + "grad_norm": 0.06892583067190382, + "language_loss": 1.12110853, + "learning_rate": 0.0007746801096530423, + "loss": 1.13838315, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.40234375, + "step": 50, + "time_per_iteration": 2.7447421550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01719129, + "balance_loss_mlp": 1.28043914, + "epoch": 0.009811465948441709, + "flos": 542489171712.0, + "grad_norm": 0.04778558244894799, + "language_loss": 1.16797209, + "learning_rate": 0.0007786015338021173, + "loss": 1.1851635, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.3984375, + "step": 51, + "time_per_iteration": 2.65645694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01722789, + "balance_loss_mlp": 1.28562462, + "epoch": 0.010003847633705272, + "flos": 536977531392.0, + "grad_norm": 0.06217135289779639, + "language_loss": 1.09074998, + "learning_rate": 0.0007824468089603051, + "loss": 1.10797799, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.3828125, + "step": 52, + "time_per_iteration": 2.7218713760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01697539, + "balance_loss_mlp": 1.26380801, + "epoch": 0.010196229318968833, + "flos": 910806657792.0, + "grad_norm": 0.04206474108062499, + "language_loss": 1.08130515, + "learning_rate": 0.0007862188363098669, + "loss": 1.09828055, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.34765625, + "step": 53, + "time_per_iteration": 3.149973154067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01668333, + "balance_loss_mlp": 1.23765349, + "epoch": 0.010388611004232396, + "flos": 586970142720.0, + "grad_norm": 0.050634309517598654, + "language_loss": 1.08688021, + "learning_rate": 0.0007899203543304438, + "loss": 1.10356343, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.31640625, + "step": 54, + "time_per_iteration": 2.7033088207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01691162, + "balance_loss_mlp": 1.26315343, + "epoch": 0.01058099268949596, + "flos": 503472208896.0, + "grad_norm": 0.06464656169002964, + "language_loss": 1.22991037, + "learning_rate": 0.0007935539507422731, + "loss": 1.246822, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.2890625, + "step": 55, + "time_per_iteration": 2.601745843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017059, + "balance_loss_mlp": 1.28017938, + "epoch": 0.010773374374759523, + "flos": 545558969088.0, + "grad_norm": 0.06403483907250343, + "language_loss": 1.12561536, + "learning_rate": 0.0007971220733732573, + "loss": 1.14267421, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.265625, + "step": 56, + "time_per_iteration": 2.677314281463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0169453, + "balance_loss_mlp": 1.27262425, + "epoch": 0.010965756060023086, + "flos": 527286960384.0, + "grad_norm": 0.061369678053330295, + "language_loss": 1.07931721, + "learning_rate": 0.0008006270400641869, + "loss": 1.09626245, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.2265625, + "step": 57, + "time_per_iteration": 2.7162468433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699229, + "balance_loss_mlp": 1.27846837, + "epoch": 0.011158137745286649, + "flos": 578098054656.0, + "grad_norm": 0.06126094216688289, + "language_loss": 1.08923888, + "learning_rate": 0.0008040710477125043, + "loss": 1.10623109, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.21484375, + "step": 58, + "time_per_iteration": 2.724116563796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648065, + "balance_loss_mlp": 1.23150039, + "epoch": 0.011350519430550212, + "flos": 530314961664.0, + "grad_norm": 0.059594432794803906, + "language_loss": 1.09501219, + "learning_rate": 0.0008074561805429771, + "loss": 1.11149275, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.171875, + "step": 59, + "time_per_iteration": 2.613821268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01628698, + "balance_loss_mlp": 1.21594822, + "epoch": 0.011542901115813775, + "flos": 556971076608.0, + "grad_norm": 0.046387810099464834, + "language_loss": 1.0703913, + "learning_rate": 0.0008107844176832545, + "loss": 1.08667827, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.1328125, + "step": 60, + "time_per_iteration": 2.6809566020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01602811, + "balance_loss_mlp": 1.19349384, + "epoch": 0.011735282801077338, + "flos": 573176463360.0, + "grad_norm": 0.036957475185327084, + "language_loss": 1.08104563, + "learning_rate": 0.0008140576401132568, + "loss": 1.09707379, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.09765625, + "step": 61, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596506, + "balance_loss_mlp": 1.19024038, + "epoch": 0.0119276644863409, + "flos": 616717442304.0, + "grad_norm": 0.034032461682055544, + "language_loss": 1.09685671, + "learning_rate": 0.0008172776370494935, + "loss": 1.11282182, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.06640625, + "step": 62, + "time_per_iteration": 2.7589328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01605764, + "balance_loss_mlp": 1.20255029, + "epoch": 0.012120046171604464, + "flos": 502085064192.0, + "grad_norm": 0.035968497482949544, + "language_loss": 1.17104983, + "learning_rate": 0.0008204461118185703, + "loss": 1.18710756, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.03515625, + "step": 63, + "time_per_iteration": 2.594369411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01603311, + "balance_loss_mlp": 1.20353031, + "epoch": 0.012312427856868027, + "flos": 474302319360.0, + "grad_norm": 0.04911792883083492, + "language_loss": 1.06295228, + "learning_rate": 0.0008235646872681536, + "loss": 1.07898545, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 3.99609375, + "step": 64, + "time_per_iteration": 2.5651702880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01599528, + "balance_loss_mlp": 1.20279896, + "epoch": 0.012504809542131588, + "flos": 539471864064.0, + "grad_norm": 0.049725750424410776, + "language_loss": 1.06296277, + "learning_rate": 0.0008266349107584288, + "loss": 1.07895803, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 3.95898438, + "step": 65, + "time_per_iteration": 2.6876485347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596697, + "balance_loss_mlp": 1.20492756, + "epoch": 0.012697191227395151, + "flos": 609857541120.0, + "grad_norm": 0.056540756097456804, + "language_loss": 1.08585978, + "learning_rate": 0.0008296582587724851, + "loss": 1.10182667, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 3.91210938, + "step": 66, + "time_per_iteration": 2.71223783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587883, + "balance_loss_mlp": 1.19821179, + "epoch": 0.012889572912658714, + "flos": 769398600960.0, + "grad_norm": 0.04465917834699911, + "language_loss": 1.0627861, + "learning_rate": 0.0008326361411800136, + "loss": 1.07866502, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 3.89648438, + "step": 67, + "time_per_iteration": 2.9413115978240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01577237, + "balance_loss_mlp": 1.19099891, + "epoch": 0.013081954597922277, + "flos": 535021724928.0, + "grad_norm": 0.05343660826588632, + "language_loss": 1.06744349, + "learning_rate": 0.0008355699051851403, + "loss": 1.08321595, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 3.86132812, + "step": 68, + "time_per_iteration": 2.726212501525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0157129, + "balance_loss_mlp": 1.18829489, + "epoch": 0.01327433628318584, + "flos": 574181584128.0, + "grad_norm": 0.041490887209285586, + "language_loss": 1.14052749, + "learning_rate": 0.0008384608389860635, + "loss": 1.15624034, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 3.828125, + "step": 69, + "time_per_iteration": 2.6679208278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156381, + "balance_loss_mlp": 1.18386579, + "epoch": 0.013466717968449404, + "flos": 498259967232.0, + "grad_norm": 0.03618836919088814, + "language_loss": 1.04182374, + "learning_rate": 0.000841310175171381, + "loss": 1.05746174, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 3.796875, + "step": 70, + "time_per_iteration": 2.6277127265930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01563963, + "balance_loss_mlp": 1.18592632, + "epoch": 0.013659099653712967, + "flos": 566622763776.0, + "grad_norm": 0.04320101591589407, + "language_loss": 1.02295327, + "learning_rate": 0.000844119093875517, + "loss": 1.03859293, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 3.77734375, + "step": 71, + "time_per_iteration": 2.7236883640289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558639, + "balance_loss_mlp": 1.18403625, + "epoch": 0.01385148133897653, + "flos": 574943686656.0, + "grad_norm": 0.03416580025853519, + "language_loss": 1.06855714, + "learning_rate": 0.0008468887257134666, + "loss": 1.08414352, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 3.7421875, + "step": 72, + "time_per_iteration": 2.6696412563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558456, + "balance_loss_mlp": 1.18499684, + "epoch": 0.014043863024240093, + "flos": 577959048960.0, + "grad_norm": 0.037886537215891476, + "language_loss": 1.09368944, + "learning_rate": 0.0008496201545131264, + "loss": 1.10927403, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 3.73046875, + "step": 73, + "time_per_iteration": 2.701594591140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545785, + "balance_loss_mlp": 1.17575896, + "epoch": 0.014236244709503656, + "flos": 940265252352.0, + "grad_norm": 0.04766211184506119, + "language_loss": 1.07240248, + "learning_rate": 0.0008523144198617317, + "loss": 1.08786011, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.6953125, + "step": 74, + "time_per_iteration": 3.1882145404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01551426, + "balance_loss_mlp": 1.18387985, + "epoch": 0.014428626394767219, + "flos": 529496478720.0, + "grad_norm": 0.031986864242930464, + "language_loss": 1.06216824, + "learning_rate": 0.0008549725194813783, + "loss": 1.0776825, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.66992188, + "step": 75, + "time_per_iteration": 2.666274309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01546662, + "balance_loss_mlp": 1.18102288, + "epoch": 0.014621008080030782, + "flos": 805283549952.0, + "grad_norm": 0.03321604497436844, + "language_loss": 1.05779314, + "learning_rate": 0.0008575954114472099, + "loss": 1.07325983, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.65039062, + "step": 76, + "time_per_iteration": 3.1192731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547179, + "balance_loss_mlp": 1.18478322, + "epoch": 0.014813389765294343, + "flos": 698357746176.0, + "grad_norm": 0.03477979781895141, + "language_loss": 1.02737951, + "learning_rate": 0.0008601840162606118, + "loss": 1.04285145, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.6171875, + "step": 77, + "time_per_iteration": 3.0015783309936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547226, + "balance_loss_mlp": 1.18788171, + "epoch": 0.015005771450557906, + "flos": 598165476864.0, + "grad_norm": 0.032631512960834254, + "language_loss": 1.09477437, + "learning_rate": 0.000862739218788641, + "loss": 1.11024666, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.58984375, + "step": 78, + "time_per_iteration": 2.790245771408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536731, + "balance_loss_mlp": 1.18177319, + "epoch": 0.01519815313582147, + "flos": 550493199360.0, + "grad_norm": 0.0308447873241268, + "language_loss": 1.07131243, + "learning_rate": 0.0008652618700799138, + "loss": 1.0866797, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.55664062, + "step": 79, + "time_per_iteration": 2.6302430629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01532812, + "balance_loss_mlp": 1.18033433, + "epoch": 0.015390534821085032, + "flos": 431440817664.0, + "grad_norm": 0.04595099678969376, + "language_loss": 1.06556606, + "learning_rate": 0.0008677527890662774, + "loss": 1.08089423, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.53125, + "step": 80, + "time_per_iteration": 2.4970459938049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01520539, + "balance_loss_mlp": 1.17130363, + "epoch": 0.015582916506348595, + "flos": 525185345280.0, + "grad_norm": 0.030530536654869142, + "language_loss": 1.07461143, + "learning_rate": 0.0008702127641587799, + "loss": 1.08981681, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.49804688, + "step": 81, + "time_per_iteration": 2.6258630752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01512144, + "balance_loss_mlp": 1.16500628, + "epoch": 0.015775298191612157, + "flos": 576617591040.0, + "grad_norm": 0.026948447424875538, + "language_loss": 1.02672768, + "learning_rate": 0.0008726425547457192, + "loss": 1.04184914, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.4765625, + "step": 82, + "time_per_iteration": 2.7344956398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517079, + "balance_loss_mlp": 1.17375636, + "epoch": 0.01596767987687572, + "flos": 611440071936.0, + "grad_norm": 0.03479426421062965, + "language_loss": 1.02940345, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457426, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.4375, + "step": 83, + "time_per_iteration": 2.738685369491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01509349, + "balance_loss_mlp": 1.16850555, + "epoch": 0.016160061562139283, + "flos": 568233484800.0, + "grad_norm": 0.05178756375238081, + "language_loss": 1.08039558, + "learning_rate": 0.0008774144832015932, + "loss": 1.09548914, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.41210938, + "step": 84, + "time_per_iteration": 2.6948299407958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575775, + "balance_loss_mlp": 2.26144409, + "epoch": 0.016352443247402846, + "flos": 1414502431488.0, + "grad_norm": 0.37456313977874084, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7735008, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.140625, + "step": 85, + "time_per_iteration": 4.596364974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517697, + "balance_loss_mlp": 1.17895198, + "epoch": 0.01654482493266641, + "flos": 731786279424.0, + "grad_norm": 0.04138572693056026, + "language_loss": 1.03059626, + "learning_rate": 0.0008820741205014318, + "loss": 1.04577315, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.390625, + "step": 86, + "time_per_iteration": 2.901047706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01566516, + "balance_loss_mlp": 1.22757995, + "epoch": 0.016737206617929972, + "flos": 537405242112.0, + "grad_norm": 0.0588613682629828, + "language_loss": 1.04849172, + "learning_rate": 0.0008843634575408404, + "loss": 1.06415701, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.39257812, + "step": 87, + "time_per_iteration": 2.6739823818206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583525, + "balance_loss_mlp": 1.24497032, + "epoch": 0.016929588303193535, + "flos": 538130406144.0, + "grad_norm": 0.09131872689500015, + "language_loss": 1.06101418, + "learning_rate": 0.0008866266301555082, + "loss": 1.07684946, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.38867188, + "step": 88, + "time_per_iteration": 2.741093635559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156783, + "balance_loss_mlp": 1.23118281, + "epoch": 0.017121969988457098, + "flos": 527792438784.0, + "grad_norm": 0.07103005743700296, + "language_loss": 1.07027078, + "learning_rate": 0.0008888642296509615, + "loss": 1.08594918, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.36914062, + "step": 89, + "time_per_iteration": 2.622267007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01554346, + "balance_loss_mlp": 1.2196058, + "epoch": 0.01731435167372066, + "flos": 626768649984.0, + "grad_norm": 0.057543283798364535, + "language_loss": 1.11941445, + "learning_rate": 0.0008910768275115906, + "loss": 1.13495779, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.34960938, + "step": 90, + "time_per_iteration": 2.778939962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536545, + "balance_loss_mlp": 1.20409441, + "epoch": 0.017506733358984224, + "flos": 497385103872.0, + "grad_norm": 0.06951140803051024, + "language_loss": 1.07318401, + "learning_rate": 0.0008932649762767675, + "loss": 1.08854938, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.32617188, + "step": 91, + "time_per_iteration": 2.5841660499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01529864, + "balance_loss_mlp": 1.20122755, + "epoch": 0.017699115044247787, + "flos": 747218870016.0, + "grad_norm": 0.037985069994816135, + "language_loss": 1.10022223, + "learning_rate": 0.0008954292103690864, + "loss": 1.11552095, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.28710938, + "step": 92, + "time_per_iteration": 2.976200580596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525091, + "balance_loss_mlp": 1.19893408, + "epoch": 0.01789149672951135, + "flos": 516521282304.0, + "grad_norm": 0.05507041657686672, + "language_loss": 1.1172272, + "learning_rate": 0.0008975700468778296, + "loss": 1.13247812, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.26171875, + "step": 93, + "time_per_iteration": 2.5778274536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01518338, + "balance_loss_mlp": 1.19427943, + "epoch": 0.018083878414774913, + "flos": 587230657536.0, + "grad_norm": 0.047907590915393955, + "language_loss": 1.05762661, + "learning_rate": 0.0008996879863005366, + "loss": 1.07280993, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.24023438, + "step": 94, + "time_per_iteration": 2.6827101707458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01506508, + "balance_loss_mlp": 1.18664575, + "epoch": 0.018276260100038477, + "flos": 498370782720.0, + "grad_norm": 0.03950158468897577, + "language_loss": 1.05640411, + "learning_rate": 0.0009017835132453337, + "loss": 1.07146931, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.19726562, + "step": 95, + "time_per_iteration": 2.5879104137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01488471, + "balance_loss_mlp": 1.17223215, + "epoch": 0.01846864178530204, + "flos": 641233058304.0, + "grad_norm": 0.042611409633865054, + "language_loss": 1.05607677, + "learning_rate": 0.0009038570970964896, + "loss": 1.07096148, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.16015625, + "step": 96, + "time_per_iteration": 2.761634349822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487316, + "balance_loss_mlp": 1.17374837, + "epoch": 0.018661023470565603, + "flos": 512667995136.0, + "grad_norm": 0.026597294022958493, + "language_loss": 1.02809072, + "learning_rate": 0.0009059091926454854, + "loss": 1.04296374, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.1328125, + "step": 97, + "time_per_iteration": 2.602036952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487556, + "balance_loss_mlp": 1.17742097, + "epoch": 0.018853405155829166, + "flos": 932697683712.0, + "grad_norm": 0.04097414840704221, + "language_loss": 1.01764143, + "learning_rate": 0.0009079402406897198, + "loss": 1.03251696, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.09765625, + "step": 98, + "time_per_iteration": 3.2514705657958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483888, + "balance_loss_mlp": 1.17642295, + "epoch": 0.01904578684109273, + "flos": 577587718656.0, + "grad_norm": 0.027217181555243938, + "language_loss": 1.03385735, + "learning_rate": 0.0009099506686008212, + "loss": 1.04869628, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.0703125, + "step": 99, + "time_per_iteration": 2.7867672443389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473245, + "balance_loss_mlp": 1.16883183, + "epoch": 0.019238168526356292, + "flos": 559521789696.0, + "grad_norm": 0.02943095981266107, + "language_loss": 1.06245995, + "learning_rate": 0.0009119408908644013, + "loss": 1.07719231, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.0390625, + "step": 100, + "time_per_iteration": 2.718982219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466855, + "balance_loss_mlp": 1.164922, + "epoch": 0.019430550211619855, + "flos": 725104267776.0, + "grad_norm": 0.035830377247789626, + "language_loss": 1.12020779, + "learning_rate": 0.0009139113095929519, + "loss": 1.13487625, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.01367188, + "step": 101, + "time_per_iteration": 2.9023444652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146708, + "balance_loss_mlp": 1.16781712, + "epoch": 0.019622931896883418, + "flos": 500456846592.0, + "grad_norm": 0.031534744220975436, + "language_loss": 1.0658195, + "learning_rate": 0.0009158623150134762, + "loss": 1.08049035, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 2.98632812, + "step": 102, + "time_per_iteration": 2.5731325149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01479653, + "balance_loss_mlp": 1.1828692, + "epoch": 0.01981531358214698, + "flos": 510282532608.0, + "grad_norm": 0.0334583858191085, + "language_loss": 1.05968487, + "learning_rate": 0.000917794285931332, + "loss": 1.07448149, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 2.9609375, + "step": 103, + "time_per_iteration": 2.656132221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477107, + "balance_loss_mlp": 1.18184972, + "epoch": 0.020007695267410544, + "flos": 522393559296.0, + "grad_norm": 0.033386157220771755, + "language_loss": 0.97816026, + "learning_rate": 0.0009197075901716639, + "loss": 0.99293131, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 2.9453125, + "step": 104, + "time_per_iteration": 2.7207133769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01472947, + "balance_loss_mlp": 1.1811223, + "epoch": 0.020200076952674107, + "flos": 534444314880.0, + "grad_norm": 0.03432724584635873, + "language_loss": 1.08410704, + "learning_rate": 0.0009216025849997171, + "loss": 1.09883642, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 2.92382812, + "step": 105, + "time_per_iteration": 2.783440113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461838, + "balance_loss_mlp": 1.17115784, + "epoch": 0.020392458637937667, + "flos": 686083414272.0, + "grad_norm": 0.04360543496830388, + "language_loss": 1.02907205, + "learning_rate": 0.0009234796175212258, + "loss": 1.04369044, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 2.9140625, + "step": 106, + "time_per_iteration": 2.914760112762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450941, + "balance_loss_mlp": 1.1615957, + "epoch": 0.02058484032320123, + "flos": 703415430912.0, + "grad_norm": 0.03266429542390293, + "language_loss": 1.06572628, + "learning_rate": 0.000925339025064007, + "loss": 1.08023572, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 2.90039062, + "step": 107, + "time_per_iteration": 2.951838254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453976, + "balance_loss_mlp": 1.16558492, + "epoch": 0.020777222008464793, + "flos": 640328059392.0, + "grad_norm": 0.03192051704400644, + "language_loss": 0.99516582, + "learning_rate": 0.0009271811355418027, + "loss": 1.00970554, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 2.890625, + "step": 108, + "time_per_iteration": 2.897881507873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01449031, + "balance_loss_mlp": 1.16159379, + "epoch": 0.020969603693728356, + "flos": 683321763840.0, + "grad_norm": 0.04466737388011785, + "language_loss": 1.06219566, + "learning_rate": 0.0009290062678013548, + "loss": 1.07668602, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 2.88085938, + "step": 109, + "time_per_iteration": 2.8423218727111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430047, + "balance_loss_mlp": 1.14413536, + "epoch": 0.02116198537899192, + "flos": 534420015360.0, + "grad_norm": 0.034258615277409615, + "language_loss": 1.04797208, + "learning_rate": 0.0009308147319536321, + "loss": 1.06227255, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 2.86523438, + "step": 110, + "time_per_iteration": 2.6316323280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425495, + "balance_loss_mlp": 1.14053667, + "epoch": 0.021354367064255482, + "flos": 718728457728.0, + "grad_norm": 0.048864006828935096, + "language_loss": 1.11352324, + "learning_rate": 0.0009326068296900676, + "loss": 1.12777817, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 2.85546875, + "step": 111, + "time_per_iteration": 2.8313205242156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416698, + "balance_loss_mlp": 1.13269377, + "epoch": 0.021546748749519045, + "flos": 520624390656.0, + "grad_norm": 0.040751650479700946, + "language_loss": 1.01643181, + "learning_rate": 0.0009343828545846161, + "loss": 1.03059864, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 2.84570312, + "step": 112, + "time_per_iteration": 2.7729175090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401674, + "balance_loss_mlp": 1.11805177, + "epoch": 0.021739130434782608, + "flos": 506161927680.0, + "grad_norm": 0.042106341000359294, + "language_loss": 1.06266427, + "learning_rate": 0.0009361430923823841, + "loss": 1.07668102, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 2.84179688, + "step": 113, + "time_per_iteration": 2.5920841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01394311, + "balance_loss_mlp": 1.11126053, + "epoch": 0.02193151212004617, + "flos": 464427055872.0, + "grad_norm": 0.07156510336232694, + "language_loss": 1.09574234, + "learning_rate": 0.0009378878212755459, + "loss": 1.10968542, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 2.8359375, + "step": 114, + "time_per_iteration": 2.5213706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376382, + "balance_loss_mlp": 1.09371293, + "epoch": 0.022123893805309734, + "flos": 553332617472.0, + "grad_norm": 0.03568103744776456, + "language_loss": 0.9948864, + "learning_rate": 0.0009396173121672103, + "loss": 1.0086503, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.83203125, + "step": 115, + "time_per_iteration": 2.654648780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351182, + "balance_loss_mlp": 1.0677501, + "epoch": 0.022316275490573297, + "flos": 637379771136.0, + "grad_norm": 0.04471438423319615, + "language_loss": 1.05214882, + "learning_rate": 0.0009413318289238633, + "loss": 1.06566072, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.83984375, + "step": 116, + "time_per_iteration": 2.7842695713043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311882, + "balance_loss_mlp": 1.0282588, + "epoch": 0.02250865717583686, + "flos": 800316271872.0, + "grad_norm": 0.046340717018109684, + "language_loss": 0.97282118, + "learning_rate": 0.0009430316286169771, + "loss": 0.98593992, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.84179688, + "step": 117, + "time_per_iteration": 3.015839099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377985, + "balance_loss_mlp": 1.09283674, + "epoch": 0.022701038861100423, + "flos": 457063621632.0, + "grad_norm": 0.07808854544893538, + "language_loss": 1.02862036, + "learning_rate": 0.0009447169617543361, + "loss": 1.04240024, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.85742188, + "step": 118, + "time_per_iteration": 2.582919120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371579, + "balance_loss_mlp": 1.08871901, + "epoch": 0.022893420546363986, + "flos": 584187105024.0, + "grad_norm": 0.08661397198668377, + "language_loss": 1.09685123, + "learning_rate": 0.0009463880725016029, + "loss": 1.11056697, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.83398438, + "step": 119, + "time_per_iteration": 2.6932969093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312523, + "balance_loss_mlp": 1.03252411, + "epoch": 0.02308580223162755, + "flos": 562478826240.0, + "grad_norm": 0.04303328442288268, + "language_loss": 1.04977584, + "learning_rate": 0.0009480451988946134, + "loss": 1.06290102, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.8046875, + "step": 120, + "time_per_iteration": 2.8070547580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299284, + "balance_loss_mlp": 1.02252805, + "epoch": 0.023278183916891113, + "flos": 772646287872.0, + "grad_norm": 0.03799067846502037, + "language_loss": 1.05637264, + "learning_rate": 0.0009496885730428627, + "loss": 1.0693655, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.77148438, + "step": 121, + "time_per_iteration": 3.014753580093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130842, + "balance_loss_mlp": 1.03376198, + "epoch": 0.023470565602154676, + "flos": 554431057152.0, + "grad_norm": 0.04194740398285866, + "language_loss": 1.04016769, + "learning_rate": 0.0009513184213246156, + "loss": 1.05325174, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.75, + "step": 122, + "time_per_iteration": 2.633074998855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316034, + "balance_loss_mlp": 1.04442739, + "epoch": 0.02366294728741824, + "flos": 561167503872.0, + "grad_norm": 0.038872106950025416, + "language_loss": 1.07101583, + "learning_rate": 0.0009529349645740552, + "loss": 1.08417618, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.71875, + "step": 123, + "time_per_iteration": 2.6846470832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320226, + "balance_loss_mlp": 1.05014575, + "epoch": 0.0238553289726818, + "flos": 469517788416.0, + "grad_norm": 0.03403697644067516, + "language_loss": 1.05937934, + "learning_rate": 0.0009545384182608524, + "loss": 1.07258177, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.703125, + "step": 124, + "time_per_iteration": 2.5332376956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326404, + "balance_loss_mlp": 1.05880272, + "epoch": 0.024047710657945365, + "flos": 561104320512.0, + "grad_norm": 0.042208642163400256, + "language_loss": 1.03444421, + "learning_rate": 0.0009561289926625252, + "loss": 1.04770815, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.67773438, + "step": 125, + "time_per_iteration": 2.68180251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324487, + "balance_loss_mlp": 1.05841172, + "epoch": 0.024240092343208928, + "flos": 505771155456.0, + "grad_norm": 0.03944680997458598, + "language_loss": 1.08491933, + "learning_rate": 0.0009577068930299292, + "loss": 1.0981642, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.66210938, + "step": 126, + "time_per_iteration": 2.602088689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323529, + "balance_loss_mlp": 1.05936122, + "epoch": 0.02443247402847249, + "flos": 436753181184.0, + "grad_norm": 0.04017271590188075, + "language_loss": 1.04077768, + "learning_rate": 0.0009592723197462087, + "loss": 1.05401289, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.64257812, + "step": 127, + "time_per_iteration": 2.643617630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318089, + "balance_loss_mlp": 1.05563784, + "epoch": 0.024624855713736054, + "flos": 685069545216.0, + "grad_norm": 0.03549644551725154, + "language_loss": 1.0056293, + "learning_rate": 0.0009608254684795125, + "loss": 1.01881027, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.625, + "step": 128, + "time_per_iteration": 2.949061632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309242, + "balance_loss_mlp": 1.04831672, + "epoch": 0.024817237398999614, + "flos": 526114643712.0, + "grad_norm": 0.03183934804306691, + "language_loss": 1.03377914, + "learning_rate": 0.0009623665303297678, + "loss": 1.04687166, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.609375, + "step": 129, + "time_per_iteration": 2.7315783500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130104, + "balance_loss_mlp": 1.04106867, + "epoch": 0.025009619084263177, + "flos": 656887279872.0, + "grad_norm": 0.038944166016075116, + "language_loss": 1.07603359, + "learning_rate": 0.0009638956919697878, + "loss": 1.08904397, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.59960938, + "step": 130, + "time_per_iteration": 2.9588887691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293161, + "balance_loss_mlp": 1.03395224, + "epoch": 0.02520200076952674, + "flos": 455370275328.0, + "grad_norm": 0.03345888261117193, + "language_loss": 0.99743778, + "learning_rate": 0.0009654131357809714, + "loss": 1.0103693, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.59179688, + "step": 131, + "time_per_iteration": 2.5802786350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296775, + "balance_loss_mlp": 1.03966463, + "epoch": 0.025394382454790303, + "flos": 841269599232.0, + "grad_norm": 0.04496153180844387, + "language_loss": 1.08517051, + "learning_rate": 0.0009669190399838441, + "loss": 1.09813821, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.5703125, + "step": 132, + "time_per_iteration": 3.1034374237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297684, + "balance_loss_mlp": 1.04190826, + "epoch": 0.025586764140053866, + "flos": 582229353216.0, + "grad_norm": 0.044253016077327914, + "language_loss": 1.0183959, + "learning_rate": 0.0009684135787636724, + "loss": 1.03137255, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.55664062, + "step": 133, + "time_per_iteration": 2.8056888580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284726, + "balance_loss_mlp": 1.03066742, + "epoch": 0.02577914582531743, + "flos": 791678453760.0, + "grad_norm": 0.04023348500073193, + "language_loss": 1.06134284, + "learning_rate": 0.0009698969223913726, + "loss": 1.07419014, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.5390625, + "step": 134, + "time_per_iteration": 3.0520598888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_mlp": 1.02717578, + "epoch": 0.025971527510580992, + "flos": 596063861760.0, + "grad_norm": 0.02965492003563146, + "language_loss": 1.08660483, + "learning_rate": 0.0009713692373399265, + "loss": 1.09939814, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.51953125, + "step": 135, + "time_per_iteration": 2.679379463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01931427, + "balance_loss_mlp": 1.66744995, + "epoch": 0.026163909195844555, + "flos": 1581077391360.0, + "grad_norm": 0.18396358569787127, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81387651, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.640625, + "step": 136, + "time_per_iteration": 5.69318151473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01580238, + "balance_loss_mlp": 1.32083893, + "epoch": 0.026356290881108118, + "flos": 1505163555840.0, + "grad_norm": 0.11058621392355464, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79391277, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.59375, + "step": 137, + "time_per_iteration": 4.930646896362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336039, + "balance_loss_mlp": 1.08846498, + "epoch": 0.02654867256637168, + "flos": 598341421056.0, + "grad_norm": 0.05793494017899448, + "language_loss": 1.01254559, + "learning_rate": 0.0009757216201974225, + "loss": 1.02590609, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.47265625, + "step": 138, + "time_per_iteration": 2.8532111644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376264, + "balance_loss_mlp": 1.13059723, + "epoch": 0.026741054251635244, + "flos": 546136379136.0, + "grad_norm": 0.07027637242601113, + "language_loss": 1.06507492, + "learning_rate": 0.0009771514130396581, + "loss": 1.07883763, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.453125, + "step": 139, + "time_per_iteration": 2.742065668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01373402, + "balance_loss_mlp": 1.12792611, + "epoch": 0.026933435936898807, + "flos": 507846525696.0, + "grad_norm": 0.06681977417406691, + "language_loss": 1.06790614, + "learning_rate": 0.00097857095638274, + "loss": 1.08164012, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.45117188, + "step": 140, + "time_per_iteration": 2.689812660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01350241, + "balance_loss_mlp": 1.10533786, + "epoch": 0.02712581762216237, + "flos": 742254504192.0, + "grad_norm": 0.04346752833457442, + "language_loss": 0.97943556, + "learning_rate": 0.0009799803961288726, + "loss": 0.99293798, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.4453125, + "step": 141, + "time_per_iteration": 3.064852714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340988, + "balance_loss_mlp": 1.09684777, + "epoch": 0.027318199307425933, + "flos": 849779105280.0, + "grad_norm": 0.04419232462487818, + "language_loss": 1.04253626, + "learning_rate": 0.000981379875086876, + "loss": 1.05594611, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.4375, + "step": 142, + "time_per_iteration": 3.049978494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342845, + "balance_loss_mlp": 1.09870481, + "epoch": 0.027510580992689496, + "flos": 576638978304.0, + "grad_norm": 0.03936283820829166, + "language_loss": 0.99339008, + "learning_rate": 0.0009827695330590185, + "loss": 1.00681853, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.4375, + "step": 143, + "time_per_iteration": 2.677050828933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360296, + "balance_loss_mlp": 1.11729932, + "epoch": 0.02770296267795306, + "flos": 773790414336.0, + "grad_norm": 0.036415015399305896, + "language_loss": 0.98794824, + "learning_rate": 0.0009841495069248256, + "loss": 1.00155115, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.42578125, + "step": 144, + "time_per_iteration": 2.9983932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369915, + "balance_loss_mlp": 1.12768197, + "epoch": 0.027895344363216622, + "flos": 570449806080.0, + "grad_norm": 0.04357781303470995, + "language_loss": 0.98341697, + "learning_rate": 0.0009855199307219871, + "loss": 0.99711609, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.41796875, + "step": 145, + "time_per_iteration": 2.6622605323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136275, + "balance_loss_mlp": 1.12261522, + "epoch": 0.028087726048480186, + "flos": 548409080832.0, + "grad_norm": 0.032618269384273584, + "language_loss": 1.00131154, + "learning_rate": 0.0009868809357244854, + "loss": 1.01493907, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.39648438, + "step": 146, + "time_per_iteration": 2.7002813816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347166, + "balance_loss_mlp": 1.10836601, + "epoch": 0.02828010773374375, + "flos": 525873570816.0, + "grad_norm": 0.032542426789695725, + "language_loss": 1.04416764, + "learning_rate": 0.0009882326505180556, + "loss": 1.05763924, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.3828125, + "step": 147, + "time_per_iteration": 2.710149049758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334853, + "balance_loss_mlp": 1.09815085, + "epoch": 0.02847248941900731, + "flos": 773772917760.0, + "grad_norm": 0.045451062042893155, + "language_loss": 1.02790403, + "learning_rate": 0.0009895752010730906, + "loss": 1.04125249, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.36132812, + "step": 148, + "time_per_iteration": 2.965888261795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328294, + "balance_loss_mlp": 1.0936898, + "epoch": 0.028664871104270875, + "flos": 535470822912.0, + "grad_norm": 0.03549847888949514, + "language_loss": 1.08720016, + "learning_rate": 0.0009909087108150867, + "loss": 1.10048318, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.33984375, + "step": 149, + "time_per_iteration": 2.759585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328431, + "balance_loss_mlp": 1.09649718, + "epoch": 0.028857252789534438, + "flos": 368605212672.0, + "grad_norm": 0.04584721914032896, + "language_loss": 1.09262538, + "learning_rate": 0.0009922333006927371, + "loss": 1.10590982, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.3125, + "step": 150, + "time_per_iteration": 2.5677716732025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132956, + "balance_loss_mlp": 1.09896171, + "epoch": 0.029049634474798, + "flos": 516484343808.0, + "grad_norm": 0.054837011337671125, + "language_loss": 1.02855873, + "learning_rate": 0.0009935490892437632, + "loss": 1.04185438, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.29882812, + "step": 151, + "time_per_iteration": 2.5842795372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323589, + "balance_loss_mlp": 1.09623301, + "epoch": 0.029242016160061564, + "flos": 589349769216.0, + "grad_norm": 0.041624099188269474, + "language_loss": 1.01284385, + "learning_rate": 0.0009948561926585687, + "loss": 1.02607965, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.2734375, + "step": 152, + "time_per_iteration": 2.7717602252960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309484, + "balance_loss_mlp": 1.08422625, + "epoch": 0.029434397845325123, + "flos": 553137231360.0, + "grad_norm": 0.04242067063834005, + "language_loss": 1.0541966, + "learning_rate": 0.0009961547248418122, + "loss": 1.0672915, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.25976562, + "step": 153, + "time_per_iteration": 2.6492583751678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303287, + "balance_loss_mlp": 1.07898307, + "epoch": 0.029626779530588686, + "flos": 604608360960.0, + "grad_norm": 0.03242941124289258, + "language_loss": 1.02145946, + "learning_rate": 0.0009974447974719707, + "loss": 1.03449237, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.25, + "step": 154, + "time_per_iteration": 2.7111871242523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303637, + "balance_loss_mlp": 1.08181214, + "epoch": 0.02981916121585225, + "flos": 622218388992.0, + "grad_norm": 0.03743420896054, + "language_loss": 1.03581393, + "learning_rate": 0.0009987265200589763, + "loss": 1.0488503, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.22460938, + "step": 155, + "time_per_iteration": 2.7590832710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281243, + "balance_loss_mlp": 1.06151628, + "epoch": 0.030011542901115813, + "flos": 662881065984.0, + "grad_norm": 0.03665146617631418, + "language_loss": 1.03448439, + "learning_rate": 0.001, + "loss": 1.04729688, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.203125, + "step": 156, + "time_per_iteration": 2.868732452392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262046, + "balance_loss_mlp": 1.04441714, + "epoch": 0.030203924586379376, + "flos": 652819164672.0, + "grad_norm": 0.048414208125286275, + "language_loss": 1.0101347, + "learning_rate": 0.0009999999029413921, + "loss": 1.02275515, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.18164062, + "step": 157, + "time_per_iteration": 2.8458704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249467, + "balance_loss_mlp": 1.03393674, + "epoch": 0.03039630627164294, + "flos": 532444766976.0, + "grad_norm": 0.038165698108555156, + "language_loss": 1.02398324, + "learning_rate": 0.0009999996117656068, + "loss": 1.03647804, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.16015625, + "step": 158, + "time_per_iteration": 2.7255747318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250196, + "balance_loss_mlp": 1.03657281, + "epoch": 0.030588687956906502, + "flos": 587295786240.0, + "grad_norm": 0.04636715302465643, + "language_loss": 0.95869231, + "learning_rate": 0.0009999991264727564, + "loss": 0.97119427, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.140625, + "step": 159, + "time_per_iteration": 2.7805936336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_mlp": 1.05284619, + "epoch": 0.030781069642170065, + "flos": 514287464448.0, + "grad_norm": 0.055354258548617474, + "language_loss": 1.07316554, + "learning_rate": 0.0009999984470630296, + "loss": 1.08580732, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.1171875, + "step": 160, + "time_per_iteration": 2.6011087894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284107, + "balance_loss_mlp": 1.07372677, + "epoch": 0.030973451327433628, + "flos": 719560546560.0, + "grad_norm": 0.03499871632601644, + "language_loss": 0.95530587, + "learning_rate": 0.0009999975735366902, + "loss": 0.96814692, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.10742188, + "step": 161, + "time_per_iteration": 3.083415985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_mlp": 1.07439709, + "epoch": 0.03116583301269719, + "flos": 1111615994880.0, + "grad_norm": 0.03722431710536786, + "language_loss": 0.96960843, + "learning_rate": 0.0009999965058940775, + "loss": 0.9824428, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.09375, + "step": 162, + "time_per_iteration": 3.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_mlp": 1.05655301, + "epoch": 0.031358214697960754, + "flos": 451833883392.0, + "grad_norm": 0.04231417263227255, + "language_loss": 1.04135799, + "learning_rate": 0.0009999952441356057, + "loss": 1.05399871, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.078125, + "step": 163, + "time_per_iteration": 2.5445146560668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239952, + "balance_loss_mlp": 1.03357697, + "epoch": 0.031550596383224314, + "flos": 1257087309312.0, + "grad_norm": 0.03293922474511325, + "language_loss": 1.04807603, + "learning_rate": 0.000999993788261765, + "loss": 1.06047547, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.06640625, + "step": 164, + "time_per_iteration": 3.603273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233685, + "balance_loss_mlp": 1.02769136, + "epoch": 0.03174297806848788, + "flos": 669323950080.0, + "grad_norm": 0.03785089383184646, + "language_loss": 1.05591631, + "learning_rate": 0.00099999213827312, + "loss": 1.06825328, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.0625, + "step": 165, + "time_per_iteration": 2.822242498397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237294, + "balance_loss_mlp": 1.03206336, + "epoch": 0.03193535975375144, + "flos": 552364435200.0, + "grad_norm": 0.03413051380570177, + "language_loss": 1.00392842, + "learning_rate": 0.000999990294170312, + "loss": 1.01630139, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.0546875, + "step": 166, + "time_per_iteration": 2.6473989486694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124218, + "balance_loss_mlp": 1.03790259, + "epoch": 0.032127741439015006, + "flos": 544740486144.0, + "grad_norm": 0.02951320831702663, + "language_loss": 1.04371905, + "learning_rate": 0.0009999882559540566, + "loss": 1.0561409, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.04492188, + "step": 167, + "time_per_iteration": 2.654994487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_mlp": 1.04661989, + "epoch": 0.032320123124278566, + "flos": 549514323456.0, + "grad_norm": 0.03217165834370848, + "language_loss": 1.01348543, + "learning_rate": 0.000999986023625145, + "loss": 1.02598298, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.03320312, + "step": 168, + "time_per_iteration": 2.759324550628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01736656, + "balance_loss_mlp": 1.53829193, + "epoch": 0.03251250480954213, + "flos": 1308817963776.0, + "grad_norm": 0.15145695156494207, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8066107, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.9765625, + "step": 169, + "time_per_iteration": 4.9954283237457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125768, + "balance_loss_mlp": 1.05588245, + "epoch": 0.03270488649480569, + "flos": 562202760192.0, + "grad_norm": 0.04037677915440104, + "language_loss": 1.01481748, + "learning_rate": 0.0009999809766328958, + "loss": 1.02739429, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.01953125, + "step": 170, + "time_per_iteration": 2.6656970977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250876, + "balance_loss_mlp": 1.0494597, + "epoch": 0.03289726818006926, + "flos": 483339657984.0, + "grad_norm": 0.04232720535630845, + "language_loss": 1.03883123, + "learning_rate": 0.0009999781619715177, + "loss": 1.0513401, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.015625, + "step": 171, + "time_per_iteration": 2.5408902168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238141, + "balance_loss_mlp": 1.03786898, + "epoch": 0.03308964986533282, + "flos": 675821269248.0, + "grad_norm": 0.04278552863969592, + "language_loss": 1.04043615, + "learning_rate": 0.000999975153201402, + "loss": 1.05281758, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.00390625, + "step": 172, + "time_per_iteration": 2.85229754447937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233367, + "balance_loss_mlp": 1.03385854, + "epoch": 0.033282031550596385, + "flos": 610341632256.0, + "grad_norm": 0.04144744195910536, + "language_loss": 1.01965618, + "learning_rate": 0.0009999719503237174, + "loss": 1.03198993, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 1.9921875, + "step": 173, + "time_per_iteration": 2.7612979412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234993, + "balance_loss_mlp": 1.03739214, + "epoch": 0.033474413235859944, + "flos": 468996758784.0, + "grad_norm": 0.06741318195929925, + "language_loss": 1.10547054, + "learning_rate": 0.0009999685533397073, + "loss": 1.1178205, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 1.97265625, + "step": 174, + "time_per_iteration": 2.5750949382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246896, + "balance_loss_mlp": 1.05101097, + "epoch": 0.03366679492112351, + "flos": 580715841792.0, + "grad_norm": 0.0354258140398677, + "language_loss": 1.02665091, + "learning_rate": 0.00099996496225069, + "loss": 1.03911996, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 1.95605469, + "step": 175, + "time_per_iteration": 2.6886191368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124614, + "balance_loss_mlp": 1.05168545, + "epoch": 0.03385917660638707, + "flos": 638886479616.0, + "grad_norm": 0.036851717024697625, + "language_loss": 1.04551578, + "learning_rate": 0.0009999611770580604, + "loss": 1.0579772, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 1.94433594, + "step": 176, + "time_per_iteration": 2.8528547286987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227252, + "balance_loss_mlp": 1.03422809, + "epoch": 0.03405155829165064, + "flos": 442740164352.0, + "grad_norm": 0.05003520598604069, + "language_loss": 1.03819132, + "learning_rate": 0.0009999571977632876, + "loss": 1.0504638, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 1.9296875, + "step": 177, + "time_per_iteration": 2.6220269203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224145, + "balance_loss_mlp": 1.03188384, + "epoch": 0.034243939976914196, + "flos": 467275222272.0, + "grad_norm": 0.0554689754659714, + "language_loss": 1.0658946, + "learning_rate": 0.0009999530243679166, + "loss": 1.07813609, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 1.921875, + "step": 178, + "time_per_iteration": 2.5593671798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235914, + "balance_loss_mlp": 1.04479802, + "epoch": 0.03443632166217776, + "flos": 780713498880.0, + "grad_norm": 0.03675993055709111, + "language_loss": 1.01102996, + "learning_rate": 0.0009999486568735675, + "loss": 1.02338898, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 1.91015625, + "step": 179, + "time_per_iteration": 3.083312749862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235549, + "balance_loss_mlp": 1.04548192, + "epoch": 0.03462870334744132, + "flos": 1265760120576.0, + "grad_norm": 0.04656515886260978, + "language_loss": 1.01660061, + "learning_rate": 0.0009999440952819362, + "loss": 1.02895617, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 1.89941406, + "step": 180, + "time_per_iteration": 3.691354513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231777, + "balance_loss_mlp": 1.04390287, + "epoch": 0.03482108503270489, + "flos": 608303200512.0, + "grad_norm": 0.04339398829325753, + "language_loss": 1.02140999, + "learning_rate": 0.0009999393395947935, + "loss": 1.03372765, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 1.87695312, + "step": 181, + "time_per_iteration": 2.8826780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222143, + "balance_loss_mlp": 1.03617644, + "epoch": 0.03501346671796845, + "flos": 539315361792.0, + "grad_norm": 0.033650569268787865, + "language_loss": 1.05363226, + "learning_rate": 0.0009999343898139858, + "loss": 1.06585371, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 1.85742188, + "step": 182, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217643, + "balance_loss_mlp": 1.03329813, + "epoch": 0.035205848403232015, + "flos": 519499706112.0, + "grad_norm": 0.04889617812287003, + "language_loss": 1.03914642, + "learning_rate": 0.0009999292459414348, + "loss": 1.05132294, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 1.84082031, + "step": 183, + "time_per_iteration": 2.648263931274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223183, + "balance_loss_mlp": 1.04103076, + "epoch": 0.035398230088495575, + "flos": 473334137088.0, + "grad_norm": 0.03546540132303448, + "language_loss": 1.08284354, + "learning_rate": 0.0009999239079791374, + "loss": 1.09507537, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 1.81835938, + "step": 184, + "time_per_iteration": 2.6003947257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229231, + "balance_loss_mlp": 1.04908144, + "epoch": 0.03559061177375914, + "flos": 513095705856.0, + "grad_norm": 0.03580873522044792, + "language_loss": 1.00877666, + "learning_rate": 0.0009999183759291659, + "loss": 1.02106905, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 1.79785156, + "step": 185, + "time_per_iteration": 2.7518959045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229953, + "balance_loss_mlp": 1.05161583, + "epoch": 0.0357829934590227, + "flos": 478350992640.0, + "grad_norm": 0.05401643684385997, + "language_loss": 1.03586912, + "learning_rate": 0.0009999126497936682, + "loss": 1.04816866, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 1.78710938, + "step": 186, + "time_per_iteration": 2.565373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218003, + "balance_loss_mlp": 1.04052448, + "epoch": 0.03597537514428627, + "flos": 645885386496.0, + "grad_norm": 0.027605248849540943, + "language_loss": 1.06344712, + "learning_rate": 0.0009999067295748676, + "loss": 1.07562721, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 1.77832031, + "step": 187, + "time_per_iteration": 2.862023115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208675, + "balance_loss_mlp": 1.03167319, + "epoch": 0.03616775682954983, + "flos": 582270182400.0, + "grad_norm": 0.041753828035088196, + "language_loss": 1.04174721, + "learning_rate": 0.000999900615275062, + "loss": 1.05383396, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 1.7734375, + "step": 188, + "time_per_iteration": 2.7248780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206757, + "balance_loss_mlp": 1.02994609, + "epoch": 0.03636013851481339, + "flos": 383265007104.0, + "grad_norm": 0.05119808239604003, + "language_loss": 1.10189009, + "learning_rate": 0.0009998943068966256, + "loss": 1.11395764, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 1.77148438, + "step": 189, + "time_per_iteration": 2.487445592880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216253, + "balance_loss_mlp": 1.04010975, + "epoch": 0.03655252020007695, + "flos": 584308614144.0, + "grad_norm": 0.029643950017142998, + "language_loss": 1.04644084, + "learning_rate": 0.0009998878044420072, + "loss": 1.05860329, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 1.76464844, + "step": 190, + "time_per_iteration": 2.736809015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012186, + "balance_loss_mlp": 1.04321897, + "epoch": 0.03674490188534051, + "flos": 472598279424.0, + "grad_norm": 0.03987592529636011, + "language_loss": 1.00565469, + "learning_rate": 0.0009998811079137318, + "loss": 1.01784062, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 1.75683594, + "step": 191, + "time_per_iteration": 2.6006946563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214791, + "balance_loss_mlp": 1.04017353, + "epoch": 0.03693728357060408, + "flos": 529411908096.0, + "grad_norm": 0.03601320862003297, + "language_loss": 1.01597381, + "learning_rate": 0.0009998742173143987, + "loss": 1.02812171, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 1.74902344, + "step": 192, + "time_per_iteration": 2.6246893405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200861, + "balance_loss_mlp": 1.02719736, + "epoch": 0.03712966525586764, + "flos": 800346407424.0, + "grad_norm": 0.02962706666311765, + "language_loss": 1.0204885, + "learning_rate": 0.0009998671326466833, + "loss": 1.03249693, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 1.73925781, + "step": 193, + "time_per_iteration": 2.9852418899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194118, + "balance_loss_mlp": 1.02121651, + "epoch": 0.037322046941131205, + "flos": 831359342592.0, + "grad_norm": 0.049736474928026, + "language_loss": 1.0340569, + "learning_rate": 0.0009998598539133362, + "loss": 1.04599798, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 1.73144531, + "step": 194, + "time_per_iteration": 3.0510568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194861, + "balance_loss_mlp": 1.02339077, + "epoch": 0.037514428626394765, + "flos": 438589423872.0, + "grad_norm": 0.030819097200883293, + "language_loss": 1.03682184, + "learning_rate": 0.0009998523811171828, + "loss": 1.04877055, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 1.71679688, + "step": 195, + "time_per_iteration": 2.5203936100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197718, + "balance_loss_mlp": 1.0269146, + "epoch": 0.03770681031165833, + "flos": 512639804928.0, + "grad_norm": 0.031890398221933944, + "language_loss": 1.04342675, + "learning_rate": 0.0009998447142611248, + "loss": 1.05540395, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 1.70996094, + "step": 196, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193737, + "balance_loss_mlp": 1.02341044, + "epoch": 0.03789919199692189, + "flos": 808843274496.0, + "grad_norm": 0.030368823498634023, + "language_loss": 0.97672093, + "learning_rate": 0.0009998368533481387, + "loss": 0.98865831, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 1.70507812, + "step": 197, + "time_per_iteration": 3.031437397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185957, + "balance_loss_mlp": 1.01677489, + "epoch": 0.03809157368218546, + "flos": 691792386048.0, + "grad_norm": 0.027429804092446938, + "language_loss": 1.00742936, + "learning_rate": 0.0009998287983812762, + "loss": 1.01928902, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 1.69335938, + "step": 198, + "time_per_iteration": 2.8533172607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186443, + "balance_loss_mlp": 1.01764262, + "epoch": 0.03828395536744902, + "flos": 519004921344.0, + "grad_norm": 0.029672573654994608, + "language_loss": 1.06761527, + "learning_rate": 0.0009998205493636646, + "loss": 1.07947969, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 1.68945312, + "step": 199, + "time_per_iteration": 2.6512415409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190294, + "balance_loss_mlp": 1.02197027, + "epoch": 0.038476337052712584, + "flos": 582763021824.0, + "grad_norm": 0.03300049351517658, + "language_loss": 0.99112457, + "learning_rate": 0.0009998121062985063, + "loss": 1.00302756, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 1.68457031, + "step": 200, + "time_per_iteration": 2.6979846954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187054, + "balance_loss_mlp": 1.01996994, + "epoch": 0.03866871873797614, + "flos": 578273998848.0, + "grad_norm": 0.03164459486115397, + "language_loss": 1.0110172, + "learning_rate": 0.0009998034691890794, + "loss": 1.02288771, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 1.671875, + "step": 201, + "time_per_iteration": 2.80670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183327, + "balance_loss_mlp": 1.01672018, + "epoch": 0.03886110042323971, + "flos": 541772755968.0, + "grad_norm": 0.032663388617215364, + "language_loss": 1.05587053, + "learning_rate": 0.0009997946380387369, + "loss": 1.06770372, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 1.66699219, + "step": 202, + "time_per_iteration": 2.6591310501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179406, + "balance_loss_mlp": 1.01394379, + "epoch": 0.03905348210850327, + "flos": 719240739072.0, + "grad_norm": 0.030305493428663434, + "language_loss": 1.08528447, + "learning_rate": 0.0009997856128509076, + "loss": 1.09707844, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 1.65527344, + "step": 203, + "time_per_iteration": 2.9006340503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181527, + "balance_loss_mlp": 1.01720893, + "epoch": 0.039245863793766836, + "flos": 428397265152.0, + "grad_norm": 0.03189317300504765, + "language_loss": 1.03375864, + "learning_rate": 0.0009997763936290952, + "loss": 1.04557395, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.64355469, + "step": 204, + "time_per_iteration": 2.5836358070373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178747, + "balance_loss_mlp": 1.01538289, + "epoch": 0.039438245479030395, + "flos": 664270156032.0, + "grad_norm": 0.033629424624266296, + "language_loss": 1.0866276, + "learning_rate": 0.0009997669803768789, + "loss": 1.09841514, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.63378906, + "step": 205, + "time_per_iteration": 2.7809464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180514, + "balance_loss_mlp": 1.01791251, + "epoch": 0.03963062716429396, + "flos": 636496159488.0, + "grad_norm": 0.025840840316256445, + "language_loss": 1.03755617, + "learning_rate": 0.0009997573730979134, + "loss": 1.04936123, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.62597656, + "step": 206, + "time_per_iteration": 2.7759904861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207138, + "balance_loss_mlp": 1.04272461, + "epoch": 0.03982300884955752, + "flos": 1421589799680.0, + "grad_norm": 0.03078548913711826, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80400336, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.64453125, + "step": 207, + "time_per_iteration": 4.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177297, + "balance_loss_mlp": 1.0162214, + "epoch": 0.04001539053482109, + "flos": 690520914432.0, + "grad_norm": 0.03233621027438014, + "language_loss": 1.02104092, + "learning_rate": 0.0009997375764747294, + "loss": 1.03281379, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.61035156, + "step": 208, + "time_per_iteration": 2.9808952808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181785, + "balance_loss_mlp": 1.02156758, + "epoch": 0.04020777222008465, + "flos": 534752461824.0, + "grad_norm": 0.037334696417832054, + "language_loss": 0.99876916, + "learning_rate": 0.0009997273871381967, + "loss": 1.01058698, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.6015625, + "step": 209, + "time_per_iteration": 2.6938650608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183132, + "balance_loss_mlp": 1.02396429, + "epoch": 0.040400153905348214, + "flos": 568997532672.0, + "grad_norm": 0.03228633343407045, + "language_loss": 1.04497194, + "learning_rate": 0.0009997170037902862, + "loss": 1.05680323, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.59082031, + "step": 210, + "time_per_iteration": 2.722900629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189763, + "balance_loss_mlp": 1.03145349, + "epoch": 0.040592535590611774, + "flos": 714679784448.0, + "grad_norm": 0.026587079094436805, + "language_loss": 1.0723207, + "learning_rate": 0.0009997064264350292, + "loss": 1.08421838, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.58203125, + "step": 211, + "time_per_iteration": 2.8636813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186902, + "balance_loss_mlp": 1.02954614, + "epoch": 0.04078491727587533, + "flos": 579207187968.0, + "grad_norm": 0.028855359605628288, + "language_loss": 1.01311755, + "learning_rate": 0.0009996956550765317, + "loss": 1.02498662, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.57226562, + "step": 212, + "time_per_iteration": 2.6752002239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183355, + "balance_loss_mlp": 1.0270474, + "epoch": 0.0409772989611389, + "flos": 553369555968.0, + "grad_norm": 0.03615073574048419, + "language_loss": 0.96463609, + "learning_rate": 0.0009996846897189762, + "loss": 0.97646964, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.56152344, + "step": 213, + "time_per_iteration": 2.618417501449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180441, + "balance_loss_mlp": 1.02470577, + "epoch": 0.04116968064640246, + "flos": 556764996864.0, + "grad_norm": 0.04473264124517712, + "language_loss": 1.02233624, + "learning_rate": 0.0009996735303666193, + "loss": 1.03414059, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.55566406, + "step": 214, + "time_per_iteration": 2.7398550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118203, + "balance_loss_mlp": 1.026963, + "epoch": 0.041362062331666026, + "flos": 579652395264.0, + "grad_norm": 0.027182691243245845, + "language_loss": 1.04435229, + "learning_rate": 0.0009996621770237937, + "loss": 1.05617261, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.54882812, + "step": 215, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182535, + "balance_loss_mlp": 1.02775347, + "epoch": 0.041554444016929586, + "flos": 612701816832.0, + "grad_norm": 0.028683660550217302, + "language_loss": 1.00582075, + "learning_rate": 0.0009996506296949073, + "loss": 1.01764607, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.54589844, + "step": 216, + "time_per_iteration": 2.877587080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180569, + "balance_loss_mlp": 1.02607429, + "epoch": 0.04174682570219315, + "flos": 529151393280.0, + "grad_norm": 0.031901868987761664, + "language_loss": 1.00452459, + "learning_rate": 0.0009996388883844428, + "loss": 1.01633024, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.54296875, + "step": 217, + "time_per_iteration": 2.6346311569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.02002692, + "epoch": 0.04193920738745671, + "flos": 512500799232.0, + "grad_norm": 0.02715845750356807, + "language_loss": 1.03465486, + "learning_rate": 0.0009996269530969588, + "loss": 1.04639161, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.53417969, + "step": 218, + "time_per_iteration": 2.6205921173095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170474, + "balance_loss_mlp": 1.0176959, + "epoch": 0.04213158907272028, + "flos": 572553366528.0, + "grad_norm": 0.03606301207395498, + "language_loss": 1.04169452, + "learning_rate": 0.0009996148238370888, + "loss": 1.05339921, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.52539062, + "step": 219, + "time_per_iteration": 2.8047173023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.01725543, + "epoch": 0.04232397075798384, + "flos": 965905552896.0, + "grad_norm": 0.026524392964530758, + "language_loss": 0.99111861, + "learning_rate": 0.0009996025006095421, + "loss": 1.00281417, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.52050781, + "step": 220, + "time_per_iteration": 3.315859317779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 0.99693298, + "epoch": 0.042516352443247404, + "flos": 1472733340416.0, + "grad_norm": 0.01509407607306266, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.78931135, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.5078125, + "step": 221, + "time_per_iteration": 5.540910243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166927, + "balance_loss_mlp": 1.0164367, + "epoch": 0.042708734128510964, + "flos": 655892852736.0, + "grad_norm": 0.029367950869880366, + "language_loss": 0.99126619, + "learning_rate": 0.0009995772722706307, + "loss": 1.00293541, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.50195312, + "step": 222, + "time_per_iteration": 2.901489019393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167445, + "balance_loss_mlp": 1.01705015, + "epoch": 0.04290111581377453, + "flos": 432734643456.0, + "grad_norm": 0.04040999725558835, + "language_loss": 1.13508129, + "learning_rate": 0.0009995643671690604, + "loss": 1.1467557, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.50097656, + "step": 223, + "time_per_iteration": 2.5576720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168153, + "balance_loss_mlp": 1.01823533, + "epoch": 0.04309349749903809, + "flos": 645867889920.0, + "grad_norm": 0.02824445481068148, + "language_loss": 1.00763512, + "learning_rate": 0.0009995512681194023, + "loss": 1.01931667, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.49609375, + "step": 224, + "time_per_iteration": 2.9571568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167559, + "balance_loss_mlp": 1.01840472, + "epoch": 0.04328587918430166, + "flos": 832897153536.0, + "grad_norm": 0.025764365733734692, + "language_loss": 0.98235118, + "learning_rate": 0.0009995379751267417, + "loss": 0.99402678, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.48828125, + "step": 225, + "time_per_iteration": 3.2627484798431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166832, + "balance_loss_mlp": 1.01824963, + "epoch": 0.043478260869565216, + "flos": 526116589056.0, + "grad_norm": 0.03531387708455554, + "language_loss": 1.00006318, + "learning_rate": 0.0009995244881962398, + "loss": 1.01173151, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.48242188, + "step": 226, + "time_per_iteration": 2.624209403991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170136, + "balance_loss_mlp": 1.02212548, + "epoch": 0.04367064255482878, + "flos": 440413027584.0, + "grad_norm": 0.039279482080902435, + "language_loss": 1.01293874, + "learning_rate": 0.0009995108073331323, + "loss": 1.02464008, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.4765625, + "step": 227, + "time_per_iteration": 2.6042520999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164096, + "balance_loss_mlp": 1.01742136, + "epoch": 0.04386302424009234, + "flos": 508467677184.0, + "grad_norm": 0.03801127181345805, + "language_loss": 1.03535032, + "learning_rate": 0.0009994969325427309, + "loss": 1.04699123, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.46582031, + "step": 228, + "time_per_iteration": 2.6691603660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163607, + "balance_loss_mlp": 1.01769507, + "epoch": 0.04405540592535591, + "flos": 541744565760.0, + "grad_norm": 0.03512041362752814, + "language_loss": 1.00143218, + "learning_rate": 0.0009994828638304218, + "loss": 1.0130682, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.46191406, + "step": 229, + "time_per_iteration": 2.627833366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164617, + "balance_loss_mlp": 1.01927722, + "epoch": 0.04424778761061947, + "flos": 447309867264.0, + "grad_norm": 0.03576658395893793, + "language_loss": 1.06260157, + "learning_rate": 0.0009994686012016675, + "loss": 1.07424784, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.45703125, + "step": 230, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159156, + "balance_loss_mlp": 1.01448417, + "epoch": 0.044440169295883035, + "flos": 701982599424.0, + "grad_norm": 0.03592315304636455, + "language_loss": 1.05298328, + "learning_rate": 0.000999454144662005, + "loss": 1.06457496, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.45019531, + "step": 231, + "time_per_iteration": 2.918896436691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156937, + "balance_loss_mlp": 1.01274192, + "epoch": 0.044632550981146595, + "flos": 589427536896.0, + "grad_norm": 0.032106980286660924, + "language_loss": 0.996499, + "learning_rate": 0.0009994394942170468, + "loss": 1.00806844, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.4453125, + "step": 232, + "time_per_iteration": 2.700378179550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169343, + "balance_loss_mlp": 1.02524316, + "epoch": 0.04482493266641016, + "flos": 555855140352.0, + "grad_norm": 0.03061962333593277, + "language_loss": 0.97402102, + "learning_rate": 0.0009994246498724808, + "loss": 0.9857145, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.44433594, + "step": 233, + "time_per_iteration": 2.692657232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171534, + "balance_loss_mlp": 1.02848291, + "epoch": 0.04501731435167372, + "flos": 724070956800.0, + "grad_norm": 0.03598428268947968, + "language_loss": 1.00358808, + "learning_rate": 0.00099940961163407, + "loss": 1.01530337, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.43359375, + "step": 234, + "time_per_iteration": 2.8496198654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167121, + "balance_loss_mlp": 1.02473748, + "epoch": 0.04520969603693728, + "flos": 512798252544.0, + "grad_norm": 0.03236637347420306, + "language_loss": 1.0231185, + "learning_rate": 0.0009993943795076528, + "loss": 1.03478956, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.42675781, + "step": 235, + "time_per_iteration": 2.6304001808166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157951, + "balance_loss_mlp": 1.01623452, + "epoch": 0.04540207772220085, + "flos": 365878555392.0, + "grad_norm": 0.04557463461025321, + "language_loss": 1.04854226, + "learning_rate": 0.0009993789534991427, + "loss": 1.06012177, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.41992188, + "step": 236, + "time_per_iteration": 2.500347852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156422, + "balance_loss_mlp": 1.01613641, + "epoch": 0.045594459407464406, + "flos": 523724323584.0, + "grad_norm": 0.028810086143122388, + "language_loss": 0.99360317, + "learning_rate": 0.0009993633336145287, + "loss": 1.00516737, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.40527344, + "step": 237, + "time_per_iteration": 2.6991968154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156358, + "balance_loss_mlp": 1.01664495, + "epoch": 0.04578684109272797, + "flos": 673116966144.0, + "grad_norm": 0.036851747197037266, + "language_loss": 1.03695393, + "learning_rate": 0.0009993475198598752, + "loss": 1.04851758, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.39941406, + "step": 238, + "time_per_iteration": 3.0150160789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160083, + "balance_loss_mlp": 1.02084696, + "epoch": 0.04597922277799153, + "flos": 542621374464.0, + "grad_norm": 0.03967898438127139, + "language_loss": 1.00323462, + "learning_rate": 0.0009993315122413212, + "loss": 1.01483548, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.39453125, + "step": 239, + "time_per_iteration": 2.6226179599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.01690221, + "epoch": 0.0461716044632551, + "flos": 459994413312.0, + "grad_norm": 0.029756199222484733, + "language_loss": 1.00536144, + "learning_rate": 0.0009993153107650818, + "loss": 1.01691425, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.38574219, + "step": 240, + "time_per_iteration": 2.635673999786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154071, + "balance_loss_mlp": 1.01607406, + "epoch": 0.04636398614851866, + "flos": 456171261696.0, + "grad_norm": 0.03103837756937707, + "language_loss": 0.99882519, + "learning_rate": 0.0009992989154374468, + "loss": 1.01036584, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.38183594, + "step": 241, + "time_per_iteration": 2.5449135303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115588, + "balance_loss_mlp": 1.01836014, + "epoch": 0.046556367833782225, + "flos": 557902320384.0, + "grad_norm": 0.06487144756994469, + "language_loss": 1.0686537, + "learning_rate": 0.0009992823262647817, + "loss": 1.08021247, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.37695312, + "step": 242, + "time_per_iteration": 2.705120325088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011561, + "balance_loss_mlp": 1.01905739, + "epoch": 0.046748749519045785, + "flos": 594088613376.0, + "grad_norm": 0.03633512017688626, + "language_loss": 1.00915635, + "learning_rate": 0.0009992655432535264, + "loss": 1.02071738, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.37207031, + "step": 243, + "time_per_iteration": 2.8158721923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160044, + "balance_loss_mlp": 1.02347767, + "epoch": 0.04694113120430935, + "flos": 570942645504.0, + "grad_norm": 0.036353271768507285, + "language_loss": 1.01172018, + "learning_rate": 0.0009992485664101973, + "loss": 1.02332067, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.3671875, + "step": 244, + "time_per_iteration": 2.723409414291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156505, + "balance_loss_mlp": 1.0207969, + "epoch": 0.04713351288957291, + "flos": 865246689024.0, + "grad_norm": 0.05316255083066814, + "language_loss": 1.03417325, + "learning_rate": 0.000999231395741385, + "loss": 1.04573822, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.35839844, + "step": 245, + "time_per_iteration": 3.1441562175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155254, + "balance_loss_mlp": 1.02011812, + "epoch": 0.04732589457483648, + "flos": 538236364032.0, + "grad_norm": 0.039550829703112036, + "language_loss": 1.01375949, + "learning_rate": 0.0009992140312537557, + "loss": 1.02531195, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.35253906, + "step": 246, + "time_per_iteration": 2.6407320499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158298, + "balance_loss_mlp": 1.02402055, + "epoch": 0.04751827626010004, + "flos": 763272612096.0, + "grad_norm": 0.029332271702031103, + "language_loss": 0.96132767, + "learning_rate": 0.000999196472954051, + "loss": 0.97291064, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.34375, + "step": 247, + "time_per_iteration": 2.9791386127471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115741, + "balance_loss_mlp": 1.02313232, + "epoch": 0.0477106579453636, + "flos": 1583128462080.0, + "grad_norm": 0.019406803026512872, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80582267, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.34375, + "step": 248, + "time_per_iteration": 5.547277927398682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115733, + "balance_loss_mlp": 1.02457833, + "epoch": 0.04790303963062716, + "flos": 458693784576.0, + "grad_norm": 0.04949407998464004, + "language_loss": 1.04053593, + "learning_rate": 0.0009991607749457578, + "loss": 1.05210924, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.328125, + "step": 249, + "time_per_iteration": 2.610372304916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158188, + "balance_loss_mlp": 1.02629459, + "epoch": 0.04809542131589073, + "flos": 783787186944.0, + "grad_norm": 0.03428496832179458, + "language_loss": 1.01565814, + "learning_rate": 0.0009991426352510286, + "loss": 1.02723992, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.31933594, + "step": 250, + "time_per_iteration": 2.9723451137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.0272516, + "epoch": 0.04828780300115429, + "flos": 560322776064.0, + "grad_norm": 0.03370153589925739, + "language_loss": 1.02967048, + "learning_rate": 0.0009991243017719422, + "loss": 1.04125512, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.3125, + "step": 251, + "time_per_iteration": 2.691317319869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115263, + "balance_loss_mlp": 1.02149975, + "epoch": 0.048480184686417856, + "flos": 502922989056.0, + "grad_norm": 0.033537523086657674, + "language_loss": 0.98110956, + "learning_rate": 0.0009991057745156165, + "loss": 0.99263585, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.31152344, + "step": 252, + "time_per_iteration": 2.615726947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126877, + "balance_loss_mlp": 0.99641418, + "epoch": 0.048672566371681415, + "flos": 1539471810048.0, + "grad_norm": 0.00943295316075806, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83037865, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.3046875, + "step": 253, + "time_per_iteration": 5.119662523269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155145, + "balance_loss_mlp": 1.02439594, + "epoch": 0.04886494805694498, + "flos": 538952779776.0, + "grad_norm": 0.04101934284448647, + "language_loss": 1.06555986, + "learning_rate": 0.0009990681387000943, + "loss": 1.07711136, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.30761719, + "step": 254, + "time_per_iteration": 2.7494144439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153346, + "balance_loss_mlp": 1.02316916, + "epoch": 0.04905732974220854, + "flos": 681485521152.0, + "grad_norm": 0.029284228955777224, + "language_loss": 1.01195645, + "learning_rate": 0.0009990490301555093, + "loss": 1.02348995, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.30175781, + "step": 255, + "time_per_iteration": 2.9595844745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_mlp": 1.00462341, + "epoch": 0.04924971142747211, + "flos": 1424277573120.0, + "grad_norm": 0.011666997955433429, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80348712, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.2890625, + "step": 256, + "time_per_iteration": 4.918023347854614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126785, + "balance_loss_mlp": 0.99822998, + "epoch": 0.04944209311273567, + "flos": 1561239381504.0, + "grad_norm": 0.006197531934497474, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80369532, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.28515625, + "step": 257, + "time_per_iteration": 4.996341228485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127556, + "balance_loss_mlp": 0.99976349, + "epoch": 0.04963447479799923, + "flos": 1574173748736.0, + "grad_norm": 0.01126324229515774, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71103442, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.27734375, + "step": 258, + "time_per_iteration": 4.951507329940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167552, + "balance_loss_mlp": 1.03966403, + "epoch": 0.049826856483262794, + "flos": 626499386880.0, + "grad_norm": 0.07394024090910019, + "language_loss": 0.96613419, + "learning_rate": 0.0009989706585723202, + "loss": 0.97780967, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.27832031, + "step": 259, + "time_per_iteration": 2.819796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158437, + "balance_loss_mlp": 1.03073978, + "epoch": 0.05001923816852635, + "flos": 505156806912.0, + "grad_norm": 0.042054435700702504, + "language_loss": 1.02184892, + "learning_rate": 0.0009989505813633442, + "loss": 1.0334332, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.27636719, + "step": 260, + "time_per_iteration": 2.671597719192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.02206886, + "epoch": 0.05021161985378992, + "flos": 588468102912.0, + "grad_norm": 0.05343186989039486, + "language_loss": 1.02308297, + "learning_rate": 0.000998930310444573, + "loss": 1.03457689, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.27246094, + "step": 261, + "time_per_iteration": 2.7573728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.01883233, + "epoch": 0.05040400153905348, + "flos": 634403292672.0, + "grad_norm": 0.052960623500171895, + "language_loss": 1.00806391, + "learning_rate": 0.0009989098458238765, + "loss": 1.01951981, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.26660156, + "step": 262, + "time_per_iteration": 2.7937912940979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146699, + "balance_loss_mlp": 1.02033675, + "epoch": 0.050596383224317046, + "flos": 554809190400.0, + "grad_norm": 0.04531187332347281, + "language_loss": 0.99888676, + "learning_rate": 0.0009988891875091998, + "loss": 1.0103538, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.26269531, + "step": 263, + "time_per_iteration": 2.811218500137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.01939976, + "epoch": 0.050788764909580605, + "flos": 550762462464.0, + "grad_norm": 0.03965392167411722, + "language_loss": 0.94696999, + "learning_rate": 0.0009988683355085636, + "loss": 0.95842183, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.25683594, + "step": 264, + "time_per_iteration": 2.7378242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141586, + "balance_loss_mlp": 1.01617777, + "epoch": 0.05098114659484417, + "flos": 606345448704.0, + "grad_norm": 0.024717188615823983, + "language_loss": 1.02827787, + "learning_rate": 0.000998847289830063, + "loss": 1.03969371, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.25292969, + "step": 265, + "time_per_iteration": 2.8625917434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142152, + "balance_loss_mlp": 1.01693416, + "epoch": 0.05117352828010773, + "flos": 439473035520.0, + "grad_norm": 0.036783183293041616, + "language_loss": 0.96527213, + "learning_rate": 0.0009988260504818682, + "loss": 0.97669363, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.25097656, + "step": 266, + "time_per_iteration": 2.5658230781555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138539, + "balance_loss_mlp": 1.0135119, + "epoch": 0.0513659099653713, + "flos": 506031670272.0, + "grad_norm": 0.04116504124695153, + "language_loss": 1.03285778, + "learning_rate": 0.000998804617472226, + "loss": 1.0442431, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.24902344, + "step": 267, + "time_per_iteration": 2.63395094871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138256, + "balance_loss_mlp": 1.01418352, + "epoch": 0.05155829165063486, + "flos": 696715922688.0, + "grad_norm": 0.034853618125567455, + "language_loss": 0.98327756, + "learning_rate": 0.0009987829908094568, + "loss": 0.9946602, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.23925781, + "step": 268, + "time_per_iteration": 2.8239262104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.01331627, + "epoch": 0.051750673335898424, + "flos": 1350302059008.0, + "grad_norm": 0.042488112993129025, + "language_loss": 1.04893267, + "learning_rate": 0.0009987611705019569, + "loss": 1.0603019, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.234375, + "step": 269, + "time_per_iteration": 4.33854079246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.01387095, + "epoch": 0.051943055021161984, + "flos": 490590331392.0, + "grad_norm": 0.037116049987967636, + "language_loss": 1.03026497, + "learning_rate": 0.0009987391565581978, + "loss": 1.04163671, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.23144531, + "step": 270, + "time_per_iteration": 2.609722852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136969, + "balance_loss_mlp": 1.01365864, + "epoch": 0.05213543670642555, + "flos": 546880985088.0, + "grad_norm": 0.03927026934880779, + "language_loss": 0.95517516, + "learning_rate": 0.000998716948986726, + "loss": 0.96654487, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.23144531, + "step": 271, + "time_per_iteration": 2.797673225402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137765, + "balance_loss_mlp": 1.01512277, + "epoch": 0.05232781839168911, + "flos": 604673489664.0, + "grad_norm": 0.04118655717732696, + "language_loss": 0.97937191, + "learning_rate": 0.0009986945477961633, + "loss": 0.9907496, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.22460938, + "step": 272, + "time_per_iteration": 2.6988775730133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135438, + "balance_loss_mlp": 1.01336777, + "epoch": 0.052520200076952676, + "flos": 539656556544.0, + "grad_norm": 0.027940819886650203, + "language_loss": 1.02222085, + "learning_rate": 0.0009986719529952066, + "loss": 1.0335753, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.21875, + "step": 273, + "time_per_iteration": 2.9503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133251, + "balance_loss_mlp": 1.01175284, + "epoch": 0.052712581762216236, + "flos": 464333736960.0, + "grad_norm": 0.036678205813438995, + "language_loss": 1.02377117, + "learning_rate": 0.000998649164592628, + "loss": 1.0351038, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.21289062, + "step": 274, + "time_per_iteration": 2.575183868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134193, + "balance_loss_mlp": 1.01279056, + "epoch": 0.0529049634474798, + "flos": 549106054656.0, + "grad_norm": 0.029580362230619023, + "language_loss": 1.00386071, + "learning_rate": 0.0009986261825972748, + "loss": 1.01520276, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.21191406, + "step": 275, + "time_per_iteration": 2.781388521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.01578796, + "epoch": 0.05309734513274336, + "flos": 619201081344.0, + "grad_norm": 0.028327187192750843, + "language_loss": 1.01742268, + "learning_rate": 0.000998603007018069, + "loss": 1.0287869, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.20410156, + "step": 276, + "time_per_iteration": 2.8231008052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137197, + "balance_loss_mlp": 1.01665294, + "epoch": 0.05328972681800693, + "flos": 606618602496.0, + "grad_norm": 0.02408735734832513, + "language_loss": 1.00149679, + "learning_rate": 0.0009985796378640089, + "loss": 1.01286888, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.203125, + "step": 277, + "time_per_iteration": 2.721719264984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136169, + "balance_loss_mlp": 1.01610124, + "epoch": 0.05348210850327049, + "flos": 605731100160.0, + "grad_norm": 0.0319931943489141, + "language_loss": 0.99697894, + "learning_rate": 0.0009985560751441665, + "loss": 1.0083406, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.19824219, + "step": 278, + "time_per_iteration": 2.835160255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133809, + "balance_loss_mlp": 1.01412332, + "epoch": 0.053674490188534055, + "flos": 631998388224.0, + "grad_norm": 0.030840524384760076, + "language_loss": 1.0228467, + "learning_rate": 0.00099853231886769, + "loss": 1.03418469, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.19433594, + "step": 279, + "time_per_iteration": 2.8541102409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131641, + "balance_loss_mlp": 1.01243138, + "epoch": 0.053866871873797614, + "flos": 480174596352.0, + "grad_norm": 0.030057370429500904, + "language_loss": 1.01521945, + "learning_rate": 0.0009985083690438024, + "loss": 1.02653599, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.18945312, + "step": 280, + "time_per_iteration": 2.778996706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133594, + "balance_loss_mlp": 1.01514757, + "epoch": 0.054059253559061174, + "flos": 789490322688.0, + "grad_norm": 0.030570218765999514, + "language_loss": 0.92515564, + "learning_rate": 0.0009984842256818016, + "loss": 0.93649161, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.18164062, + "step": 281, + "time_per_iteration": 3.113694429397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.01928854, + "epoch": 0.05425163524432474, + "flos": 629506000896.0, + "grad_norm": 0.043548376252248826, + "language_loss": 1.03102541, + "learning_rate": 0.0009984598887910613, + "loss": 1.04240274, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.18164062, + "step": 282, + "time_per_iteration": 2.8303444385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.01504183, + "epoch": 0.0544440169295883, + "flos": 616993508352.0, + "grad_norm": 0.05077708884656826, + "language_loss": 0.98823464, + "learning_rate": 0.0009984353583810297, + "loss": 0.99956, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.171875, + "step": 283, + "time_per_iteration": 2.835850954055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.01315546, + "epoch": 0.05463639861485187, + "flos": 648930884352.0, + "grad_norm": 0.03524270200319673, + "language_loss": 1.0117259, + "learning_rate": 0.0009984106344612302, + "loss": 1.02302563, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.16503906, + "step": 284, + "time_per_iteration": 2.760528564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129453, + "balance_loss_mlp": 1.01319993, + "epoch": 0.054828780300115426, + "flos": 798585987072.0, + "grad_norm": 0.03078454247465455, + "language_loss": 0.96210134, + "learning_rate": 0.0009983857170412615, + "loss": 0.97339588, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.15917969, + "step": 285, + "time_per_iteration": 2.9911587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131741, + "balance_loss_mlp": 1.01567924, + "epoch": 0.05502116198537899, + "flos": 550799400960.0, + "grad_norm": 0.028192528419898312, + "language_loss": 0.95645988, + "learning_rate": 0.000998360606130798, + "loss": 0.96777725, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.15722656, + "step": 286, + "time_per_iteration": 2.8603405952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119957, + "balance_loss_mlp": 1.00475311, + "epoch": 0.05521354367064255, + "flos": 1410909659136.0, + "grad_norm": 0.016802553847575376, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70193076, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.1484375, + "step": 287, + "time_per_iteration": 4.872994899749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139216, + "balance_loss_mlp": 1.02372622, + "epoch": 0.05540592535590612, + "flos": 646612495872.0, + "grad_norm": 0.03160477576624613, + "language_loss": 1.01500821, + "learning_rate": 0.0009983098038774552, + "loss": 1.02640033, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.15136719, + "step": 288, + "time_per_iteration": 2.7645044326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119652, + "balance_loss_mlp": 1.00521088, + "epoch": 0.05559830704116968, + "flos": 1514318512896.0, + "grad_norm": 0.011772143096286682, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79289877, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.140625, + "step": 289, + "time_per_iteration": 4.783201456069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150059, + "balance_loss_mlp": 1.03542745, + "epoch": 0.055790688726433245, + "flos": 509335737600.0, + "grad_norm": 0.037615798403722346, + "language_loss": 1.00063777, + "learning_rate": 0.0009982582277800948, + "loss": 1.01213825, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.14257812, + "step": 290, + "time_per_iteration": 2.5825588703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142418, + "balance_loss_mlp": 1.02873969, + "epoch": 0.055983070411696804, + "flos": 659075410944.0, + "grad_norm": 0.03490310528255379, + "language_loss": 1.06654799, + "learning_rate": 0.0009982321495648908, + "loss": 1.07797217, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.13671875, + "step": 291, + "time_per_iteration": 2.8099231719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137522, + "balance_loss_mlp": 1.02470279, + "epoch": 0.05617545209696037, + "flos": 588476851200.0, + "grad_norm": 0.035465642673631545, + "language_loss": 0.97683877, + "learning_rate": 0.0009982058779188115, + "loss": 0.98821402, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.13183594, + "step": 292, + "time_per_iteration": 2.7125580310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136096, + "balance_loss_mlp": 1.02384841, + "epoch": 0.05636783378222393, + "flos": 612788332800.0, + "grad_norm": 0.032210362870472055, + "language_loss": 1.05647731, + "learning_rate": 0.0009981794128520567, + "loss": 1.06783831, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.12597656, + "step": 293, + "time_per_iteration": 2.7916390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135514, + "balance_loss_mlp": 1.0241251, + "epoch": 0.0565602154674875, + "flos": 669424071936.0, + "grad_norm": 0.03595229916115603, + "language_loss": 1.02550793, + "learning_rate": 0.000998152754374901, + "loss": 1.03686309, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.1171875, + "step": 294, + "time_per_iteration": 2.8770558834075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134115, + "balance_loss_mlp": 1.0227263, + "epoch": 0.05675259715275106, + "flos": 618365101824.0, + "grad_norm": 0.028486588423889302, + "language_loss": 0.98274708, + "learning_rate": 0.0009981259024976943, + "loss": 0.99408829, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.1171875, + "step": 295, + "time_per_iteration": 2.729853630065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133246, + "balance_loss_mlp": 1.02204788, + "epoch": 0.05694497883801462, + "flos": 753154330368.0, + "grad_norm": 0.04188437456637708, + "language_loss": 0.968624, + "learning_rate": 0.0009980988572308612, + "loss": 0.97995651, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.11523438, + "step": 296, + "time_per_iteration": 3.0135345458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132496, + "balance_loss_mlp": 1.02187026, + "epoch": 0.05713736052327818, + "flos": 713382067968.0, + "grad_norm": 0.0305883196599643, + "language_loss": 0.9903996, + "learning_rate": 0.0009980716185849015, + "loss": 1.0017246, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.109375, + "step": 297, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.01865172, + "epoch": 0.05732974220854175, + "flos": 469936750848.0, + "grad_norm": 0.029025981508343963, + "language_loss": 0.95620793, + "learning_rate": 0.0009980441865703904, + "loss": 0.96750069, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.109375, + "step": 298, + "time_per_iteration": 2.67486572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.0163666, + "epoch": 0.05752212389380531, + "flos": 602541739008.0, + "grad_norm": 0.028406065642448373, + "language_loss": 1.04190016, + "learning_rate": 0.000998016561197978, + "loss": 1.05316436, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.10351562, + "step": 299, + "time_per_iteration": 2.7435965538024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127499, + "balance_loss_mlp": 1.01773107, + "epoch": 0.057714505579068875, + "flos": 679950622464.0, + "grad_norm": 0.02999406165417261, + "language_loss": 0.957955, + "learning_rate": 0.0009979887424783895, + "loss": 0.96922994, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.10058594, + "step": 300, + "time_per_iteration": 2.868412494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127678, + "balance_loss_mlp": 1.01800561, + "epoch": 0.057906887264332435, + "flos": 597012602112.0, + "grad_norm": 0.033381964405594114, + "language_loss": 0.95279002, + "learning_rate": 0.0009979607304224248, + "loss": 0.96406674, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.09960938, + "step": 301, + "time_per_iteration": 2.7196099758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127179, + "balance_loss_mlp": 1.01760185, + "epoch": 0.058099268949596, + "flos": 553165421568.0, + "grad_norm": 0.029428698202492602, + "language_loss": 1.02305853, + "learning_rate": 0.000997932525040959, + "loss": 1.03433037, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.09863281, + "step": 302, + "time_per_iteration": 2.645131826400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126073, + "balance_loss_mlp": 1.0166868, + "epoch": 0.05829165063485956, + "flos": 509231725056.0, + "grad_norm": 0.033454482596205204, + "language_loss": 1.04832363, + "learning_rate": 0.000997904126344943, + "loss": 1.05958426, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.09667969, + "step": 303, + "time_per_iteration": 2.60955810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_mlp": 1.0157212, + "epoch": 0.05848403232012313, + "flos": 616363608576.0, + "grad_norm": 0.0319979050325151, + "language_loss": 1.00779867, + "learning_rate": 0.0009978755343454018, + "loss": 1.01905453, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.1015625, + "step": 304, + "time_per_iteration": 2.733825206756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124254, + "balance_loss_mlp": 1.01467645, + "epoch": 0.05867641400538669, + "flos": 501079943424.0, + "grad_norm": 0.03385536533959698, + "language_loss": 1.01509869, + "learning_rate": 0.0009978467490534355, + "loss": 1.0263412, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.09863281, + "step": 305, + "time_per_iteration": 2.6263206005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121932, + "balance_loss_mlp": 1.01292717, + "epoch": 0.05886879569065025, + "flos": 532379638272.0, + "grad_norm": 0.03088897761094542, + "language_loss": 0.98605353, + "learning_rate": 0.00099781777048022, + "loss": 0.99727285, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.09277344, + "step": 306, + "time_per_iteration": 2.7351841926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122107, + "balance_loss_mlp": 1.01329267, + "epoch": 0.05906117737591381, + "flos": 490041111552.0, + "grad_norm": 0.034758856969872284, + "language_loss": 0.99957371, + "learning_rate": 0.0009977885986370057, + "loss": 1.01079476, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.09082031, + "step": 307, + "time_per_iteration": 2.566316843032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120423, + "balance_loss_mlp": 1.01199007, + "epoch": 0.05925355906117737, + "flos": 592710216960.0, + "grad_norm": 0.0408216139096099, + "language_loss": 0.95604599, + "learning_rate": 0.000997759233535118, + "loss": 0.96725023, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.08691406, + "step": 308, + "time_per_iteration": 2.781667470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119623, + "balance_loss_mlp": 1.01147592, + "epoch": 0.05944594074644094, + "flos": 564788466432.0, + "grad_norm": 0.03543125546238922, + "language_loss": 1.01945186, + "learning_rate": 0.0009977296751859576, + "loss": 1.03064811, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.08398438, + "step": 309, + "time_per_iteration": 2.778700828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121487, + "balance_loss_mlp": 1.0137223, + "epoch": 0.0596383224317045, + "flos": 539808201216.0, + "grad_norm": 0.03208598270087784, + "language_loss": 1.03591859, + "learning_rate": 0.0009976999236009998, + "loss": 1.04713345, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.08007812, + "step": 310, + "time_per_iteration": 2.790116786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121449, + "balance_loss_mlp": 1.01387453, + "epoch": 0.059830704116968066, + "flos": 562053060864.0, + "grad_norm": 0.03260901983169028, + "language_loss": 1.05564129, + "learning_rate": 0.0009976699787917955, + "loss": 1.06685579, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.078125, + "step": 311, + "time_per_iteration": 2.6586148738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108932, + "balance_loss_mlp": 1.00326538, + "epoch": 0.060023085802231625, + "flos": 1574050294272.0, + "grad_norm": 0.018314702584398344, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74551928, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.05859375, + "step": 312, + "time_per_iteration": 4.943182945251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128012, + "balance_loss_mlp": 1.02101004, + "epoch": 0.06021546748749519, + "flos": 483628363008.0, + "grad_norm": 0.04396023920554742, + "language_loss": 0.97026515, + "learning_rate": 0.0009976095095472243, + "loss": 0.98154521, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.07226562, + "step": 313, + "time_per_iteration": 2.619016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_mlp": 1.02425838, + "epoch": 0.06040784917275875, + "flos": 621424205568.0, + "grad_norm": 0.03687701456451143, + "language_loss": 0.97965562, + "learning_rate": 0.0009975789851353334, + "loss": 0.99096727, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.07128906, + "step": 314, + "time_per_iteration": 2.8331894874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125758, + "balance_loss_mlp": 1.01980519, + "epoch": 0.06060023085802232, + "flos": 484603348224.0, + "grad_norm": 0.029408756794299912, + "language_loss": 1.00726843, + "learning_rate": 0.0009975482675461487, + "loss": 1.01852608, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.06152344, + "step": 315, + "time_per_iteration": 2.659079074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125971, + "balance_loss_mlp": 1.02001762, + "epoch": 0.06079261254328588, + "flos": 582986598144.0, + "grad_norm": 0.027344501346145803, + "language_loss": 0.98408186, + "learning_rate": 0.0009975173567915952, + "loss": 0.99534154, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.06152344, + "step": 316, + "time_per_iteration": 2.6947872638702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123736, + "balance_loss_mlp": 1.01873684, + "epoch": 0.060984994228549444, + "flos": 689009348352.0, + "grad_norm": 0.03553374767777348, + "language_loss": 0.92618632, + "learning_rate": 0.000997486252883674, + "loss": 0.93742371, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.05175781, + "step": 317, + "time_per_iteration": 2.8523428440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123139, + "balance_loss_mlp": 1.01861632, + "epoch": 0.061177375913813004, + "flos": 1316749104384.0, + "grad_norm": 0.03506621320439297, + "language_loss": 0.97693729, + "learning_rate": 0.0009974549558344602, + "loss": 0.98816866, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.046875, + "step": 318, + "time_per_iteration": 3.705524206161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121913, + "balance_loss_mlp": 1.01805806, + "epoch": 0.06136975759907657, + "flos": 575401532928.0, + "grad_norm": 0.03493031867187039, + "language_loss": 1.07333064, + "learning_rate": 0.000997423465656105, + "loss": 1.08454978, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.04003906, + "step": 319, + "time_per_iteration": 2.75838565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119126, + "balance_loss_mlp": 1.01546133, + "epoch": 0.06156213928434013, + "flos": 528565234944.0, + "grad_norm": 0.037170039701900144, + "language_loss": 1.04350638, + "learning_rate": 0.0009973917823608335, + "loss": 1.05469775, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.03808594, + "step": 320, + "time_per_iteration": 2.6494460105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117676, + "balance_loss_mlp": 1.01458335, + "epoch": 0.061754520969603696, + "flos": 496590920448.0, + "grad_norm": 0.030464742512101767, + "language_loss": 0.98981547, + "learning_rate": 0.0009973599059609462, + "loss": 1.00099218, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.03222656, + "step": 321, + "time_per_iteration": 2.7119081020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116635, + "balance_loss_mlp": 1.01344728, + "epoch": 0.061946902654867256, + "flos": 441044872704.0, + "grad_norm": 0.031106795532346753, + "language_loss": 0.97035432, + "learning_rate": 0.000997327836468819, + "loss": 0.98152065, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.03320312, + "step": 322, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121262, + "balance_loss_mlp": 1.01836073, + "epoch": 0.06213928434013082, + "flos": 600043515648.0, + "grad_norm": 0.031546338171402045, + "language_loss": 1.00120687, + "learning_rate": 0.000997295573896902, + "loss": 1.01241946, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.03027344, + "step": 323, + "time_per_iteration": 2.825425624847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113502, + "balance_loss_mlp": 1.01126862, + "epoch": 0.06233166602539438, + "flos": 1453116961536.0, + "grad_norm": 0.009515746361157745, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82309544, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.0234375, + "step": 324, + "time_per_iteration": 4.7325074672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.0074234, + "epoch": 0.06252404771065795, + "flos": 1466631651072.0, + "grad_norm": 0.010337204897298672, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79680836, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.015625, + "step": 325, + "time_per_iteration": 4.845058917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131262, + "balance_loss_mlp": 1.02950513, + "epoch": 0.06271642939592151, + "flos": 465236790528.0, + "grad_norm": 0.04479189972062717, + "language_loss": 0.94122899, + "learning_rate": 0.000997197627828043, + "loss": 0.95254159, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.01855469, + "step": 326, + "time_per_iteration": 2.531477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136139, + "balance_loss_mlp": 1.03466833, + "epoch": 0.06290881108118507, + "flos": 533432391168.0, + "grad_norm": 0.03210871152906133, + "language_loss": 0.89633012, + "learning_rate": 0.0009971645930629716, + "loss": 0.9076916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.015625, + "step": 327, + "time_per_iteration": 2.766155481338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131438, + "balance_loss_mlp": 1.0305388, + "epoch": 0.06310119276644863, + "flos": 674768516352.0, + "grad_norm": 0.03217671154768682, + "language_loss": 1.03418863, + "learning_rate": 0.0009971313652814872, + "loss": 1.0455029, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.00976562, + "step": 328, + "time_per_iteration": 2.818718433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125209, + "balance_loss_mlp": 1.02440596, + "epoch": 0.0632935744517122, + "flos": 772051381248.0, + "grad_norm": 0.03902843256426295, + "language_loss": 1.00692391, + "learning_rate": 0.0009970979444964903, + "loss": 1.01817608, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.00878906, + "step": 329, + "time_per_iteration": 2.9847218990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119216, + "balance_loss_mlp": 1.01869905, + "epoch": 0.06348595613697576, + "flos": 562975556352.0, + "grad_norm": 0.040034835413812295, + "language_loss": 1.01797342, + "learning_rate": 0.0009970643307209556, + "loss": 1.02916563, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.00585938, + "step": 330, + "time_per_iteration": 2.817711353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112644, + "balance_loss_mlp": 1.01250839, + "epoch": 0.06367833782223932, + "flos": 677384358144.0, + "grad_norm": 0.031424074947949916, + "language_loss": 0.98358697, + "learning_rate": 0.0009970305239679334, + "loss": 0.99471337, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.00195312, + "step": 331, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.01247358, + "epoch": 0.06387071950750288, + "flos": 496349847552.0, + "grad_norm": 0.04016029313197435, + "language_loss": 1.03082633, + "learning_rate": 0.0009969965242505483, + "loss": 1.04195428, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.00390625, + "step": 332, + "time_per_iteration": 2.631326675415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113411, + "balance_loss_mlp": 1.01317954, + "epoch": 0.06406310119276645, + "flos": 534557075712.0, + "grad_norm": 0.03761595064373852, + "language_loss": 0.99054992, + "learning_rate": 0.0009969623315820007, + "loss": 1.00168395, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.00292969, + "step": 333, + "time_per_iteration": 2.6700048446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.01369655, + "epoch": 0.06425548287803001, + "flos": 457165688832.0, + "grad_norm": 0.0356255093132357, + "language_loss": 0.99075055, + "learning_rate": 0.000996927945975565, + "loss": 1.00188696, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.99951172, + "step": 334, + "time_per_iteration": 2.567225933074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112774, + "balance_loss_mlp": 1.01282871, + "epoch": 0.06444786456329357, + "flos": 561123762432.0, + "grad_norm": 0.034265188200332725, + "language_loss": 0.96451521, + "learning_rate": 0.0009968933674445906, + "loss": 0.97564298, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.99951172, + "step": 335, + "time_per_iteration": 2.6834452152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110954, + "balance_loss_mlp": 1.01100898, + "epoch": 0.06464024624855713, + "flos": 667357449984.0, + "grad_norm": 0.026754476738251005, + "language_loss": 0.980811, + "learning_rate": 0.0009968585960025028, + "loss": 0.99192053, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.99853516, + "step": 336, + "time_per_iteration": 2.9675402641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112488, + "balance_loss_mlp": 1.01368713, + "epoch": 0.0648326279338207, + "flos": 1524558303744.0, + "grad_norm": 0.027483244216433014, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78765678, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.98632812, + "step": 337, + "time_per_iteration": 4.80242133140564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.01540756, + "epoch": 0.06502500961908426, + "flos": 1145216581632.0, + "grad_norm": 0.03509421691107687, + "language_loss": 0.96500707, + "learning_rate": 0.0009967884744390583, + "loss": 0.97615772, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.99414062, + "step": 338, + "time_per_iteration": 3.517488479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118827, + "balance_loss_mlp": 1.01945412, + "epoch": 0.06521739130434782, + "flos": 583694265600.0, + "grad_norm": 0.03507378265000135, + "language_loss": 0.97375119, + "learning_rate": 0.0009967531243449256, + "loss": 0.98493946, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.9921875, + "step": 339, + "time_per_iteration": 2.713430404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119012, + "balance_loss_mlp": 1.02002037, + "epoch": 0.06540977298961138, + "flos": 498659487744.0, + "grad_norm": 0.03215705196534619, + "language_loss": 1.04762673, + "learning_rate": 0.000996717581394126, + "loss": 1.05881691, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.98876953, + "step": 340, + "time_per_iteration": 2.5391135215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116775, + "balance_loss_mlp": 1.01787901, + "epoch": 0.06560215467487496, + "flos": 543904506624.0, + "grad_norm": 0.030763143460584817, + "language_loss": 1.05044627, + "learning_rate": 0.000996681845600459, + "loss": 1.06161404, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.98632812, + "step": 341, + "time_per_iteration": 2.670804262161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118249, + "balance_loss_mlp": 1.01963949, + "epoch": 0.06579453636013852, + "flos": 414351819264.0, + "grad_norm": 0.040583240554979534, + "language_loss": 0.9744029, + "learning_rate": 0.0009966459169777982, + "loss": 0.98558539, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.98388672, + "step": 342, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115105, + "balance_loss_mlp": 1.01706719, + "epoch": 0.06598691804540208, + "flos": 561681730560.0, + "grad_norm": 0.04164342519277061, + "language_loss": 1.05655766, + "learning_rate": 0.0009966097955400924, + "loss": 1.06770873, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.97949219, + "step": 343, + "time_per_iteration": 2.666548728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_mlp": 1.01532912, + "epoch": 0.06617929973066564, + "flos": 573302830080.0, + "grad_norm": 0.03386977599556249, + "language_loss": 0.99970496, + "learning_rate": 0.0009965734813013652, + "loss": 1.01082909, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.97070312, + "step": 344, + "time_per_iteration": 2.8448328971862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_mlp": 1.01261127, + "epoch": 0.06637168141592921, + "flos": 491465194752.0, + "grad_norm": 0.03376822413453626, + "language_loss": 1.02026749, + "learning_rate": 0.0009965369742757151, + "loss": 1.03136492, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.97119141, + "step": 345, + "time_per_iteration": 2.568521738052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.01176453, + "epoch": 0.06656406310119277, + "flos": 1081039518720.0, + "grad_norm": 0.03449730062562062, + "language_loss": 0.98245382, + "learning_rate": 0.0009965002744773152, + "loss": 0.99353665, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.96484375, + "step": 346, + "time_per_iteration": 3.501471519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109602, + "balance_loss_mlp": 1.01347148, + "epoch": 0.06675644478645633, + "flos": 514723923456.0, + "grad_norm": 0.029121068034632647, + "language_loss": 0.95998263, + "learning_rate": 0.0009964633819204139, + "loss": 0.97107863, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.9609375, + "step": 347, + "time_per_iteration": 2.6675100326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093636, + "balance_loss_mlp": 0.9986496, + "epoch": 0.06694882647171989, + "flos": 1450537079808.0, + "grad_norm": 0.008592618933675954, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.82894754, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.94921875, + "step": 348, + "time_per_iteration": 4.92915415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 0.99832916, + "epoch": 0.06714120815698346, + "flos": 1555400152320.0, + "grad_norm": 0.006174818833869298, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76247013, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.94726562, + "step": 349, + "time_per_iteration": 4.8783159255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112042, + "balance_loss_mlp": 1.01719952, + "epoch": 0.06733358984224702, + "flos": 881617326336.0, + "grad_norm": 0.039044792628629706, + "language_loss": 0.95966816, + "learning_rate": 0.000996351547842304, + "loss": 0.97078854, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.94775391, + "step": 350, + "time_per_iteration": 3.151158094406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106972, + "balance_loss_mlp": 1.01222503, + "epoch": 0.06752597152751058, + "flos": 519918668544.0, + "grad_norm": 0.04011951728876299, + "language_loss": 0.94198334, + "learning_rate": 0.0009963138843953744, + "loss": 0.953053, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.94677734, + "step": 351, + "time_per_iteration": 2.6077194213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111271, + "balance_loss_mlp": 1.01661849, + "epoch": 0.06771835321277414, + "flos": 540883308288.0, + "grad_norm": 0.02897454745239974, + "language_loss": 0.98297268, + "learning_rate": 0.000996276028262306, + "loss": 0.99408543, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.94580078, + "step": 352, + "time_per_iteration": 2.8440346717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115128, + "balance_loss_mlp": 1.02052331, + "epoch": 0.0679107348980377, + "flos": 461615827968.0, + "grad_norm": 0.03358261828070724, + "language_loss": 1.05270672, + "learning_rate": 0.0009962379794577964, + "loss": 1.06385791, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.9453125, + "step": 353, + "time_per_iteration": 2.6153147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115897, + "balance_loss_mlp": 1.02129257, + "epoch": 0.06810311658330127, + "flos": 637208684544.0, + "grad_norm": 0.03193767698980152, + "language_loss": 0.94629884, + "learning_rate": 0.000996199737996617, + "loss": 0.95745778, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.9453125, + "step": 354, + "time_per_iteration": 2.9557363986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114833, + "balance_loss_mlp": 1.0208956, + "epoch": 0.06829549826856483, + "flos": 465627562752.0, + "grad_norm": 0.034421374529713736, + "language_loss": 1.03816652, + "learning_rate": 0.0009961613038936149, + "loss": 1.04931474, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.93847656, + "step": 355, + "time_per_iteration": 2.583648204803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112591, + "balance_loss_mlp": 1.01879704, + "epoch": 0.06848787995382839, + "flos": 635897362176.0, + "grad_norm": 0.027271592740405557, + "language_loss": 0.95725697, + "learning_rate": 0.000996122677163711, + "loss": 0.96838284, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.93701172, + "step": 356, + "time_per_iteration": 2.7997536659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.02022934, + "epoch": 0.06868026163909195, + "flos": 807781773312.0, + "grad_norm": 0.036098266403844226, + "language_loss": 1.02058005, + "learning_rate": 0.000996083857821902, + "loss": 1.03171647, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.93310547, + "step": 357, + "time_per_iteration": 3.0117554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113245, + "balance_loss_mlp": 1.01978505, + "epoch": 0.06887264332435553, + "flos": 440152512768.0, + "grad_norm": 0.03587140172627376, + "language_loss": 1.00045025, + "learning_rate": 0.0009960448458832588, + "loss": 1.01158273, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.93359375, + "step": 358, + "time_per_iteration": 2.6948373317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110172, + "balance_loss_mlp": 1.01714087, + "epoch": 0.06906502500961909, + "flos": 485786358528.0, + "grad_norm": 0.028895953236024122, + "language_loss": 0.99980301, + "learning_rate": 0.000996005641362927, + "loss": 1.01090467, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.92919922, + "step": 359, + "time_per_iteration": 2.600889205932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110333, + "balance_loss_mlp": 1.01715922, + "epoch": 0.06925740669488265, + "flos": 734886212352.0, + "grad_norm": 0.03093408458560108, + "language_loss": 1.02453041, + "learning_rate": 0.0009959662442761274, + "loss": 1.0356338, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.93066406, + "step": 360, + "time_per_iteration": 2.9324746131896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107676, + "balance_loss_mlp": 1.01445436, + "epoch": 0.0694497883801462, + "flos": 553571745024.0, + "grad_norm": 0.03028505188811882, + "language_loss": 0.95860314, + "learning_rate": 0.000995926654638155, + "loss": 0.96967983, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.93115234, + "step": 361, + "time_per_iteration": 2.8280868530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104746, + "balance_loss_mlp": 1.01157248, + "epoch": 0.06964217006540978, + "flos": 679244900352.0, + "grad_norm": 0.03450824772288923, + "language_loss": 0.98644811, + "learning_rate": 0.00099588687246438, + "loss": 0.99749553, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.93066406, + "step": 362, + "time_per_iteration": 2.8108932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108438, + "balance_loss_mlp": 1.01535928, + "epoch": 0.06983455175067334, + "flos": 525261167616.0, + "grad_norm": 0.03621302361184023, + "language_loss": 1.06105995, + "learning_rate": 0.0009958468977702471, + "loss": 1.07214439, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.9296875, + "step": 363, + "time_per_iteration": 2.6087372303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135422, + "balance_loss_mlp": 1.04272461, + "epoch": 0.0700269334359369, + "flos": 1580176283136.0, + "grad_norm": 0.03651647631774479, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.80870128, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.92578125, + "step": 364, + "time_per_iteration": 4.806072235107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104861, + "balance_loss_mlp": 1.01254511, + "epoch": 0.07021931512120046, + "flos": 1014858050304.0, + "grad_norm": 0.04058448706036458, + "language_loss": 0.94071019, + "learning_rate": 0.0009957663708830612, + "loss": 0.9517588, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.921875, + "step": 365, + "time_per_iteration": 3.30859637260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110656, + "balance_loss_mlp": 1.01862633, + "epoch": 0.07041169680646403, + "flos": 824432367360.0, + "grad_norm": 0.04186203278400794, + "language_loss": 0.98041129, + "learning_rate": 0.0009957258187212714, + "loss": 0.9915179, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.91894531, + "step": 366, + "time_per_iteration": 3.00058913230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097015, + "balance_loss_mlp": 1.00565338, + "epoch": 0.07060407849172759, + "flos": 1417293250560.0, + "grad_norm": 0.011820269564466843, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80291873, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.91210938, + "step": 367, + "time_per_iteration": 4.794500827789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113703, + "balance_loss_mlp": 1.02186394, + "epoch": 0.07079646017699115, + "flos": 513942379008.0, + "grad_norm": 0.041641563183133855, + "language_loss": 0.94691038, + "learning_rate": 0.0009956441370400167, + "loss": 0.95804739, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.91699219, + "step": 368, + "time_per_iteration": 2.63948917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111577, + "balance_loss_mlp": 1.02436066, + "epoch": 0.07098884186225471, + "flos": 541549179648.0, + "grad_norm": 0.03426405251061256, + "language_loss": 1.00885093, + "learning_rate": 0.0009956030075522636, + "loss": 1.02000868, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.91259766, + "step": 369, + "time_per_iteration": 2.74157452583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107449, + "balance_loss_mlp": 1.01613438, + "epoch": 0.07118122354751828, + "flos": 549739845120.0, + "grad_norm": 0.030296400642036637, + "language_loss": 1.0031743, + "learning_rate": 0.0009955616856543587, + "loss": 1.01424885, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.91162109, + "step": 370, + "time_per_iteration": 2.6210479736328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105786, + "balance_loss_mlp": 1.01475775, + "epoch": 0.07137360523278184, + "flos": 622077437952.0, + "grad_norm": 0.029509682347833893, + "language_loss": 0.92550498, + "learning_rate": 0.0009955201713623448, + "loss": 0.93656284, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.90869141, + "step": 371, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.00284576, + "epoch": 0.0715659869180454, + "flos": 1505976202752.0, + "grad_norm": 0.005566886599578838, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77765214, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.89648438, + "step": 372, + "time_per_iteration": 4.947838306427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126764, + "balance_loss_mlp": 1.0361172, + "epoch": 0.07175836860330896, + "flos": 496482050304.0, + "grad_norm": 0.040308561934975694, + "language_loss": 1.05629396, + "learning_rate": 0.0009954365656605333, + "loss": 1.06756163, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.90478516, + "step": 373, + "time_per_iteration": 2.5537302494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124141, + "balance_loss_mlp": 1.03416181, + "epoch": 0.07195075028857253, + "flos": 787082505984.0, + "grad_norm": 0.034789914575730614, + "language_loss": 0.98912442, + "learning_rate": 0.0009953944742831947, + "loss": 1.00036585, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.89892578, + "step": 374, + "time_per_iteration": 2.976074695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106044, + "balance_loss_mlp": 1.01678061, + "epoch": 0.0721431319738361, + "flos": 594347182848.0, + "grad_norm": 0.029628456658550576, + "language_loss": 1.02558136, + "learning_rate": 0.0009953521905766642, + "loss": 1.03664172, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.89404297, + "step": 375, + "time_per_iteration": 2.9556005001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101275, + "balance_loss_mlp": 1.01234496, + "epoch": 0.07233551365909965, + "flos": 549329630976.0, + "grad_norm": 0.034208323574026145, + "language_loss": 1.01073325, + "learning_rate": 0.0009953097145573577, + "loss": 1.02174592, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.89111328, + "step": 376, + "time_per_iteration": 2.6449482440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_mlp": 1.01759815, + "epoch": 0.07252789534436321, + "flos": 959169106176.0, + "grad_norm": 0.031040198427254525, + "language_loss": 0.98588479, + "learning_rate": 0.000995267046241766, + "loss": 0.99694908, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.89013672, + "step": 377, + "time_per_iteration": 3.2564361095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106989, + "balance_loss_mlp": 1.01877415, + "epoch": 0.07272027702962677, + "flos": 508656260352.0, + "grad_norm": 0.029229214223645432, + "language_loss": 0.98238575, + "learning_rate": 0.0009952241856464547, + "loss": 0.99345565, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.88378906, + "step": 378, + "time_per_iteration": 2.5843191146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108111, + "balance_loss_mlp": 1.02013505, + "epoch": 0.07291265871489035, + "flos": 613552380672.0, + "grad_norm": 0.03194005050639913, + "language_loss": 1.05557346, + "learning_rate": 0.0009951811327880632, + "loss": 1.06665444, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.88134766, + "step": 379, + "time_per_iteration": 2.727449655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107323, + "balance_loss_mlp": 1.01934636, + "epoch": 0.0731050404001539, + "flos": 496742565120.0, + "grad_norm": 0.03092115392183015, + "language_loss": 0.98400533, + "learning_rate": 0.0009951378876833063, + "loss": 0.99507862, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.88134766, + "step": 380, + "time_per_iteration": 2.5320205688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101258, + "balance_loss_mlp": 1.01332915, + "epoch": 0.07329742208541747, + "flos": 641130991104.0, + "grad_norm": 0.032065094183830696, + "language_loss": 1.04703462, + "learning_rate": 0.0009950944503489736, + "loss": 1.05804706, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.88085938, + "step": 381, + "time_per_iteration": 2.7422876358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_mlp": 1.01453876, + "epoch": 0.07348980377068103, + "flos": 817741607424.0, + "grad_norm": 0.030510114485064205, + "language_loss": 0.99112171, + "learning_rate": 0.0009950508208019285, + "loss": 1.00214303, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.87744141, + "step": 382, + "time_per_iteration": 3.046475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_mlp": 1.01323569, + "epoch": 0.0736821854559446, + "flos": 509670129408.0, + "grad_norm": 0.035756321159612754, + "language_loss": 1.03789318, + "learning_rate": 0.0009950069990591096, + "loss": 1.04890537, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.88134766, + "step": 383, + "time_per_iteration": 2.620088577270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.02674103, + "epoch": 0.07387456714120816, + "flos": 1558050987264.0, + "grad_norm": 0.043940663043905655, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77514511, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.86523438, + "step": 384, + "time_per_iteration": 4.87653374671936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121594, + "balance_loss_mlp": 1.03299809, + "epoch": 0.07406694882647172, + "flos": 526644421632.0, + "grad_norm": 0.039102279996233, + "language_loss": 0.96614265, + "learning_rate": 0.0009949187790542777, + "loss": 0.97735858, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.88769531, + "step": 385, + "time_per_iteration": 2.734100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112471, + "balance_loss_mlp": 1.03625691, + "epoch": 0.07425933051173528, + "flos": 498824738304.0, + "grad_norm": 0.03701278047407747, + "language_loss": 0.92462552, + "learning_rate": 0.0009948743808265148, + "loss": 0.93587261, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.88623047, + "step": 386, + "time_per_iteration": 2.7154581546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_mlp": 1.03704965, + "epoch": 0.07445171219699885, + "flos": 506057915136.0, + "grad_norm": 0.06663512882119103, + "language_loss": 1.02268195, + "learning_rate": 0.0009948297904714782, + "loss": 1.0339365, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.88574219, + "step": 387, + "time_per_iteration": 2.68532133102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112575, + "balance_loss_mlp": 1.03777313, + "epoch": 0.07464409388226241, + "flos": 555117337344.0, + "grad_norm": 0.036483324457394946, + "language_loss": 0.94151849, + "learning_rate": 0.0009947850080064796, + "loss": 0.95277596, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.88134766, + "step": 388, + "time_per_iteration": 2.789128303527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121204, + "balance_loss_mlp": 1.03370392, + "epoch": 0.07483647556752597, + "flos": 778275546624.0, + "grad_norm": 0.0421926900222792, + "language_loss": 0.99476451, + "learning_rate": 0.0009947400334489047, + "loss": 1.00597644, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.87646484, + "step": 389, + "time_per_iteration": 2.9937496185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011085, + "balance_loss_mlp": 1.02133441, + "epoch": 0.07502885725278953, + "flos": 613682638080.0, + "grad_norm": 0.0417493031738284, + "language_loss": 0.90741575, + "learning_rate": 0.0009946948668162145, + "loss": 0.91850078, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.87304688, + "step": 390, + "time_per_iteration": 2.7264010906219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101473, + "balance_loss_mlp": 1.01502275, + "epoch": 0.0752212389380531, + "flos": 689856021504.0, + "grad_norm": 0.03330838563423677, + "language_loss": 0.95001, + "learning_rate": 0.0009946495081259441, + "loss": 0.9610247, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.86572266, + "step": 391, + "time_per_iteration": 2.832472085952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097898, + "balance_loss_mlp": 1.01182938, + "epoch": 0.07541362062331666, + "flos": 767052022272.0, + "grad_norm": 0.03859494705227578, + "language_loss": 0.99014449, + "learning_rate": 0.0009946039573957035, + "loss": 1.00112355, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.86181641, + "step": 392, + "time_per_iteration": 2.925933361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101692, + "balance_loss_mlp": 1.01576602, + "epoch": 0.07560600230858022, + "flos": 589909682688.0, + "grad_norm": 0.039112379024015986, + "language_loss": 0.95485294, + "learning_rate": 0.000994558214643177, + "loss": 0.9658699, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.86035156, + "step": 393, + "time_per_iteration": 2.763448476791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095538, + "balance_loss_mlp": 1.00961244, + "epoch": 0.07579838399384378, + "flos": 751146034176.0, + "grad_norm": 0.03818992224284351, + "language_loss": 0.96862066, + "learning_rate": 0.000994512279886123, + "loss": 0.97957599, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.86035156, + "step": 394, + "time_per_iteration": 3.143615245819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101397, + "balance_loss_mlp": 1.01561391, + "epoch": 0.07599076567910736, + "flos": 524551554816.0, + "grad_norm": 0.030240351127206026, + "language_loss": 0.96659988, + "learning_rate": 0.0009944661531423758, + "loss": 0.97761387, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.85888672, + "step": 395, + "time_per_iteration": 2.6748764514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107513, + "balance_loss_mlp": 1.02206361, + "epoch": 0.07618314736437092, + "flos": 552186545664.0, + "grad_norm": 0.03358451790414236, + "language_loss": 0.95614338, + "learning_rate": 0.000994419834429843, + "loss": 0.96721858, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.85546875, + "step": 396, + "time_per_iteration": 2.6525089740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105372, + "balance_loss_mlp": 1.01987493, + "epoch": 0.07637552904963447, + "flos": 699433831680.0, + "grad_norm": 0.04315212632526892, + "language_loss": 1.00552011, + "learning_rate": 0.0009943733237665069, + "loss": 1.01657379, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.85595703, + "step": 397, + "time_per_iteration": 2.8678157329559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097353, + "balance_loss_mlp": 1.01218963, + "epoch": 0.07656791073489803, + "flos": 580636128768.0, + "grad_norm": 0.029538416941692198, + "language_loss": 0.99224108, + "learning_rate": 0.0009943266211704248, + "loss": 1.0032146, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.85253906, + "step": 398, + "time_per_iteration": 3.0023248195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099387, + "balance_loss_mlp": 1.01460528, + "epoch": 0.0767602924201616, + "flos": 418037910528.0, + "grad_norm": 0.03167845871290285, + "language_loss": 1.01143491, + "learning_rate": 0.000994279726659728, + "loss": 1.02242875, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.84863281, + "step": 399, + "time_per_iteration": 2.527693271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107007, + "balance_loss_mlp": 1.02246368, + "epoch": 0.07695267410542517, + "flos": 483888877824.0, + "grad_norm": 0.03414294034973106, + "language_loss": 0.9968133, + "learning_rate": 0.0009942326402526231, + "loss": 1.00788331, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.84619141, + "step": 400, + "time_per_iteration": 2.5610573291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_mlp": 1.02848434, + "epoch": 0.07714505579068873, + "flos": 532027749888.0, + "grad_norm": 0.030264499227930883, + "language_loss": 0.97403878, + "learning_rate": 0.0009941853619673902, + "loss": 0.98516715, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.84423828, + "step": 401, + "time_per_iteration": 2.680175542831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107236, + "balance_loss_mlp": 1.02302694, + "epoch": 0.07733743747595229, + "flos": 806440315392.0, + "grad_norm": 0.03979329481069023, + "language_loss": 1.01160502, + "learning_rate": 0.0009941378918223844, + "loss": 1.02267742, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.84277344, + "step": 402, + "time_per_iteration": 3.0908427238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098686, + "balance_loss_mlp": 1.01447606, + "epoch": 0.07752981916121585, + "flos": 623614281984.0, + "grad_norm": 0.03310929598543939, + "language_loss": 0.93567806, + "learning_rate": 0.0009940902298360354, + "loss": 0.94666493, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.84277344, + "step": 403, + "time_per_iteration": 2.7569308280944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.01048076, + "epoch": 0.07772220084647942, + "flos": 729543713280.0, + "grad_norm": 0.03955766616265138, + "language_loss": 1.03173304, + "learning_rate": 0.0009940423760268473, + "loss": 1.04268289, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.84570312, + "step": 404, + "time_per_iteration": 2.8456103801727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098252, + "balance_loss_mlp": 1.01375628, + "epoch": 0.07791458253174298, + "flos": 556469488896.0, + "grad_norm": 0.042207617679060144, + "language_loss": 0.96929657, + "learning_rate": 0.0009939943304133982, + "loss": 0.98027909, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.84570312, + "step": 405, + "time_per_iteration": 2.615145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104796, + "balance_loss_mlp": 1.02044404, + "epoch": 0.07810696421700654, + "flos": 554235671040.0, + "grad_norm": 0.04104566792755741, + "language_loss": 1.03659868, + "learning_rate": 0.0009939460930143416, + "loss": 1.04764676, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.84423828, + "step": 406, + "time_per_iteration": 2.6304614543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_mlp": 1.01745594, + "epoch": 0.0782993459022701, + "flos": 651879172608.0, + "grad_norm": 0.0317151282671847, + "language_loss": 0.97752666, + "learning_rate": 0.0009938976638484043, + "loss": 0.98854232, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.84179688, + "step": 407, + "time_per_iteration": 2.9032115936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109564, + "balance_loss_mlp": 1.01205039, + "epoch": 0.07849172758753367, + "flos": 497161527552.0, + "grad_norm": 0.04013855375776475, + "language_loss": 0.97246277, + "learning_rate": 0.0009938490429343887, + "loss": 0.98341918, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.83642578, + "step": 408, + "time_per_iteration": 2.5688796043395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.01236188, + "epoch": 0.07868410927279723, + "flos": 579076930560.0, + "grad_norm": 0.0397915036848884, + "language_loss": 0.97571141, + "learning_rate": 0.0009938002302911709, + "loss": 0.98666751, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.83300781, + "step": 409, + "time_per_iteration": 2.75036883354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096533, + "balance_loss_mlp": 1.01365864, + "epoch": 0.07887649095806079, + "flos": 524067463680.0, + "grad_norm": 0.03678821175613874, + "language_loss": 1.00230122, + "learning_rate": 0.0009937512259377015, + "loss": 1.01326644, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.82910156, + "step": 410, + "time_per_iteration": 2.6584975719451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110197, + "balance_loss_mlp": 1.01938236, + "epoch": 0.07906887264332435, + "flos": 558438901248.0, + "grad_norm": 0.04956969404692801, + "language_loss": 0.989124, + "learning_rate": 0.000993702029893006, + "loss": 1.00014377, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.82617188, + "step": 411, + "time_per_iteration": 2.7666263580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_mlp": 1.0196116, + "epoch": 0.07926125432858792, + "flos": 823364063232.0, + "grad_norm": 0.03322797228086769, + "language_loss": 0.99091381, + "learning_rate": 0.0009936526421761838, + "loss": 1.00193632, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.82666016, + "step": 412, + "time_per_iteration": 3.0222113132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102099, + "balance_loss_mlp": 1.01955855, + "epoch": 0.07945363601385148, + "flos": 563394518784.0, + "grad_norm": 0.04210923401756456, + "language_loss": 1.01423764, + "learning_rate": 0.000993603062806409, + "loss": 1.02525866, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.82568359, + "step": 413, + "time_per_iteration": 2.713226079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100317, + "balance_loss_mlp": 1.0176332, + "epoch": 0.07964601769911504, + "flos": 518885357568.0, + "grad_norm": 0.041362228888401006, + "language_loss": 1.04903626, + "learning_rate": 0.0009935532918029298, + "loss": 1.06003952, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.82714844, + "step": 414, + "time_per_iteration": 2.59602689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095709, + "balance_loss_mlp": 1.01326394, + "epoch": 0.0798383993843786, + "flos": 540301040640.0, + "grad_norm": 0.030384950019726516, + "language_loss": 0.97377884, + "learning_rate": 0.0009935033291850694, + "loss": 0.98473597, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.82470703, + "step": 415, + "time_per_iteration": 2.6417808532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094851, + "balance_loss_mlp": 1.013026, + "epoch": 0.08003078106964218, + "flos": 486122695680.0, + "grad_norm": 0.03579523867672845, + "language_loss": 1.00004411, + "learning_rate": 0.0009934531749722247, + "loss": 1.01099253, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.81835938, + "step": 416, + "time_per_iteration": 2.593029737472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095566, + "balance_loss_mlp": 1.01383638, + "epoch": 0.08022316275490574, + "flos": 519276129792.0, + "grad_norm": 0.0354518245662521, + "language_loss": 0.98370755, + "learning_rate": 0.0009934028291838672, + "loss": 0.99466318, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.81738281, + "step": 417, + "time_per_iteration": 2.7351250648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096643, + "balance_loss_mlp": 1.01496112, + "epoch": 0.0804155444401693, + "flos": 495047273472.0, + "grad_norm": 0.032920982329526526, + "language_loss": 0.93668723, + "learning_rate": 0.0009933522918395433, + "loss": 0.94765365, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.81689453, + "step": 418, + "time_per_iteration": 2.6427221298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.03316498, + "epoch": 0.08060792612543285, + "flos": 1584856801536.0, + "grad_norm": 0.029973653623271358, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79365897, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.81640625, + "step": 419, + "time_per_iteration": 4.8632917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.01569724, + "epoch": 0.08080030781069643, + "flos": 526359607296.0, + "grad_norm": 0.04163447523548115, + "language_loss": 1.12134457, + "learning_rate": 0.000993250642561551, + "loss": 1.13230991, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.80810547, + "step": 420, + "time_per_iteration": 2.608396053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109578, + "balance_loss_mlp": 1.01505113, + "epoch": 0.08099268949595999, + "flos": 547757793792.0, + "grad_norm": 0.04746808509414602, + "language_loss": 0.97398257, + "learning_rate": 0.0009931995306673466, + "loss": 0.98494035, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.80712891, + "step": 421, + "time_per_iteration": 2.7215850353240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097341, + "balance_loss_mlp": 1.01670778, + "epoch": 0.08118507118122355, + "flos": 511374169344.0, + "grad_norm": 0.04020038552675014, + "language_loss": 1.02514148, + "learning_rate": 0.000993148227296103, + "loss": 1.03611493, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.80615234, + "step": 422, + "time_per_iteration": 2.625366449356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010968, + "balance_loss_mlp": 1.01607168, + "epoch": 0.08137745286648711, + "flos": 722002389504.0, + "grad_norm": 0.03556088777041087, + "language_loss": 0.90137196, + "learning_rate": 0.000993096732467738, + "loss": 0.91233999, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.80712891, + "step": 423, + "time_per_iteration": 2.9795689582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.0118531, + "epoch": 0.08156983455175067, + "flos": 680818682880.0, + "grad_norm": 0.04422604915428747, + "language_loss": 0.99073571, + "learning_rate": 0.0009930450462022435, + "loss": 1.00165915, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.8046875, + "step": 424, + "time_per_iteration": 2.879889726638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.00783539, + "epoch": 0.08176221623701424, + "flos": 1456591137024.0, + "grad_norm": 0.006453860192715822, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.8027699, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.79296875, + "step": 425, + "time_per_iteration": 4.908784627914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.01541877, + "epoch": 0.0819545979222778, + "flos": 1558885044480.0, + "grad_norm": 0.04271462185638088, + "language_loss": 0.96659774, + "learning_rate": 0.0009929410994402065, + "loss": 0.9775573, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.80517578, + "step": 426, + "time_per_iteration": 3.7266876697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100573, + "balance_loss_mlp": 1.02013052, + "epoch": 0.08214697960754136, + "flos": 513801427968.0, + "grad_norm": 0.040597463537132866, + "language_loss": 1.00489211, + "learning_rate": 0.0009928888389840196, + "loss": 1.01589799, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.80419922, + "step": 427, + "time_per_iteration": 2.695010185241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098577, + "balance_loss_mlp": 1.01822996, + "epoch": 0.08233936129280492, + "flos": 596222309376.0, + "grad_norm": 0.03622779747664415, + "language_loss": 1.02622843, + "learning_rate": 0.0009928363871714147, + "loss": 1.03721428, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.80322266, + "step": 428, + "time_per_iteration": 2.66733455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097324, + "balance_loss_mlp": 1.01721525, + "epoch": 0.08253174297806849, + "flos": 573165769728.0, + "grad_norm": 0.028981657602537042, + "language_loss": 0.97141832, + "learning_rate": 0.0009927837440227556, + "loss": 0.98239154, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.80078125, + "step": 429, + "time_per_iteration": 2.8499114513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093938, + "balance_loss_mlp": 1.01392436, + "epoch": 0.08272412466333205, + "flos": 624643702272.0, + "grad_norm": 0.031878488957356683, + "language_loss": 0.91184896, + "learning_rate": 0.0009927309095584798, + "loss": 0.92278832, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.79980469, + "step": 430, + "time_per_iteration": 3.020768165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097624, + "balance_loss_mlp": 1.01756275, + "epoch": 0.08291650634859561, + "flos": 514995131904.0, + "grad_norm": 0.040558959270141796, + "language_loss": 1.03523278, + "learning_rate": 0.0009926778837991, + "loss": 1.0462091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.80029297, + "step": 431, + "time_per_iteration": 2.609189033508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101176, + "balance_loss_mlp": 1.02125835, + "epoch": 0.08310888803385917, + "flos": 668542405632.0, + "grad_norm": 0.035092839201242565, + "language_loss": 1.01323938, + "learning_rate": 0.000992624666765202, + "loss": 1.0242511, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.79882812, + "step": 432, + "time_per_iteration": 2.817399501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_mlp": 1.02154219, + "epoch": 0.08330126971912274, + "flos": 584491361280.0, + "grad_norm": 0.0354530922421884, + "language_loss": 0.98992586, + "learning_rate": 0.000992571258477447, + "loss": 1.00094295, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.80126953, + "step": 433, + "time_per_iteration": 2.777506113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010961, + "balance_loss_mlp": 1.0161345, + "epoch": 0.0834936514043863, + "flos": 562498268160.0, + "grad_norm": 0.03167346665720251, + "language_loss": 0.92772877, + "learning_rate": 0.0009925176589565695, + "loss": 0.93868983, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.79931641, + "step": 434, + "time_per_iteration": 2.801501512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093857, + "balance_loss_mlp": 1.01398647, + "epoch": 0.08368603308964986, + "flos": 495513868032.0, + "grad_norm": 0.03411426988917409, + "language_loss": 1.03318536, + "learning_rate": 0.0009924638682233791, + "loss": 1.04412401, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.79833984, + "step": 435, + "time_per_iteration": 2.573282241821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.01512909, + "epoch": 0.08387841477491342, + "flos": 1391811397632.0, + "grad_norm": 0.030642245427906535, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.8065716, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.7734375, + "step": 436, + "time_per_iteration": 4.596274375915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099407, + "balance_loss_mlp": 1.02006125, + "epoch": 0.084070796460177, + "flos": 800355155712.0, + "grad_norm": 0.040681894877429646, + "language_loss": 0.92768085, + "learning_rate": 0.0009923557132036668, + "loss": 0.93867493, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.79296875, + "step": 437, + "time_per_iteration": 3.0366878509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_mlp": 1.02364242, + "epoch": 0.08426317814544056, + "flos": 560097254400.0, + "grad_norm": 0.034275916488964116, + "language_loss": 0.96774155, + "learning_rate": 0.0009923013489591345, + "loss": 0.97876477, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.78613281, + "step": 438, + "time_per_iteration": 2.8060851097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_mlp": 1.0219903, + "epoch": 0.08445555983070412, + "flos": 811884881664.0, + "grad_norm": 0.035250716051411925, + "language_loss": 0.95655745, + "learning_rate": 0.0009922467935862681, + "loss": 0.96756417, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.78613281, + "step": 439, + "time_per_iteration": 3.116757869720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098598, + "balance_loss_mlp": 1.0204916, + "epoch": 0.08464794151596768, + "flos": 511170034944.0, + "grad_norm": 0.03561138790794706, + "language_loss": 0.98418635, + "learning_rate": 0.0009921920471062478, + "loss": 0.99517238, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.78027344, + "step": 440, + "time_per_iteration": 2.6008944511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093389, + "balance_loss_mlp": 1.01561701, + "epoch": 0.08484032320123125, + "flos": 557474609664.0, + "grad_norm": 0.02914226137027636, + "language_loss": 0.96590662, + "learning_rate": 0.0009921371095403281, + "loss": 0.97684056, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.77685547, + "step": 441, + "time_per_iteration": 2.638679265975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_mlp": 1.01697087, + "epoch": 0.08503270488649481, + "flos": 528361100544.0, + "grad_norm": 0.02987504029564206, + "language_loss": 0.99685514, + "learning_rate": 0.0009920819809098379, + "loss": 1.00780344, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.77783203, + "step": 442, + "time_per_iteration": 2.5915398597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089542, + "balance_loss_mlp": 1.01172209, + "epoch": 0.08522508657175837, + "flos": 615386678016.0, + "grad_norm": 0.03983619354546574, + "language_loss": 0.95535469, + "learning_rate": 0.0009920266612361798, + "loss": 0.96625006, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.77734375, + "step": 443, + "time_per_iteration": 2.724025249481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091681, + "balance_loss_mlp": 1.01371801, + "epoch": 0.08541746825702193, + "flos": 620987746560.0, + "grad_norm": 0.032808156584867194, + "language_loss": 0.9504559, + "learning_rate": 0.0009919711505408308, + "loss": 0.96137273, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.77880859, + "step": 444, + "time_per_iteration": 2.780973434448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087177, + "balance_loss_mlp": 1.00926137, + "epoch": 0.08560984994228549, + "flos": 483888877824.0, + "grad_norm": 0.03232110076143325, + "language_loss": 0.92813373, + "learning_rate": 0.000991915448845342, + "loss": 0.93900549, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.77832031, + "step": 445, + "time_per_iteration": 2.6011459827423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090069, + "balance_loss_mlp": 1.01243973, + "epoch": 0.08580223162754906, + "flos": 518177690112.0, + "grad_norm": 0.03377956208163177, + "language_loss": 1.02285504, + "learning_rate": 0.000991859556171339, + "loss": 1.03375578, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.77539062, + "step": 446, + "time_per_iteration": 2.606220006942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.01086187, + "epoch": 0.08599461331281262, + "flos": 532520589312.0, + "grad_norm": 0.037753212584348855, + "language_loss": 1.04541254, + "learning_rate": 0.000991803472540521, + "loss": 1.0562979, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.77587891, + "step": 447, + "time_per_iteration": 2.625401735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088712, + "balance_loss_mlp": 1.01113105, + "epoch": 0.08618699499807618, + "flos": 791634712320.0, + "grad_norm": 0.030920782852134367, + "language_loss": 0.98781657, + "learning_rate": 0.0009917471979746615, + "loss": 0.99870372, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.77490234, + "step": 448, + "time_per_iteration": 3.0066978931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.01195049, + "epoch": 0.08637937668333974, + "flos": 567115603200.0, + "grad_norm": 0.03238149886931097, + "language_loss": 0.98317528, + "learning_rate": 0.0009916907324956086, + "loss": 0.99407488, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.77929688, + "step": 449, + "time_per_iteration": 2.7561135292053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091057, + "balance_loss_mlp": 1.01333201, + "epoch": 0.08657175836860331, + "flos": 446118108672.0, + "grad_norm": 0.029046506526173844, + "language_loss": 0.94927382, + "learning_rate": 0.0009916340761252837, + "loss": 0.96018445, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.77636719, + "step": 450, + "time_per_iteration": 2.6452889442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089525, + "balance_loss_mlp": 1.01222932, + "epoch": 0.08676414005386687, + "flos": 845589480960.0, + "grad_norm": 0.032144406787761336, + "language_loss": 0.91630232, + "learning_rate": 0.0009915772288856832, + "loss": 0.92719758, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.77197266, + "step": 451, + "time_per_iteration": 3.0991322994232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108797, + "balance_loss_mlp": 1.01086605, + "epoch": 0.08695652173913043, + "flos": 604484906496.0, + "grad_norm": 0.025568476728402203, + "language_loss": 0.93134868, + "learning_rate": 0.000991520190798877, + "loss": 0.94222844, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.77001953, + "step": 452, + "time_per_iteration": 2.833534002304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093662, + "balance_loss_mlp": 1.01660514, + "epoch": 0.08714890342439399, + "flos": 732001107456.0, + "grad_norm": 0.03795734255344977, + "language_loss": 1.02428043, + "learning_rate": 0.0009914629618870089, + "loss": 1.03521705, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.76953125, + "step": 453, + "time_per_iteration": 2.9043643474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.02319336, + "epoch": 0.08734128510965757, + "flos": 1485456770304.0, + "grad_norm": 0.019964198948139205, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79774594, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.75390625, + "step": 454, + "time_per_iteration": 2.093019723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_mlp": 1.01278687, + "epoch": 0.08753366679492113, + "flos": 1526269146624.0, + "grad_norm": 0.012226751630218, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82515901, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.75, + "step": 455, + "time_per_iteration": 4.905871391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091043, + "balance_loss_mlp": 1.01379561, + "epoch": 0.08772604848018468, + "flos": 722525364480.0, + "grad_norm": 0.044152825797527884, + "language_loss": 0.95217329, + "learning_rate": 0.0009912901304235883, + "loss": 0.96308374, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.77148438, + "step": 456, + "time_per_iteration": 2.850330352783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090876, + "balance_loss_mlp": 1.01396191, + "epoch": 0.08791843016544824, + "flos": 709467542784.0, + "grad_norm": 0.038854584599924205, + "language_loss": 0.92178857, + "learning_rate": 0.000991232138434397, + "loss": 0.9326973, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.76806641, + "step": 457, + "time_per_iteration": 2.868957757949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.01477098, + "epoch": 0.08811081185071182, + "flos": 474022362624.0, + "grad_norm": 0.04035146689108268, + "language_loss": 0.99321103, + "learning_rate": 0.000991173955731976, + "loss": 1.00412512, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.76513672, + "step": 458, + "time_per_iteration": 2.6747970581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089272, + "balance_loss_mlp": 1.01288271, + "epoch": 0.08830319353597538, + "flos": 686315738880.0, + "grad_norm": 0.033089720334054364, + "language_loss": 1.03213239, + "learning_rate": 0.0009911155823389137, + "loss": 1.04302514, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.76269531, + "step": 459, + "time_per_iteration": 2.9462268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_mlp": 1.00881398, + "epoch": 0.08849557522123894, + "flos": 574609294848.0, + "grad_norm": 0.035557366742091014, + "language_loss": 0.99025905, + "learning_rate": 0.000991057018277873, + "loss": 1.00111353, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.76513672, + "step": 460, + "time_per_iteration": 2.6903369426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.00968456, + "epoch": 0.0886879569065025, + "flos": 565628336640.0, + "grad_norm": 0.039664118418905284, + "language_loss": 1.00002789, + "learning_rate": 0.0009909982635715898, + "loss": 1.01089334, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.76757812, + "step": 461, + "time_per_iteration": 2.620046615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010895, + "balance_loss_mlp": 1.0128243, + "epoch": 0.08888033859176607, + "flos": 564957607680.0, + "grad_norm": 0.03231802322071402, + "language_loss": 0.98670942, + "learning_rate": 0.0009909393182428751, + "loss": 0.99760437, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.765625, + "step": 462, + "time_per_iteration": 2.6466307640075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090991, + "balance_loss_mlp": 1.01412475, + "epoch": 0.08907272027702963, + "flos": 466743499008.0, + "grad_norm": 0.03344290639259395, + "language_loss": 0.93214953, + "learning_rate": 0.000990880182314614, + "loss": 0.94305944, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.76757812, + "step": 463, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_mlp": 1.0100224, + "epoch": 0.08926510196229319, + "flos": 682844475648.0, + "grad_norm": 0.03261982194681884, + "language_loss": 0.93093467, + "learning_rate": 0.0009908208558097643, + "loss": 0.94180012, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.76416016, + "step": 464, + "time_per_iteration": 2.9068925380706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089482, + "balance_loss_mlp": 1.01323605, + "epoch": 0.08945748364755675, + "flos": 597822336768.0, + "grad_norm": 0.03309433671244878, + "language_loss": 0.95414662, + "learning_rate": 0.000990761338751359, + "loss": 0.9650414, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.76123047, + "step": 465, + "time_per_iteration": 2.774606227874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079613, + "balance_loss_mlp": 1.00732422, + "epoch": 0.08964986533282032, + "flos": 1589343879168.0, + "grad_norm": 0.03434681355524106, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74739242, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.72460938, + "step": 466, + "time_per_iteration": 4.996358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092523, + "balance_loss_mlp": 1.01646745, + "epoch": 0.08984224701808388, + "flos": 534550272768.0, + "grad_norm": 0.03379784984504044, + "language_loss": 0.98391378, + "learning_rate": 0.0009906417330663815, + "loss": 0.99483901, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.75927734, + "step": 467, + "time_per_iteration": 2.6774964332580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092653, + "balance_loss_mlp": 1.01678836, + "epoch": 0.09003462870334744, + "flos": 479850898176.0, + "grad_norm": 0.04271038491910547, + "language_loss": 0.94838965, + "learning_rate": 0.0009905816444862442, + "loss": 0.95931625, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.75732422, + "step": 468, + "time_per_iteration": 2.6558451652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092919, + "balance_loss_mlp": 1.01691103, + "epoch": 0.090227010388611, + "flos": 654903283200.0, + "grad_norm": 0.031716132767048565, + "language_loss": 0.92225289, + "learning_rate": 0.0009905213654454216, + "loss": 0.933182, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.75878906, + "step": 469, + "time_per_iteration": 2.9322757720947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.01796389, + "epoch": 0.09041939207387456, + "flos": 619359528960.0, + "grad_norm": 0.03474651138537023, + "language_loss": 1.00819349, + "learning_rate": 0.0009904608959673158, + "loss": 1.01913023, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.75585938, + "step": 470, + "time_per_iteration": 2.7938003540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091787, + "balance_loss_mlp": 1.01620793, + "epoch": 0.09061177375913813, + "flos": 455296398336.0, + "grad_norm": 0.04023106246537731, + "language_loss": 1.00852847, + "learning_rate": 0.000990400236075403, + "loss": 1.01944637, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.75439453, + "step": 471, + "time_per_iteration": 2.5231049060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.01024961, + "epoch": 0.0908041554444017, + "flos": 545309147904.0, + "grad_norm": 0.036372029021066864, + "language_loss": 0.97571105, + "learning_rate": 0.0009903393857932338, + "loss": 0.98656648, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.75146484, + "step": 472, + "time_per_iteration": 2.700449228286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082684, + "balance_loss_mlp": 1.00786841, + "epoch": 0.09099653712966525, + "flos": 565467943680.0, + "grad_norm": 0.03263919317425628, + "language_loss": 0.95124531, + "learning_rate": 0.0009902783451444317, + "loss": 0.96207213, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.74658203, + "step": 473, + "time_per_iteration": 2.7006537914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.00667381, + "epoch": 0.09118891881492881, + "flos": 475502826240.0, + "grad_norm": 0.036465550100162274, + "language_loss": 0.98778975, + "learning_rate": 0.0009902171141526956, + "loss": 0.99860233, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.74414062, + "step": 474, + "time_per_iteration": 2.565852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081522, + "balance_loss_mlp": 1.00732613, + "epoch": 0.09138130050019239, + "flos": 546991800576.0, + "grad_norm": 0.03189281102051162, + "language_loss": 0.86324012, + "learning_rate": 0.000990155692841797, + "loss": 0.87405533, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.74023438, + "step": 475, + "time_per_iteration": 2.9694621562957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.0079515, + "epoch": 0.09157368218545595, + "flos": 733974410496.0, + "grad_norm": 0.03574286330183218, + "language_loss": 0.98287529, + "learning_rate": 0.0009900940812355818, + "loss": 0.99369442, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.73779297, + "step": 476, + "time_per_iteration": 2.8549702167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.00835192, + "epoch": 0.0917660638707195, + "flos": 612073862400.0, + "grad_norm": 0.03800316101532587, + "language_loss": 0.95275486, + "learning_rate": 0.00099003227935797, + "loss": 0.96357656, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.73632812, + "step": 477, + "time_per_iteration": 2.709808349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084092, + "balance_loss_mlp": 1.01051593, + "epoch": 0.09195844555598306, + "flos": 657019482624.0, + "grad_norm": 0.03875864993538346, + "language_loss": 0.99037415, + "learning_rate": 0.000989970287232955, + "loss": 1.0012151, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.73486328, + "step": 478, + "time_per_iteration": 2.7670538425445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.01252699, + "epoch": 0.09215082724124664, + "flos": 477541257984.0, + "grad_norm": 0.03367109557456403, + "language_loss": 0.95731258, + "learning_rate": 0.0009899081048846043, + "loss": 0.96817166, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.73339844, + "step": 479, + "time_per_iteration": 2.588352918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_mlp": 1.01208997, + "epoch": 0.0923432089265102, + "flos": 525326296320.0, + "grad_norm": 0.0462740033589213, + "language_loss": 1.00606585, + "learning_rate": 0.0009898457323370593, + "loss": 1.01691723, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.73046875, + "step": 480, + "time_per_iteration": 2.5808160305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082132, + "balance_loss_mlp": 1.00936687, + "epoch": 0.09253559061177376, + "flos": 546639912192.0, + "grad_norm": 0.03676160983227949, + "language_loss": 0.9798522, + "learning_rate": 0.000989783169614535, + "loss": 0.99067354, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.72900391, + "step": 481, + "time_per_iteration": 2.624483108520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097145, + "balance_loss_mlp": 1.02485657, + "epoch": 0.09272797229703732, + "flos": 1541337209856.0, + "grad_norm": 0.023489610904585654, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79849905, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.72460938, + "step": 482, + "time_per_iteration": 4.897305965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085543, + "balance_loss_mlp": 1.01330173, + "epoch": 0.09292035398230089, + "flos": 691065276672.0, + "grad_norm": 0.04252493421314706, + "language_loss": 0.95552129, + "learning_rate": 0.000989657473741779, + "loss": 0.96637678, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.72412109, + "step": 483, + "time_per_iteration": 2.8165738582611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.01184416, + "epoch": 0.09311273566756445, + "flos": 510823004160.0, + "grad_norm": 0.03895509426778844, + "language_loss": 0.97422099, + "learning_rate": 0.0009895943406403465, + "loss": 0.98506236, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.72460938, + "step": 484, + "time_per_iteration": 2.7523326873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086134, + "balance_loss_mlp": 1.01384509, + "epoch": 0.09330511735282801, + "flos": 660584064768.0, + "grad_norm": 0.04754513437429821, + "language_loss": 0.90526009, + "learning_rate": 0.0009895310174615338, + "loss": 0.91612148, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.72460938, + "step": 485, + "time_per_iteration": 2.843790292739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070969, + "balance_loss_mlp": 0.99982452, + "epoch": 0.09349749903809157, + "flos": 1456024420608.0, + "grad_norm": 0.007982392205281765, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76789486, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.71289062, + "step": 486, + "time_per_iteration": 4.649716138839722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.00877845, + "epoch": 0.09368988072335514, + "flos": 521900719872.0, + "grad_norm": 0.0379904908867083, + "language_loss": 0.94096279, + "learning_rate": 0.0009894038009701782, + "loss": 0.95177054, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.72167969, + "step": 487, + "time_per_iteration": 2.615767002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_mlp": 1.012941, + "epoch": 0.0938822624086187, + "flos": 498752806656.0, + "grad_norm": 0.041516659048387576, + "language_loss": 0.97017074, + "learning_rate": 0.0009893399077070253, + "loss": 0.98102111, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.72265625, + "step": 488, + "time_per_iteration": 2.592867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.01828361, + "epoch": 0.09407464409388226, + "flos": 534224629248.0, + "grad_norm": 0.031087819309936707, + "language_loss": 0.91152203, + "learning_rate": 0.0009892758244652718, + "loss": 0.92242396, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.72070312, + "step": 489, + "time_per_iteration": 2.702681541442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080571, + "balance_loss_mlp": 1.00852132, + "epoch": 0.09426702577914582, + "flos": 587091651840.0, + "grad_norm": 0.037758062155454256, + "language_loss": 0.98290044, + "learning_rate": 0.0009892115512697968, + "loss": 0.99370617, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.72216797, + "step": 490, + "time_per_iteration": 2.7222015857696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.01649261, + "epoch": 0.0944594074644094, + "flos": 504464690688.0, + "grad_norm": 0.03400132145466818, + "language_loss": 0.98617911, + "learning_rate": 0.0009891470881455537, + "loss": 0.99706453, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.72216797, + "step": 491, + "time_per_iteration": 2.6978650093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.01626599, + "epoch": 0.09465178914967295, + "flos": 572114962176.0, + "grad_norm": 0.03537229102294209, + "language_loss": 0.97051454, + "learning_rate": 0.0009890824351175692, + "loss": 0.98139298, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.71728516, + "step": 492, + "time_per_iteration": 2.7183802127838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.01590919, + "epoch": 0.09484417083493651, + "flos": 550419322368.0, + "grad_norm": 0.028677449722299516, + "language_loss": 1.00688422, + "learning_rate": 0.0009890175922109435, + "loss": 1.01776004, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.71826172, + "step": 493, + "time_per_iteration": 2.680469512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082456, + "balance_loss_mlp": 1.01088285, + "epoch": 0.09503655252020007, + "flos": 825272237568.0, + "grad_norm": 0.03488638846892438, + "language_loss": 0.98808897, + "learning_rate": 0.0009889525594508513, + "loss": 0.99891359, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.71728516, + "step": 494, + "time_per_iteration": 2.983400344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083066, + "balance_loss_mlp": 1.01154041, + "epoch": 0.09522893420546363, + "flos": 405518615040.0, + "grad_norm": 0.028649644857800794, + "language_loss": 0.9245472, + "learning_rate": 0.0009888873368625404, + "loss": 0.93537784, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.71679688, + "step": 495, + "time_per_iteration": 2.497526168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108369, + "balance_loss_mlp": 1.01206875, + "epoch": 0.0954213158907272, + "flos": 692257035264.0, + "grad_norm": 0.03396045626839725, + "language_loss": 0.96602595, + "learning_rate": 0.0009888219244713326, + "loss": 0.97686291, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.71777344, + "step": 496, + "time_per_iteration": 2.8588504791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.01018417, + "epoch": 0.09561369757599077, + "flos": 520075170816.0, + "grad_norm": 0.039869543083186736, + "language_loss": 0.97707164, + "learning_rate": 0.0009887563223026229, + "loss": 0.98788875, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.71679688, + "step": 497, + "time_per_iteration": 2.6856894493103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075874, + "balance_loss_mlp": 1.00644684, + "epoch": 0.09580607926125433, + "flos": 1388784363264.0, + "grad_norm": 0.01625235818526382, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80144036, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.6953125, + "step": 498, + "time_per_iteration": 4.882593393325806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086748, + "balance_loss_mlp": 1.0150795, + "epoch": 0.09599846094651789, + "flos": 718826634240.0, + "grad_norm": 0.03326061844711544, + "language_loss": 0.95632416, + "learning_rate": 0.0009886245487346482, + "loss": 0.9671917, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.71826172, + "step": 499, + "time_per_iteration": 3.0426785945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.01568544, + "epoch": 0.09619084263178146, + "flos": 386894717952.0, + "grad_norm": 0.04298067648683731, + "language_loss": 0.98954022, + "learning_rate": 0.0009885583773865422, + "loss": 1.00041187, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.71630859, + "step": 500, + "time_per_iteration": 2.452941417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.01467967, + "epoch": 0.09638322431704502, + "flos": 535173369600.0, + "grad_norm": 0.04172266818012015, + "language_loss": 0.95971203, + "learning_rate": 0.0009884920163632524, + "loss": 0.97057414, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.71679688, + "step": 501, + "time_per_iteration": 2.657940626144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080406, + "balance_loss_mlp": 1.00911927, + "epoch": 0.09657560600230858, + "flos": 501657353472.0, + "grad_norm": 0.041437287127294276, + "language_loss": 0.9960922, + "learning_rate": 0.000988425465690543, + "loss": 1.00689626, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.71435547, + "step": 502, + "time_per_iteration": 2.5540428161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.00642741, + "epoch": 0.09676798768757214, + "flos": 530332458240.0, + "grad_norm": 0.03187665411612151, + "language_loss": 0.96807587, + "learning_rate": 0.0009883587253942505, + "loss": 0.97885495, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.71630859, + "step": 503, + "time_per_iteration": 2.7744338512420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086901, + "balance_loss_mlp": 1.01542282, + "epoch": 0.09696036937283571, + "flos": 464557313280.0, + "grad_norm": 0.038653015311582224, + "language_loss": 1.0234406, + "learning_rate": 0.0009882917955002862, + "loss": 1.03430974, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.71630859, + "step": 504, + "time_per_iteration": 2.500669479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_mlp": 1.01074982, + "epoch": 0.09715275105809927, + "flos": 536011294464.0, + "grad_norm": 0.035792041916504785, + "language_loss": 0.94188601, + "learning_rate": 0.0009882246760346343, + "loss": 0.95270395, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.71191406, + "step": 505, + "time_per_iteration": 2.6442148685455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077575, + "balance_loss_mlp": 1.00652647, + "epoch": 0.09734513274336283, + "flos": 455882556672.0, + "grad_norm": 0.04461237962136338, + "language_loss": 1.00418711, + "learning_rate": 0.0009881573670233533, + "loss": 1.01496279, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.71191406, + "step": 506, + "time_per_iteration": 2.5102410316467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075502, + "balance_loss_mlp": 1.00450063, + "epoch": 0.09753751442862639, + "flos": 509828577024.0, + "grad_norm": 0.03506590591484262, + "language_loss": 0.93374205, + "learning_rate": 0.0009880898684925747, + "loss": 0.94449711, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.71142578, + "step": 507, + "time_per_iteration": 2.652381658554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.00624609, + "epoch": 0.09772989611388996, + "flos": 485247832320.0, + "grad_norm": 0.03501422949918711, + "language_loss": 0.92606336, + "learning_rate": 0.0009880221804685037, + "loss": 0.9368335, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.70898438, + "step": 508, + "time_per_iteration": 2.5481274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073608, + "balance_loss_mlp": 1.00456238, + "epoch": 0.09792227779915352, + "flos": 1569319231488.0, + "grad_norm": 0.011873284077886747, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80418032, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.69140625, + "step": 509, + "time_per_iteration": 4.725191354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076044, + "balance_loss_mlp": 1.00590122, + "epoch": 0.09811465948441708, + "flos": 588915255552.0, + "grad_norm": 0.04172960474096109, + "language_loss": 0.98818666, + "learning_rate": 0.0009878862360456733, + "loss": 0.99894708, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.70263672, + "step": 510, + "time_per_iteration": 2.7094569206237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.00828481, + "epoch": 0.09830704116968064, + "flos": 614129790720.0, + "grad_norm": 0.037035801977756785, + "language_loss": 0.90851068, + "learning_rate": 0.0009878179796996922, + "loss": 0.919294, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.70166016, + "step": 511, + "time_per_iteration": 2.6973366737365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079637, + "balance_loss_mlp": 1.00973296, + "epoch": 0.09849942285494422, + "flos": 539936513280.0, + "grad_norm": 0.0318668020933778, + "language_loss": 0.94484478, + "learning_rate": 0.0009877495339659754, + "loss": 0.95564115, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.70019531, + "step": 512, + "time_per_iteration": 2.7476089000701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083598, + "balance_loss_mlp": 1.0137887, + "epoch": 0.09869180454020778, + "flos": 621604040448.0, + "grad_norm": 0.03763698097825182, + "language_loss": 0.89467418, + "learning_rate": 0.000987680898871096, + "loss": 0.90551007, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.69921875, + "step": 513, + "time_per_iteration": 2.7254321575164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.01382184, + "epoch": 0.09888418622547133, + "flos": 813061089024.0, + "grad_norm": 0.049179676158016515, + "language_loss": 0.91816097, + "learning_rate": 0.0009876120744417, + "loss": 0.9289968, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.69873047, + "step": 514, + "time_per_iteration": 2.9596974849700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083293, + "balance_loss_mlp": 1.01357901, + "epoch": 0.0990765679107349, + "flos": 536857967616.0, + "grad_norm": 0.03966041946019195, + "language_loss": 0.99294269, + "learning_rate": 0.0009875430607045078, + "loss": 1.0037756, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.69824219, + "step": 515, + "time_per_iteration": 2.7065181732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_mlp": 1.01439941, + "epoch": 0.09926894959599845, + "flos": 588971635968.0, + "grad_norm": 0.037836000479060286, + "language_loss": 0.94664383, + "learning_rate": 0.000987473857686313, + "loss": 0.95748156, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.69482422, + "step": 516, + "time_per_iteration": 2.712947130203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_mlp": 1.01582849, + "epoch": 0.09946133128126203, + "flos": 642387878400.0, + "grad_norm": 0.04191957443387863, + "language_loss": 0.98466003, + "learning_rate": 0.0009874044654139824, + "loss": 0.99551111, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.69384766, + "step": 517, + "time_per_iteration": 2.7391469478607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081227, + "balance_loss_mlp": 1.01194227, + "epoch": 0.09965371296652559, + "flos": 466726002432.0, + "grad_norm": 0.049265237591549625, + "language_loss": 0.97911566, + "learning_rate": 0.0009873348839144563, + "loss": 0.98992795, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.69384766, + "step": 518, + "time_per_iteration": 2.5496554374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081078, + "balance_loss_mlp": 1.01198411, + "epoch": 0.09984609465178915, + "flos": 484559606784.0, + "grad_norm": 0.04039588305244337, + "language_loss": 0.99084902, + "learning_rate": 0.000987265113214749, + "loss": 1.00165975, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.69189453, + "step": 519, + "time_per_iteration": 2.592350721359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_mlp": 1.01200545, + "epoch": 0.1000384763370527, + "flos": 570095972352.0, + "grad_norm": 0.04690738730083641, + "language_loss": 1.01784182, + "learning_rate": 0.0009871951533419476, + "loss": 1.02865279, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.69189453, + "step": 520, + "time_per_iteration": 2.699725866317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.00854921, + "epoch": 0.10023085802231628, + "flos": 546926671872.0, + "grad_norm": 0.03422053119670882, + "language_loss": 0.91227025, + "learning_rate": 0.0009871250043232132, + "loss": 0.92304718, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.69238281, + "step": 521, + "time_per_iteration": 2.74124813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078273, + "balance_loss_mlp": 1.00913203, + "epoch": 0.10042323970757984, + "flos": 504440391168.0, + "grad_norm": 0.0407416967929008, + "language_loss": 0.91114902, + "learning_rate": 0.0009870546661857797, + "loss": 0.92193174, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.69238281, + "step": 522, + "time_per_iteration": 2.6524126529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080712, + "balance_loss_mlp": 1.01199949, + "epoch": 0.1006156213928434, + "flos": 771725737728.0, + "grad_norm": 0.04764395650012834, + "language_loss": 1.0071038, + "learning_rate": 0.0009869841389569553, + "loss": 1.01791096, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.68798828, + "step": 523, + "time_per_iteration": 2.9797816276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.01237857, + "epoch": 0.10080800307810696, + "flos": 491009293824.0, + "grad_norm": 0.04526617857315469, + "language_loss": 0.93126583, + "learning_rate": 0.0009869134226641206, + "loss": 0.94207817, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.68945312, + "step": 524, + "time_per_iteration": 2.624396562576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079355, + "balance_loss_mlp": 1.01064241, + "epoch": 0.10100038476337053, + "flos": 455713415424.0, + "grad_norm": 0.04976961118682096, + "language_loss": 0.93662071, + "learning_rate": 0.0009868425173347303, + "loss": 0.94741422, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.68798828, + "step": 525, + "time_per_iteration": 2.659106731414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077138, + "balance_loss_mlp": 1.00809169, + "epoch": 0.10119276644863409, + "flos": 557574731520.0, + "grad_norm": 0.04197638521891018, + "language_loss": 0.9924143, + "learning_rate": 0.0009867714229963125, + "loss": 1.00318575, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.69140625, + "step": 526, + "time_per_iteration": 2.7414495944976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.01201165, + "epoch": 0.10138514813389765, + "flos": 517220201472.0, + "grad_norm": 0.044929109849797505, + "language_loss": 0.96641302, + "learning_rate": 0.000986700139676468, + "loss": 0.97722065, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.68847656, + "step": 527, + "time_per_iteration": 2.620313882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083057, + "balance_loss_mlp": 1.01405847, + "epoch": 0.10157752981916121, + "flos": 501564034560.0, + "grad_norm": 0.03558874762709202, + "language_loss": 0.9424324, + "learning_rate": 0.0009866286674028717, + "loss": 0.95326293, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.69091797, + "step": 528, + "time_per_iteration": 2.632835865020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082511, + "balance_loss_mlp": 1.01379848, + "epoch": 0.10176991150442478, + "flos": 658094589696.0, + "grad_norm": 0.042026744727430246, + "language_loss": 0.91470444, + "learning_rate": 0.0009865570062032717, + "loss": 0.9255296, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.68798828, + "step": 529, + "time_per_iteration": 2.9185874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.01519477, + "epoch": 0.10196229318968834, + "flos": 574403215104.0, + "grad_norm": 0.031693910674612406, + "language_loss": 0.95307148, + "learning_rate": 0.0009864851561054893, + "loss": 0.96391344, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.69091797, + "step": 530, + "time_per_iteration": 2.7826597690582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.01765728, + "epoch": 0.1021546748749519, + "flos": 519256687872.0, + "grad_norm": 0.0418084670656813, + "language_loss": 0.94574928, + "learning_rate": 0.0009864131171374191, + "loss": 0.95661592, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.69091797, + "step": 531, + "time_per_iteration": 2.67000150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088238, + "balance_loss_mlp": 1.01919198, + "epoch": 0.10234705656021546, + "flos": 610954035456.0, + "grad_norm": 0.03906444640078033, + "language_loss": 0.94287467, + "learning_rate": 0.0009863408893270292, + "loss": 0.95375705, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.69140625, + "step": 532, + "time_per_iteration": 2.7893166542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_mlp": 1.02029741, + "epoch": 0.10253943824547904, + "flos": 602913069312.0, + "grad_norm": 0.046708965243717, + "language_loss": 0.90346718, + "learning_rate": 0.0009862684727023605, + "loss": 0.91435778, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.68847656, + "step": 533, + "time_per_iteration": 2.7212483882904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.0105468, + "epoch": 0.1027318199307426, + "flos": 664157395200.0, + "grad_norm": 0.04923575085492922, + "language_loss": 0.9286049, + "learning_rate": 0.0009861958672915283, + "loss": 0.93939555, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.68603516, + "step": 534, + "time_per_iteration": 2.8216443061828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080998, + "balance_loss_mlp": 1.01271474, + "epoch": 0.10292420161600616, + "flos": 684531019008.0, + "grad_norm": 0.03566434899904423, + "language_loss": 0.91122925, + "learning_rate": 0.0009861230731227201, + "loss": 0.92203927, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.68359375, + "step": 535, + "time_per_iteration": 2.8432843685150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082908, + "balance_loss_mlp": 1.01514912, + "epoch": 0.10311658330126972, + "flos": 491269808640.0, + "grad_norm": 0.04656876258351904, + "language_loss": 0.9494285, + "learning_rate": 0.0009860500902241973, + "loss": 0.96025753, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.67822266, + "step": 536, + "time_per_iteration": 2.601234197616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085787, + "balance_loss_mlp": 1.01831496, + "epoch": 0.10330896498653329, + "flos": 432687011328.0, + "grad_norm": 0.046264109011482965, + "language_loss": 0.99409795, + "learning_rate": 0.0009859769186242942, + "loss": 1.00495577, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.67529297, + "step": 537, + "time_per_iteration": 2.527156114578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079891, + "balance_loss_mlp": 1.01265681, + "epoch": 0.10350134667179685, + "flos": 550642898688.0, + "grad_norm": 0.04274411195548745, + "language_loss": 0.92667055, + "learning_rate": 0.0009859035583514187, + "loss": 0.93746948, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.67285156, + "step": 538, + "time_per_iteration": 2.6489107608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082236, + "balance_loss_mlp": 1.01505005, + "epoch": 0.10369372835706041, + "flos": 641827964928.0, + "grad_norm": 0.04978782417937993, + "language_loss": 0.95941103, + "learning_rate": 0.0009858300094340517, + "loss": 0.97023344, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.67236328, + "step": 539, + "time_per_iteration": 2.8078534603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107826, + "balance_loss_mlp": 1.01102614, + "epoch": 0.10388611004232397, + "flos": 522766834944.0, + "grad_norm": 0.04233995967203171, + "language_loss": 0.8846426, + "learning_rate": 0.0009857562719007473, + "loss": 0.8954252, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.67285156, + "step": 540, + "time_per_iteration": 2.605253219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108211, + "balance_loss_mlp": 1.01487637, + "epoch": 0.10407849172758753, + "flos": 703741074432.0, + "grad_norm": 0.04489314852578161, + "language_loss": 0.9024663, + "learning_rate": 0.0009856823457801331, + "loss": 0.91328734, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.67285156, + "step": 541, + "time_per_iteration": 2.8836264610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074756, + "balance_loss_mlp": 1.00737894, + "epoch": 0.1042708734128511, + "flos": 503945606400.0, + "grad_norm": 0.04545070943505171, + "language_loss": 0.97841358, + "learning_rate": 0.00098560823110091, + "loss": 0.98916113, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.67431641, + "step": 542, + "time_per_iteration": 2.629241466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078174, + "balance_loss_mlp": 1.01084471, + "epoch": 0.10446325509811466, + "flos": 486641779968.0, + "grad_norm": 0.04151430298304091, + "language_loss": 0.974545, + "learning_rate": 0.000985533927891851, + "loss": 0.98532677, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.67382812, + "step": 543, + "time_per_iteration": 2.712714195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.01110125, + "epoch": 0.10465563678337822, + "flos": 569713948416.0, + "grad_norm": 0.043537531534841835, + "language_loss": 0.9559319, + "learning_rate": 0.0009854594361818044, + "loss": 0.96671236, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.66992188, + "step": 544, + "time_per_iteration": 2.66324520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075343, + "balance_loss_mlp": 1.00806153, + "epoch": 0.10484801846864178, + "flos": 627243992832.0, + "grad_norm": 0.042858245855360314, + "language_loss": 0.94459403, + "learning_rate": 0.0009853847559996897, + "loss": 0.95534742, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.67333984, + "step": 545, + "time_per_iteration": 2.749379873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.00697374, + "epoch": 0.10504040015390535, + "flos": 744813965568.0, + "grad_norm": 0.04113973833070077, + "language_loss": 0.93940508, + "learning_rate": 0.0009853098873745, + "loss": 0.95015049, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.67626953, + "step": 546, + "time_per_iteration": 3.0356035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082094, + "balance_loss_mlp": 1.01457405, + "epoch": 0.10523278183916891, + "flos": 587843060736.0, + "grad_norm": 0.04039468180414331, + "language_loss": 0.92498314, + "learning_rate": 0.0009852348303353027, + "loss": 0.93580401, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.67578125, + "step": 547, + "time_per_iteration": 2.787853479385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_mlp": 1.01283157, + "epoch": 0.10542516352443247, + "flos": 871147156224.0, + "grad_norm": 0.04319215205461418, + "language_loss": 0.86143011, + "learning_rate": 0.000985159584911237, + "loss": 0.872235, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.67724609, + "step": 548, + "time_per_iteration": 3.103173017501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.00949633, + "epoch": 0.10561754520969603, + "flos": 506413694208.0, + "grad_norm": 0.04405333210851084, + "language_loss": 0.94064271, + "learning_rate": 0.0009850841511315162, + "loss": 0.95141286, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.67578125, + "step": 549, + "time_per_iteration": 2.647629737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107948, + "balance_loss_mlp": 1.01176953, + "epoch": 0.1058099268949596, + "flos": 561148061952.0, + "grad_norm": 0.03728506713954383, + "language_loss": 0.9326818, + "learning_rate": 0.0009850085290254256, + "loss": 0.94347662, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.67773438, + "step": 550, + "time_per_iteration": 2.7680838108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081585, + "balance_loss_mlp": 1.01411295, + "epoch": 0.10600230858022316, + "flos": 563160248832.0, + "grad_norm": 0.031635589688873186, + "language_loss": 0.90350562, + "learning_rate": 0.0009849327186223246, + "loss": 0.91432148, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.67529297, + "step": 551, + "time_per_iteration": 2.7540531158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077249, + "balance_loss_mlp": 1.01001453, + "epoch": 0.10619469026548672, + "flos": 495318481920.0, + "grad_norm": 0.03875875468173829, + "language_loss": 0.97612774, + "learning_rate": 0.000984856719951646, + "loss": 0.98690015, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.67285156, + "step": 552, + "time_per_iteration": 2.5471906661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_mlp": 1.01300704, + "epoch": 0.10638707195075028, + "flos": 677465038080.0, + "grad_norm": 0.04041077275123314, + "language_loss": 0.94560456, + "learning_rate": 0.0009847805330428943, + "loss": 0.95640558, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.67138672, + "step": 553, + "time_per_iteration": 2.879901647567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081173, + "balance_loss_mlp": 1.01398706, + "epoch": 0.10657945363601386, + "flos": 489035990784.0, + "grad_norm": 0.051524237529684984, + "language_loss": 0.97161597, + "learning_rate": 0.0009847041579256481, + "loss": 0.98242772, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.67236328, + "step": 554, + "time_per_iteration": 2.5838425159454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076637, + "balance_loss_mlp": 1.00997543, + "epoch": 0.10677183532127742, + "flos": 483971503104.0, + "grad_norm": 0.03890900728724459, + "language_loss": 0.96058643, + "learning_rate": 0.0009846275946295592, + "loss": 0.97135282, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.66699219, + "step": 555, + "time_per_iteration": 2.619490623474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074794, + "balance_loss_mlp": 1.00813222, + "epoch": 0.10696421700654098, + "flos": 657582308352.0, + "grad_norm": 0.03350037319549477, + "language_loss": 0.89189553, + "learning_rate": 0.0009845508431843518, + "loss": 0.9026435, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.66699219, + "step": 556, + "time_per_iteration": 3.0074055194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_mlp": 1.00895333, + "epoch": 0.10715659869180454, + "flos": 568793398272.0, + "grad_norm": 0.03867425342149035, + "language_loss": 0.90383601, + "learning_rate": 0.0009844739036198233, + "loss": 0.91459262, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.66748047, + "step": 557, + "time_per_iteration": 2.719309091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073849, + "balance_loss_mlp": 1.00756896, + "epoch": 0.10734898037706811, + "flos": 541744565760.0, + "grad_norm": 0.03845092177051005, + "language_loss": 0.97656357, + "learning_rate": 0.0009843967759658448, + "loss": 0.98730206, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.66308594, + "step": 558, + "time_per_iteration": 2.679964065551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077583, + "balance_loss_mlp": 1.01311493, + "epoch": 0.10754136206233167, + "flos": 1479734192640.0, + "grad_norm": 0.013283033162601723, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73845339, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.64453125, + "step": 559, + "time_per_iteration": 4.837440729141235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_mlp": 1.00977802, + "epoch": 0.10773374374759523, + "flos": 513412601088.0, + "grad_norm": 0.03702065367467253, + "language_loss": 0.97501957, + "learning_rate": 0.000984241956509384, + "loss": 0.98577774, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.66064453, + "step": 560, + "time_per_iteration": 2.6579978466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079218, + "balance_loss_mlp": 1.01312864, + "epoch": 0.10792612543285879, + "flos": 497478422784.0, + "grad_norm": 0.05173888564395698, + "language_loss": 0.9404971, + "learning_rate": 0.0009841642647670078, + "loss": 0.9512893, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.66113281, + "step": 561, + "time_per_iteration": 2.557605743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080143, + "balance_loss_mlp": 1.01429176, + "epoch": 0.10811850711812235, + "flos": 736838128128.0, + "grad_norm": 0.0493873548723288, + "language_loss": 0.88547891, + "learning_rate": 0.0009840863850553944, + "loss": 0.89628035, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.65869141, + "step": 562, + "time_per_iteration": 2.949580669403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077668, + "balance_loss_mlp": 1.0115304, + "epoch": 0.10831088880338592, + "flos": 612677517312.0, + "grad_norm": 0.04173462884607535, + "language_loss": 0.94150907, + "learning_rate": 0.0009840083174047782, + "loss": 0.95228577, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.66162109, + "step": 563, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081559, + "balance_loss_mlp": 1.01561248, + "epoch": 0.10850327048864948, + "flos": 557498909184.0, + "grad_norm": 0.034100755270258146, + "language_loss": 0.88515103, + "learning_rate": 0.0009839300618454685, + "loss": 0.89596659, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.65966797, + "step": 564, + "time_per_iteration": 2.8846256732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080528, + "balance_loss_mlp": 1.0148201, + "epoch": 0.10869565217391304, + "flos": 604437274368.0, + "grad_norm": 0.036735298053950545, + "language_loss": 0.93941957, + "learning_rate": 0.0009838516184078466, + "loss": 0.95022488, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.65722656, + "step": 565, + "time_per_iteration": 2.813284158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078727, + "balance_loss_mlp": 1.01297164, + "epoch": 0.1088880338591766, + "flos": 527206280448.0, + "grad_norm": 0.040314305725270186, + "language_loss": 0.91096556, + "learning_rate": 0.0009837729871223669, + "loss": 0.92175281, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.65771484, + "step": 566, + "time_per_iteration": 2.651611089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078801, + "balance_loss_mlp": 1.01318836, + "epoch": 0.10908041554444017, + "flos": 621417402624.0, + "grad_norm": 0.042325065837349046, + "language_loss": 0.91458869, + "learning_rate": 0.0009836941680195568, + "loss": 0.92537665, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.65625, + "step": 567, + "time_per_iteration": 2.8296427726745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.01508534, + "epoch": 0.10927279722970373, + "flos": 899674507008.0, + "grad_norm": 0.04990856516123606, + "language_loss": 0.87414277, + "learning_rate": 0.0009836151611300166, + "loss": 0.88495302, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.65966797, + "step": 568, + "time_per_iteration": 3.2401816844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107825, + "balance_loss_mlp": 1.01206517, + "epoch": 0.10946517891496729, + "flos": 529700613120.0, + "grad_norm": 0.0427731854110213, + "language_loss": 0.96863574, + "learning_rate": 0.0009835359664844194, + "loss": 0.97941828, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.66210938, + "step": 569, + "time_per_iteration": 2.6190173625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064438, + "balance_loss_mlp": 1.00092316, + "epoch": 0.10965756060023085, + "flos": 1563994228992.0, + "grad_norm": 0.005811935039235345, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.8210125, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.63476562, + "step": 570, + "time_per_iteration": 4.957117795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080699, + "balance_loss_mlp": 1.0151341, + "epoch": 0.10984994228549443, + "flos": 514100826624.0, + "grad_norm": 0.04369440603786518, + "language_loss": 0.94858396, + "learning_rate": 0.0009833770140481118, + "loss": 0.95939088, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.65576172, + "step": 571, + "time_per_iteration": 2.6529860496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_mlp": 1.02059519, + "epoch": 0.11004232397075799, + "flos": 956275252992.0, + "grad_norm": 0.04378732511153692, + "language_loss": 0.85010409, + "learning_rate": 0.000983297256319112, + "loss": 0.86096668, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.65673828, + "step": 572, + "time_per_iteration": 3.2036497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.01499045, + "epoch": 0.11023470565602154, + "flos": 489229431552.0, + "grad_norm": 0.043497603291787354, + "language_loss": 0.89141667, + "learning_rate": 0.000983217310957477, + "loss": 0.90222269, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.65625, + "step": 573, + "time_per_iteration": 2.7763278484344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078757, + "balance_loss_mlp": 1.01333535, + "epoch": 0.1104270873412851, + "flos": 656991292416.0, + "grad_norm": 0.04901418812727031, + "language_loss": 0.9269613, + "learning_rate": 0.000983137177994244, + "loss": 0.93774891, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.65429688, + "step": 574, + "time_per_iteration": 2.8529646396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080019, + "balance_loss_mlp": 1.01474011, + "epoch": 0.11061946902654868, + "flos": 724748488704.0, + "grad_norm": 0.03457948694206611, + "language_loss": 0.87449324, + "learning_rate": 0.0009830568574605235, + "loss": 0.88529336, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.65283203, + "step": 575, + "time_per_iteration": 2.94710373878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010791, + "balance_loss_mlp": 1.01367807, + "epoch": 0.11081185071181224, + "flos": 836869037568.0, + "grad_norm": 0.04085001299476677, + "language_loss": 0.90086508, + "learning_rate": 0.0009829763493874992, + "loss": 0.91165602, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.65429688, + "step": 576, + "time_per_iteration": 3.0296730995178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107807, + "balance_loss_mlp": 1.01283884, + "epoch": 0.1110042323970758, + "flos": 610283306496.0, + "grad_norm": 0.03775485835018356, + "language_loss": 0.95256275, + "learning_rate": 0.0009828956538064264, + "loss": 0.9633435, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.65234375, + "step": 577, + "time_per_iteration": 2.7944416999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.00893569, + "epoch": 0.11119661408233936, + "flos": 597040792320.0, + "grad_norm": 0.04378674390965236, + "language_loss": 0.93033826, + "learning_rate": 0.0009828147707486344, + "loss": 0.94107759, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.64990234, + "step": 578, + "time_per_iteration": 2.7034592628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.01099229, + "epoch": 0.11138899576760293, + "flos": 556888451328.0, + "grad_norm": 0.05042820660432219, + "language_loss": 0.89312434, + "learning_rate": 0.0009827337002455245, + "loss": 0.90388274, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.6484375, + "step": 579, + "time_per_iteration": 2.6187195777893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074948, + "balance_loss_mlp": 1.01057482, + "epoch": 0.11158137745286649, + "flos": 691063331328.0, + "grad_norm": 0.03501309245374513, + "language_loss": 0.89977694, + "learning_rate": 0.0009826524423285712, + "loss": 0.91052639, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.64355469, + "step": 580, + "time_per_iteration": 2.9009909629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.0148946, + "epoch": 0.11177375913813005, + "flos": 764307868416.0, + "grad_norm": 0.04023884017549449, + "language_loss": 0.91280103, + "learning_rate": 0.0009825709970293218, + "loss": 0.92359698, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.64697266, + "step": 581, + "time_per_iteration": 2.9111618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.0095998, + "epoch": 0.11196614082339361, + "flos": 808031594496.0, + "grad_norm": 0.038028140255108665, + "language_loss": 0.97163212, + "learning_rate": 0.0009824893643793956, + "loss": 0.98237336, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.64501953, + "step": 582, + "time_per_iteration": 3.0907368659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072862, + "balance_loss_mlp": 1.00796497, + "epoch": 0.11215852250865718, + "flos": 559725924096.0, + "grad_norm": 0.04580369165919148, + "language_loss": 0.90464842, + "learning_rate": 0.0009824075444104857, + "loss": 0.91537702, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.64892578, + "step": 583, + "time_per_iteration": 2.7276525497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_mlp": 1.01285601, + "epoch": 0.11235090419392074, + "flos": 514576169472.0, + "grad_norm": 0.03926612419770205, + "language_loss": 0.95381963, + "learning_rate": 0.000982325537154357, + "loss": 0.96459383, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.64550781, + "step": 584, + "time_per_iteration": 2.6261777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074375, + "balance_loss_mlp": 1.0100019, + "epoch": 0.1125432858791843, + "flos": 492433377024.0, + "grad_norm": 0.043221505898455144, + "language_loss": 0.96143711, + "learning_rate": 0.0009822433426428484, + "loss": 0.97218084, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.64355469, + "step": 585, + "time_per_iteration": 2.5630125999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.01079714, + "epoch": 0.11273566756444786, + "flos": 511728003072.0, + "grad_norm": 0.04466131563000304, + "language_loss": 0.88984096, + "learning_rate": 0.0009821609609078697, + "loss": 0.90059173, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.64257812, + "step": 586, + "time_per_iteration": 2.649122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.01077783, + "epoch": 0.11292804924971142, + "flos": 623640526848.0, + "grad_norm": 0.03579172726266892, + "language_loss": 0.91595018, + "learning_rate": 0.0009820783919814045, + "loss": 0.92670119, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.64306641, + "step": 587, + "time_per_iteration": 2.7977845668792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_mlp": 1.00830126, + "epoch": 0.113120430934975, + "flos": 479039218176.0, + "grad_norm": 0.04738669495581529, + "language_loss": 0.85574889, + "learning_rate": 0.0009819956358955095, + "loss": 0.86647511, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.64306641, + "step": 588, + "time_per_iteration": 2.59133243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_mlp": 1.01245642, + "epoch": 0.11331281262023855, + "flos": 467991638016.0, + "grad_norm": 0.048752038127388646, + "language_loss": 0.86982751, + "learning_rate": 0.0009819126926823127, + "loss": 0.88059437, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.64208984, + "step": 589, + "time_per_iteration": 2.511939764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075971, + "balance_loss_mlp": 1.01174104, + "epoch": 0.11350519430550211, + "flos": 651611854848.0, + "grad_norm": 0.04204370934342767, + "language_loss": 0.89311969, + "learning_rate": 0.000981829562374016, + "loss": 0.9038794, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.64208984, + "step": 590, + "time_per_iteration": 2.798734426498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.01111591, + "epoch": 0.11369757599076567, + "flos": 558861754368.0, + "grad_norm": 0.04723710161718091, + "language_loss": 0.99783856, + "learning_rate": 0.0009817462450028933, + "loss": 1.00858927, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.63916016, + "step": 591, + "time_per_iteration": 2.717622756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076867, + "balance_loss_mlp": 1.01316178, + "epoch": 0.11388995767602925, + "flos": 572306457600.0, + "grad_norm": 0.041300229846526024, + "language_loss": 0.87103492, + "learning_rate": 0.0009816627406012916, + "loss": 0.88180363, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.63671875, + "step": 592, + "time_per_iteration": 2.783677339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077614, + "balance_loss_mlp": 1.01376593, + "epoch": 0.1140823393612928, + "flos": 741744168192.0, + "grad_norm": 0.04574882804976793, + "language_loss": 0.87044728, + "learning_rate": 0.0009815790492016295, + "loss": 0.88122344, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.63818359, + "step": 593, + "time_per_iteration": 2.920262336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079358, + "balance_loss_mlp": 1.01560438, + "epoch": 0.11427472104655637, + "flos": 700252314624.0, + "grad_norm": 0.042792726491020304, + "language_loss": 0.89086539, + "learning_rate": 0.0009814951708363993, + "loss": 0.90165901, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.63720703, + "step": 594, + "time_per_iteration": 2.8244025707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.00799561, + "epoch": 0.11446710273181993, + "flos": 1480355344128.0, + "grad_norm": 0.0135056408383676, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79060781, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.6171875, + "step": 595, + "time_per_iteration": 4.779642105102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.01189995, + "epoch": 0.1146594844170835, + "flos": 495913388544.0, + "grad_norm": 0.038757735955663945, + "language_loss": 0.90035105, + "learning_rate": 0.0009813268533395648, + "loss": 0.91110331, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.6328125, + "step": 596, + "time_per_iteration": 2.5933825969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082133, + "balance_loss_mlp": 1.01895213, + "epoch": 0.11485186610234706, + "flos": 475791531264.0, + "grad_norm": 0.0538004660752225, + "language_loss": 0.90474582, + "learning_rate": 0.0009812424142733073, + "loss": 0.9155671, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.63134766, + "step": 597, + "time_per_iteration": 2.528027296066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.01089013, + "epoch": 0.11504424778761062, + "flos": 732620313600.0, + "grad_norm": 0.03283482462688361, + "language_loss": 0.87953097, + "learning_rate": 0.000981157788372175, + "loss": 0.89027071, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.63037109, + "step": 598, + "time_per_iteration": 3.008469343185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.01160276, + "epoch": 0.11523662947287418, + "flos": 546963610368.0, + "grad_norm": 0.037424804687157906, + "language_loss": 0.91041148, + "learning_rate": 0.0009810729756690223, + "loss": 0.92115927, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.63134766, + "step": 599, + "time_per_iteration": 2.75840163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077312, + "balance_loss_mlp": 1.01408339, + "epoch": 0.11542901115813775, + "flos": 776388759552.0, + "grad_norm": 0.04126969924944996, + "language_loss": 0.9391377, + "learning_rate": 0.0009809879761967766, + "loss": 0.94991082, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.63183594, + "step": 600, + "time_per_iteration": 2.9511778354644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081843, + "balance_loss_mlp": 1.01828074, + "epoch": 0.11562139284340131, + "flos": 732213990144.0, + "grad_norm": 0.05544181306164312, + "language_loss": 0.88981479, + "learning_rate": 0.0009809027899884378, + "loss": 0.90063322, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.63525391, + "step": 601, + "time_per_iteration": 2.888591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.01256609, + "epoch": 0.11581377452866487, + "flos": 537040714752.0, + "grad_norm": 0.03483284203155477, + "language_loss": 0.90335476, + "learning_rate": 0.0009808174170770779, + "loss": 0.9141165, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.63574219, + "step": 602, + "time_per_iteration": 2.7933802604675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073479, + "balance_loss_mlp": 1.01263428, + "epoch": 0.11600615621392843, + "flos": 1559214555648.0, + "grad_norm": 0.012041981792172347, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85971725, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.60742188, + "step": 603, + "time_per_iteration": 4.875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.01658237, + "epoch": 0.116198537899192, + "flos": 538468688640.0, + "grad_norm": 0.046063141341509364, + "language_loss": 0.95944118, + "learning_rate": 0.0009806461112779462, + "loss": 0.97023928, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.63183594, + "step": 604, + "time_per_iteration": 2.708552360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077772, + "balance_loss_mlp": 1.01444781, + "epoch": 0.11639091958445556, + "flos": 455137950720.0, + "grad_norm": 0.05737724930332189, + "language_loss": 0.90764457, + "learning_rate": 0.0009805601784566814, + "loss": 0.91842222, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.6328125, + "step": 605, + "time_per_iteration": 2.545696496963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076475, + "balance_loss_mlp": 1.01329422, + "epoch": 0.11658330126971912, + "flos": 556152593664.0, + "grad_norm": 0.04016687987230144, + "language_loss": 0.97276044, + "learning_rate": 0.0009804740590654089, + "loss": 0.98352522, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.63134766, + "step": 606, + "time_per_iteration": 2.6464574337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_mlp": 1.01399851, + "epoch": 0.11677568295498268, + "flos": 717601827840.0, + "grad_norm": 0.0453344941203476, + "language_loss": 0.91881627, + "learning_rate": 0.0009803877531375635, + "loss": 0.9295876, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.63085938, + "step": 607, + "time_per_iteration": 2.8467392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074228, + "balance_loss_mlp": 1.0111903, + "epoch": 0.11696806464024626, + "flos": 610899600384.0, + "grad_norm": 0.04469679718872237, + "language_loss": 0.92976171, + "learning_rate": 0.0009803012607066523, + "loss": 0.94050401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.62988281, + "step": 608, + "time_per_iteration": 2.7587811946868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.01030838, + "epoch": 0.11716044632550981, + "flos": 521416628736.0, + "grad_norm": 0.04044307397502579, + "language_loss": 0.91207683, + "learning_rate": 0.0009802145818062543, + "loss": 0.92280889, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.62841797, + "step": 609, + "time_per_iteration": 2.7623538970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.00919068, + "epoch": 0.11735282801077337, + "flos": 508489064448.0, + "grad_norm": 0.04251091083777229, + "language_loss": 0.93763256, + "learning_rate": 0.0009801277164700212, + "loss": 0.9483524, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.62744141, + "step": 610, + "time_per_iteration": 2.6250369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079805, + "balance_loss_mlp": 1.0171963, + "epoch": 0.11754520969603693, + "flos": 687837031680.0, + "grad_norm": 0.044835447829723894, + "language_loss": 0.91796255, + "learning_rate": 0.0009800406647316776, + "loss": 0.92876053, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.62548828, + "step": 611, + "time_per_iteration": 2.81438946723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058731, + "balance_loss_mlp": 0.99807739, + "epoch": 0.1177375913813005, + "flos": 1545759158784.0, + "grad_norm": 0.00493114536612535, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77973187, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.60546875, + "step": 612, + "time_per_iteration": 4.795796871185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.01008153, + "epoch": 0.11792997306656407, + "flos": 521538137856.0, + "grad_norm": 0.049162221556570344, + "language_loss": 0.91035461, + "learning_rate": 0.000979866002183916, + "loss": 0.92108488, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.62890625, + "step": 613, + "time_per_iteration": 2.6470768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.00820458, + "epoch": 0.11812235475182763, + "flos": 667489652736.0, + "grad_norm": 0.0453482214384289, + "language_loss": 0.92239928, + "learning_rate": 0.0009797783914423082, + "loss": 0.93311322, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.63134766, + "step": 614, + "time_per_iteration": 2.8020856380462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_mlp": 1.01220894, + "epoch": 0.11831473643709119, + "flos": 622505148672.0, + "grad_norm": 0.04034391423157231, + "language_loss": 0.86097217, + "learning_rate": 0.0009796905944342094, + "loss": 0.87172604, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.63134766, + "step": 615, + "time_per_iteration": 2.839617967605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_mlp": 1.0160079, + "epoch": 0.11850711812235475, + "flos": 457695466752.0, + "grad_norm": 0.03330066749319758, + "language_loss": 0.89949274, + "learning_rate": 0.0009796026111937057, + "loss": 0.91028321, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.62988281, + "step": 616, + "time_per_iteration": 2.6211540699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077601, + "balance_loss_mlp": 1.0150882, + "epoch": 0.11869949980761832, + "flos": 514928057856.0, + "grad_norm": 0.034464018290856886, + "language_loss": 0.90251315, + "learning_rate": 0.0009795144417549552, + "loss": 0.91328913, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.62451172, + "step": 617, + "time_per_iteration": 2.6946897506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080332, + "balance_loss_mlp": 1.01815259, + "epoch": 0.11889188149288188, + "flos": 536157103104.0, + "grad_norm": 0.035314864293198016, + "language_loss": 0.91583192, + "learning_rate": 0.0009794260861521883, + "loss": 0.92663527, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.62109375, + "step": 618, + "time_per_iteration": 2.77822208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.01979554, + "epoch": 0.11908426317814544, + "flos": 499645166592.0, + "grad_norm": 0.042334404758790994, + "language_loss": 0.88659471, + "learning_rate": 0.0009793375444197075, + "loss": 0.89741158, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.61816406, + "step": 619, + "time_per_iteration": 2.6199400424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086152, + "balance_loss_mlp": 1.02416277, + "epoch": 0.119276644863409, + "flos": 661068155904.0, + "grad_norm": 0.043937618111938345, + "language_loss": 0.86906028, + "learning_rate": 0.000979248816591888, + "loss": 0.87992179, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.61914062, + "step": 620, + "time_per_iteration": 2.789858341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_mlp": 1.01947308, + "epoch": 0.11946902654867257, + "flos": 760153237248.0, + "grad_norm": 0.04701199265522289, + "language_loss": 0.87992656, + "learning_rate": 0.0009791599027031766, + "loss": 0.89074314, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.62109375, + "step": 621, + "time_per_iteration": 3.026487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074317, + "balance_loss_mlp": 1.01223314, + "epoch": 0.11966140823393613, + "flos": 682214575872.0, + "grad_norm": 0.0506686420393155, + "language_loss": 0.88143325, + "learning_rate": 0.0009790708027880932, + "loss": 0.89217639, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.62011719, + "step": 622, + "time_per_iteration": 2.8321774005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.02143097, + "epoch": 0.11985378991919969, + "flos": 1454300938752.0, + "grad_norm": 0.023212611497014573, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78508806, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.59960938, + "step": 623, + "time_per_iteration": 4.862462759017944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071261, + "balance_loss_mlp": 1.00936747, + "epoch": 0.12004617160446325, + "flos": 528899626752.0, + "grad_norm": 0.04437858339694968, + "language_loss": 0.95209736, + "learning_rate": 0.0009788920450172487, + "loss": 0.96280998, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.61816406, + "step": 624, + "time_per_iteration": 2.630764961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078232, + "balance_loss_mlp": 1.01619518, + "epoch": 0.12023855328972682, + "flos": 475177182720.0, + "grad_norm": 0.048047229360432486, + "language_loss": 0.92430472, + "learning_rate": 0.0009788023872308875, + "loss": 0.93508708, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.61962891, + "step": 625, + "time_per_iteration": 2.5534780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076523, + "balance_loss_mlp": 1.01682281, + "epoch": 0.12043093497499038, + "flos": 1535054718720.0, + "grad_norm": 0.022021305117703366, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.7650553, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.59570312, + "step": 626, + "time_per_iteration": 4.738527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108475, + "balance_loss_mlp": 1.023, + "epoch": 0.12062331666025394, + "flos": 540915389184.0, + "grad_norm": 0.04663901515177362, + "language_loss": 0.9603011, + "learning_rate": 0.0009786225140303285, + "loss": 0.97114861, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.61669922, + "step": 627, + "time_per_iteration": 2.634160280227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.02427304, + "epoch": 0.1208156983455175, + "flos": 513000441600.0, + "grad_norm": 0.042540459475059536, + "language_loss": 0.94019556, + "learning_rate": 0.0009785322986859634, + "loss": 0.95105481, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.61572266, + "step": 628, + "time_per_iteration": 2.681070327758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.01725972, + "epoch": 0.12100808003078108, + "flos": 597590012160.0, + "grad_norm": 0.03866803919075334, + "language_loss": 0.94614279, + "learning_rate": 0.0009784418975588838, + "loss": 0.95693052, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.61425781, + "step": 629, + "time_per_iteration": 2.7337839603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073027, + "balance_loss_mlp": 1.01132393, + "epoch": 0.12120046171604464, + "flos": 524067463680.0, + "grad_norm": 0.03279843121618067, + "language_loss": 0.94581258, + "learning_rate": 0.0009783513106841862, + "loss": 0.95654285, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.61621094, + "step": 630, + "time_per_iteration": 2.702615737915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080086, + "balance_loss_mlp": 1.01981354, + "epoch": 0.1213928434013082, + "flos": 1557910036224.0, + "grad_norm": 0.01502333088768157, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77812791, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.6015625, + "step": 631, + "time_per_iteration": 4.998409032821655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080339, + "balance_loss_mlp": 1.01835024, + "epoch": 0.12158522508657175, + "flos": 496388731392.0, + "grad_norm": 0.04174070683076465, + "language_loss": 0.89320499, + "learning_rate": 0.0009781695798326854, + "loss": 0.90400839, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.61914062, + "step": 632, + "time_per_iteration": 2.5908379554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079195, + "balance_loss_mlp": 1.01744485, + "epoch": 0.12177760677183531, + "flos": 476590572288.0, + "grad_norm": 0.04165368210868703, + "language_loss": 0.89744723, + "learning_rate": 0.0009780784359264365, + "loss": 0.90823919, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.61669922, + "step": 633, + "time_per_iteration": 2.689202070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073334, + "balance_loss_mlp": 1.01382446, + "epoch": 0.12196998845709889, + "flos": 1471787512320.0, + "grad_norm": 0.011333314510513573, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75262028, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.59375, + "step": 634, + "time_per_iteration": 4.762145757675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073337, + "balance_loss_mlp": 1.01187229, + "epoch": 0.12216237014236245, + "flos": 587749741824.0, + "grad_norm": 0.03178889939160208, + "language_loss": 0.88649213, + "learning_rate": 0.000977895591329867, + "loss": 0.8972255, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.61376953, + "step": 635, + "time_per_iteration": 2.7996504306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075051, + "balance_loss_mlp": 1.01372933, + "epoch": 0.12235475182762601, + "flos": 599107414272.0, + "grad_norm": 0.038321985001081305, + "language_loss": 0.88459468, + "learning_rate": 0.000977803890710533, + "loss": 0.89534515, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.61230469, + "step": 636, + "time_per_iteration": 2.7200405597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072884, + "balance_loss_mlp": 1.0117538, + "epoch": 0.12254713351288957, + "flos": 498761554944.0, + "grad_norm": 0.03313527469264444, + "language_loss": 0.94808865, + "learning_rate": 0.0009777120045912774, + "loss": 0.95881748, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.61035156, + "step": 637, + "time_per_iteration": 2.6253507137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072019, + "balance_loss_mlp": 1.01084125, + "epoch": 0.12273951519815314, + "flos": 606981184512.0, + "grad_norm": 0.04065251745031248, + "language_loss": 0.91558111, + "learning_rate": 0.0009776199330077736, + "loss": 0.92630136, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.61083984, + "step": 638, + "time_per_iteration": 2.724416732788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069779, + "balance_loss_mlp": 1.0086484, + "epoch": 0.1229318968834167, + "flos": 598985905152.0, + "grad_norm": 0.04427923240085457, + "language_loss": 0.94062102, + "learning_rate": 0.0009775276759957667, + "loss": 0.9513188, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.61035156, + "step": 639, + "time_per_iteration": 2.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_mlp": 1.00851989, + "epoch": 0.12312427856868026, + "flos": 679589985792.0, + "grad_norm": 0.04435656949952303, + "language_loss": 0.91938198, + "learning_rate": 0.0009774352335910745, + "loss": 0.93008226, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.61425781, + "step": 640, + "time_per_iteration": 2.8135974407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072128, + "balance_loss_mlp": 1.01095021, + "epoch": 0.12331666025394382, + "flos": 610044178944.0, + "grad_norm": 0.03352322480141845, + "language_loss": 0.95842457, + "learning_rate": 0.000977342605829586, + "loss": 0.96914589, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.61083984, + "step": 641, + "time_per_iteration": 2.734373092651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107341, + "balance_loss_mlp": 1.01208854, + "epoch": 0.12350904193920739, + "flos": 763841273856.0, + "grad_norm": 0.04166007448412618, + "language_loss": 0.87458932, + "learning_rate": 0.0009772497927472623, + "loss": 0.88532341, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.61230469, + "step": 642, + "time_per_iteration": 3.069495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.01199543, + "epoch": 0.12370142362447095, + "flos": 542050767360.0, + "grad_norm": 0.04189965725350253, + "language_loss": 0.86664522, + "learning_rate": 0.0009771567943801368, + "loss": 0.87737978, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.61376953, + "step": 643, + "time_per_iteration": 2.6783955097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.01000655, + "epoch": 0.12389380530973451, + "flos": 549253808640.0, + "grad_norm": 0.03907898995026106, + "language_loss": 0.90534973, + "learning_rate": 0.0009770636107643152, + "loss": 0.91606158, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.61083984, + "step": 644, + "time_per_iteration": 2.7792532444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.01343274, + "epoch": 0.12408618699499807, + "flos": 541353793536.0, + "grad_norm": 0.03775088580197231, + "language_loss": 0.89077818, + "learning_rate": 0.0009769702419359738, + "loss": 0.9015224, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.60888672, + "step": 645, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071725, + "balance_loss_mlp": 1.01083338, + "epoch": 0.12427856868026164, + "flos": 747160544256.0, + "grad_norm": 0.03491310842571494, + "language_loss": 0.90435565, + "learning_rate": 0.000976876687931362, + "loss": 0.91507292, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.60791016, + "step": 646, + "time_per_iteration": 3.028578758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074215, + "balance_loss_mlp": 1.01332271, + "epoch": 0.1244709503655252, + "flos": 534745658880.0, + "grad_norm": 0.04739554944994068, + "language_loss": 0.86433625, + "learning_rate": 0.0009767829487868005, + "loss": 0.87507832, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.60791016, + "step": 647, + "time_per_iteration": 2.6323471069335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075713, + "balance_loss_mlp": 1.01472592, + "epoch": 0.12466333205078876, + "flos": 509112161280.0, + "grad_norm": 0.0390766896094967, + "language_loss": 0.89632404, + "learning_rate": 0.000976689024538682, + "loss": 0.90708113, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.60888672, + "step": 648, + "time_per_iteration": 2.6233997344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069043, + "balance_loss_mlp": 1.00819838, + "epoch": 0.12485571373605232, + "flos": 682640341248.0, + "grad_norm": 0.04106035596266842, + "language_loss": 0.87981439, + "learning_rate": 0.0009765949152234716, + "loss": 0.89050484, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.60742188, + "step": 649, + "time_per_iteration": 2.9135711193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064964, + "balance_loss_mlp": 1.00659943, + "epoch": 0.1250480954213159, + "flos": 1333201377024.0, + "grad_norm": 0.013063081234142807, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79751045, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.58203125, + "step": 650, + "time_per_iteration": 4.696362495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.0093261, + "epoch": 0.12524047710657946, + "flos": 940198178304.0, + "grad_norm": 0.03723688894295025, + "language_loss": 0.82869852, + "learning_rate": 0.0009764061415379919, + "loss": 0.83939779, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.60498047, + "step": 651, + "time_per_iteration": 3.287029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071729, + "balance_loss_mlp": 1.01078951, + "epoch": 0.12543285879184302, + "flos": 514901812992.0, + "grad_norm": 0.03842788822410913, + "language_loss": 0.90123397, + "learning_rate": 0.0009763114772410109, + "loss": 0.91195124, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.60839844, + "step": 652, + "time_per_iteration": 2.5726470947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071215, + "balance_loss_mlp": 1.01075244, + "epoch": 0.12562524047710658, + "flos": 719684001024.0, + "grad_norm": 0.03790395950388449, + "language_loss": 0.88320071, + "learning_rate": 0.0009762166280235146, + "loss": 0.89391285, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.60351562, + "step": 653, + "time_per_iteration": 2.9728682041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073992, + "balance_loss_mlp": 1.01372027, + "epoch": 0.12581762216237014, + "flos": 564799160064.0, + "grad_norm": 0.039966468352906216, + "language_loss": 0.88308495, + "learning_rate": 0.0009761215939223267, + "loss": 0.89382488, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.6015625, + "step": 654, + "time_per_iteration": 2.7552366256713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.01100981, + "epoch": 0.1260100038476337, + "flos": 482901253632.0, + "grad_norm": 0.045851790315233704, + "language_loss": 0.87049586, + "learning_rate": 0.0009760263749743428, + "loss": 0.88121206, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.60498047, + "step": 655, + "time_per_iteration": 2.5859339237213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.01301908, + "epoch": 0.12620238553289725, + "flos": 576702161664.0, + "grad_norm": 0.03680601760412016, + "language_loss": 0.91127861, + "learning_rate": 0.0009759309712165299, + "loss": 0.9220134, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.60351562, + "step": 656, + "time_per_iteration": 2.7411043643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.00841653, + "epoch": 0.12639476721816084, + "flos": 532186197504.0, + "grad_norm": 0.050748048847022796, + "language_loss": 0.94208288, + "learning_rate": 0.0009758353826859272, + "loss": 0.95277309, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.60498047, + "step": 657, + "time_per_iteration": 2.5851681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071924, + "balance_loss_mlp": 1.01117456, + "epoch": 0.1265871489034244, + "flos": 691232472576.0, + "grad_norm": 0.04052834214006204, + "language_loss": 0.90056133, + "learning_rate": 0.0009757396094196456, + "loss": 0.91128063, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.60644531, + "step": 658, + "time_per_iteration": 2.9119739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.01083672, + "epoch": 0.12677953058868796, + "flos": 538243166976.0, + "grad_norm": 0.03305987481805703, + "language_loss": 0.85138786, + "learning_rate": 0.0009756436514548673, + "loss": 0.86210179, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.60449219, + "step": 659, + "time_per_iteration": 2.8146860599517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070631, + "balance_loss_mlp": 1.01021552, + "epoch": 0.12697191227395152, + "flos": 520120857600.0, + "grad_norm": 0.03322369158928612, + "language_loss": 0.89052176, + "learning_rate": 0.0009755475088288466, + "loss": 0.90122807, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.60302734, + "step": 660, + "time_per_iteration": 2.7092652320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070351, + "balance_loss_mlp": 1.01007843, + "epoch": 0.12716429395921508, + "flos": 567666768384.0, + "grad_norm": 0.0427017471912124, + "language_loss": 0.91535795, + "learning_rate": 0.0009754511815789095, + "loss": 0.92606151, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.6015625, + "step": 661, + "time_per_iteration": 2.790198564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.00809085, + "epoch": 0.12735667564447864, + "flos": 515142885888.0, + "grad_norm": 0.0409493229321676, + "language_loss": 0.8685838, + "learning_rate": 0.0009753546697424533, + "loss": 0.87926698, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.60107422, + "step": 662, + "time_per_iteration": 2.6784565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070378, + "balance_loss_mlp": 1.01020074, + "epoch": 0.1275490573297422, + "flos": 542321975808.0, + "grad_norm": 0.039351291895580044, + "language_loss": 0.91270494, + "learning_rate": 0.0009752579733569475, + "loss": 0.92340875, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.60058594, + "step": 663, + "time_per_iteration": 2.679379940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071762, + "balance_loss_mlp": 1.01358795, + "epoch": 0.12774143901500576, + "flos": 1562027728896.0, + "grad_norm": 0.016936801864205438, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7595315, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.58007812, + "step": 664, + "time_per_iteration": 4.936127424240112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070961, + "balance_loss_mlp": 1.01092672, + "epoch": 0.12793382070026935, + "flos": 614874396672.0, + "grad_norm": 0.047422479810277696, + "language_loss": 0.90634137, + "learning_rate": 0.0009750640270890217, + "loss": 0.91705096, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.59912109, + "step": 665, + "time_per_iteration": 2.712202548980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.01361179, + "epoch": 0.1281262023855329, + "flos": 709118566656.0, + "grad_norm": 0.04721256261198653, + "language_loss": 0.97348696, + "learning_rate": 0.0009749667772818983, + "loss": 0.98422199, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.59765625, + "step": 666, + "time_per_iteration": 2.959563732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065521, + "balance_loss_mlp": 1.00791931, + "epoch": 0.12831858407079647, + "flos": 1428185295360.0, + "grad_norm": 0.00958948420866419, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78001463, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.57421875, + "step": 667, + "time_per_iteration": 4.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.01259768, + "epoch": 0.12851096575606002, + "flos": 450019027968.0, + "grad_norm": 0.04331482152431362, + "language_loss": 0.96237415, + "learning_rate": 0.0009747717245101093, + "loss": 0.97309327, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.59179688, + "step": 668, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.01193655, + "epoch": 0.12870334744132358, + "flos": 480910454016.0, + "grad_norm": 0.040015395826151615, + "language_loss": 0.86231172, + "learning_rate": 0.00097467392162117, + "loss": 0.87302423, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.59179688, + "step": 669, + "time_per_iteration": 2.620121717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.01342034, + "epoch": 0.12889572912658714, + "flos": 640152115200.0, + "grad_norm": 0.03307407171369126, + "language_loss": 0.91950834, + "learning_rate": 0.0009745759344474708, + "loss": 0.9302386, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.59472656, + "step": 670, + "time_per_iteration": 2.834406852722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070894, + "balance_loss_mlp": 1.01114607, + "epoch": 0.1290881108118507, + "flos": 510955206912.0, + "grad_norm": 0.03904079329345599, + "language_loss": 0.90752548, + "learning_rate": 0.0009744777630270536, + "loss": 0.91823441, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.59619141, + "step": 671, + "time_per_iteration": 2.5841259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.00967062, + "epoch": 0.12928049249711426, + "flos": 672291680256.0, + "grad_norm": 0.0427916369984872, + "language_loss": 0.94394779, + "learning_rate": 0.000974379407398032, + "loss": 0.95464385, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.59814453, + "step": 672, + "time_per_iteration": 2.8698208332061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072071, + "balance_loss_mlp": 1.0120368, + "epoch": 0.12947287418237785, + "flos": 795000017664.0, + "grad_norm": 0.03399258645873994, + "language_loss": 0.83039552, + "learning_rate": 0.0009742808675985913, + "loss": 0.84111625, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.59912109, + "step": 673, + "time_per_iteration": 3.1018688678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_mlp": 1.00729334, + "epoch": 0.1296652558676414, + "flos": 486448339200.0, + "grad_norm": 0.039807509100232605, + "language_loss": 0.91899526, + "learning_rate": 0.0009741821436669876, + "loss": 0.92966807, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.59863281, + "step": 674, + "time_per_iteration": 2.6348536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068278, + "balance_loss_mlp": 1.00853038, + "epoch": 0.12985763755290497, + "flos": 454393344768.0, + "grad_norm": 0.044170807310258554, + "language_loss": 0.93403888, + "learning_rate": 0.0009740832356415492, + "loss": 0.9447217, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.59619141, + "step": 675, + "time_per_iteration": 2.483262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072639, + "balance_loss_mlp": 1.01265311, + "epoch": 0.13005001923816853, + "flos": 826435805952.0, + "grad_norm": 0.043859966784303914, + "language_loss": 0.89693773, + "learning_rate": 0.0009739841435606756, + "loss": 0.90766412, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.59863281, + "step": 676, + "time_per_iteration": 2.992385149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_mlp": 1.00726056, + "epoch": 0.1302424009234321, + "flos": 532481705472.0, + "grad_norm": 0.03559705023164985, + "language_loss": 0.91210669, + "learning_rate": 0.0009738848674628377, + "loss": 0.92277622, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.59570312, + "step": 677, + "time_per_iteration": 2.766364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.00765288, + "epoch": 0.13043478260869565, + "flos": 526917575424.0, + "grad_norm": 0.03838556287658105, + "language_loss": 0.90382779, + "learning_rate": 0.000973785407386578, + "loss": 0.91449988, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.59423828, + "step": 678, + "time_per_iteration": 2.772854804992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070658, + "balance_loss_mlp": 1.01076782, + "epoch": 0.1306271642939592, + "flos": 627417991680.0, + "grad_norm": 0.03509098765963207, + "language_loss": 0.88142246, + "learning_rate": 0.0009736857633705103, + "loss": 0.89212906, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.59765625, + "step": 679, + "time_per_iteration": 2.851567268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.01602292, + "epoch": 0.13081954597922277, + "flos": 551841460224.0, + "grad_norm": 0.03859467755451503, + "language_loss": 0.94306064, + "learning_rate": 0.0009735859354533196, + "loss": 0.95381933, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.59716797, + "step": 680, + "time_per_iteration": 2.6908183097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.01038456, + "epoch": 0.13101192766448633, + "flos": 537956407296.0, + "grad_norm": 0.04695623305024525, + "language_loss": 0.92768431, + "learning_rate": 0.0009734859236737628, + "loss": 0.93838656, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.59716797, + "step": 681, + "time_per_iteration": 2.618556261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.00631785, + "epoch": 0.13120430934974991, + "flos": 504514268160.0, + "grad_norm": 0.03771498494962771, + "language_loss": 0.94425803, + "learning_rate": 0.0009733857280706678, + "loss": 0.95491678, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.59423828, + "step": 682, + "time_per_iteration": 2.607445240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.00883758, + "epoch": 0.13139669103501347, + "flos": 615423616512.0, + "grad_norm": 0.040497909024236244, + "language_loss": 0.85748106, + "learning_rate": 0.000973285348682934, + "loss": 0.86816311, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.59228516, + "step": 683, + "time_per_iteration": 2.749258518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.00846863, + "epoch": 0.13158907272027703, + "flos": 1488218420736.0, + "grad_norm": 0.017735586482065788, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.78962922, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.5625, + "step": 684, + "time_per_iteration": 4.792337894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069614, + "balance_loss_mlp": 1.01053405, + "epoch": 0.1317814544055406, + "flos": 987119046912.0, + "grad_norm": 0.04121230716493085, + "language_loss": 0.86815995, + "learning_rate": 0.0009730840387095046, + "loss": 0.87885606, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.58935547, + "step": 685, + "time_per_iteration": 3.324737071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.00972676, + "epoch": 0.13197383609080415, + "flos": 612629885184.0, + "grad_norm": 0.03769323902360627, + "language_loss": 0.91733027, + "learning_rate": 0.0009729831082019642, + "loss": 0.92801929, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.59033203, + "step": 686, + "time_per_iteration": 2.883368968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069054, + "balance_loss_mlp": 1.0096879, + "epoch": 0.1321662177760677, + "flos": 495555664128.0, + "grad_norm": 0.03344682577786829, + "language_loss": 0.90060174, + "learning_rate": 0.0009728819940660958, + "loss": 0.91129231, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.59228516, + "step": 687, + "time_per_iteration": 2.7771294116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069131, + "balance_loss_mlp": 1.00971675, + "epoch": 0.13235859946133127, + "flos": 496844632320.0, + "grad_norm": 0.041743180753116546, + "language_loss": 0.8673048, + "learning_rate": 0.0009727806963411557, + "loss": 0.87799615, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.59277344, + "step": 688, + "time_per_iteration": 2.5879924297332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069414, + "balance_loss_mlp": 1.00971425, + "epoch": 0.13255098114659483, + "flos": 512768116992.0, + "grad_norm": 0.035278095584539565, + "language_loss": 0.88457793, + "learning_rate": 0.000972679215066471, + "loss": 0.89527214, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.59570312, + "step": 689, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_mlp": 1.00826621, + "epoch": 0.13274336283185842, + "flos": 548400332544.0, + "grad_norm": 0.043703661342582356, + "language_loss": 1.0036962, + "learning_rate": 0.0009725775502814401, + "loss": 1.01437247, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.59228516, + "step": 690, + "time_per_iteration": 2.580975294113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072547, + "balance_loss_mlp": 1.01313293, + "epoch": 0.13293574451712198, + "flos": 642003909120.0, + "grad_norm": 0.041755939912029, + "language_loss": 0.86554468, + "learning_rate": 0.0009724757020255327, + "loss": 0.87627012, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.59277344, + "step": 691, + "time_per_iteration": 2.895805835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074323, + "balance_loss_mlp": 1.01533794, + "epoch": 0.13312812620238554, + "flos": 492470315520.0, + "grad_norm": 0.04584738151589033, + "language_loss": 0.8907311, + "learning_rate": 0.0009723736703382902, + "loss": 0.90147436, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.58837891, + "step": 692, + "time_per_iteration": 2.593621253967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073259, + "balance_loss_mlp": 1.01427472, + "epoch": 0.1333205078876491, + "flos": 509950086144.0, + "grad_norm": 0.042207641511909956, + "language_loss": 0.84734881, + "learning_rate": 0.0009722714552593244, + "loss": 0.85808134, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.58837891, + "step": 693, + "time_per_iteration": 2.6628286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069094, + "balance_loss_mlp": 1.01010931, + "epoch": 0.13351288957291266, + "flos": 419592251136.0, + "grad_norm": 0.04342856140262568, + "language_loss": 0.95545483, + "learning_rate": 0.000972169056828319, + "loss": 0.96614575, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.58837891, + "step": 694, + "time_per_iteration": 2.491511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.00896847, + "epoch": 0.13370527125817622, + "flos": 617051834112.0, + "grad_norm": 0.03328111889388194, + "language_loss": 0.87929142, + "learning_rate": 0.0009720664750850283, + "loss": 0.88997287, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.59033203, + "step": 695, + "time_per_iteration": 2.802238941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066112, + "balance_loss_mlp": 1.00693631, + "epoch": 0.13389765294343978, + "flos": 627170115840.0, + "grad_norm": 0.04111883948503256, + "language_loss": 0.94899035, + "learning_rate": 0.0009719637100692784, + "loss": 0.95965147, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.59033203, + "step": 696, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066724, + "balance_loss_mlp": 1.00764382, + "epoch": 0.13409003462870334, + "flos": 610897655040.0, + "grad_norm": 0.03903466400724949, + "language_loss": 0.84625083, + "learning_rate": 0.0009718607618209661, + "loss": 0.85691804, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.58935547, + "step": 697, + "time_per_iteration": 2.8612687587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.00915492, + "epoch": 0.13428241631396692, + "flos": 685088987136.0, + "grad_norm": 0.03548160791415639, + "language_loss": 0.8885181, + "learning_rate": 0.0009717576303800595, + "loss": 0.89919716, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.5859375, + "step": 698, + "time_per_iteration": 3.046081304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.00870502, + "epoch": 0.13447479799923048, + "flos": 509819828736.0, + "grad_norm": 0.04099621387271608, + "language_loss": 0.8689754, + "learning_rate": 0.0009716543157865975, + "loss": 0.87964994, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.5859375, + "step": 699, + "time_per_iteration": 2.7116739749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.00893724, + "epoch": 0.13466717968449404, + "flos": 899060158464.0, + "grad_norm": 0.03800712734159662, + "language_loss": 0.8517018, + "learning_rate": 0.0009715508180806907, + "loss": 0.86237621, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.58349609, + "step": 700, + "time_per_iteration": 3.184324026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.00777256, + "epoch": 0.1348595613697576, + "flos": 991695552768.0, + "grad_norm": 0.036541360765650906, + "language_loss": 0.91219282, + "learning_rate": 0.0009714471373025202, + "loss": 0.92285609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.58398438, + "step": 701, + "time_per_iteration": 3.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.0059582, + "epoch": 0.13505194305502116, + "flos": 488812414464.0, + "grad_norm": 0.038284394577449095, + "language_loss": 0.90020943, + "learning_rate": 0.0009713432734923386, + "loss": 0.91085601, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.58544922, + "step": 702, + "time_per_iteration": 2.6416144371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.00842357, + "epoch": 0.13524432474028472, + "flos": 614520562944.0, + "grad_norm": 0.03635122731697363, + "language_loss": 0.87970936, + "learning_rate": 0.0009712392266904696, + "loss": 0.89038247, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.58740234, + "step": 703, + "time_per_iteration": 2.73490309715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.00782144, + "epoch": 0.13543670642554828, + "flos": 906275838720.0, + "grad_norm": 0.040994558071305906, + "language_loss": 0.86788869, + "learning_rate": 0.0009711349969373076, + "loss": 0.87855482, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.58642578, + "step": 704, + "time_per_iteration": 3.1667368412017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066356, + "balance_loss_mlp": 1.00765777, + "epoch": 0.13562908811081184, + "flos": 551748141312.0, + "grad_norm": 0.040707128775991024, + "language_loss": 0.81448901, + "learning_rate": 0.0009710305842733178, + "loss": 0.82515258, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.58544922, + "step": 705, + "time_per_iteration": 2.7456798553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064755, + "balance_loss_mlp": 1.00648558, + "epoch": 0.1358214697960754, + "flos": 509038284288.0, + "grad_norm": 0.04235852839756889, + "language_loss": 0.91048527, + "learning_rate": 0.0009709259887390373, + "loss": 0.9211328, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.58105469, + "step": 706, + "time_per_iteration": 2.614645481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067613, + "balance_loss_mlp": 1.0098201, + "epoch": 0.136013851481339, + "flos": 529924189440.0, + "grad_norm": 0.045207837368539144, + "language_loss": 0.92539275, + "learning_rate": 0.0009708212103750737, + "loss": 0.93606889, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.57617188, + "step": 707, + "time_per_iteration": 2.5839250087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073243, + "balance_loss_mlp": 1.01525927, + "epoch": 0.13620623316660255, + "flos": 660321604608.0, + "grad_norm": 0.04139663244511697, + "language_loss": 0.88690269, + "learning_rate": 0.0009707162492221051, + "loss": 0.8976351, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.578125, + "step": 708, + "time_per_iteration": 2.8753738403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106855, + "balance_loss_mlp": 1.01051939, + "epoch": 0.1363986148518661, + "flos": 673083918336.0, + "grad_norm": 0.04870142688483653, + "language_loss": 0.89226341, + "learning_rate": 0.0009706111053208815, + "loss": 0.90294898, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.57861328, + "step": 709, + "time_per_iteration": 2.792555570602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.0069865, + "epoch": 0.13659099653712967, + "flos": 474004866048.0, + "grad_norm": 0.041589756065930725, + "language_loss": 0.87875092, + "learning_rate": 0.0009705057787122232, + "loss": 0.88940346, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.58105469, + "step": 710, + "time_per_iteration": 2.5474488735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106841, + "balance_loss_mlp": 1.00980711, + "epoch": 0.13678337822239323, + "flos": 453648738816.0, + "grad_norm": 0.03947638411835938, + "language_loss": 0.92397159, + "learning_rate": 0.0009704002694370216, + "loss": 0.93465567, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.58447266, + "step": 711, + "time_per_iteration": 2.5812153816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107162, + "balance_loss_mlp": 1.01306474, + "epoch": 0.13697575990765679, + "flos": 520626336000.0, + "grad_norm": 0.04103000756090051, + "language_loss": 0.88202429, + "learning_rate": 0.0009702945775362388, + "loss": 0.89274049, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.58398438, + "step": 712, + "time_per_iteration": 2.6084940433502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067699, + "balance_loss_mlp": 1.00914371, + "epoch": 0.13716814159292035, + "flos": 481366354944.0, + "grad_norm": 0.04017855754763819, + "language_loss": 0.88458985, + "learning_rate": 0.0009701887030509086, + "loss": 0.89526689, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.58398438, + "step": 713, + "time_per_iteration": 2.6361663341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.01425505, + "epoch": 0.1373605232781839, + "flos": 546750727680.0, + "grad_norm": 0.04169009137316196, + "language_loss": 0.92536753, + "learning_rate": 0.0009700826460221346, + "loss": 0.93609238, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.58056641, + "step": 714, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068882, + "balance_loss_mlp": 1.01080275, + "epoch": 0.1375529049634475, + "flos": 710071197696.0, + "grad_norm": 0.042053375460334, + "language_loss": 0.94210052, + "learning_rate": 0.0009699764064910921, + "loss": 0.95278937, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.57910156, + "step": 715, + "time_per_iteration": 2.870835542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069166, + "balance_loss_mlp": 1.01099169, + "epoch": 0.13774528664871105, + "flos": 487677036288.0, + "grad_norm": 0.04018028408764831, + "language_loss": 0.88572168, + "learning_rate": 0.0009698699844990268, + "loss": 0.89641333, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.58007812, + "step": 716, + "time_per_iteration": 2.6557233333587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106644, + "balance_loss_mlp": 1.00817037, + "epoch": 0.1379376683339746, + "flos": 681459276288.0, + "grad_norm": 0.03631196674856893, + "language_loss": 0.89737439, + "learning_rate": 0.0009697633800872555, + "loss": 0.90803885, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.58105469, + "step": 717, + "time_per_iteration": 2.9236202239990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.00998127, + "epoch": 0.13813005001923817, + "flos": 612226473984.0, + "grad_norm": 0.040527486313319094, + "language_loss": 0.9214747, + "learning_rate": 0.0009696565932971655, + "loss": 0.93215865, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.58251953, + "step": 718, + "time_per_iteration": 2.8931636810302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_mlp": 1.01394677, + "epoch": 0.13832243170450173, + "flos": 589927179264.0, + "grad_norm": 0.042228364331249636, + "language_loss": 0.91184157, + "learning_rate": 0.0009695496241702153, + "loss": 0.92256421, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.58154297, + "step": 719, + "time_per_iteration": 2.8006720542907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010673, + "balance_loss_mlp": 1.00917327, + "epoch": 0.1385148133897653, + "flos": 701320618752.0, + "grad_norm": 0.04012183054192491, + "language_loss": 0.87174737, + "learning_rate": 0.0009694424727479339, + "loss": 0.88242036, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.57958984, + "step": 720, + "time_per_iteration": 2.9363977909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.0081414, + "epoch": 0.13870719507502885, + "flos": 599367929088.0, + "grad_norm": 0.04032336097495746, + "language_loss": 0.90803999, + "learning_rate": 0.0009693351390719213, + "loss": 0.91870457, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.58154297, + "step": 721, + "time_per_iteration": 2.7786271572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01273775, + "epoch": 0.1388995767602924, + "flos": 587749741824.0, + "grad_norm": 0.04179929290372652, + "language_loss": 0.92465305, + "learning_rate": 0.000969227623183848, + "loss": 0.93536115, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.57910156, + "step": 722, + "time_per_iteration": 2.777453660964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.00870621, + "epoch": 0.139091958445556, + "flos": 652363263744.0, + "grad_norm": 0.041578114374578125, + "language_loss": 0.92603219, + "learning_rate": 0.0009691199251254554, + "loss": 0.9366982, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.57714844, + "step": 723, + "time_per_iteration": 2.813610553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.00586045, + "epoch": 0.13928434013081956, + "flos": 576906296064.0, + "grad_norm": 0.03663552971403626, + "language_loss": 0.88541949, + "learning_rate": 0.0009690120449385555, + "loss": 0.89605606, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.57617188, + "step": 724, + "time_per_iteration": 2.7604424953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_mlp": 1.00582433, + "epoch": 0.13947672181608312, + "flos": 564315068928.0, + "grad_norm": 0.034271197388489986, + "language_loss": 0.93926299, + "learning_rate": 0.0009689039826650312, + "loss": 0.94990206, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.57910156, + "step": 725, + "time_per_iteration": 2.7856695652008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095871, + "balance_loss_mlp": 1.03941345, + "epoch": 0.13966910350134668, + "flos": 1524951988224.0, + "grad_norm": 0.03128450212810151, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77618933, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.56640625, + "step": 726, + "time_per_iteration": 4.903306245803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.00902975, + "epoch": 0.13986148518661023, + "flos": 500856367104.0, + "grad_norm": 0.052764167671210026, + "language_loss": 0.89172196, + "learning_rate": 0.0009686873120259941, + "loss": 0.90239263, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.57861328, + "step": 727, + "time_per_iteration": 2.6450552940368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072692, + "balance_loss_mlp": 1.01518559, + "epoch": 0.1400538668718738, + "flos": 599850074880.0, + "grad_norm": 0.036488800736072635, + "language_loss": 0.88047451, + "learning_rate": 0.0009685787037446004, + "loss": 0.89120144, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.57324219, + "step": 728, + "time_per_iteration": 2.763434648513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072512, + "balance_loss_mlp": 1.01481462, + "epoch": 0.14024624855713735, + "flos": 595169556480.0, + "grad_norm": 0.047561697925478, + "language_loss": 0.88858587, + "learning_rate": 0.0009684699135448201, + "loss": 0.89931101, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.57519531, + "step": 729, + "time_per_iteration": 2.745037078857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067277, + "balance_loss_mlp": 1.00962722, + "epoch": 0.1404386302424009, + "flos": 507586010880.0, + "grad_norm": 0.03094406590189725, + "language_loss": 0.9291476, + "learning_rate": 0.0009683609414688895, + "loss": 0.93982029, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.57470703, + "step": 730, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.01039195, + "epoch": 0.14063101192766447, + "flos": 574515975936.0, + "grad_norm": 0.037780385553924656, + "language_loss": 0.87345785, + "learning_rate": 0.0009682517875591154, + "loss": 0.88414258, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.57910156, + "step": 731, + "time_per_iteration": 2.752572536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071345, + "balance_loss_mlp": 1.0129801, + "epoch": 0.14082339361292806, + "flos": 565765396992.0, + "grad_norm": 0.03832964150159033, + "language_loss": 0.87666118, + "learning_rate": 0.0009681424518578749, + "loss": 0.88737464, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.58203125, + "step": 732, + "time_per_iteration": 2.7323830127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.01028764, + "epoch": 0.14101577529819162, + "flos": 464583558144.0, + "grad_norm": 0.035957988569031644, + "language_loss": 0.88670099, + "learning_rate": 0.000968032934407616, + "loss": 0.8973856, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.58007812, + "step": 733, + "time_per_iteration": 2.6479005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064525, + "balance_loss_mlp": 1.00644577, + "epoch": 0.14120815698345518, + "flos": 597262423296.0, + "grad_norm": 0.039547782577588224, + "language_loss": 0.82413781, + "learning_rate": 0.0009679232352508571, + "loss": 0.83478296, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.57910156, + "step": 734, + "time_per_iteration": 2.7924795150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.00599897, + "epoch": 0.14140053866871874, + "flos": 536232925440.0, + "grad_norm": 0.03854566850595878, + "language_loss": 0.82520735, + "learning_rate": 0.0009678133544301871, + "loss": 0.83584428, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.57519531, + "step": 735, + "time_per_iteration": 2.658731698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_mlp": 1.00498438, + "epoch": 0.1415929203539823, + "flos": 521277623040.0, + "grad_norm": 0.0297517777524564, + "language_loss": 0.92917788, + "learning_rate": 0.0009677032919882658, + "loss": 0.93980187, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.57226562, + "step": 736, + "time_per_iteration": 2.661276340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_mlp": 1.0113374, + "epoch": 0.14178530203924586, + "flos": 483302719488.0, + "grad_norm": 0.041037110936195734, + "language_loss": 0.92867804, + "learning_rate": 0.000967593047967823, + "loss": 0.93936217, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.56982422, + "step": 737, + "time_per_iteration": 2.52840256690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068115, + "balance_loss_mlp": 1.01056099, + "epoch": 0.14197768372450942, + "flos": 677840259072.0, + "grad_norm": 0.04254557939420697, + "language_loss": 0.88126308, + "learning_rate": 0.0009674826224116593, + "loss": 0.89194429, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.57373047, + "step": 738, + "time_per_iteration": 2.858147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.0167979, + "epoch": 0.14217006540977298, + "flos": 446992972032.0, + "grad_norm": 0.045930563119643074, + "language_loss": 0.87994051, + "learning_rate": 0.0009673720153626455, + "loss": 0.89068353, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.57324219, + "step": 739, + "time_per_iteration": 2.664236545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.01199603, + "epoch": 0.14236244709503657, + "flos": 497478422784.0, + "grad_norm": 0.040566684483093814, + "language_loss": 0.88105047, + "learning_rate": 0.0009672612268637235, + "loss": 0.89174449, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.57226562, + "step": 740, + "time_per_iteration": 2.634126901626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069933, + "balance_loss_mlp": 1.01304626, + "epoch": 0.14255482878030012, + "flos": 649480104192.0, + "grad_norm": 0.05086050125917657, + "language_loss": 0.85906518, + "learning_rate": 0.0009671502569579048, + "loss": 0.86976457, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.56884766, + "step": 741, + "time_per_iteration": 2.7642107009887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071964, + "balance_loss_mlp": 1.01564944, + "epoch": 0.14274721046556368, + "flos": 537274984704.0, + "grad_norm": 0.037356444744632025, + "language_loss": 0.90824854, + "learning_rate": 0.0009670391056882719, + "loss": 0.91896814, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.56445312, + "step": 742, + "time_per_iteration": 2.7307372093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069292, + "balance_loss_mlp": 1.01288199, + "epoch": 0.14293959215082724, + "flos": 958584893184.0, + "grad_norm": 0.03744948002603285, + "language_loss": 0.89976203, + "learning_rate": 0.0009669277730979776, + "loss": 0.91045499, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.56494141, + "step": 743, + "time_per_iteration": 3.2251601219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068321, + "balance_loss_mlp": 1.01162553, + "epoch": 0.1431319738360908, + "flos": 694386840576.0, + "grad_norm": 0.037398516399228816, + "language_loss": 0.86562485, + "learning_rate": 0.0009668162592302449, + "loss": 0.87630802, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.56738281, + "step": 744, + "time_per_iteration": 2.924435615539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.01015854, + "epoch": 0.14332435552135436, + "flos": 566503200000.0, + "grad_norm": 0.037819132294000864, + "language_loss": 0.86981773, + "learning_rate": 0.0009667045641283676, + "loss": 0.88048917, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.56933594, + "step": 745, + "time_per_iteration": 2.6744887828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071811, + "balance_loss_mlp": 1.01540148, + "epoch": 0.14351673720661792, + "flos": 739696988160.0, + "grad_norm": 0.042480690817339954, + "language_loss": 0.96115947, + "learning_rate": 0.0009665926878357092, + "loss": 0.97187757, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.56591797, + "step": 746, + "time_per_iteration": 2.9137520790100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069908, + "balance_loss_mlp": 1.0134027, + "epoch": 0.14370911889188148, + "flos": 550352248320.0, + "grad_norm": 0.037361960218361134, + "language_loss": 0.92219329, + "learning_rate": 0.0009664806303957043, + "loss": 0.93289238, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.56542969, + "step": 747, + "time_per_iteration": 2.7734382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.01453757, + "epoch": 0.14390150057714507, + "flos": 591590390016.0, + "grad_norm": 0.040803275102161134, + "language_loss": 0.88578373, + "learning_rate": 0.0009663683918518571, + "loss": 0.89649272, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.56542969, + "step": 748, + "time_per_iteration": 2.93782114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.0103749, + "epoch": 0.14409388226240863, + "flos": 592145445888.0, + "grad_norm": 0.040391516566669984, + "language_loss": 0.87085271, + "learning_rate": 0.0009662559722477428, + "loss": 0.88152146, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.56640625, + "step": 749, + "time_per_iteration": 2.696570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.08542633, + "epoch": 0.1442862639476722, + "flos": 1514657762304.0, + "grad_norm": 0.043557664449290004, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77303517, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.55273438, + "step": 750, + "time_per_iteration": 5.024984836578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106928, + "balance_loss_mlp": 1.01263177, + "epoch": 0.14447864563293575, + "flos": 497856556032.0, + "grad_norm": 0.03544029116038115, + "language_loss": 0.90697813, + "learning_rate": 0.0009660305900333632, + "loss": 0.91767091, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.56738281, + "step": 751, + "time_per_iteration": 2.678037166595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078207, + "balance_loss_mlp": 1.02165437, + "epoch": 0.1446710273181993, + "flos": 590795239680.0, + "grad_norm": 0.04141635113788076, + "language_loss": 0.83649188, + "learning_rate": 0.0009659176275105992, + "loss": 0.84727395, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.56640625, + "step": 752, + "time_per_iteration": 2.714871883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.02074409, + "epoch": 0.14486340900346287, + "flos": 587013884160.0, + "grad_norm": 0.03637909883196532, + "language_loss": 0.87195009, + "learning_rate": 0.0009658044841025701, + "loss": 0.88271976, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.56396484, + "step": 753, + "time_per_iteration": 2.7753467559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.01901722, + "epoch": 0.14505579068872643, + "flos": 505741019904.0, + "grad_norm": 0.041255413340114844, + "language_loss": 0.82866222, + "learning_rate": 0.0009656911598532021, + "loss": 0.83941746, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.56591797, + "step": 754, + "time_per_iteration": 2.657831907272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.02119958, + "epoch": 0.14524817237399, + "flos": 487816041984.0, + "grad_norm": 0.03637506550278126, + "language_loss": 0.9138847, + "learning_rate": 0.0009655776548064917, + "loss": 0.92465889, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.56347656, + "step": 755, + "time_per_iteration": 2.6499805450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070806, + "balance_loss_mlp": 1.01477778, + "epoch": 0.14544055405925355, + "flos": 729450394368.0, + "grad_norm": 0.037726189244012505, + "language_loss": 0.89799821, + "learning_rate": 0.0009654639690065054, + "loss": 0.90870631, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.56201172, + "step": 756, + "time_per_iteration": 2.913638114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.01461017, + "epoch": 0.14563293574451713, + "flos": 594787532544.0, + "grad_norm": 0.03772784195488967, + "language_loss": 0.8914414, + "learning_rate": 0.00096535010249738, + "loss": 0.90214825, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.5625, + "step": 757, + "time_per_iteration": 2.721640110015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.01121712, + "epoch": 0.1458253174297807, + "flos": 561623404800.0, + "grad_norm": 0.04410713855467511, + "language_loss": 0.84106696, + "learning_rate": 0.0009652360553233224, + "loss": 0.8517437, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.56591797, + "step": 758, + "time_per_iteration": 2.771986484527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080421, + "balance_loss_mlp": 1.02625275, + "epoch": 0.14601769911504425, + "flos": 1561189804032.0, + "grad_norm": 0.021986445825835567, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74854165, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.54296875, + "step": 759, + "time_per_iteration": 4.951657056808472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064246, + "balance_loss_mlp": 1.00712132, + "epoch": 0.1462100808003078, + "flos": 867823646976.0, + "grad_norm": 0.03532102179266325, + "language_loss": 0.82350075, + "learning_rate": 0.0009650074191575883, + "loss": 0.83414322, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.56982422, + "step": 760, + "time_per_iteration": 3.2275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.02083874, + "epoch": 0.14640246248557137, + "flos": 524030525184.0, + "grad_norm": 0.0394901057776484, + "language_loss": 0.87295806, + "learning_rate": 0.0009648928302546766, + "loss": 0.88373965, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.57177734, + "step": 761, + "time_per_iteration": 2.6739044189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108061, + "balance_loss_mlp": 1.02319896, + "epoch": 0.14659484417083493, + "flos": 1032242556672.0, + "grad_norm": 0.0381114836464334, + "language_loss": 0.86423808, + "learning_rate": 0.0009647780608643613, + "loss": 0.87504417, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.57226562, + "step": 762, + "time_per_iteration": 3.355055332183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_mlp": 1.02742219, + "epoch": 0.1467872258560985, + "flos": 501657353472.0, + "grad_norm": 0.04884269069306727, + "language_loss": 0.89483184, + "learning_rate": 0.0009646631110312001, + "loss": 0.90568066, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.57275391, + "step": 763, + "time_per_iteration": 2.638404607772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074108, + "balance_loss_mlp": 1.01683939, + "epoch": 0.14697960754136205, + "flos": 548936913408.0, + "grad_norm": 0.030517371118051684, + "language_loss": 0.89587164, + "learning_rate": 0.0009645479807998203, + "loss": 0.90661263, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.57128906, + "step": 764, + "time_per_iteration": 2.7784340381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066881, + "balance_loss_mlp": 1.0099467, + "epoch": 0.14717198922662564, + "flos": 518902854144.0, + "grad_norm": 0.03321738346858149, + "language_loss": 0.93693149, + "learning_rate": 0.0009644326702149196, + "loss": 0.94760031, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.56884766, + "step": 765, + "time_per_iteration": 2.712148904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.009902, + "epoch": 0.1473643709118892, + "flos": 733484483328.0, + "grad_norm": 0.042813367444357694, + "language_loss": 0.86227441, + "learning_rate": 0.0009643171793212653, + "loss": 0.87293845, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.56591797, + "step": 766, + "time_per_iteration": 3.0350003242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.01357007, + "epoch": 0.14755675259715276, + "flos": 621669169152.0, + "grad_norm": 0.04397904632105779, + "language_loss": 0.90884185, + "learning_rate": 0.0009642015081636952, + "loss": 0.91953874, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.56298828, + "step": 767, + "time_per_iteration": 2.6967811584472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.01185656, + "epoch": 0.14774913428241632, + "flos": 453173395968.0, + "grad_norm": 0.040409537343205924, + "language_loss": 0.89756525, + "learning_rate": 0.0009640856567871166, + "loss": 0.90824074, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.55859375, + "step": 768, + "time_per_iteration": 2.5016207695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.00803363, + "epoch": 0.14794151596767988, + "flos": 838655702784.0, + "grad_norm": 0.03518214363191685, + "language_loss": 0.90024096, + "learning_rate": 0.0009639696252365072, + "loss": 0.91087824, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.55859375, + "step": 769, + "time_per_iteration": 3.0535316467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064247, + "balance_loss_mlp": 1.00874364, + "epoch": 0.14813389765294344, + "flos": 687405430272.0, + "grad_norm": 0.03578436651039587, + "language_loss": 0.83073497, + "learning_rate": 0.0009638534135569144, + "loss": 0.8413775, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.55664062, + "step": 770, + "time_per_iteration": 2.8983683586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065854, + "balance_loss_mlp": 1.01039767, + "epoch": 0.148326279338207, + "flos": 510944513280.0, + "grad_norm": 0.03931230706380594, + "language_loss": 0.91550887, + "learning_rate": 0.0009637370217934554, + "loss": 0.92616743, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.55615234, + "step": 771, + "time_per_iteration": 2.6311967372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061556, + "balance_loss_mlp": 1.00590932, + "epoch": 0.14851866102347056, + "flos": 589332272640.0, + "grad_norm": 0.03214719611667013, + "language_loss": 0.8436957, + "learning_rate": 0.0009636204499913175, + "loss": 0.85431123, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.55810547, + "step": 772, + "time_per_iteration": 2.8748695850372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066056, + "balance_loss_mlp": 1.01069546, + "epoch": 0.14871104270873411, + "flos": 692248286976.0, + "grad_norm": 0.034034874980260935, + "language_loss": 0.89455193, + "learning_rate": 0.0009635036981957581, + "loss": 0.9052124, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.55517578, + "step": 773, + "time_per_iteration": 2.8526012897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063541, + "balance_loss_mlp": 1.00789392, + "epoch": 0.1489034243939977, + "flos": 656283624960.0, + "grad_norm": 0.03841304714783139, + "language_loss": 0.91971016, + "learning_rate": 0.0009633867664521043, + "loss": 0.93034559, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.55810547, + "step": 774, + "time_per_iteration": 2.823320150375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_mlp": 1.00736797, + "epoch": 0.14909580607926126, + "flos": 476796652032.0, + "grad_norm": 0.0404919947218097, + "language_loss": 0.88328946, + "learning_rate": 0.0009632696548057527, + "loss": 0.89392436, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.56298828, + "step": 775, + "time_per_iteration": 2.5567190647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072265, + "balance_loss_mlp": 1.01609385, + "epoch": 0.14928818776452482, + "flos": 612284799744.0, + "grad_norm": 0.03821441574416946, + "language_loss": 0.86270714, + "learning_rate": 0.0009631523633021704, + "loss": 0.87342978, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.56347656, + "step": 776, + "time_per_iteration": 2.783348321914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.01187015, + "epoch": 0.14948056944978838, + "flos": 562917230592.0, + "grad_norm": 0.039790220133906304, + "language_loss": 0.90072912, + "learning_rate": 0.0009630348919868936, + "loss": 0.9114095, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.56347656, + "step": 777, + "time_per_iteration": 2.7115018367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.01736236, + "epoch": 0.14967295113505194, + "flos": 450112346880.0, + "grad_norm": 0.044777999480791836, + "language_loss": 0.82363755, + "learning_rate": 0.0009629172409055293, + "loss": 0.83437192, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.5625, + "step": 778, + "time_per_iteration": 2.578178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079079, + "balance_loss_mlp": 1.02319324, + "epoch": 0.1498653328203155, + "flos": 572429912064.0, + "grad_norm": 0.03699200582710457, + "language_loss": 0.8876617, + "learning_rate": 0.0009627994101037531, + "loss": 0.89845246, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.56054688, + "step": 779, + "time_per_iteration": 2.7733986377716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.01556909, + "epoch": 0.15005771450557906, + "flos": 632408602368.0, + "grad_norm": 0.04036301028093645, + "language_loss": 0.90477651, + "learning_rate": 0.0009626813996273114, + "loss": 0.91549194, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.56152344, + "step": 780, + "time_per_iteration": 2.8476834297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064638, + "balance_loss_mlp": 1.00884771, + "epoch": 0.15025009619084262, + "flos": 579166358784.0, + "grad_norm": 0.036574622666600026, + "language_loss": 0.89819682, + "learning_rate": 0.0009625632095220198, + "loss": 0.90884316, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.55957031, + "step": 781, + "time_per_iteration": 2.8279531002044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065406, + "balance_loss_mlp": 1.00961614, + "epoch": 0.1504424778761062, + "flos": 484857060096.0, + "grad_norm": 0.04416373966784989, + "language_loss": 0.8858574, + "learning_rate": 0.0009624448398337637, + "loss": 0.89651144, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.55957031, + "step": 782, + "time_per_iteration": 2.512742280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0075767, + "epoch": 0.15063485956136977, + "flos": 763895708928.0, + "grad_norm": 0.03630111779859241, + "language_loss": 0.90811443, + "learning_rate": 0.0009623262906084984, + "loss": 0.9187429, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.55419922, + "step": 783, + "time_per_iteration": 3.0409936904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.01156867, + "epoch": 0.15082724124663333, + "flos": 498676984320.0, + "grad_norm": 0.03758683048429116, + "language_loss": 0.91324949, + "learning_rate": 0.0009622075618922486, + "loss": 0.92391407, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.55029297, + "step": 784, + "time_per_iteration": 2.716580629348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.01188219, + "epoch": 0.15101962293189689, + "flos": 510722882304.0, + "grad_norm": 0.0361748672236624, + "language_loss": 0.88713133, + "learning_rate": 0.0009620886537311091, + "loss": 0.89779752, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.54882812, + "step": 785, + "time_per_iteration": 2.7197515964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.01069367, + "epoch": 0.15121200461716044, + "flos": 458702532864.0, + "grad_norm": 0.0476660620131034, + "language_loss": 0.86751854, + "learning_rate": 0.000961969566171244, + "loss": 0.87817287, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.54882812, + "step": 786, + "time_per_iteration": 2.519826650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063397, + "balance_loss_mlp": 1.00865602, + "epoch": 0.151404386302424, + "flos": 539017908480.0, + "grad_norm": 0.0401982478312821, + "language_loss": 0.91594857, + "learning_rate": 0.0009618502992588873, + "loss": 0.92658257, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.54882812, + "step": 787, + "time_per_iteration": 2.6427645683288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.02133262, + "epoch": 0.15159676798768756, + "flos": 689617860864.0, + "grad_norm": 0.04258050045209434, + "language_loss": 0.8916502, + "learning_rate": 0.0009617308530403424, + "loss": 0.9024148, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.55273438, + "step": 788, + "time_per_iteration": 3.0662577152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106461, + "balance_loss_mlp": 1.00958323, + "epoch": 0.15178914967295112, + "flos": 546433832448.0, + "grad_norm": 0.03354297731817266, + "language_loss": 0.88695067, + "learning_rate": 0.0009616112275619825, + "loss": 0.89759684, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.55175781, + "step": 789, + "time_per_iteration": 2.7230606079101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065845, + "balance_loss_mlp": 1.01081765, + "epoch": 0.1519815313582147, + "flos": 512815749120.0, + "grad_norm": 0.03087624340708216, + "language_loss": 0.85391772, + "learning_rate": 0.0009614914228702503, + "loss": 0.86457616, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.55175781, + "step": 790, + "time_per_iteration": 2.6690316200256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075082, + "balance_loss_mlp": 1.02024603, + "epoch": 0.15217391304347827, + "flos": 685458372096.0, + "grad_norm": 0.03877155611381102, + "language_loss": 0.90952718, + "learning_rate": 0.0009613714390116581, + "loss": 0.92027801, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.54980469, + "step": 791, + "time_per_iteration": 3.006898880004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069036, + "balance_loss_mlp": 1.01396108, + "epoch": 0.15236629472874183, + "flos": 645446982144.0, + "grad_norm": 0.03750254169389994, + "language_loss": 0.87660968, + "learning_rate": 0.0009612512760327879, + "loss": 0.88730001, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.55224609, + "step": 792, + "time_per_iteration": 2.858262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068429, + "balance_loss_mlp": 1.01297235, + "epoch": 0.1525586764140054, + "flos": 413765660928.0, + "grad_norm": 0.044925092089749936, + "language_loss": 0.86468709, + "learning_rate": 0.0009611309339802909, + "loss": 0.87537134, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.55615234, + "step": 793, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070738, + "balance_loss_mlp": 1.01485312, + "epoch": 0.15275105809926895, + "flos": 804234687744.0, + "grad_norm": 0.03634630877191588, + "language_loss": 0.85518378, + "learning_rate": 0.0009610104129008881, + "loss": 0.8658911, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.56054688, + "step": 794, + "time_per_iteration": 3.119896173477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064568, + "balance_loss_mlp": 1.0088737, + "epoch": 0.1529434397845325, + "flos": 613543632384.0, + "grad_norm": 0.039196324818253456, + "language_loss": 0.89691782, + "learning_rate": 0.0009608897128413701, + "loss": 0.90756351, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.55859375, + "step": 795, + "time_per_iteration": 2.7244484424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.00949657, + "epoch": 0.15313582146979607, + "flos": 616472478720.0, + "grad_norm": 0.031652256183926086, + "language_loss": 0.86697376, + "learning_rate": 0.0009607688338485965, + "loss": 0.87762469, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.55761719, + "step": 796, + "time_per_iteration": 2.859959363937378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106534, + "balance_loss_mlp": 1.00997913, + "epoch": 0.15332820315505963, + "flos": 794993214720.0, + "grad_norm": 0.036135713167076366, + "language_loss": 0.91464871, + "learning_rate": 0.0009606477759694969, + "loss": 0.92530215, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.55517578, + "step": 797, + "time_per_iteration": 3.0383169651031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.00806129, + "epoch": 0.1535205848403232, + "flos": 551257247232.0, + "grad_norm": 0.04267360012583918, + "language_loss": 0.89290035, + "learning_rate": 0.0009605265392510703, + "loss": 0.90353841, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.55908203, + "step": 798, + "time_per_iteration": 2.642423152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063071, + "balance_loss_mlp": 1.00732899, + "epoch": 0.15371296652558677, + "flos": 536979476736.0, + "grad_norm": 0.03662373873498648, + "language_loss": 0.93232477, + "learning_rate": 0.0009604051237403846, + "loss": 0.94295549, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.55908203, + "step": 799, + "time_per_iteration": 2.6661648750305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062966, + "balance_loss_mlp": 1.00693774, + "epoch": 0.15390534821085033, + "flos": 396090504192.0, + "grad_norm": 0.042222005302764924, + "language_loss": 0.87381375, + "learning_rate": 0.0009602835294845776, + "loss": 0.8844434, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.56201172, + "step": 800, + "time_per_iteration": 2.4529898166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_mlp": 1.00520432, + "epoch": 0.1540977298961139, + "flos": 536886157824.0, + "grad_norm": 0.03888031973735598, + "language_loss": 0.91938102, + "learning_rate": 0.0009601617565308565, + "loss": 0.92998952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.55810547, + "step": 801, + "time_per_iteration": 2.6380698680877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.0085746, + "epoch": 0.15429011158137745, + "flos": 725091628800.0, + "grad_norm": 0.03523983772327724, + "language_loss": 0.87975162, + "learning_rate": 0.0009600398049264977, + "loss": 0.89039195, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.55615234, + "step": 802, + "time_per_iteration": 2.9610986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064973, + "balance_loss_mlp": 1.00970769, + "epoch": 0.154482493266641, + "flos": 621749849088.0, + "grad_norm": 0.04424510077845192, + "language_loss": 0.93353879, + "learning_rate": 0.0009599176747188469, + "loss": 0.94418848, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.55419922, + "step": 803, + "time_per_iteration": 2.883296251296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065036, + "balance_loss_mlp": 1.00981843, + "epoch": 0.15467487495190457, + "flos": 526720243968.0, + "grad_norm": 0.03833070581853241, + "language_loss": 0.84471631, + "learning_rate": 0.0009597953659553196, + "loss": 0.85536671, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.55371094, + "step": 804, + "time_per_iteration": 2.7128705978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.00712788, + "epoch": 0.15486725663716813, + "flos": 528760621056.0, + "grad_norm": 0.03896986919959599, + "language_loss": 0.90159577, + "learning_rate": 0.0009596728786833997, + "loss": 0.9122197, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.55419922, + "step": 805, + "time_per_iteration": 2.605398178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062158, + "balance_loss_mlp": 1.00684452, + "epoch": 0.1550596383224317, + "flos": 1050280295424.0, + "grad_norm": 0.039312204875199507, + "language_loss": 0.90827858, + "learning_rate": 0.0009595502129506415, + "loss": 0.91890013, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.5546875, + "step": 806, + "time_per_iteration": 3.355556011199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_mlp": 1.00736439, + "epoch": 0.15525202000769528, + "flos": 614837458176.0, + "grad_norm": 0.03934214137038287, + "language_loss": 0.83726299, + "learning_rate": 0.0009594273688046678, + "loss": 0.8478874, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.55224609, + "step": 807, + "time_per_iteration": 2.765700101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062118, + "balance_loss_mlp": 1.00728118, + "epoch": 0.15544440169295884, + "flos": 534103120128.0, + "grad_norm": 0.042258492962953934, + "language_loss": 0.86714661, + "learning_rate": 0.000959304346293171, + "loss": 0.8777678, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.54980469, + "step": 808, + "time_per_iteration": 2.6490986347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.00928247, + "epoch": 0.1556367833782224, + "flos": 645887331840.0, + "grad_norm": 0.047675746935091516, + "language_loss": 0.89139616, + "learning_rate": 0.0009591811454639125, + "loss": 0.90203738, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.54980469, + "step": 809, + "time_per_iteration": 2.7880568504333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059631, + "balance_loss_mlp": 1.00469911, + "epoch": 0.15582916506348596, + "flos": 544953368832.0, + "grad_norm": 0.05205155355433054, + "language_loss": 0.89500809, + "learning_rate": 0.0009590577663647234, + "loss": 0.90560436, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.55078125, + "step": 810, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061907, + "balance_loss_mlp": 1.0068804, + "epoch": 0.15602154674874952, + "flos": 581215484160.0, + "grad_norm": 0.039153260843753375, + "language_loss": 0.87186325, + "learning_rate": 0.0009589342090435036, + "loss": 0.88248235, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.55175781, + "step": 811, + "time_per_iteration": 2.806425094604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_mlp": 1.00607169, + "epoch": 0.15621392843401308, + "flos": 536317496064.0, + "grad_norm": 0.04937652455074429, + "language_loss": 0.88453877, + "learning_rate": 0.0009588104735482223, + "loss": 0.89514732, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.54931641, + "step": 812, + "time_per_iteration": 2.647728204727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060077, + "balance_loss_mlp": 1.00538397, + "epoch": 0.15640631011927664, + "flos": 551982411264.0, + "grad_norm": 0.04402679292728805, + "language_loss": 0.85281312, + "learning_rate": 0.0009586865599269177, + "loss": 0.86341381, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.54833984, + "step": 813, + "time_per_iteration": 2.642218828201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061354, + "balance_loss_mlp": 1.0069474, + "epoch": 0.1565986918045402, + "flos": 638636658432.0, + "grad_norm": 0.0415768255708782, + "language_loss": 0.89702487, + "learning_rate": 0.0009585624682276977, + "loss": 0.90763843, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.54541016, + "step": 814, + "time_per_iteration": 2.7770931720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058453, + "balance_loss_mlp": 1.00366414, + "epoch": 0.15679107348980378, + "flos": 491782089984.0, + "grad_norm": 0.039213144049943555, + "language_loss": 0.88436091, + "learning_rate": 0.0009584381984987386, + "loss": 0.89494538, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.54931641, + "step": 815, + "time_per_iteration": 2.617560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061581, + "balance_loss_mlp": 1.00655353, + "epoch": 0.15698345517506734, + "flos": 531003187200.0, + "grad_norm": 0.030486806446719653, + "language_loss": 0.91117728, + "learning_rate": 0.0009583137507882864, + "loss": 0.92179304, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.55175781, + "step": 816, + "time_per_iteration": 2.6757051944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060617, + "balance_loss_mlp": 1.00568497, + "epoch": 0.1571758368603309, + "flos": 547078316544.0, + "grad_norm": 0.03910336486934304, + "language_loss": 0.82217371, + "learning_rate": 0.000958189125144656, + "loss": 0.83277988, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.55078125, + "step": 817, + "time_per_iteration": 2.7065701484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061392, + "balance_loss_mlp": 1.00655591, + "epoch": 0.15736821854559446, + "flos": 566744272896.0, + "grad_norm": 0.03730967846547413, + "language_loss": 0.89150202, + "learning_rate": 0.0009580643216162313, + "loss": 0.90211594, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.54980469, + "step": 818, + "time_per_iteration": 2.6849937438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_mlp": 1.00792253, + "epoch": 0.15756060023085802, + "flos": 501954806784.0, + "grad_norm": 0.041127076818974775, + "language_loss": 0.80838168, + "learning_rate": 0.0009579393402514652, + "loss": 0.81900686, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.54736328, + "step": 819, + "time_per_iteration": 2.615342378616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060056, + "balance_loss_mlp": 1.00560164, + "epoch": 0.15775298191612158, + "flos": 520272502272.0, + "grad_norm": 0.037825026421493144, + "language_loss": 0.91941106, + "learning_rate": 0.0009578141810988801, + "loss": 0.93001157, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.54589844, + "step": 820, + "time_per_iteration": 2.6530544757843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.00666904, + "epoch": 0.15794536360138514, + "flos": 467088584448.0, + "grad_norm": 0.039348813654249644, + "language_loss": 0.92238629, + "learning_rate": 0.0009576888442070668, + "loss": 0.93299985, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.54833984, + "step": 821, + "time_per_iteration": 2.5978658199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_mlp": 1.00809062, + "epoch": 0.1581377452866487, + "flos": 518168941824.0, + "grad_norm": 0.03790806580601569, + "language_loss": 0.93657464, + "learning_rate": 0.0009575633296246854, + "loss": 0.94720107, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.546875, + "step": 822, + "time_per_iteration": 2.582139492034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_mlp": 1.00711334, + "epoch": 0.15833012697191226, + "flos": 550838284800.0, + "grad_norm": 0.03604802690546967, + "language_loss": 0.84146446, + "learning_rate": 0.0009574376374004652, + "loss": 0.85208106, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.546875, + "step": 823, + "time_per_iteration": 2.6182329654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.00703347, + "epoch": 0.15852250865717585, + "flos": 488467329024.0, + "grad_norm": 0.0382059884648543, + "language_loss": 0.82121176, + "learning_rate": 0.000957311767583204, + "loss": 0.83182758, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.546875, + "step": 824, + "time_per_iteration": 2.584266185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_mlp": 1.00531006, + "epoch": 0.1587148903424394, + "flos": 1312699441152.0, + "grad_norm": 0.00659207066158758, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83129162, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.5234375, + "step": 825, + "time_per_iteration": 4.734830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_mlp": 1.00965643, + "epoch": 0.15890727202770297, + "flos": 467833190400.0, + "grad_norm": 0.04624650490850591, + "language_loss": 0.92764026, + "learning_rate": 0.0009570594953650961, + "loss": 0.93828189, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.54638672, + "step": 826, + "time_per_iteration": 2.5117454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.00937772, + "epoch": 0.15909965371296653, + "flos": 778607993088.0, + "grad_norm": 0.03976637787958364, + "language_loss": 0.81327987, + "learning_rate": 0.00095693309306219, + "loss": 0.8239187, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.54638672, + "step": 827, + "time_per_iteration": 3.1954681873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.00599849, + "epoch": 0.1592920353982301, + "flos": 1079964411648.0, + "grad_norm": 0.038150784713437476, + "language_loss": 0.89750922, + "learning_rate": 0.0009568065133621244, + "loss": 0.90811658, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54882812, + "step": 828, + "time_per_iteration": 3.3355016708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.00642896, + "epoch": 0.15948441708349365, + "flos": 726890932992.0, + "grad_norm": 0.03986186218144037, + "language_loss": 0.85834098, + "learning_rate": 0.0009566797563140422, + "loss": 0.86894989, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.54589844, + "step": 829, + "time_per_iteration": 2.873845100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_mlp": 1.00519884, + "epoch": 0.1596767987687572, + "flos": 580076215296.0, + "grad_norm": 0.03433333328837374, + "language_loss": 0.89395094, + "learning_rate": 0.0009565528219671547, + "loss": 0.90454364, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.54199219, + "step": 830, + "time_per_iteration": 2.9566032886505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.00991619, + "epoch": 0.15986918045402077, + "flos": 530026256640.0, + "grad_norm": 0.037800776955081314, + "language_loss": 0.86586118, + "learning_rate": 0.0009564257103707418, + "loss": 0.87649965, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.54052734, + "step": 831, + "time_per_iteration": 2.6305205821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.00870061, + "epoch": 0.16006156213928435, + "flos": 575670796032.0, + "grad_norm": 0.04196239075383403, + "language_loss": 0.92502224, + "learning_rate": 0.0009562984215741533, + "loss": 0.93564951, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54150391, + "step": 832, + "time_per_iteration": 2.6781210899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00743783, + "epoch": 0.1602539438245479, + "flos": 516675839232.0, + "grad_norm": 0.039654673227061156, + "language_loss": 0.83729708, + "learning_rate": 0.0009561709556268065, + "loss": 0.84791321, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.54296875, + "step": 833, + "time_per_iteration": 2.732191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064816, + "balance_loss_mlp": 1.01021826, + "epoch": 0.16044632550981147, + "flos": 622162008576.0, + "grad_norm": 0.03600956841171521, + "language_loss": 0.95349514, + "learning_rate": 0.0009560433125781884, + "loss": 0.96414334, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.54736328, + "step": 834, + "time_per_iteration": 4.227160215377808 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063475, + "balance_loss_mlp": 1.008973, + "epoch": 0.16063870719507503, + "flos": 562128883200.0, + "grad_norm": 0.03652136008848007, + "language_loss": 0.94107795, + "learning_rate": 0.0009559154924778544, + "loss": 0.95171273, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.54638672, + "step": 835, + "time_per_iteration": 2.7238283157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_mlp": 1.01251614, + "epoch": 0.1608310888803386, + "flos": 806561824512.0, + "grad_norm": 0.044196177378580975, + "language_loss": 0.86185992, + "learning_rate": 0.0009557874953754284, + "loss": 0.87252581, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.54199219, + "step": 836, + "time_per_iteration": 3.03965425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.00943184, + "epoch": 0.16102347056560215, + "flos": 601695065856.0, + "grad_norm": 0.04086380423696876, + "language_loss": 0.84961462, + "learning_rate": 0.0009556593213206038, + "loss": 0.86025023, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.54248047, + "step": 837, + "time_per_iteration": 2.714165687561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_mlp": 1.0095681, + "epoch": 0.1612158522508657, + "flos": 554615749632.0, + "grad_norm": 0.03942211179170501, + "language_loss": 0.88284755, + "learning_rate": 0.0009555309703631414, + "loss": 0.89348304, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.54101562, + "step": 838, + "time_per_iteration": 2.6616575717926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061318, + "balance_loss_mlp": 1.00729215, + "epoch": 0.16140823393612927, + "flos": 557018708736.0, + "grad_norm": 0.03970121061853926, + "language_loss": 0.88476837, + "learning_rate": 0.0009554024425528722, + "loss": 0.89538157, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.54150391, + "step": 839, + "time_per_iteration": 2.6778693199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061761, + "balance_loss_mlp": 1.00792611, + "epoch": 0.16160061562139286, + "flos": 544909627392.0, + "grad_norm": 0.03616953348933095, + "language_loss": 0.90216744, + "learning_rate": 0.0009552737379396948, + "loss": 0.91278505, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.53955078, + "step": 840, + "time_per_iteration": 2.6190080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060215, + "balance_loss_mlp": 1.00638056, + "epoch": 0.16179299730665642, + "flos": 605007881472.0, + "grad_norm": 0.03485432207779616, + "language_loss": 0.88917094, + "learning_rate": 0.0009551448565735767, + "loss": 0.89977312, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.53955078, + "step": 841, + "time_per_iteration": 2.771730422973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059839, + "balance_loss_mlp": 1.00624251, + "epoch": 0.16198537899191998, + "flos": 788552275968.0, + "grad_norm": 0.040424272174261144, + "language_loss": 0.855564, + "learning_rate": 0.0009550157985045543, + "loss": 0.86616236, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.53710938, + "step": 842, + "time_per_iteration": 3.014448642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063416, + "balance_loss_mlp": 1.00986671, + "epoch": 0.16217776067718354, + "flos": 520830470400.0, + "grad_norm": 0.03210449059239548, + "language_loss": 0.9010545, + "learning_rate": 0.0009548865637827321, + "loss": 0.91168869, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.53662109, + "step": 843, + "time_per_iteration": 2.663733959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.00725794, + "epoch": 0.1623701423624471, + "flos": 506255246592.0, + "grad_norm": 0.04236042945807781, + "language_loss": 0.91279781, + "learning_rate": 0.0009547571524582838, + "loss": 0.92340446, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.53515625, + "step": 844, + "time_per_iteration": 2.5841143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00848722, + "epoch": 0.16256252404771065, + "flos": 498157900032.0, + "grad_norm": 0.043042899099755685, + "language_loss": 0.93573415, + "learning_rate": 0.0009546275645814512, + "loss": 0.94635028, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.53222656, + "step": 845, + "time_per_iteration": 2.601743221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064884, + "balance_loss_mlp": 1.01152599, + "epoch": 0.16275490573297421, + "flos": 503287516416.0, + "grad_norm": 0.046422900850994125, + "language_loss": 0.90658545, + "learning_rate": 0.0009544978002025446, + "loss": 0.9172343, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.53466797, + "step": 846, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.00957346, + "epoch": 0.16294728741823777, + "flos": 508354916352.0, + "grad_norm": 0.03474620131823351, + "language_loss": 0.88017273, + "learning_rate": 0.0009543678593719434, + "loss": 0.89080155, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.53417969, + "step": 847, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_mlp": 1.01334834, + "epoch": 0.16313966910350133, + "flos": 510757875456.0, + "grad_norm": 0.031134263506057067, + "language_loss": 0.88570058, + "learning_rate": 0.0009542377421400945, + "loss": 0.89637142, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.53857422, + "step": 848, + "time_per_iteration": 2.79311203956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061983, + "balance_loss_mlp": 1.00810015, + "epoch": 0.16333205078876492, + "flos": 545057381376.0, + "grad_norm": 0.03805815068737175, + "language_loss": 0.84448338, + "learning_rate": 0.0009541074485575145, + "loss": 0.85510319, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.54003906, + "step": 849, + "time_per_iteration": 2.714644193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106829, + "balance_loss_mlp": 1.01450312, + "epoch": 0.16352443247402848, + "flos": 508712640768.0, + "grad_norm": 0.03447226436126556, + "language_loss": 0.93184924, + "learning_rate": 0.0009539769786747874, + "loss": 0.94253218, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.5390625, + "step": 850, + "time_per_iteration": 2.5857110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070929, + "balance_loss_mlp": 1.01709449, + "epoch": 0.16371681415929204, + "flos": 543223084032.0, + "grad_norm": 0.036141614394747515, + "language_loss": 0.82550752, + "learning_rate": 0.0009538463325425665, + "loss": 0.83621687, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.53955078, + "step": 851, + "time_per_iteration": 2.7186405658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.01242912, + "epoch": 0.1639091958445556, + "flos": 521761714176.0, + "grad_norm": 0.03784697093976771, + "language_loss": 0.87203169, + "learning_rate": 0.0009537155102115728, + "loss": 0.8826977, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.54296875, + "step": 852, + "time_per_iteration": 2.5761775970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061784, + "balance_loss_mlp": 1.00771022, + "epoch": 0.16410157752981916, + "flos": 548482957824.0, + "grad_norm": 0.03731294741121226, + "language_loss": 0.85278255, + "learning_rate": 0.0009535845117325961, + "loss": 0.8634004, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.54199219, + "step": 853, + "time_per_iteration": 2.6968846321105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065026, + "balance_loss_mlp": 1.01085758, + "epoch": 0.16429395921508272, + "flos": 584026712064.0, + "grad_norm": 0.031860977478103375, + "language_loss": 0.9423098, + "learning_rate": 0.0009534533371564946, + "loss": 0.95296007, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.54296875, + "step": 854, + "time_per_iteration": 2.7640349864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106098, + "balance_loss_mlp": 1.00709713, + "epoch": 0.16448634090034628, + "flos": 531962621184.0, + "grad_norm": 0.03950290113288642, + "language_loss": 0.89868152, + "learning_rate": 0.0009533219865341949, + "loss": 0.90929133, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.54003906, + "step": 855, + "time_per_iteration": 2.6025009155273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060489, + "balance_loss_mlp": 1.00693989, + "epoch": 0.16467872258560984, + "flos": 492961209600.0, + "grad_norm": 0.03645156199748424, + "language_loss": 0.87602645, + "learning_rate": 0.0009531904599166916, + "loss": 0.88663131, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.53662109, + "step": 856, + "time_per_iteration": 2.656604290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060758, + "balance_loss_mlp": 1.00730467, + "epoch": 0.16487110427087343, + "flos": 507260367360.0, + "grad_norm": 0.04426557796634758, + "language_loss": 0.86560714, + "learning_rate": 0.0009530587573550478, + "loss": 0.87621474, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.53564453, + "step": 857, + "time_per_iteration": 2.610445261001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_mlp": 1.00538635, + "epoch": 0.16506348595613698, + "flos": 1436111555328.0, + "grad_norm": 0.010874217326465607, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75375891, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.51171875, + "step": 858, + "time_per_iteration": 4.991516590118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.00718212, + "epoch": 0.16525586764140054, + "flos": 478090477824.0, + "grad_norm": 0.04454190836652637, + "language_loss": 0.91544032, + "learning_rate": 0.0009527948246039337, + "loss": 0.9260481, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.53710938, + "step": 859, + "time_per_iteration": 2.538290500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058142, + "balance_loss_mlp": 1.00425971, + "epoch": 0.1654482493266641, + "flos": 882541767168.0, + "grad_norm": 0.03991834039284953, + "language_loss": 0.88867122, + "learning_rate": 0.000952662594516931, + "loss": 0.89925265, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.54003906, + "step": 860, + "time_per_iteration": 3.083786964416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065202, + "balance_loss_mlp": 1.01122451, + "epoch": 0.16564063101192766, + "flos": 628106217216.0, + "grad_norm": 0.03630731527649873, + "language_loss": 0.87934124, + "learning_rate": 0.0009525301886907234, + "loss": 0.88999331, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.54101562, + "step": 861, + "time_per_iteration": 2.8606412410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.00884438, + "epoch": 0.16583301269719122, + "flos": 562593532416.0, + "grad_norm": 0.03632506699489255, + "language_loss": 0.8885988, + "learning_rate": 0.0009523976071767155, + "loss": 0.89922649, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.54052734, + "step": 862, + "time_per_iteration": 2.651202440261841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062989, + "balance_loss_mlp": 1.0094403, + "epoch": 0.16602539438245478, + "flos": 568984893696.0, + "grad_norm": 0.03883194498572106, + "language_loss": 0.88789731, + "learning_rate": 0.00095226485002638, + "loss": 0.8985272, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.53662109, + "step": 863, + "time_per_iteration": 2.798125982284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.01019073, + "epoch": 0.16621777606771834, + "flos": 576022684416.0, + "grad_norm": 0.03638934937563812, + "language_loss": 0.89892161, + "learning_rate": 0.0009521319172912576, + "loss": 0.90955949, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.53710938, + "step": 864, + "time_per_iteration": 4.098716974258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_mlp": 1.00632548, + "epoch": 0.16641015775298193, + "flos": 515598786816.0, + "grad_norm": 0.037169751839881825, + "language_loss": 0.96108532, + "learning_rate": 0.0009519988090229579, + "loss": 0.97168505, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.53759766, + "step": 865, + "time_per_iteration": 2.659381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068447, + "balance_loss_mlp": 1.01489806, + "epoch": 0.1666025394382455, + "flos": 622850234112.0, + "grad_norm": 0.04388029559541895, + "language_loss": 0.88811028, + "learning_rate": 0.0009518655252731576, + "loss": 0.89879477, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.53662109, + "step": 866, + "time_per_iteration": 2.738511323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061925, + "balance_loss_mlp": 1.00880551, + "epoch": 0.16679492112350905, + "flos": 549933285888.0, + "grad_norm": 0.03352631932153436, + "language_loss": 0.91113746, + "learning_rate": 0.0009517320660936022, + "loss": 0.92175674, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.53222656, + "step": 867, + "time_per_iteration": 2.7755699157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.01343453, + "epoch": 0.1669873028087726, + "flos": 666866555904.0, + "grad_norm": 0.04051359913494383, + "language_loss": 0.84396493, + "learning_rate": 0.0009515984315361051, + "loss": 0.85462809, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.52978516, + "step": 868, + "time_per_iteration": 2.8502533435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_mlp": 1.00944042, + "epoch": 0.16717968449403617, + "flos": 539604066816.0, + "grad_norm": 0.03969494402961726, + "language_loss": 0.88029611, + "learning_rate": 0.000951464621652548, + "loss": 0.89092225, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.53271484, + "step": 869, + "time_per_iteration": 2.6079800128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.01233244, + "epoch": 0.16737206617929973, + "flos": 531279253248.0, + "grad_norm": 0.03349656106003216, + "language_loss": 0.7990135, + "learning_rate": 0.0009513306364948804, + "loss": 0.80967236, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.53662109, + "step": 870, + "time_per_iteration": 2.824232578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106371, + "balance_loss_mlp": 1.00987494, + "epoch": 0.1675644478645633, + "flos": 481757127168.0, + "grad_norm": 0.04264569815750397, + "language_loss": 0.90229708, + "learning_rate": 0.0009511964761151197, + "loss": 0.91293418, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.53955078, + "step": 871, + "time_per_iteration": 2.6326816082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106642, + "balance_loss_mlp": 1.01344323, + "epoch": 0.16775682954982685, + "flos": 495542058240.0, + "grad_norm": 0.04000245460937008, + "language_loss": 0.91825569, + "learning_rate": 0.0009510621405653521, + "loss": 0.92891991, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.53076172, + "step": 872, + "time_per_iteration": 2.5802783966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074346, + "balance_loss_mlp": 1.02151191, + "epoch": 0.1679492112350904, + "flos": 753406096896.0, + "grad_norm": 0.04130745072346603, + "language_loss": 0.85908926, + "learning_rate": 0.0009509276298977309, + "loss": 0.86983275, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.52929688, + "step": 873, + "time_per_iteration": 2.9676413536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069963, + "balance_loss_mlp": 1.01689136, + "epoch": 0.168141592920354, + "flos": 1137733583616.0, + "grad_norm": 0.036676349776393134, + "language_loss": 0.82925022, + "learning_rate": 0.0009507929441644778, + "loss": 0.83994985, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.53173828, + "step": 874, + "time_per_iteration": 3.5441927909851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.00924039, + "epoch": 0.16833397460561755, + "flos": 633554674176.0, + "grad_norm": 0.03715311549034911, + "language_loss": 0.86810201, + "learning_rate": 0.0009506580834178826, + "loss": 0.87872851, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.53515625, + "step": 875, + "time_per_iteration": 2.767840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106879, + "balance_loss_mlp": 1.01524162, + "epoch": 0.1685263562908811, + "flos": 542543606784.0, + "grad_norm": 0.041322978640758234, + "language_loss": 0.92533737, + "learning_rate": 0.0009505230477103028, + "loss": 0.93602526, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.53662109, + "step": 876, + "time_per_iteration": 2.68626070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_mlp": 1.01151776, + "epoch": 0.16871873797614467, + "flos": 620486158848.0, + "grad_norm": 0.04979097271806245, + "language_loss": 0.82312369, + "learning_rate": 0.0009503878370941641, + "loss": 0.83377057, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.53271484, + "step": 877, + "time_per_iteration": 2.738828182220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.01455081, + "epoch": 0.16891111966140823, + "flos": 607456527360.0, + "grad_norm": 0.048240798926105125, + "language_loss": 0.90597415, + "learning_rate": 0.0009502524516219595, + "loss": 0.91664839, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.52978516, + "step": 878, + "time_per_iteration": 2.7533464431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.01234174, + "epoch": 0.1691035013466718, + "flos": 553406494464.0, + "grad_norm": 0.04285435284136928, + "language_loss": 0.91275579, + "learning_rate": 0.0009501168913462506, + "loss": 0.92340994, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.53173828, + "step": 879, + "time_per_iteration": 2.6498849391937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106115, + "balance_loss_mlp": 1.00946045, + "epoch": 0.16929588303193535, + "flos": 1479308427264.0, + "grad_norm": 0.010969186313753012, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80183077, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.51757812, + "step": 880, + "time_per_iteration": 4.8127734661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065784, + "balance_loss_mlp": 1.01228285, + "epoch": 0.1694882647171989, + "flos": 927848024064.0, + "grad_norm": 0.04254449001590413, + "language_loss": 0.86211771, + "learning_rate": 0.0009498452465949042, + "loss": 0.87277561, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.53613281, + "step": 881, + "time_per_iteration": 3.242352247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.00668061, + "epoch": 0.1696806464024625, + "flos": 547152193536.0, + "grad_norm": 0.03842920637304405, + "language_loss": 0.92758489, + "learning_rate": 0.0009497091622247285, + "loss": 0.93818152, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.53076172, + "step": 882, + "time_per_iteration": 2.7538321018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.01363766, + "epoch": 0.16987302808772606, + "flos": 530295519744.0, + "grad_norm": 0.04346709327253658, + "language_loss": 0.94739175, + "learning_rate": 0.0009495729032619723, + "loss": 0.95805502, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.52783203, + "step": 883, + "time_per_iteration": 2.681851863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.00830746, + "epoch": 0.17006540977298962, + "flos": 756479784960.0, + "grad_norm": 0.03707996109728333, + "language_loss": 0.85065424, + "learning_rate": 0.0009494364697595354, + "loss": 0.86126566, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.52929688, + "step": 884, + "time_per_iteration": 2.886613607406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058078, + "balance_loss_mlp": 1.00495851, + "epoch": 0.17025779145825318, + "flos": 559875623424.0, + "grad_norm": 0.04262534374301406, + "language_loss": 0.90753883, + "learning_rate": 0.0009492998617703867, + "loss": 0.91811961, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.53222656, + "step": 885, + "time_per_iteration": 2.7197954654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069962, + "balance_loss_mlp": 1.01684284, + "epoch": 0.17045017314351674, + "flos": 513217214976.0, + "grad_norm": 0.04472607646913617, + "language_loss": 0.89151132, + "learning_rate": 0.0009491630793475619, + "loss": 0.90221095, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.53222656, + "step": 886, + "time_per_iteration": 2.6023643016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059759, + "balance_loss_mlp": 1.00706899, + "epoch": 0.1706425548287803, + "flos": 510013269504.0, + "grad_norm": 0.03690999998020265, + "language_loss": 0.86250949, + "learning_rate": 0.0009490261225441643, + "loss": 0.87310708, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.52783203, + "step": 887, + "time_per_iteration": 2.8811516761779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01845872, + "epoch": 0.17083493651404386, + "flos": 718715818752.0, + "grad_norm": 0.037520519160069404, + "language_loss": 0.91723603, + "learning_rate": 0.0009488889914133656, + "loss": 0.92794418, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.52441406, + "step": 888, + "time_per_iteration": 2.983920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.01515496, + "epoch": 0.17102731819930742, + "flos": 560201266944.0, + "grad_norm": 0.034570155262309, + "language_loss": 0.90050644, + "learning_rate": 0.0009487516860084047, + "loss": 0.91118205, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.52490234, + "step": 889, + "time_per_iteration": 2.739945888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061028, + "balance_loss_mlp": 1.0078603, + "epoch": 0.17121969988457098, + "flos": 495765634560.0, + "grad_norm": 0.04354558177795279, + "language_loss": 0.9033885, + "learning_rate": 0.0009486142063825884, + "loss": 0.91399872, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.53271484, + "step": 890, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.02223206, + "epoch": 0.17141208156983456, + "flos": 1552108723968.0, + "grad_norm": 0.01766408052426257, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73499948, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.515625, + "step": 891, + "time_per_iteration": 4.968579053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_mlp": 1.00568736, + "epoch": 0.17160446325509812, + "flos": 620700986880.0, + "grad_norm": 0.037544702591063864, + "language_loss": 0.91210532, + "learning_rate": 0.0009483387246819542, + "loss": 0.92269152, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.53027344, + "step": 892, + "time_per_iteration": 2.7970938682556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071655, + "balance_loss_mlp": 1.0209198, + "epoch": 0.17179684494036168, + "flos": 1384695839232.0, + "grad_norm": 0.01601076320839161, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83357239, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.5078125, + "step": 893, + "time_per_iteration": 4.629605054855347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.01386988, + "epoch": 0.17198922662562524, + "flos": 493642632192.0, + "grad_norm": 0.03763004911158334, + "language_loss": 0.90241146, + "learning_rate": 0.0009480625467392688, + "loss": 0.91307414, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.52490234, + "step": 894, + "time_per_iteration": 2.6142358779907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_mlp": 1.01822662, + "epoch": 0.1721816083108888, + "flos": 1461488428800.0, + "grad_norm": 0.016749035753296605, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79063439, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.50585938, + "step": 895, + "time_per_iteration": 4.811494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112065, + "balance_loss_mlp": 1.06719661, + "epoch": 0.17237398999615236, + "flos": 529205828352.0, + "grad_norm": 0.05241044192650153, + "language_loss": 0.88738441, + "learning_rate": 0.0009477856729834196, + "loss": 0.89859092, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.53564453, + "step": 896, + "time_per_iteration": 2.7389612197875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066584, + "balance_loss_mlp": 1.01446557, + "epoch": 0.17256637168141592, + "flos": 605027323392.0, + "grad_norm": 0.03860455021635393, + "language_loss": 0.90989411, + "learning_rate": 0.0009476469753098809, + "loss": 0.92055988, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.52197266, + "step": 897, + "time_per_iteration": 2.7175238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.02507758, + "epoch": 0.17275875336667948, + "flos": 510694692096.0, + "grad_norm": 0.040412661310783936, + "language_loss": 0.88453948, + "learning_rate": 0.0009475081038443738, + "loss": 0.89531147, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.52197266, + "step": 898, + "time_per_iteration": 2.6398110389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.02753115, + "epoch": 0.17295113505194307, + "flos": 666502028544.0, + "grad_norm": 0.045107808798334564, + "language_loss": 0.87902451, + "learning_rate": 0.0009473690586408124, + "loss": 0.88981915, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.52001953, + "step": 899, + "time_per_iteration": 2.817730665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.01965487, + "epoch": 0.17314351673720663, + "flos": 556432550400.0, + "grad_norm": 0.03870851432877784, + "language_loss": 0.87576568, + "learning_rate": 0.0009472298397531792, + "loss": 0.88648236, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.52099609, + "step": 900, + "time_per_iteration": 2.6932764053344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061802, + "balance_loss_mlp": 1.00892079, + "epoch": 0.17333589842247019, + "flos": 504607587072.0, + "grad_norm": 0.03631909976073519, + "language_loss": 0.87174571, + "learning_rate": 0.0009470904472355235, + "loss": 0.88236374, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.52978516, + "step": 901, + "time_per_iteration": 2.669405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099242, + "balance_loss_mlp": 1.04593205, + "epoch": 0.17352828010773375, + "flos": 557351155200.0, + "grad_norm": 0.04839261993488341, + "language_loss": 0.80976391, + "learning_rate": 0.0009469508811419626, + "loss": 0.82075632, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.53417969, + "step": 902, + "time_per_iteration": 2.7412211894989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083992, + "balance_loss_mlp": 1.033638, + "epoch": 0.1737206617929973, + "flos": 1557794363136.0, + "grad_norm": 0.02136399149953286, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72697818, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.50390625, + "step": 903, + "time_per_iteration": 4.800720930099487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075494, + "balance_loss_mlp": 1.02318478, + "epoch": 0.17391304347826086, + "flos": 517756782336.0, + "grad_norm": 0.04178806719411302, + "language_loss": 0.85797513, + "learning_rate": 0.0009466712284439292, + "loss": 0.86873007, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.52392578, + "step": 904, + "time_per_iteration": 2.7409780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076244, + "balance_loss_mlp": 1.02360141, + "epoch": 0.17410542516352442, + "flos": 542161582848.0, + "grad_norm": 0.043268311729831165, + "language_loss": 0.90273786, + "learning_rate": 0.0009465311419480276, + "loss": 0.91350031, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.52734375, + "step": 905, + "time_per_iteration": 2.7310986518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068245, + "balance_loss_mlp": 1.01526833, + "epoch": 0.17429780684878798, + "flos": 625082106624.0, + "grad_norm": 0.0375699532684124, + "language_loss": 0.89484948, + "learning_rate": 0.0009463908820933622, + "loss": 0.905532, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.53076172, + "step": 906, + "time_per_iteration": 2.8575551509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086696, + "balance_loss_mlp": 1.03281319, + "epoch": 0.17449018853405157, + "flos": 576849915648.0, + "grad_norm": 0.04286783530345041, + "language_loss": 0.83513701, + "learning_rate": 0.0009462504489343868, + "loss": 0.84600401, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.54003906, + "step": 907, + "time_per_iteration": 2.83085036277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.0128628, + "epoch": 0.17468257021931513, + "flos": 534773849088.0, + "grad_norm": 0.0408315501053547, + "language_loss": 0.90177906, + "learning_rate": 0.0009461098425256222, + "loss": 0.91243982, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.53320312, + "step": 908, + "time_per_iteration": 2.6000654697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075411, + "balance_loss_mlp": 1.02257717, + "epoch": 0.1748749519045787, + "flos": 541809694464.0, + "grad_norm": 0.0381088809784924, + "language_loss": 0.87053907, + "learning_rate": 0.0009459690629216567, + "loss": 0.88129318, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.52929688, + "step": 909, + "time_per_iteration": 2.622178316116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_mlp": 1.02770495, + "epoch": 0.17506733358984225, + "flos": 499627670016.0, + "grad_norm": 0.039096197570908604, + "language_loss": 0.88898331, + "learning_rate": 0.0009458281101771457, + "loss": 0.89978582, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.52636719, + "step": 910, + "time_per_iteration": 2.5964770317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.01217556, + "epoch": 0.1752597152751058, + "flos": 624133366272.0, + "grad_norm": 0.035444142957055544, + "language_loss": 0.83730716, + "learning_rate": 0.0009456869843468122, + "loss": 0.84795535, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.52734375, + "step": 911, + "time_per_iteration": 2.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059336, + "balance_loss_mlp": 1.00650251, + "epoch": 0.17545209696036937, + "flos": 521994038784.0, + "grad_norm": 0.04587594362499167, + "language_loss": 0.79429859, + "learning_rate": 0.0009455456854854459, + "loss": 0.80489194, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.52929688, + "step": 912, + "time_per_iteration": 2.627058744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.0219084, + "epoch": 0.17564447864563293, + "flos": 462946592256.0, + "grad_norm": 0.044462507375804226, + "language_loss": 0.85522115, + "learning_rate": 0.0009454042136479039, + "loss": 0.86597091, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.53173828, + "step": 913, + "time_per_iteration": 2.562453031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.01250815, + "epoch": 0.1758368603308965, + "flos": 481618121472.0, + "grad_norm": 0.03599423435064716, + "language_loss": 0.84144086, + "learning_rate": 0.0009452625688891103, + "loss": 0.85208857, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.5234375, + "step": 914, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.0137558, + "epoch": 0.17602924201616005, + "flos": 1482087574272.0, + "grad_norm": 0.013260252544834742, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79798466, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.49902344, + "step": 915, + "time_per_iteration": 4.572151184082031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_mlp": 1.0219233, + "epoch": 0.17622162370142364, + "flos": 603471037440.0, + "grad_norm": 0.044830704586910027, + "language_loss": 0.94022703, + "learning_rate": 0.0009449787608278015, + "loss": 0.95096982, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.52441406, + "step": 916, + "time_per_iteration": 2.731264114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062837, + "balance_loss_mlp": 1.0104804, + "epoch": 0.1764140053866872, + "flos": 443606279424.0, + "grad_norm": 0.0370205772569368, + "language_loss": 0.92972034, + "learning_rate": 0.0009448365976354704, + "loss": 0.94034874, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.52441406, + "step": 917, + "time_per_iteration": 2.478041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.0204134, + "epoch": 0.17660638707195075, + "flos": 501592224768.0, + "grad_norm": 0.047363321454448416, + "language_loss": 0.907022, + "learning_rate": 0.0009446942617422558, + "loss": 0.91775542, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.53027344, + "step": 918, + "time_per_iteration": 2.5698564052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_mlp": 1.00789583, + "epoch": 0.17679876875721431, + "flos": 539984145408.0, + "grad_norm": 0.03732253291641402, + "language_loss": 0.86447889, + "learning_rate": 0.0009445517532034176, + "loss": 0.87508708, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.53027344, + "step": 919, + "time_per_iteration": 2.6916563510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.00926292, + "epoch": 0.17699115044247787, + "flos": 498715868160.0, + "grad_norm": 0.04444616550081301, + "language_loss": 0.8994987, + "learning_rate": 0.0009444090720742824, + "loss": 0.91012013, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.52978516, + "step": 920, + "time_per_iteration": 2.5798380374908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.01706016, + "epoch": 0.17718353212774143, + "flos": 663916322304.0, + "grad_norm": 0.04662040468857239, + "language_loss": 0.89399016, + "learning_rate": 0.0009442662184102439, + "loss": 0.90468818, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.52832031, + "step": 921, + "time_per_iteration": 2.755929708480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.01164341, + "epoch": 0.177375913813005, + "flos": 583848822528.0, + "grad_norm": 0.03479566109485236, + "language_loss": 0.88455689, + "learning_rate": 0.000944123192266763, + "loss": 0.89519787, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.52539062, + "step": 922, + "time_per_iteration": 2.8776824474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.00980616, + "epoch": 0.17756829549826855, + "flos": 553684505856.0, + "grad_norm": 0.036018663808135676, + "language_loss": 0.84559548, + "learning_rate": 0.0009439799936993671, + "loss": 0.85622525, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.53271484, + "step": 923, + "time_per_iteration": 2.708897113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.01041508, + "epoch": 0.17776067718353214, + "flos": 557372542464.0, + "grad_norm": 0.06706828820902193, + "language_loss": 0.89721078, + "learning_rate": 0.0009438366227636511, + "loss": 0.90784371, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.52978516, + "step": 924, + "time_per_iteration": 2.6524295806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.01035416, + "epoch": 0.1779530588687957, + "flos": 659652820992.0, + "grad_norm": 0.03503923634288643, + "language_loss": 0.87549317, + "learning_rate": 0.0009436930795152763, + "loss": 0.8861202, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.52441406, + "step": 925, + "time_per_iteration": 2.8627374172210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_mlp": 1.01823378, + "epoch": 0.17814544055405926, + "flos": 645672503808.0, + "grad_norm": 0.03989967380061369, + "language_loss": 0.87815237, + "learning_rate": 0.0009435493640099713, + "loss": 0.88885403, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.52001953, + "step": 926, + "time_per_iteration": 2.7886180877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.01283479, + "epoch": 0.17833782223932282, + "flos": 461885091072.0, + "grad_norm": 0.040977111340993126, + "language_loss": 0.85709256, + "learning_rate": 0.0009434054763035314, + "loss": 0.86774307, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.52294922, + "step": 927, + "time_per_iteration": 2.635576009750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00520515, + "epoch": 0.17853020392458638, + "flos": 760854101760.0, + "grad_norm": 0.029435711646972902, + "language_loss": 0.86359227, + "learning_rate": 0.0009432614164518185, + "loss": 0.8741703, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.52685547, + "step": 928, + "time_per_iteration": 2.945253849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074963, + "balance_loss_mlp": 1.02203369, + "epoch": 0.17872258560984994, + "flos": 784056450048.0, + "grad_norm": 0.039066121455708196, + "language_loss": 0.84876156, + "learning_rate": 0.000943117184510762, + "loss": 0.85951114, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 3.0016870498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092369, + "balance_loss_mlp": 1.04201508, + "epoch": 0.1789149672951135, + "flos": 1463034021120.0, + "grad_norm": 0.03241390760866092, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79882336, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.50390625, + "step": 930, + "time_per_iteration": 5.0408923625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.04005396, + "epoch": 0.17910734898037706, + "flos": 504931285248.0, + "grad_norm": 0.037670754636037675, + "language_loss": 0.90276599, + "learning_rate": 0.0009428282045846674, + "loss": 0.91368294, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.51708984, + "step": 931, + "time_per_iteration": 2.699357509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093914, + "balance_loss_mlp": 1.04260671, + "epoch": 0.17929973066564064, + "flos": 747670880256.0, + "grad_norm": 0.03557447538434831, + "language_loss": 0.91468316, + "learning_rate": 0.0009426834567118214, + "loss": 0.92562228, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.51367188, + "step": 932, + "time_per_iteration": 3.0888116359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095199, + "balance_loss_mlp": 1.04370034, + "epoch": 0.1794921123509042, + "flos": 714573826560.0, + "grad_norm": 0.03713873812168088, + "language_loss": 0.82311261, + "learning_rate": 0.0009425385369740155, + "loss": 0.8340646, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.515625, + "step": 933, + "time_per_iteration": 3.0156304836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.04465711, + "epoch": 0.17968449403616776, + "flos": 634362463488.0, + "grad_norm": 0.04581160448205157, + "language_loss": 0.89044029, + "learning_rate": 0.0009423934454275125, + "loss": 0.90140092, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.51464844, + "step": 934, + "time_per_iteration": 2.8524558544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095874, + "balance_loss_mlp": 1.04428041, + "epoch": 0.17987687572143132, + "flos": 537378997248.0, + "grad_norm": 0.045982575553228676, + "language_loss": 0.93734717, + "learning_rate": 0.0009422481821286418, + "loss": 0.94830596, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.51660156, + "step": 935, + "time_per_iteration": 2.7354249954223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096416, + "balance_loss_mlp": 1.0448221, + "epoch": 0.18006925740669488, + "flos": 539119975680.0, + "grad_norm": 0.04748543050697339, + "language_loss": 0.89948702, + "learning_rate": 0.0009421027471337998, + "loss": 0.91045117, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.51660156, + "step": 936, + "time_per_iteration": 2.660287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095184, + "balance_loss_mlp": 1.04363835, + "epoch": 0.18026163909195844, + "flos": 540535310592.0, + "grad_norm": 0.04911488628490749, + "language_loss": 0.84066534, + "learning_rate": 0.0009419571404994493, + "loss": 0.8516171, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.51611328, + "step": 937, + "time_per_iteration": 2.624769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.03865409, + "epoch": 0.180454020777222, + "flos": 501683598336.0, + "grad_norm": 0.0468107226861285, + "language_loss": 0.92304778, + "learning_rate": 0.00094181136228212, + "loss": 0.9339512, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.51757812, + "step": 938, + "time_per_iteration": 2.6784133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092284, + "balance_loss_mlp": 1.04069054, + "epoch": 0.18064640246248556, + "flos": 500007748608.0, + "grad_norm": 0.039466745711782485, + "language_loss": 0.87082231, + "learning_rate": 0.0009416654125384077, + "loss": 0.8817451, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.51660156, + "step": 939, + "time_per_iteration": 2.7231576442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081085, + "balance_loss_mlp": 1.03034973, + "epoch": 0.18083878414774912, + "flos": 1522293383424.0, + "grad_norm": 0.016406546431804496, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80853462, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.5078125, + "step": 940, + "time_per_iteration": 4.919930934906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329403, + "balance_loss_mlp": 1.27490067, + "epoch": 0.1810311658330127, + "flos": 728666904576.0, + "grad_norm": 0.12503564718566265, + "language_loss": 0.85519916, + "learning_rate": 0.000941372998698552, + "loss": 0.8684932, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.54638672, + "step": 941, + "time_per_iteration": 2.9731380939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093385, + "balance_loss_mlp": 1.04121876, + "epoch": 0.18122354751827627, + "flos": 566045353728.0, + "grad_norm": 0.05253753965114479, + "language_loss": 0.83319217, + "learning_rate": 0.0009412265347159336, + "loss": 0.84412599, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.52246094, + "step": 942, + "time_per_iteration": 2.7150988578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103842, + "balance_loss_mlp": 1.05162799, + "epoch": 0.18141592920353983, + "flos": 520318189056.0, + "grad_norm": 0.046885904923641086, + "language_loss": 0.86687338, + "learning_rate": 0.0009410798994339829, + "loss": 0.87791175, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.52294922, + "step": 943, + "time_per_iteration": 2.598576545715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111341, + "balance_loss_mlp": 1.05831623, + "epoch": 0.1816083108888034, + "flos": 513477729792.0, + "grad_norm": 0.04639702407841738, + "language_loss": 0.8991158, + "learning_rate": 0.000940933092909628, + "loss": 0.91022921, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.53125, + "step": 944, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104212, + "balance_loss_mlp": 1.05109203, + "epoch": 0.18180069257406695, + "flos": 493373369088.0, + "grad_norm": 0.04493061679832577, + "language_loss": 0.85416293, + "learning_rate": 0.0009407861151998649, + "loss": 0.86520505, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.53222656, + "step": 945, + "time_per_iteration": 2.5710983276367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.04692006, + "epoch": 0.1819930742593305, + "flos": 571231350528.0, + "grad_norm": 0.04259629183686275, + "language_loss": 0.87787771, + "learning_rate": 0.0009406389663617552, + "loss": 0.88888001, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.53417969, + "step": 946, + "time_per_iteration": 2.6741456985473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.04661465, + "epoch": 0.18218545594459407, + "flos": 607111441920.0, + "grad_norm": 0.04866460503106345, + "language_loss": 0.87927794, + "learning_rate": 0.000940491646452427, + "loss": 0.89027911, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.53613281, + "step": 947, + "time_per_iteration": 2.718358278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101327, + "balance_loss_mlp": 1.04753995, + "epoch": 0.18237783762985763, + "flos": 549739845120.0, + "grad_norm": 0.042994543525894185, + "language_loss": 0.92601323, + "learning_rate": 0.000940344155529075, + "loss": 0.93702656, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.5390625, + "step": 948, + "time_per_iteration": 2.624303102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097087, + "balance_loss_mlp": 1.04325247, + "epoch": 0.1825702193151212, + "flos": 451675435776.0, + "grad_norm": 0.046415524987670945, + "language_loss": 0.89178842, + "learning_rate": 0.0009401964936489605, + "loss": 0.90275931, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.53955078, + "step": 949, + "time_per_iteration": 2.5104119777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088983, + "balance_loss_mlp": 1.03524303, + "epoch": 0.18276260100038477, + "flos": 590385025536.0, + "grad_norm": 0.0430347708706334, + "language_loss": 0.86972219, + "learning_rate": 0.0009400486608694108, + "loss": 0.88061202, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.53857422, + "step": 950, + "time_per_iteration": 2.744044065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085663, + "balance_loss_mlp": 1.03154159, + "epoch": 0.18295498268564833, + "flos": 788710723584.0, + "grad_norm": 0.040810758702646055, + "language_loss": 0.88588369, + "learning_rate": 0.0009399006572478195, + "loss": 0.89674032, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.54248047, + "step": 951, + "time_per_iteration": 3.0828475952148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.02493632, + "epoch": 0.1831473643709119, + "flos": 579226629888.0, + "grad_norm": 0.03747434947067488, + "language_loss": 0.92113942, + "learning_rate": 0.0009397524828416468, + "loss": 0.93193376, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.54638672, + "step": 952, + "time_per_iteration": 2.6881086826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.03405273, + "epoch": 0.18333974605617545, + "flos": 567964221696.0, + "grad_norm": 0.0419825959367211, + "language_loss": 0.97306633, + "learning_rate": 0.0009396041377084192, + "loss": 0.9839648, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.55957031, + "step": 953, + "time_per_iteration": 2.673654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097804, + "balance_loss_mlp": 1.04191864, + "epoch": 0.183532127741439, + "flos": 528070450176.0, + "grad_norm": 0.04203850234568462, + "language_loss": 0.89016271, + "learning_rate": 0.0009394556219057295, + "loss": 0.90114069, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.56054688, + "step": 954, + "time_per_iteration": 2.7255043983459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.01610565, + "epoch": 0.18372450942670257, + "flos": 595644899328.0, + "grad_norm": 0.03789415730727427, + "language_loss": 0.84751296, + "learning_rate": 0.0009393069354912362, + "loss": 0.85822284, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.55029297, + "step": 955, + "time_per_iteration": 2.7474210262298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_mlp": 1.02963698, + "epoch": 0.18391689111196613, + "flos": 646284907008.0, + "grad_norm": 0.04389714766773939, + "language_loss": 0.83882308, + "learning_rate": 0.0009391580785226649, + "loss": 0.84966445, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.54638672, + "step": 956, + "time_per_iteration": 2.844409465789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081024, + "balance_loss_mlp": 1.02990723, + "epoch": 0.18410927279722972, + "flos": 1460394846720.0, + "grad_norm": 0.013082177800516761, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80421472, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.51171875, + "step": 957, + "time_per_iteration": 4.792405843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_mlp": 1.030267, + "epoch": 0.18430165448249328, + "flos": 660004709376.0, + "grad_norm": 0.04089111102732722, + "language_loss": 0.88231802, + "learning_rate": 0.0009388598531545196, + "loss": 0.89316285, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.54345703, + "step": 958, + "time_per_iteration": 2.900062084197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084489, + "balance_loss_mlp": 1.03017747, + "epoch": 0.18449403616775684, + "flos": 518950486272.0, + "grad_norm": 0.045948437313162956, + "language_loss": 0.87467843, + "learning_rate": 0.000938710484870727, + "loss": 0.88552332, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.54443359, + "step": 959, + "time_per_iteration": 2.5785140991210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.031569, + "epoch": 0.1846864178530204, + "flos": 553825456896.0, + "grad_norm": 0.04362127254920589, + "language_loss": 0.87369549, + "learning_rate": 0.0009385609462644189, + "loss": 0.88455284, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.54296875, + "step": 960, + "time_per_iteration": 2.686221122741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.02774417, + "epoch": 0.18487879953828396, + "flos": 467116774656.0, + "grad_norm": 0.04468558895083242, + "language_loss": 0.86931455, + "learning_rate": 0.0009384112373936514, + "loss": 0.88013744, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.546875, + "step": 961, + "time_per_iteration": 2.633582830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.00935197, + "epoch": 0.18507118122354752, + "flos": 649684238592.0, + "grad_norm": 0.03687654302408078, + "language_loss": 0.9259429, + "learning_rate": 0.0009382613583165467, + "loss": 0.93658715, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.55224609, + "step": 962, + "time_per_iteration": 2.7910635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458915, + "balance_loss_mlp": 1.40078855, + "epoch": 0.18526356290881107, + "flos": 627923470080.0, + "grad_norm": 0.09306974449566385, + "language_loss": 0.90611041, + "learning_rate": 0.0009381113090912928, + "loss": 0.92069954, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.57958984, + "step": 963, + "time_per_iteration": 2.7445125579833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078714, + "balance_loss_mlp": 1.02464056, + "epoch": 0.18545594459407463, + "flos": 433646445312.0, + "grad_norm": 0.04076594680163087, + "language_loss": 0.91471934, + "learning_rate": 0.000937961089776144, + "loss": 0.92550647, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.54199219, + "step": 964, + "time_per_iteration": 2.5835955142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089429, + "balance_loss_mlp": 1.03607059, + "epoch": 0.1856483262793382, + "flos": 750427673088.0, + "grad_norm": 0.041116434601540804, + "language_loss": 0.8449949, + "learning_rate": 0.0009378107004294208, + "loss": 0.8558892, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.53466797, + "step": 965, + "time_per_iteration": 2.9773664474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090833, + "balance_loss_mlp": 1.03790379, + "epoch": 0.18584070796460178, + "flos": 531402707712.0, + "grad_norm": 0.04029010126422192, + "language_loss": 0.93043375, + "learning_rate": 0.0009376601411095096, + "loss": 0.94134206, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.53027344, + "step": 966, + "time_per_iteration": 2.6703643798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088702, + "balance_loss_mlp": 1.03639269, + "epoch": 0.18603308964986534, + "flos": 484084263936.0, + "grad_norm": 0.03934020689435504, + "language_loss": 0.87718618, + "learning_rate": 0.0009375094118748622, + "loss": 0.88807321, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.52392578, + "step": 967, + "time_per_iteration": 2.5719969272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091813, + "balance_loss_mlp": 1.03974187, + "epoch": 0.1862254713351289, + "flos": 802682292480.0, + "grad_norm": 0.042176858736630414, + "language_loss": 0.92643285, + "learning_rate": 0.0009373585127839976, + "loss": 0.93735105, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.52148438, + "step": 968, + "time_per_iteration": 2.956153392791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096332, + "balance_loss_mlp": 1.04483318, + "epoch": 0.18641785302039246, + "flos": 479290984704.0, + "grad_norm": 0.04307464179422831, + "language_loss": 0.92206955, + "learning_rate": 0.0009372074438954994, + "loss": 0.93303293, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.515625, + "step": 969, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092255, + "balance_loss_mlp": 1.04085171, + "epoch": 0.18661023470565602, + "flos": 389779822848.0, + "grad_norm": 0.044792080488554424, + "language_loss": 0.93312657, + "learning_rate": 0.0009370562052680181, + "loss": 0.94404912, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.51464844, + "step": 970, + "time_per_iteration": 2.4642274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109029, + "balance_loss_mlp": 1.03926873, + "epoch": 0.18680261639091958, + "flos": 565776090624.0, + "grad_norm": 0.03666794569701081, + "language_loss": 0.90593827, + "learning_rate": 0.0009369047969602695, + "loss": 0.91684115, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.51074219, + "step": 971, + "time_per_iteration": 2.6925313472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090519, + "balance_loss_mlp": 1.03968859, + "epoch": 0.18699499807618314, + "flos": 480230976768.0, + "grad_norm": 0.04959033368050126, + "language_loss": 0.88274431, + "learning_rate": 0.0009367532190310357, + "loss": 0.89364946, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.50878906, + "step": 972, + "time_per_iteration": 2.5632824897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095286, + "balance_loss_mlp": 1.04464579, + "epoch": 0.1871873797614467, + "flos": 554328989952.0, + "grad_norm": 0.047101191533600484, + "language_loss": 0.90956879, + "learning_rate": 0.0009366014715391644, + "loss": 0.92052168, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.50683594, + "step": 973, + "time_per_iteration": 2.6131792068481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087331, + "balance_loss_mlp": 1.03669059, + "epoch": 0.18737976144671029, + "flos": 553953768960.0, + "grad_norm": 0.03277863870695053, + "language_loss": 0.85193431, + "learning_rate": 0.0009364495545435693, + "loss": 0.86280763, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.50683594, + "step": 974, + "time_per_iteration": 2.768160820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077828, + "balance_loss_mlp": 1.02647221, + "epoch": 0.18757214313197385, + "flos": 503248632576.0, + "grad_norm": 0.03709252074476072, + "language_loss": 0.90046728, + "learning_rate": 0.0009362974681032297, + "loss": 0.91124547, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.51416016, + "step": 975, + "time_per_iteration": 2.596752405166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358762, + "balance_loss_mlp": 1.30464137, + "epoch": 0.1877645248172374, + "flos": 676292721408.0, + "grad_norm": 0.11355211768831018, + "language_loss": 0.89691889, + "learning_rate": 0.0009361452122771907, + "loss": 0.91050649, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.54248047, + "step": 976, + "time_per_iteration": 2.841670036315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.03649426, + "epoch": 0.18795690650250096, + "flos": 405863700480.0, + "grad_norm": 0.05182073733860081, + "language_loss": 0.85757113, + "learning_rate": 0.0009359927871245635, + "loss": 0.86844826, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.51269531, + "step": 977, + "time_per_iteration": 2.4593758583068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110124, + "balance_loss_mlp": 1.04988456, + "epoch": 0.18814928818776452, + "flos": 639064369152.0, + "grad_norm": 0.04599902588150218, + "language_loss": 0.8843354, + "learning_rate": 0.0009358401927045246, + "loss": 0.89534783, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.51416016, + "step": 978, + "time_per_iteration": 2.8043553829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_mlp": 1.05197036, + "epoch": 0.18834166987302808, + "flos": 1140117100800.0, + "grad_norm": 0.05109113713971293, + "language_loss": 0.89583617, + "learning_rate": 0.0009356874290763166, + "loss": 0.90687132, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.51611328, + "step": 979, + "time_per_iteration": 3.4783685207366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105346, + "balance_loss_mlp": 1.0536567, + "epoch": 0.18853405155829164, + "flos": 505816842240.0, + "grad_norm": 0.03906189308485337, + "language_loss": 0.90395761, + "learning_rate": 0.0009355344962992474, + "loss": 0.91501105, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.51757812, + "step": 980, + "time_per_iteration": 2.6457359790802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_mlp": 1.05116904, + "epoch": 0.1887264332435552, + "flos": 609371504640.0, + "grad_norm": 0.038270487176229884, + "language_loss": 0.89782834, + "learning_rate": 0.0009353813944326908, + "loss": 0.9088589, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.51953125, + "step": 981, + "time_per_iteration": 2.923243761062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102538, + "balance_loss_mlp": 1.05070543, + "epoch": 0.1889188149288188, + "flos": 553593132288.0, + "grad_norm": 0.04212053297292714, + "language_loss": 0.84181225, + "learning_rate": 0.0009352281235360863, + "loss": 0.85283768, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.51904297, + "step": 982, + "time_per_iteration": 2.674790620803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_mlp": 1.05135345, + "epoch": 0.18911119661408235, + "flos": 419470742016.0, + "grad_norm": 0.03892833341753514, + "language_loss": 0.86323905, + "learning_rate": 0.0009350746836689389, + "loss": 0.87426949, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.51757812, + "step": 983, + "time_per_iteration": 2.5294649600982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103523, + "balance_loss_mlp": 1.05335999, + "epoch": 0.1893035782993459, + "flos": 1485320676864.0, + "grad_norm": 0.016207020064155576, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82542741, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.50195312, + "step": 984, + "time_per_iteration": 5.031845569610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094201, + "balance_loss_mlp": 1.04227316, + "epoch": 0.18949595998460947, + "flos": 509457246720.0, + "grad_norm": 0.045438139941342374, + "language_loss": 0.84563899, + "learning_rate": 0.0009347672972613634, + "loss": 0.85658097, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.52001953, + "step": 985, + "time_per_iteration": 2.6333274841308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.0384593, + "epoch": 0.18968834166987303, + "flos": 532193000448.0, + "grad_norm": 0.03993027053802703, + "language_loss": 0.8704083, + "learning_rate": 0.0009346133508402735, + "loss": 0.8813107, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.51855469, + "step": 986, + "time_per_iteration": 2.751340389251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089761, + "balance_loss_mlp": 1.03797686, + "epoch": 0.1898807233551366, + "flos": 500754299904.0, + "grad_norm": 0.04595906606263721, + "language_loss": 0.85852754, + "learning_rate": 0.0009344592356873166, + "loss": 0.86942512, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.51855469, + "step": 987, + "time_per_iteration": 2.6785645484924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.03223073, + "epoch": 0.19007310504040015, + "flos": 603360221952.0, + "grad_norm": 0.042275439246703725, + "language_loss": 0.79788595, + "learning_rate": 0.0009343049518623255, + "loss": 0.80872947, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.52197266, + "step": 988, + "time_per_iteration": 2.709439516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365061, + "balance_loss_mlp": 1.30979574, + "epoch": 0.1902654867256637, + "flos": 602765315328.0, + "grad_norm": 0.1049262798815586, + "language_loss": 0.8386007, + "learning_rate": 0.0009341504994251985, + "loss": 0.85225129, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.55419922, + "step": 989, + "time_per_iteration": 2.925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.03952026, + "epoch": 0.19045786841092727, + "flos": 1579234345728.0, + "grad_norm": 0.01847097645999908, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74610186, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.50195312, + "step": 990, + "time_per_iteration": 5.025054216384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_mlp": 1.04845631, + "epoch": 0.19065025009619085, + "flos": 683055412992.0, + "grad_norm": 0.039739471389523856, + "language_loss": 0.8281374, + "learning_rate": 0.0009338410889544574, + "loss": 0.83915699, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.53613281, + "step": 991, + "time_per_iteration": 3.0653748512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112616, + "balance_loss_mlp": 1.05868626, + "epoch": 0.1908426317814544, + "flos": 603442847232.0, + "grad_norm": 0.04383499470371995, + "language_loss": 0.89543211, + "learning_rate": 0.000933686131040967, + "loss": 0.90655828, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.54052734, + "step": 992, + "time_per_iteration": 2.7901530265808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106006, + "balance_loss_mlp": 1.0517416, + "epoch": 0.19103501346671797, + "flos": 587434791936.0, + "grad_norm": 0.04122735235002176, + "language_loss": 0.92173266, + "learning_rate": 0.0009335310047555883, + "loss": 0.93279278, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.54394531, + "step": 993, + "time_per_iteration": 2.7153608798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097163, + "balance_loss_mlp": 1.04285157, + "epoch": 0.19122739515198153, + "flos": 546835298304.0, + "grad_norm": 0.04052898350535971, + "language_loss": 0.89637405, + "learning_rate": 0.0009333757101585467, + "loss": 0.90734565, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.54443359, + "step": 994, + "time_per_iteration": 2.6286795139312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091826, + "balance_loss_mlp": 1.03732359, + "epoch": 0.1914197768372451, + "flos": 522550061568.0, + "grad_norm": 0.03850908176124289, + "language_loss": 0.94694555, + "learning_rate": 0.0009332202473101329, + "loss": 0.95786381, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.54638672, + "step": 995, + "time_per_iteration": 2.649850368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072176, + "balance_loss_mlp": 1.01714945, + "epoch": 0.19161215852250865, + "flos": 612388812288.0, + "grad_norm": 0.03654296504823072, + "language_loss": 0.83743644, + "learning_rate": 0.0009330646162707028, + "loss": 0.84815824, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.55175781, + "step": 996, + "time_per_iteration": 2.7329981327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059087, + "balance_loss_mlp": 1.0033443, + "epoch": 0.1918045402077722, + "flos": 848183935488.0, + "grad_norm": 0.03315860340701524, + "language_loss": 0.85236025, + "learning_rate": 0.0009329088171006779, + "loss": 0.8629511, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.55908203, + "step": 997, + "time_per_iteration": 3.135049343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290698, + "balance_loss_mlp": 1.2330482, + "epoch": 0.19199692189303577, + "flos": 466893198336.0, + "grad_norm": 0.06463762674453556, + "language_loss": 0.86239529, + "learning_rate": 0.0009327528498605446, + "loss": 0.87530231, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.57470703, + "step": 998, + "time_per_iteration": 2.5807580947875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.01727533, + "epoch": 0.19218930357829936, + "flos": 532613908224.0, + "grad_norm": 0.04280698068802137, + "language_loss": 0.90856296, + "learning_rate": 0.0009325967146108548, + "loss": 0.91928697, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.55273438, + "step": 999, + "time_per_iteration": 2.637840986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086346, + "balance_loss_mlp": 1.03217781, + "epoch": 0.19238168526356292, + "flos": 602728376832.0, + "grad_norm": 0.04847652630230049, + "language_loss": 0.88902158, + "learning_rate": 0.0009324404114122258, + "loss": 0.89988506, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.54296875, + "step": 1000, + "time_per_iteration": 4.1391942501068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090902, + "balance_loss_mlp": 1.03701913, + "epoch": 0.19257406694882648, + "flos": 573155076096.0, + "grad_norm": 0.04193719314851312, + "language_loss": 0.88362414, + "learning_rate": 0.0009322839403253397, + "loss": 0.89453316, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.54003906, + "step": 1001, + "time_per_iteration": 2.8266265392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.03395164, + "epoch": 0.19276644863409004, + "flos": 803157635328.0, + "grad_norm": 0.04353601683576214, + "language_loss": 0.85235333, + "learning_rate": 0.0009321273014109439, + "loss": 0.86323166, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.54003906, + "step": 1002, + "time_per_iteration": 2.9539175033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_mlp": 1.04068995, + "epoch": 0.1929588303193536, + "flos": 564480319488.0, + "grad_norm": 0.03718563884895513, + "language_loss": 0.86078906, + "learning_rate": 0.0009319704947298513, + "loss": 0.87173432, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.53955078, + "step": 1003, + "time_per_iteration": 2.8760387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091693, + "balance_loss_mlp": 1.0380007, + "epoch": 0.19315121200461716, + "flos": 627988598784.0, + "grad_norm": 0.03744955738150477, + "language_loss": 0.89579475, + "learning_rate": 0.0009318135203429393, + "loss": 0.9067117, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.53808594, + "step": 1004, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094654, + "balance_loss_mlp": 1.04058087, + "epoch": 0.19334359368988072, + "flos": 518584013568.0, + "grad_norm": 0.03742742378220975, + "language_loss": 0.89228511, + "learning_rate": 0.0009316563783111511, + "loss": 0.90323162, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.54199219, + "step": 1005, + "time_per_iteration": 2.7024500370025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090205, + "balance_loss_mlp": 1.03598833, + "epoch": 0.19353597537514428, + "flos": 695400709632.0, + "grad_norm": 0.036019255491177425, + "language_loss": 0.83731771, + "learning_rate": 0.0009314990686954943, + "loss": 0.84821975, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.54345703, + "step": 1006, + "time_per_iteration": 2.901319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092974, + "balance_loss_mlp": 1.03866184, + "epoch": 0.19372835706040784, + "flos": 1212200981760.0, + "grad_norm": 0.03507497873235563, + "language_loss": 0.82359284, + "learning_rate": 0.000931341591557042, + "loss": 0.8345226, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.54443359, + "step": 1007, + "time_per_iteration": 3.70509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.03467596, + "epoch": 0.19392073874567142, + "flos": 521685891840.0, + "grad_norm": 0.04354230775215961, + "language_loss": 0.88703787, + "learning_rate": 0.0009311839469569325, + "loss": 0.89792681, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.54345703, + "step": 1008, + "time_per_iteration": 2.632070302963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088222, + "balance_loss_mlp": 1.03386211, + "epoch": 0.19411312043093498, + "flos": 589911628032.0, + "grad_norm": 0.044503426382111445, + "language_loss": 0.88821465, + "learning_rate": 0.0009310261349563687, + "loss": 0.89909685, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.54492188, + "step": 1009, + "time_per_iteration": 2.7138211727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0067606, + "epoch": 0.19430550211619854, + "flos": 580572945408.0, + "grad_norm": 0.029375689409949213, + "language_loss": 0.86173785, + "learning_rate": 0.0009308681556166186, + "loss": 0.87235624, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.55224609, + "step": 1010, + "time_per_iteration": 2.834946870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05162705, + "balance_loss_mlp": 5.08607721, + "epoch": 0.1944978838014621, + "flos": 622246579200.0, + "grad_norm": 0.2884784307389343, + "language_loss": 0.88793403, + "learning_rate": 0.0009307100089990152, + "loss": 0.93956107, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.76513672, + "step": 1011, + "time_per_iteration": 2.705335855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094303, + "balance_loss_mlp": 1.04189909, + "epoch": 0.19469026548672566, + "flos": 599815081728.0, + "grad_norm": 0.04633555371791679, + "language_loss": 0.85740912, + "learning_rate": 0.0009305516951649568, + "loss": 0.86835217, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.52490234, + "step": 1012, + "time_per_iteration": 2.7048773765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164213, + "balance_loss_mlp": 1.11281013, + "epoch": 0.19488264717198922, + "flos": 553248046848.0, + "grad_norm": 0.04991787894778298, + "language_loss": 0.87912452, + "learning_rate": 0.0009303932141759057, + "loss": 0.89076668, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.51464844, + "step": 1013, + "time_per_iteration": 2.8072102069854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211245, + "balance_loss_mlp": 1.15984225, + "epoch": 0.19507502885725278, + "flos": 667313708544.0, + "grad_norm": 0.06529111316537192, + "language_loss": 0.85445917, + "learning_rate": 0.0009302345660933902, + "loss": 0.86657166, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.51464844, + "step": 1014, + "time_per_iteration": 2.7895615100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244014, + "balance_loss_mlp": 1.19265878, + "epoch": 0.19526741054251634, + "flos": 672328618752.0, + "grad_norm": 0.06071591874537116, + "language_loss": 0.86587232, + "learning_rate": 0.0009300757509790026, + "loss": 0.87831247, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.51416016, + "step": 1015, + "time_per_iteration": 2.8867006301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012313, + "balance_loss_mlp": 1.18008745, + "epoch": 0.19545979222777993, + "flos": 448147792128.0, + "grad_norm": 0.057262662434688416, + "language_loss": 0.91914976, + "learning_rate": 0.0009299167688944005, + "loss": 0.93146276, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.51269531, + "step": 1016, + "time_per_iteration": 2.526421546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226901, + "balance_loss_mlp": 1.17568827, + "epoch": 0.1956521739130435, + "flos": 570169849344.0, + "grad_norm": 0.05343522997619492, + "language_loss": 0.87454194, + "learning_rate": 0.0009297576199013063, + "loss": 0.8868109, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.51269531, + "step": 1017, + "time_per_iteration": 2.7184784412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012071, + "balance_loss_mlp": 1.15884399, + "epoch": 0.19584455559830705, + "flos": 1458883280640.0, + "grad_norm": 0.03399393552013433, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74209231, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.48242188, + "step": 1018, + "time_per_iteration": 4.916393756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159874, + "balance_loss_mlp": 1.11199951, + "epoch": 0.1960369372835706, + "flos": 1594484189184.0, + "grad_norm": 0.02523442502037962, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80586171, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.47851562, + "step": 1019, + "time_per_iteration": 5.5991902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163202, + "balance_loss_mlp": 1.11241901, + "epoch": 0.19622931896883417, + "flos": 617254023168.0, + "grad_norm": 0.06792637193668423, + "language_loss": 0.88615566, + "learning_rate": 0.0009292791720892659, + "loss": 0.89778763, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.50830078, + "step": 1020, + "time_per_iteration": 2.8419806957244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.08191884, + "epoch": 0.19642170065409773, + "flos": 467208148224.0, + "grad_norm": 0.044541966790476714, + "language_loss": 0.90245676, + "learning_rate": 0.0009291193560807218, + "loss": 0.91378373, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.50830078, + "step": 1021, + "time_per_iteration": 2.60357403755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111942, + "balance_loss_mlp": 1.06858945, + "epoch": 0.19661408233936128, + "flos": 516288957696.0, + "grad_norm": 0.03957164107654416, + "language_loss": 0.88134921, + "learning_rate": 0.0009289593734732688, + "loss": 0.89254344, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.50878906, + "step": 1022, + "time_per_iteration": 2.6077988147735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115639, + "balance_loss_mlp": 1.06461763, + "epoch": 0.19680646402462484, + "flos": 393494104320.0, + "grad_norm": 0.03618938319364158, + "language_loss": 0.94921708, + "learning_rate": 0.0009287992243290175, + "loss": 0.96037352, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.51074219, + "step": 1023, + "time_per_iteration": 2.486910820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_mlp": 1.05263603, + "epoch": 0.19699884570988843, + "flos": 627624071424.0, + "grad_norm": 0.04088238638674664, + "language_loss": 0.91379654, + "learning_rate": 0.0009286389087101435, + "loss": 0.92483938, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.51708984, + "step": 1024, + "time_per_iteration": 2.7762300968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083626, + "balance_loss_mlp": 1.03126919, + "epoch": 0.197191227395152, + "flos": 559074637056.0, + "grad_norm": 0.038177798611856564, + "language_loss": 0.89866579, + "learning_rate": 0.0009284784266788864, + "loss": 0.90950203, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.52441406, + "step": 1025, + "time_per_iteration": 2.7595441341400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.05275905, + "epoch": 0.19738360908041555, + "flos": 666250262016.0, + "grad_norm": 0.08120700653890094, + "language_loss": 0.93505025, + "learning_rate": 0.0009283177782975512, + "loss": 0.94610423, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.52734375, + "step": 1026, + "time_per_iteration": 2.9439735412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158523, + "balance_loss_mlp": 1.10511732, + "epoch": 0.1975759907656791, + "flos": 523511440896.0, + "grad_norm": 0.05175943009769999, + "language_loss": 0.89213437, + "learning_rate": 0.000928156963628507, + "loss": 0.9037196, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.53515625, + "step": 1027, + "time_per_iteration": 2.5648727416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124606, + "balance_loss_mlp": 1.0717721, + "epoch": 0.19776837245094267, + "flos": 463485118464.0, + "grad_norm": 0.0380471847687272, + "language_loss": 0.89530945, + "learning_rate": 0.0009279959827341877, + "loss": 0.90655547, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.52929688, + "step": 1028, + "time_per_iteration": 2.7482099533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114747, + "balance_loss_mlp": 1.0622474, + "epoch": 0.19796075413620623, + "flos": 504058367232.0, + "grad_norm": 0.038077776452832945, + "language_loss": 0.88821751, + "learning_rate": 0.0009278348356770915, + "loss": 0.89936495, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.52587891, + "step": 1029, + "time_per_iteration": 2.5559866428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125843, + "balance_loss_mlp": 1.07362974, + "epoch": 0.1981531358214698, + "flos": 508571689728.0, + "grad_norm": 0.03906482091144459, + "language_loss": 0.87010926, + "learning_rate": 0.0009276735225197814, + "loss": 0.88136768, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.52294922, + "step": 1030, + "time_per_iteration": 2.598353862762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116206, + "balance_loss_mlp": 1.06418335, + "epoch": 0.19834551750673335, + "flos": 532640153088.0, + "grad_norm": 0.039761606091750314, + "language_loss": 0.8715511, + "learning_rate": 0.0009275120433248847, + "loss": 0.88271314, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.52099609, + "step": 1031, + "time_per_iteration": 2.691051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105688, + "balance_loss_mlp": 1.05414224, + "epoch": 0.1985378991919969, + "flos": 776971027200.0, + "grad_norm": 0.03650424605094363, + "language_loss": 0.87217546, + "learning_rate": 0.0009273503981550931, + "loss": 0.88323236, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.51611328, + "step": 1032, + "time_per_iteration": 3.05829119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094626, + "balance_loss_mlp": 1.04336572, + "epoch": 0.1987302808772605, + "flos": 435192037632.0, + "grad_norm": 0.04492232470085823, + "language_loss": 0.88675368, + "learning_rate": 0.0009271885870731626, + "loss": 0.89769995, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.51318359, + "step": 1033, + "time_per_iteration": 2.5097644329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.04036272, + "epoch": 0.19892266256252406, + "flos": 554654633472.0, + "grad_norm": 0.041410721104386976, + "language_loss": 0.89478087, + "learning_rate": 0.0009270266101419143, + "loss": 0.90569472, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.51074219, + "step": 1034, + "time_per_iteration": 2.6359710693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.04026711, + "epoch": 0.19911504424778761, + "flos": 550949100288.0, + "grad_norm": 0.034987230226667505, + "language_loss": 0.86329561, + "learning_rate": 0.0009268644674242328, + "loss": 0.87420899, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.51123047, + "step": 1035, + "time_per_iteration": 2.679041624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091574, + "balance_loss_mlp": 1.04045713, + "epoch": 0.19930742593305117, + "flos": 519313068288.0, + "grad_norm": 0.035495194235479824, + "language_loss": 0.81977046, + "learning_rate": 0.0009267021589830678, + "loss": 0.83068615, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.51171875, + "step": 1036, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330025, + "balance_loss_mlp": 1.27871704, + "epoch": 0.19949980761831473, + "flos": 1512640717824.0, + "grad_norm": 0.0530000786951376, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78957105, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.51367188, + "step": 1037, + "time_per_iteration": 5.041083097457886 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.04635978, + "epoch": 0.1996921893035783, + "flos": 699440634624.0, + "grad_norm": 0.03827221066614039, + "language_loss": 0.93735194, + "learning_rate": 0.000926377045182406, + "loss": 0.94832766, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.51269531, + "step": 1038, + "time_per_iteration": 2.921194314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_mlp": 1.05443072, + "epoch": 0.19988457098884185, + "flos": 728395696128.0, + "grad_norm": 0.0388450926907903, + "language_loss": 0.89164472, + "learning_rate": 0.0009262142399491296, + "loss": 0.90270543, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.51708984, + "step": 1039, + "time_per_iteration": 3.0543293952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.05093122, + "epoch": 0.2000769526741054, + "flos": 561625350144.0, + "grad_norm": 0.04341407711707897, + "language_loss": 0.8911137, + "learning_rate": 0.0009260512692448105, + "loss": 0.90213847, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.51611328, + "step": 1040, + "time_per_iteration": 2.6906111240386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097091, + "balance_loss_mlp": 1.04549766, + "epoch": 0.200269334359369, + "flos": 573165769728.0, + "grad_norm": 0.03433464693573298, + "language_loss": 0.85109496, + "learning_rate": 0.000925888133132719, + "loss": 0.86206591, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.51660156, + "step": 1041, + "time_per_iteration": 2.77327561378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112812, + "balance_loss_mlp": 1.06465149, + "epoch": 0.20046171604463256, + "flos": 1489155500544.0, + "grad_norm": 0.023433110981570023, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072325, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.48144531, + "step": 1042, + "time_per_iteration": 4.926042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116989, + "balance_loss_mlp": 1.06525254, + "epoch": 0.20065409772989612, + "flos": 497578544640.0, + "grad_norm": 0.04254485219096875, + "language_loss": 0.82304472, + "learning_rate": 0.0009255613649386244, + "loss": 0.83421457, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.51806641, + "step": 1043, + "time_per_iteration": 2.6593456268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111366, + "balance_loss_mlp": 1.06144655, + "epoch": 0.20084647941515968, + "flos": 580464075264.0, + "grad_norm": 0.040062947145422745, + "language_loss": 0.79980814, + "learning_rate": 0.0009253977329834838, + "loss": 0.81094474, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.52294922, + "step": 1044, + "time_per_iteration": 2.765777111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110762, + "balance_loss_mlp": 1.0584054, + "epoch": 0.20103886110042324, + "flos": 643288986624.0, + "grad_norm": 0.040441822708095716, + "language_loss": 0.87291706, + "learning_rate": 0.0009252339358742965, + "loss": 0.88402474, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.52441406, + "step": 1045, + "time_per_iteration": 2.825388193130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.05353701, + "epoch": 0.2012312427856868, + "flos": 442970543616.0, + "grad_norm": 0.03567593499019723, + "language_loss": 0.84250462, + "learning_rate": 0.000925069973674654, + "loss": 0.85356355, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.52441406, + "step": 1046, + "time_per_iteration": 2.609393358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103182, + "balance_loss_mlp": 1.05082524, + "epoch": 0.20142362447095036, + "flos": 555473116416.0, + "grad_norm": 0.03147198417726023, + "language_loss": 0.89562172, + "learning_rate": 0.000924905846448212, + "loss": 0.90665352, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.52441406, + "step": 1047, + "time_per_iteration": 2.7771337032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.0364331, + "epoch": 0.20161600615621392, + "flos": 671555822592.0, + "grad_norm": 0.0352448826174341, + "language_loss": 0.86282432, + "learning_rate": 0.0009247415542586906, + "loss": 0.87371844, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.53076172, + "step": 1048, + "time_per_iteration": 2.8992083072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.03624833, + "epoch": 0.2018083878414775, + "flos": 574307950848.0, + "grad_norm": 0.02930747529675645, + "language_loss": 0.83574796, + "learning_rate": 0.0009245770971698735, + "loss": 0.84664071, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.53125, + "step": 1049, + "time_per_iteration": 2.890824317932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092759, + "balance_loss_mlp": 1.03992498, + "epoch": 0.20200076952674106, + "flos": 426795292416.0, + "grad_norm": 0.03785140598382088, + "language_loss": 0.89288604, + "learning_rate": 0.0009244124752456087, + "loss": 0.9038136, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.52929688, + "step": 1050, + "time_per_iteration": 2.5022785663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078262, + "balance_loss_mlp": 1.02566695, + "epoch": 0.20219315121200462, + "flos": 537685198848.0, + "grad_norm": 0.03140637951028952, + "language_loss": 0.86254251, + "learning_rate": 0.0009242476885498081, + "loss": 0.87332511, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.52685547, + "step": 1051, + "time_per_iteration": 2.732915163040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080259, + "balance_loss_mlp": 1.02771127, + "epoch": 0.20238553289726818, + "flos": 478835083776.0, + "grad_norm": 0.042472274730814934, + "language_loss": 0.82148528, + "learning_rate": 0.0009240827371464474, + "loss": 0.83228779, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.52636719, + "step": 1052, + "time_per_iteration": 2.577660322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076456, + "balance_loss_mlp": 1.02448094, + "epoch": 0.20257791458253174, + "flos": 1153847596800.0, + "grad_norm": 0.038862673250338535, + "language_loss": 0.85609984, + "learning_rate": 0.0009239176210995666, + "loss": 0.86686444, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.52050781, + "step": 1053, + "time_per_iteration": 3.517408609390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076589, + "balance_loss_mlp": 1.02485228, + "epoch": 0.2027702962677953, + "flos": 668149688064.0, + "grad_norm": 0.03591644261584591, + "language_loss": 0.94691521, + "learning_rate": 0.0009237523404732695, + "loss": 0.95768112, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51806641, + "step": 1054, + "time_per_iteration": 2.9073944091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010707, + "balance_loss_mlp": 1.01934481, + "epoch": 0.20296267795305886, + "flos": 642453007104.0, + "grad_norm": 0.03829830750428097, + "language_loss": 0.85043323, + "learning_rate": 0.0009235868953317235, + "loss": 0.86114025, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.51416016, + "step": 1055, + "time_per_iteration": 2.8769731521606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063995, + "balance_loss_mlp": 1.01249659, + "epoch": 0.20315505963832242, + "flos": 932130967296.0, + "grad_norm": 0.03371739794492534, + "language_loss": 0.86243355, + "learning_rate": 0.0009234212857391602, + "loss": 0.87307346, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.515625, + "step": 1056, + "time_per_iteration": 3.1701345443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062013, + "balance_loss_mlp": 1.01075327, + "epoch": 0.20334744132358598, + "flos": 563288560896.0, + "grad_norm": 0.028023058598955305, + "language_loss": 0.9034453, + "learning_rate": 0.000923255511759875, + "loss": 0.91406548, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.51318359, + "step": 1057, + "time_per_iteration": 2.8186585903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105938, + "balance_loss_mlp": 1.00840592, + "epoch": 0.20353982300884957, + "flos": 645429485568.0, + "grad_norm": 0.03599363132321351, + "language_loss": 0.85699975, + "learning_rate": 0.000923089573458227, + "loss": 0.86759359, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.51025391, + "step": 1058, + "time_per_iteration": 2.829428195953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_mlp": 1.01248097, + "epoch": 0.20373220469411313, + "flos": 652706403840.0, + "grad_norm": 0.03721325608628497, + "language_loss": 0.84890962, + "learning_rate": 0.0009229234708986392, + "loss": 0.85954273, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.50878906, + "step": 1059, + "time_per_iteration": 2.9125583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119614, + "balance_loss_mlp": 1.06964111, + "epoch": 0.2039245863793767, + "flos": 1440399367680.0, + "grad_norm": 0.026200157549973457, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82786512, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.49902344, + "step": 1060, + "time_per_iteration": 4.70502233505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105542, + "balance_loss_mlp": 1.00468493, + "epoch": 0.20411696806464025, + "flos": 598128538368.0, + "grad_norm": 0.03644056871626998, + "language_loss": 0.85909504, + "learning_rate": 0.0009225907732636548, + "loss": 0.86964923, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.5078125, + "step": 1061, + "time_per_iteration": 2.7681198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057413, + "balance_loss_mlp": 1.00672543, + "epoch": 0.2043093497499038, + "flos": 574897999872.0, + "grad_norm": 0.03243635340085092, + "language_loss": 0.87862682, + "learning_rate": 0.0009224241783174227, + "loss": 0.88920105, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.50732422, + "step": 1062, + "time_per_iteration": 2.682659864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058744, + "balance_loss_mlp": 1.00819898, + "epoch": 0.20450173143516737, + "flos": 631524990720.0, + "grad_norm": 0.033151959510572516, + "language_loss": 0.86810422, + "learning_rate": 0.0009222574193715802, + "loss": 0.87869167, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.50585938, + "step": 1063, + "time_per_iteration": 2.7470076084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057209, + "balance_loss_mlp": 1.00656855, + "epoch": 0.20469411312043093, + "flos": 575147821056.0, + "grad_norm": 0.03442752078644266, + "language_loss": 0.86910367, + "learning_rate": 0.000922090496490869, + "loss": 0.87967575, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.50683594, + "step": 1064, + "time_per_iteration": 2.789161443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_mlp": 1.00465047, + "epoch": 0.20488649480569449, + "flos": 638280879360.0, + "grad_norm": 0.029149473365885022, + "language_loss": 0.90671569, + "learning_rate": 0.0009219234097400937, + "loss": 0.91726714, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.50537109, + "step": 1065, + "time_per_iteration": 2.8469130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00483656, + "epoch": 0.20507887649095807, + "flos": 977439169536.0, + "grad_norm": 0.03225683406068631, + "language_loss": 0.83590472, + "learning_rate": 0.0009217561591841237, + "loss": 0.84645659, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.50390625, + "step": 1066, + "time_per_iteration": 3.331498622894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105396, + "balance_loss_mlp": 1.00332034, + "epoch": 0.20527125817622163, + "flos": 487156006656.0, + "grad_norm": 0.037421781664849635, + "language_loss": 0.81758374, + "learning_rate": 0.0009215887448878913, + "loss": 0.82812333, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.50683594, + "step": 1067, + "time_per_iteration": 2.5782346725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054496, + "balance_loss_mlp": 1.00414193, + "epoch": 0.2054636398614852, + "flos": 528211401216.0, + "grad_norm": 0.031680985043262715, + "language_loss": 0.86063826, + "learning_rate": 0.0009214211669163922, + "loss": 0.87118322, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.50390625, + "step": 1068, + "time_per_iteration": 2.689772129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054583, + "balance_loss_mlp": 1.00403798, + "epoch": 0.20565602154674875, + "flos": 559324458240.0, + "grad_norm": 0.03119808154519671, + "language_loss": 0.94868428, + "learning_rate": 0.0009212534253346862, + "loss": 0.95923012, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.50585938, + "step": 1069, + "time_per_iteration": 2.760840654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.01027393, + "epoch": 0.2058484032320123, + "flos": 505221935616.0, + "grad_norm": 0.042999288209875815, + "language_loss": 0.85068119, + "learning_rate": 0.0009210855202078964, + "loss": 0.86128938, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.50585938, + "step": 1070, + "time_per_iteration": 2.6273016929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057609, + "balance_loss_mlp": 1.00687337, + "epoch": 0.20604078491727587, + "flos": 434047911168.0, + "grad_norm": 0.03672139626538296, + "language_loss": 0.88035965, + "learning_rate": 0.0009209174516012091, + "loss": 0.89093566, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.5078125, + "step": 1071, + "time_per_iteration": 2.5263099670410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.0049957, + "epoch": 0.20623316660253943, + "flos": 609875037696.0, + "grad_norm": 0.03118890610347894, + "language_loss": 0.89938867, + "learning_rate": 0.0009207492195798747, + "loss": 0.90994692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.50878906, + "step": 1072, + "time_per_iteration": 2.773094654083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059091, + "balance_loss_mlp": 1.00816524, + "epoch": 0.206425548287803, + "flos": 481394545152.0, + "grad_norm": 0.034846135669383375, + "language_loss": 0.85408926, + "learning_rate": 0.0009205808242092061, + "loss": 0.86468017, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.50976562, + "step": 1073, + "time_per_iteration": 2.6704161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.01025188, + "epoch": 0.20661792997306658, + "flos": 951124249344.0, + "grad_norm": 0.036438983488896924, + "language_loss": 0.83303434, + "learning_rate": 0.0009204122655545808, + "loss": 0.84364516, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.50878906, + "step": 1074, + "time_per_iteration": 3.3605480194091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059315, + "balance_loss_mlp": 1.00857949, + "epoch": 0.20681031165833014, + "flos": 604617109248.0, + "grad_norm": 0.03238632395719984, + "language_loss": 0.81744164, + "learning_rate": 0.0009202435436814388, + "loss": 0.82803476, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.5078125, + "step": 1075, + "time_per_iteration": 2.6966288089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106261, + "balance_loss_mlp": 1.01163661, + "epoch": 0.2070026933435937, + "flos": 710266583808.0, + "grad_norm": 0.03297439165012413, + "language_loss": 0.90137285, + "learning_rate": 0.0009200746586552836, + "loss": 0.91199899, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.51025391, + "step": 1076, + "time_per_iteration": 2.919851779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057537, + "balance_loss_mlp": 1.00675428, + "epoch": 0.20719507502885726, + "flos": 831255330048.0, + "grad_norm": 0.031928056401627374, + "language_loss": 0.84964621, + "learning_rate": 0.0009199056105416825, + "loss": 0.86022151, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.50830078, + "step": 1077, + "time_per_iteration": 3.0944886207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059646, + "balance_loss_mlp": 1.00881469, + "epoch": 0.20738745671412082, + "flos": 639500828160.0, + "grad_norm": 0.033227407694906064, + "language_loss": 0.87196565, + "learning_rate": 0.0009197363994062654, + "loss": 0.88256204, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.50878906, + "step": 1078, + "time_per_iteration": 2.8505265712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059879, + "balance_loss_mlp": 1.00933433, + "epoch": 0.20757983839938438, + "flos": 686984522496.0, + "grad_norm": 0.03258152966614613, + "language_loss": 0.84972161, + "learning_rate": 0.0009195670253147262, + "loss": 0.86032039, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.50585938, + "step": 1079, + "time_per_iteration": 3.0077526569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_mlp": 1.01375961, + "epoch": 0.20777222008464794, + "flos": 520318189056.0, + "grad_norm": 0.03575722766779635, + "language_loss": 0.83075011, + "learning_rate": 0.0009193974883328216, + "loss": 0.84139216, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.50488281, + "step": 1080, + "time_per_iteration": 2.6277496814727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062434, + "balance_loss_mlp": 1.01212776, + "epoch": 0.2079646017699115, + "flos": 512470663680.0, + "grad_norm": 0.03316952161345372, + "language_loss": 0.87936002, + "learning_rate": 0.0009192277885263718, + "loss": 0.88998437, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.50341797, + "step": 1081, + "time_per_iteration": 2.6486003398895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056126, + "balance_loss_mlp": 1.00596321, + "epoch": 0.20815698345517505, + "flos": 933468534528.0, + "grad_norm": 0.031694408237267754, + "language_loss": 0.87043977, + "learning_rate": 0.0009190579259612602, + "loss": 0.881001, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.50195312, + "step": 1082, + "time_per_iteration": 3.280133008956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062428, + "balance_loss_mlp": 1.01202655, + "epoch": 0.20834936514043864, + "flos": 633554674176.0, + "grad_norm": 0.03367407497844021, + "language_loss": 0.87446159, + "learning_rate": 0.000918887900703433, + "loss": 0.88508588, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.50439453, + "step": 1083, + "time_per_iteration": 2.7914657592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_mlp": 1.01024699, + "epoch": 0.2085417468257022, + "flos": 395243831040.0, + "grad_norm": 0.03354838448754016, + "language_loss": 0.91036344, + "learning_rate": 0.0009187177128188999, + "loss": 0.92096996, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.50439453, + "step": 1084, + "time_per_iteration": 2.4803311824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107357, + "balance_loss_mlp": 1.02455139, + "epoch": 0.20873412851096576, + "flos": 1405197775104.0, + "grad_norm": 0.012085868941934568, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78230107, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.48925781, + "step": 1085, + "time_per_iteration": 4.883121728897095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_mlp": 1.00562024, + "epoch": 0.20892651019622932, + "flos": 448762140672.0, + "grad_norm": 0.03493036575467998, + "language_loss": 0.8691588, + "learning_rate": 0.000918376849434071, + "loss": 0.87971807, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.50317383, + "step": 1086, + "time_per_iteration": 2.537820816040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065129, + "balance_loss_mlp": 1.01444149, + "epoch": 0.20911889188149288, + "flos": 494081036544.0, + "grad_norm": 0.040745363066357655, + "language_loss": 0.91673005, + "learning_rate": 0.0009182061740661098, + "loss": 0.9273814, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.50732422, + "step": 1087, + "time_per_iteration": 2.5920886993408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056749, + "balance_loss_mlp": 1.00615633, + "epoch": 0.20931127356675644, + "flos": 842750062848.0, + "grad_norm": 0.02822254108426211, + "language_loss": 0.85810733, + "learning_rate": 0.0009180353363361127, + "loss": 0.86867487, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.50634766, + "step": 1088, + "time_per_iteration": 3.1376798152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060338, + "balance_loss_mlp": 1.00979316, + "epoch": 0.20950365525202, + "flos": 758525019648.0, + "grad_norm": 0.03922038165748564, + "language_loss": 0.83160806, + "learning_rate": 0.0009178643363104044, + "loss": 0.84221143, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.50585938, + "step": 1089, + "time_per_iteration": 3.124352216720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059844, + "balance_loss_mlp": 1.00939417, + "epoch": 0.20969603693728356, + "flos": 473492584704.0, + "grad_norm": 0.04272734591158297, + "language_loss": 0.920385, + "learning_rate": 0.0009176931740553735, + "loss": 0.93098342, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.50488281, + "step": 1090, + "time_per_iteration": 2.556528091430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067623, + "balance_loss_mlp": 1.01731646, + "epoch": 0.20988841862254715, + "flos": 978628982784.0, + "grad_norm": 0.03590255199570226, + "language_loss": 0.83530974, + "learning_rate": 0.0009175218496374708, + "loss": 0.84598601, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.50341797, + "step": 1091, + "time_per_iteration": 3.328984260559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059931, + "balance_loss_mlp": 1.00976801, + "epoch": 0.2100808003078107, + "flos": 1094819592192.0, + "grad_norm": 0.03766723451938342, + "language_loss": 0.86626744, + "learning_rate": 0.0009173503631232103, + "loss": 0.87686676, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.50170898, + "step": 1092, + "time_per_iteration": 3.4216480255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.00832939, + "epoch": 0.21027318199307427, + "flos": 1014560596992.0, + "grad_norm": 0.047058286401960234, + "language_loss": 0.82703817, + "learning_rate": 0.0009171787145791691, + "loss": 0.83762449, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.50341797, + "step": 1093, + "time_per_iteration": 3.2454655170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.00911129, + "epoch": 0.21046556367833782, + "flos": 522413001216.0, + "grad_norm": 0.043211200123957835, + "language_loss": 0.80955076, + "learning_rate": 0.000917006904071987, + "loss": 0.8201468, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.50537109, + "step": 1094, + "time_per_iteration": 2.6560592651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_mlp": 1.01053584, + "epoch": 0.21065794536360138, + "flos": 604840685568.0, + "grad_norm": 0.03488627405352903, + "language_loss": 0.87964189, + "learning_rate": 0.0009168349316683669, + "loss": 0.89025223, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.50537109, + "step": 1095, + "time_per_iteration": 2.794358253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.01243329, + "epoch": 0.21085032704886494, + "flos": 604558783488.0, + "grad_norm": 0.031199931973452354, + "language_loss": 0.82918072, + "learning_rate": 0.0009166627974350741, + "loss": 0.83981001, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.50537109, + "step": 1096, + "time_per_iteration": 2.89837384223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062823, + "balance_loss_mlp": 1.01242077, + "epoch": 0.2110427087341285, + "flos": 638832044544.0, + "grad_norm": 0.03623978918327459, + "language_loss": 0.90394479, + "learning_rate": 0.0009164905014389373, + "loss": 0.91457301, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.50439453, + "step": 1097, + "time_per_iteration": 2.79203462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055619, + "balance_loss_mlp": 1.00559878, + "epoch": 0.21123509041939206, + "flos": 523930403328.0, + "grad_norm": 0.03351990521185014, + "language_loss": 0.87381279, + "learning_rate": 0.0009163180437468476, + "loss": 0.88436902, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.50024414, + "step": 1098, + "time_per_iteration": 2.6110002994537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056208, + "balance_loss_mlp": 1.00647402, + "epoch": 0.21142747210465565, + "flos": 452194520064.0, + "grad_norm": 0.03619268995909484, + "language_loss": 0.86631316, + "learning_rate": 0.000916145424425759, + "loss": 0.87687522, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.49658203, + "step": 1099, + "time_per_iteration": 2.67106294631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060107, + "balance_loss_mlp": 1.01027727, + "epoch": 0.2116198537899192, + "flos": 877626978816.0, + "grad_norm": 0.042483916895571405, + "language_loss": 0.91832745, + "learning_rate": 0.0009159726435426885, + "loss": 0.92892849, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.49780273, + "step": 1100, + "time_per_iteration": 3.095250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052771, + "balance_loss_mlp": 1.00275087, + "epoch": 0.21181223547518277, + "flos": 524675009280.0, + "grad_norm": 0.035590136232614346, + "language_loss": 0.91126454, + "learning_rate": 0.0009157997011647154, + "loss": 0.92179227, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.49926758, + "step": 1101, + "time_per_iteration": 2.61954665184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.00615227, + "epoch": 0.21200461716044633, + "flos": 573426284544.0, + "grad_norm": 0.03167271765745466, + "language_loss": 0.86759949, + "learning_rate": 0.0009156265973589817, + "loss": 0.87816215, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.50146484, + "step": 1102, + "time_per_iteration": 2.7851946353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053549, + "balance_loss_mlp": 1.00348067, + "epoch": 0.2121969988457099, + "flos": 546175262976.0, + "grad_norm": 0.033324702660241096, + "language_loss": 0.90598941, + "learning_rate": 0.0009154533321926926, + "loss": 0.91652489, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.50073242, + "step": 1103, + "time_per_iteration": 2.658358573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056655, + "balance_loss_mlp": 1.00663483, + "epoch": 0.21238938053097345, + "flos": 845355211008.0, + "grad_norm": 0.03290940631262569, + "language_loss": 0.88234645, + "learning_rate": 0.0009152799057331156, + "loss": 0.89291298, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.50024414, + "step": 1104, + "time_per_iteration": 3.1174561977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056205, + "balance_loss_mlp": 1.00623202, + "epoch": 0.212581762216237, + "flos": 447142671360.0, + "grad_norm": 0.035279899791186564, + "language_loss": 0.91767001, + "learning_rate": 0.0009151063180475805, + "loss": 0.92823207, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.5, + "step": 1105, + "time_per_iteration": 2.538922071456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054823, + "balance_loss_mlp": 1.00489795, + "epoch": 0.21277414390150057, + "flos": 515385904128.0, + "grad_norm": 0.03737857831356842, + "language_loss": 0.85410213, + "learning_rate": 0.0009149325692034803, + "loss": 0.86465037, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.49853516, + "step": 1106, + "time_per_iteration": 2.588087558746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055756, + "balance_loss_mlp": 1.00788116, + "epoch": 0.21296652558676413, + "flos": 1488514907136.0, + "grad_norm": 0.005769411809131762, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80259192, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.47851562, + "step": 1107, + "time_per_iteration": 4.901995658874512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055609, + "balance_loss_mlp": 1.00596976, + "epoch": 0.21315890727202771, + "flos": 847451968512.0, + "grad_norm": 0.03679321288402367, + "language_loss": 0.87994891, + "learning_rate": 0.0009145845883094678, + "loss": 0.89050496, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.49584961, + "step": 1108, + "time_per_iteration": 3.034179925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057835, + "balance_loss_mlp": 1.00833917, + "epoch": 0.21335128895729127, + "flos": 630556808448.0, + "grad_norm": 0.040833312538100186, + "language_loss": 0.86006308, + "learning_rate": 0.000914410356394654, + "loss": 0.87064135, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.49438477, + "step": 1109, + "time_per_iteration": 2.793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058379, + "balance_loss_mlp": 1.00878823, + "epoch": 0.21354367064255483, + "flos": 712285573632.0, + "grad_norm": 0.029526159769499145, + "language_loss": 0.85111213, + "learning_rate": 0.0009142359635914709, + "loss": 0.86169595, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.49560547, + "step": 1110, + "time_per_iteration": 3.0403430461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063298, + "balance_loss_mlp": 1.01375508, + "epoch": 0.2137360523278184, + "flos": 457211375616.0, + "grad_norm": 0.03547311640481051, + "language_loss": 0.85051197, + "learning_rate": 0.0009140614099676245, + "loss": 0.8611449, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.49414062, + "step": 1111, + "time_per_iteration": 2.6027371883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054693, + "balance_loss_mlp": 1.00495887, + "epoch": 0.21392843401308195, + "flos": 667266076416.0, + "grad_norm": 0.03139007596896344, + "language_loss": 0.8342849, + "learning_rate": 0.0009138866955908821, + "loss": 0.84483182, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.49658203, + "step": 1112, + "time_per_iteration": 2.924180269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00517082, + "epoch": 0.2141208156983455, + "flos": 750362544384.0, + "grad_norm": 0.03405304612319473, + "language_loss": 0.81477892, + "learning_rate": 0.0009137118205290738, + "loss": 0.82533085, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.49951172, + "step": 1113, + "time_per_iteration": 2.956289768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057131, + "balance_loss_mlp": 1.00711048, + "epoch": 0.21431319738360907, + "flos": 420011213568.0, + "grad_norm": 0.037812047895131755, + "language_loss": 0.90930229, + "learning_rate": 0.0009135367848500924, + "loss": 0.9198736, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.49975586, + "step": 1114, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_mlp": 1.01079023, + "epoch": 0.21450557906887263, + "flos": 610239565056.0, + "grad_norm": 0.04455846969282107, + "language_loss": 0.87261575, + "learning_rate": 0.0009133615886218927, + "loss": 0.88322389, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.5, + "step": 1115, + "time_per_iteration": 2.7146785259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105379, + "balance_loss_mlp": 1.00367427, + "epoch": 0.21469796075413622, + "flos": 562975556352.0, + "grad_norm": 0.04025415931658291, + "language_loss": 0.88754129, + "learning_rate": 0.0009131862319124917, + "loss": 0.89807916, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.50097656, + "step": 1116, + "time_per_iteration": 2.702315092086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058084, + "balance_loss_mlp": 1.0081588, + "epoch": 0.21489034243939978, + "flos": 595738218240.0, + "grad_norm": 0.036347556106983744, + "language_loss": 0.84819156, + "learning_rate": 0.0009130107147899691, + "loss": 0.8587724, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.49902344, + "step": 1117, + "time_per_iteration": 2.705153226852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055436, + "balance_loss_mlp": 1.00555849, + "epoch": 0.21508272412466334, + "flos": 442850979840.0, + "grad_norm": 0.032390780355026266, + "language_loss": 0.85796201, + "learning_rate": 0.0009128350373224665, + "loss": 0.86851633, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.49804688, + "step": 1118, + "time_per_iteration": 2.5689737796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055817, + "balance_loss_mlp": 1.00775146, + "epoch": 0.2152751058099269, + "flos": 1499234898432.0, + "grad_norm": 0.005802610423144338, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82512248, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.48046875, + "step": 1119, + "time_per_iteration": 4.659603834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054629, + "balance_loss_mlp": 1.00475144, + "epoch": 0.21546748749519046, + "flos": 494992838400.0, + "grad_norm": 0.03550503890551413, + "language_loss": 0.86117166, + "learning_rate": 0.0009124832016254005, + "loss": 0.87171793, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.4987793, + "step": 1120, + "time_per_iteration": 2.6080243587493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054572, + "balance_loss_mlp": 1.00450444, + "epoch": 0.21565986918045402, + "flos": 635695173120.0, + "grad_norm": 0.03761657282592244, + "language_loss": 0.88987935, + "learning_rate": 0.0009123070435324316, + "loss": 0.90042508, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.50097656, + "step": 1121, + "time_per_iteration": 2.8451340198516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062664, + "balance_loss_mlp": 1.01450348, + "epoch": 0.21585225086571758, + "flos": 1586801914368.0, + "grad_norm": 0.011675507285583616, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78938448, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.48144531, + "step": 1122, + "time_per_iteration": 5.018117666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_mlp": 1.00541639, + "epoch": 0.21604463255098114, + "flos": 685323257088.0, + "grad_norm": 0.03443856201457266, + "language_loss": 0.87021005, + "learning_rate": 0.0009119542471995752, + "loss": 0.8807621, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.49682617, + "step": 1123, + "time_per_iteration": 2.8631908893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.00755107, + "epoch": 0.2162370142362447, + "flos": 782308668672.0, + "grad_norm": 0.034966150945184314, + "language_loss": 0.82536203, + "learning_rate": 0.0009117776090966554, + "loss": 0.83593345, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.49511719, + "step": 1124, + "time_per_iteration": 2.9458060264587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_mlp": 1.00877571, + "epoch": 0.21642939592150828, + "flos": 1003762838016.0, + "grad_norm": 0.03795033166932298, + "language_loss": 0.87775326, + "learning_rate": 0.0009116008111274899, + "loss": 0.88833648, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.49511719, + "step": 1125, + "time_per_iteration": 3.2748866081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_mlp": 1.00556183, + "epoch": 0.21662177760677184, + "flos": 1485764917248.0, + "grad_norm": 0.008195913283110022, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8015998, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.47460938, + "step": 1126, + "time_per_iteration": 4.803825616836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105355, + "balance_loss_mlp": 1.00391161, + "epoch": 0.2168141592920354, + "flos": 888861196800.0, + "grad_norm": 0.03626284425770287, + "language_loss": 0.85553163, + "learning_rate": 0.0009112467358650396, + "loss": 0.86606717, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.49609375, + "step": 1127, + "time_per_iteration": 3.155856132507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057313, + "balance_loss_mlp": 1.00753081, + "epoch": 0.21700654097729896, + "flos": 547085119488.0, + "grad_norm": 0.03272511127748384, + "language_loss": 0.87140059, + "learning_rate": 0.0009110694587092192, + "loss": 0.88197374, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.49682617, + "step": 1128, + "time_per_iteration": 2.7438507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.00781655, + "epoch": 0.21719892266256252, + "flos": 510536244480.0, + "grad_norm": 0.0385378102776186, + "language_loss": 0.81826651, + "learning_rate": 0.0009108920219620815, + "loss": 0.82884294, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.49829102, + "step": 1129, + "time_per_iteration": 2.6256754398345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.00682795, + "epoch": 0.21739130434782608, + "flos": 544462474752.0, + "grad_norm": 0.03288593298355655, + "language_loss": 0.9021399, + "learning_rate": 0.0009107144256925133, + "loss": 0.91270602, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.49707031, + "step": 1130, + "time_per_iteration": 2.665764808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055495, + "balance_loss_mlp": 1.00566518, + "epoch": 0.21758368603308964, + "flos": 617983077888.0, + "grad_norm": 0.04004849400109536, + "language_loss": 0.83221352, + "learning_rate": 0.0009105366699694638, + "loss": 0.84276843, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.49755859, + "step": 1131, + "time_per_iteration": 2.7092785835266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055334, + "balance_loss_mlp": 1.0055995, + "epoch": 0.2177760677183532, + "flos": 636335766528.0, + "grad_norm": 0.03327692114185805, + "language_loss": 0.82139939, + "learning_rate": 0.0009103587548619439, + "loss": 0.83195269, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.49658203, + "step": 1132, + "time_per_iteration": 2.833617925643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055546, + "balance_loss_mlp": 1.00585985, + "epoch": 0.2179684494036168, + "flos": 533597641728.0, + "grad_norm": 0.036557340203022134, + "language_loss": 0.8721149, + "learning_rate": 0.0009101806804390261, + "loss": 0.8826704, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.49609375, + "step": 1133, + "time_per_iteration": 2.7880306243896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054716, + "balance_loss_mlp": 1.0050298, + "epoch": 0.21816083108888035, + "flos": 476182303488.0, + "grad_norm": 0.03701280834454915, + "language_loss": 0.917292, + "learning_rate": 0.0009100024467698453, + "loss": 0.92783916, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.49560547, + "step": 1134, + "time_per_iteration": 2.592986822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054821, + "balance_loss_mlp": 1.00513422, + "epoch": 0.2183532127741439, + "flos": 578547152640.0, + "grad_norm": 0.04183992577645213, + "language_loss": 0.83309305, + "learning_rate": 0.0009098240539235981, + "loss": 0.84364122, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.49658203, + "step": 1135, + "time_per_iteration": 2.693387269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055318, + "balance_loss_mlp": 1.00558341, + "epoch": 0.21854559445940747, + "flos": 595280371968.0, + "grad_norm": 0.03379290176549673, + "language_loss": 0.88387418, + "learning_rate": 0.0009096455019695423, + "loss": 0.89442736, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.49609375, + "step": 1136, + "time_per_iteration": 2.781304359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059818, + "balance_loss_mlp": 1.0098455, + "epoch": 0.21873797614467103, + "flos": 409549791744.0, + "grad_norm": 0.03874067782032871, + "language_loss": 0.90736896, + "learning_rate": 0.000909466790976998, + "loss": 0.91796714, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.49951172, + "step": 1137, + "time_per_iteration": 2.4837231636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.00620675, + "epoch": 0.21893035782993459, + "flos": 895655969280.0, + "grad_norm": 0.03281311030157744, + "language_loss": 0.83296013, + "learning_rate": 0.0009092879210153473, + "loss": 0.84352005, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.49682617, + "step": 1138, + "time_per_iteration": 3.156329870223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058359, + "balance_loss_mlp": 1.00862455, + "epoch": 0.21912273951519814, + "flos": 468569048064.0, + "grad_norm": 0.03332829582894704, + "language_loss": 0.89480728, + "learning_rate": 0.0009091088921540333, + "loss": 0.90539086, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.49731445, + "step": 1139, + "time_per_iteration": 2.5444674491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060322, + "balance_loss_mlp": 1.01197052, + "epoch": 0.2193151212004617, + "flos": 1535180118528.0, + "grad_norm": 0.009447727830516332, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76569003, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.48339844, + "step": 1140, + "time_per_iteration": 4.993603944778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_mlp": 1.00158358, + "epoch": 0.2195075028857253, + "flos": 592275703296.0, + "grad_norm": 0.039648398816974934, + "language_loss": 0.85201681, + "learning_rate": 0.0009087503580104985, + "loss": 0.86252946, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.49560547, + "step": 1141, + "time_per_iteration": 2.6736245155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053862, + "balance_loss_mlp": 1.00436676, + "epoch": 0.21969988457098885, + "flos": 637518776832.0, + "grad_norm": 0.03678403810630545, + "language_loss": 0.8005864, + "learning_rate": 0.0009085708528674728, + "loss": 0.81112504, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.49414062, + "step": 1142, + "time_per_iteration": 2.799607038497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.00362051, + "epoch": 0.2198922662562524, + "flos": 913860903936.0, + "grad_norm": 0.040969430424554455, + "language_loss": 0.86853033, + "learning_rate": 0.0009083911891031745, + "loss": 0.87906301, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.49487305, + "step": 1143, + "time_per_iteration": 3.1043601036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010518, + "balance_loss_mlp": 1.00235164, + "epoch": 0.22008464794151597, + "flos": 824495550720.0, + "grad_norm": 0.03475506353694162, + "language_loss": 0.91937912, + "learning_rate": 0.0009082113667873553, + "loss": 0.92989707, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.4934082, + "step": 1144, + "time_per_iteration": 3.114678144454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055626, + "balance_loss_mlp": 1.00636888, + "epoch": 0.22027702962677953, + "flos": 460619455488.0, + "grad_norm": 0.047183367988671336, + "language_loss": 0.91319406, + "learning_rate": 0.0009080313859898283, + "loss": 0.92375034, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.49145508, + "step": 1145, + "time_per_iteration": 2.529627799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_mlp": 1.00877535, + "epoch": 0.2204694113120431, + "flos": 532288264704.0, + "grad_norm": 0.034289556826903954, + "language_loss": 0.91988164, + "learning_rate": 0.0009078512467804684, + "loss": 0.93046296, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.49243164, + "step": 1146, + "time_per_iteration": 2.692556381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056493, + "balance_loss_mlp": 1.00737858, + "epoch": 0.22066179299730665, + "flos": 523687385088.0, + "grad_norm": 0.03628724645244133, + "language_loss": 0.91349947, + "learning_rate": 0.0009076709492292119, + "loss": 0.9240644, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.49023438, + "step": 1147, + "time_per_iteration": 2.6262857913970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056943, + "balance_loss_mlp": 1.00799513, + "epoch": 0.2208541746825702, + "flos": 547506027264.0, + "grad_norm": 0.0383258843164557, + "language_loss": 0.89899343, + "learning_rate": 0.0009074904934060562, + "loss": 0.90956283, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.48901367, + "step": 1148, + "time_per_iteration": 2.710716962814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054195, + "balance_loss_mlp": 1.00498509, + "epoch": 0.22104655636783377, + "flos": 710060504064.0, + "grad_norm": 0.034028934421108444, + "language_loss": 0.85814822, + "learning_rate": 0.0009073098793810607, + "loss": 0.86869013, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.4909668, + "step": 1149, + "time_per_iteration": 2.986891269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056627, + "balance_loss_mlp": 1.00758433, + "epoch": 0.22123893805309736, + "flos": 585965021952.0, + "grad_norm": 0.03641392016248804, + "language_loss": 0.88886124, + "learning_rate": 0.000907129107224346, + "loss": 0.89942753, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.48999023, + "step": 1150, + "time_per_iteration": 2.7348337173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055224, + "balance_loss_mlp": 1.00601482, + "epoch": 0.22143131973836092, + "flos": 493251859968.0, + "grad_norm": 0.02984339906163832, + "language_loss": 0.89448893, + "learning_rate": 0.0009069481770060939, + "loss": 0.90504116, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.49121094, + "step": 1151, + "time_per_iteration": 2.688180685043335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055578, + "balance_loss_mlp": 1.00593948, + "epoch": 0.22162370142362448, + "flos": 1081469174784.0, + "grad_norm": 0.034516826316188534, + "language_loss": 0.8487525, + "learning_rate": 0.000906767088796548, + "loss": 0.85930824, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.49584961, + "step": 1152, + "time_per_iteration": 3.4747724533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057959, + "balance_loss_mlp": 1.00841522, + "epoch": 0.22181608310888803, + "flos": 493512374784.0, + "grad_norm": 0.03114695536209251, + "language_loss": 0.87880313, + "learning_rate": 0.0009065858426660127, + "loss": 0.88938272, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.49462891, + "step": 1153, + "time_per_iteration": 2.6112635135650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060109, + "balance_loss_mlp": 1.0103749, + "epoch": 0.2220084647941516, + "flos": 725325898752.0, + "grad_norm": 0.04119971901255946, + "language_loss": 0.85662532, + "learning_rate": 0.0009064044386848543, + "loss": 0.86722642, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.49658203, + "step": 1154, + "time_per_iteration": 2.893120288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105547, + "balance_loss_mlp": 1.00564086, + "epoch": 0.22220084647941515, + "flos": 490245245952.0, + "grad_norm": 0.04012578927121656, + "language_loss": 0.89651787, + "learning_rate": 0.0009062228769234997, + "loss": 0.9070726, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.49731445, + "step": 1155, + "time_per_iteration": 2.544904947280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_mlp": 1.00344408, + "epoch": 0.2223932281646787, + "flos": 537296371968.0, + "grad_norm": 0.03814815821860503, + "language_loss": 0.82016486, + "learning_rate": 0.0009060411574524376, + "loss": 0.83069855, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.49804688, + "step": 1156, + "time_per_iteration": 2.6412572860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056148, + "balance_loss_mlp": 1.00660419, + "epoch": 0.22258560984994227, + "flos": 932968892160.0, + "grad_norm": 0.0415511709861084, + "language_loss": 0.88770878, + "learning_rate": 0.0009058592803422178, + "loss": 0.89827025, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.49462891, + "step": 1157, + "time_per_iteration": 4.623233079910278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055229, + "balance_loss_mlp": 1.00792694, + "epoch": 0.22277799153520586, + "flos": 1202397638400.0, + "grad_norm": 0.007067436666665483, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79765517, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.47265625, + "step": 1158, + "time_per_iteration": 4.805820465087891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053661, + "balance_loss_mlp": 1.00397491, + "epoch": 0.22297037322046942, + "flos": 502317388800.0, + "grad_norm": 0.032485949168455416, + "language_loss": 0.91067338, + "learning_rate": 0.00090549505348681, + "loss": 0.92121005, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.49633789, + "step": 1159, + "time_per_iteration": 2.5877561569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00427544, + "epoch": 0.22316275490573298, + "flos": 754113764352.0, + "grad_norm": 0.0354615562345569, + "language_loss": 0.84617937, + "learning_rate": 0.0009053127038830275, + "loss": 0.85672045, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.49731445, + "step": 1160, + "time_per_iteration": 3.0164098739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_mlp": 1.00777233, + "epoch": 0.22335513659099654, + "flos": 515804866560.0, + "grad_norm": 0.03692799991821936, + "language_loss": 0.87995219, + "learning_rate": 0.000905130196922898, + "loss": 0.89052767, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.49682617, + "step": 1161, + "time_per_iteration": 2.603769063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058076, + "balance_loss_mlp": 1.00848484, + "epoch": 0.2235475182762601, + "flos": 485508347136.0, + "grad_norm": 0.031071089964746976, + "language_loss": 0.8758713, + "learning_rate": 0.0009049475326772769, + "loss": 0.88645208, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.49511719, + "step": 1162, + "time_per_iteration": 2.6613070964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_mlp": 1.00334835, + "epoch": 0.22373989996152366, + "flos": 471068238336.0, + "grad_norm": 0.03308636607962537, + "language_loss": 0.83887613, + "learning_rate": 0.0009047647112170811, + "loss": 0.84940416, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.49389648, + "step": 1163, + "time_per_iteration": 2.8056106567382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105253, + "balance_loss_mlp": 1.00322485, + "epoch": 0.22393228164678722, + "flos": 1273019542272.0, + "grad_norm": 0.035987441954907426, + "language_loss": 0.88180983, + "learning_rate": 0.0009045817326132876, + "loss": 0.89233518, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.49243164, + "step": 1164, + "time_per_iteration": 3.7020320892333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_mlp": 1.00575495, + "epoch": 0.22412466333205078, + "flos": 597468503040.0, + "grad_norm": 0.03371692057767332, + "language_loss": 0.84342653, + "learning_rate": 0.0009043985969369357, + "loss": 0.85397661, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.49145508, + "step": 1165, + "time_per_iteration": 2.8581626415252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_mlp": 1.00299454, + "epoch": 0.22431704501731436, + "flos": 609632019456.0, + "grad_norm": 0.03010954873673584, + "language_loss": 0.84869868, + "learning_rate": 0.0009042153042591245, + "loss": 0.85922217, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.49243164, + "step": 1166, + "time_per_iteration": 2.810300827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_mlp": 1.0050199, + "epoch": 0.22450942670257792, + "flos": 908108190720.0, + "grad_norm": 0.030118647676053625, + "language_loss": 0.86120874, + "learning_rate": 0.0009040318546510146, + "loss": 0.87175173, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.49169922, + "step": 1167, + "time_per_iteration": 3.129802942276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057032, + "balance_loss_mlp": 1.00791764, + "epoch": 0.22470180838784148, + "flos": 566381690880.0, + "grad_norm": 0.035718478093575166, + "language_loss": 0.85780692, + "learning_rate": 0.0009038482481838275, + "loss": 0.86837721, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.49047852, + "step": 1168, + "time_per_iteration": 2.674471855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00880456, + "epoch": 0.22489419007310504, + "flos": 835918351872.0, + "grad_norm": 0.03078757560697398, + "language_loss": 0.88093269, + "learning_rate": 0.0009036644849288455, + "loss": 0.89151073, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.48925781, + "step": 1169, + "time_per_iteration": 3.126168727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00331759, + "epoch": 0.2250865717583686, + "flos": 582139924992.0, + "grad_norm": 0.03503818002335677, + "language_loss": 0.86431491, + "learning_rate": 0.0009034805649574118, + "loss": 0.87483639, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.48779297, + "step": 1170, + "time_per_iteration": 2.6982839107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056949, + "balance_loss_mlp": 1.0084312, + "epoch": 0.22527895344363216, + "flos": 601671733248.0, + "grad_norm": 0.031992933731526396, + "language_loss": 0.85811341, + "learning_rate": 0.0009032964883409308, + "loss": 0.86868292, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.48510742, + "step": 1171, + "time_per_iteration": 2.9468932151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055088, + "balance_loss_mlp": 1.00826263, + "epoch": 0.22547133512889572, + "flos": 1443734537472.0, + "grad_norm": 0.010800983830845337, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.7410562, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.46777344, + "step": 1172, + "time_per_iteration": 5.044191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_mlp": 1.0051204, + "epoch": 0.22566371681415928, + "flos": 491586703872.0, + "grad_norm": 0.034976527569036825, + "language_loss": 0.88142014, + "learning_rate": 0.0009029278654587462, + "loss": 0.89195722, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.48583984, + "step": 1173, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105611, + "balance_loss_mlp": 1.00749624, + "epoch": 0.22585609849942284, + "flos": 605752487424.0, + "grad_norm": 0.03629905495680353, + "language_loss": 0.82793885, + "learning_rate": 0.0009027433193361548, + "loss": 0.83850002, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.48583984, + "step": 1174, + "time_per_iteration": 2.707061290740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105926, + "balance_loss_mlp": 1.01064646, + "epoch": 0.22604848018468643, + "flos": 636728484096.0, + "grad_norm": 0.035409171913978986, + "language_loss": 0.87780964, + "learning_rate": 0.00090255861685474, + "loss": 0.88840234, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.48608398, + "step": 1175, + "time_per_iteration": 2.7910189628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.00752461, + "epoch": 0.22624086186995, + "flos": 480845325312.0, + "grad_norm": 0.040136392489239156, + "language_loss": 0.91905487, + "learning_rate": 0.0009023737580862095, + "loss": 0.92961645, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.48632812, + "step": 1176, + "time_per_iteration": 2.5489909648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054427, + "balance_loss_mlp": 1.00600469, + "epoch": 0.22643324355521355, + "flos": 496807693824.0, + "grad_norm": 0.032828642541270554, + "language_loss": 0.83966863, + "learning_rate": 0.0009021887431023321, + "loss": 0.85021293, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.48413086, + "step": 1177, + "time_per_iteration": 2.679046392440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060571, + "balance_loss_mlp": 1.01224387, + "epoch": 0.2266256252404771, + "flos": 562684905984.0, + "grad_norm": 0.03431341234676521, + "language_loss": 0.8836711, + "learning_rate": 0.0009020035719749369, + "loss": 0.89427686, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.4831543, + "step": 1178, + "time_per_iteration": 2.777273416519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_mlp": 1.00516534, + "epoch": 0.22681800692574067, + "flos": 581033703936.0, + "grad_norm": 0.0422995660898389, + "language_loss": 0.78512251, + "learning_rate": 0.0009018182447759136, + "loss": 0.79566014, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.48583984, + "step": 1179, + "time_per_iteration": 2.9779903888702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105363, + "balance_loss_mlp": 1.00508785, + "epoch": 0.22701038861100423, + "flos": 741466156800.0, + "grad_norm": 0.03672617722264385, + "language_loss": 0.80683887, + "learning_rate": 0.0009016327615772126, + "loss": 0.81737518, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.48535156, + "step": 1180, + "time_per_iteration": 2.953355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054935, + "balance_loss_mlp": 1.00636911, + "epoch": 0.2272027702962678, + "flos": 578306079744.0, + "grad_norm": 0.03924605706365315, + "language_loss": 0.88551408, + "learning_rate": 0.0009014471224508451, + "loss": 0.89606345, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.4855957, + "step": 1181, + "time_per_iteration": 2.7092630863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00744355, + "epoch": 0.22739515198153135, + "flos": 545291651328.0, + "grad_norm": 0.04038062834310644, + "language_loss": 0.83949769, + "learning_rate": 0.0009012613274688823, + "loss": 0.85005856, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.48632812, + "step": 1182, + "time_per_iteration": 2.642143964767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055555, + "balance_loss_mlp": 1.00689363, + "epoch": 0.22758753366679493, + "flos": 441092504832.0, + "grad_norm": 0.03566258536478163, + "language_loss": 0.88506091, + "learning_rate": 0.0009010753767034565, + "loss": 0.89561647, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.48632812, + "step": 1183, + "time_per_iteration": 2.599167585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_mlp": 1.00526154, + "epoch": 0.2277799153520585, + "flos": 730824900096.0, + "grad_norm": 0.03354089847275564, + "language_loss": 0.79992342, + "learning_rate": 0.0009008892702267599, + "loss": 0.81046152, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.48535156, + "step": 1184, + "time_per_iteration": 2.9798924922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057068, + "balance_loss_mlp": 1.00855029, + "epoch": 0.22797229703732205, + "flos": 527913947904.0, + "grad_norm": 0.04184098346005727, + "language_loss": 0.89975739, + "learning_rate": 0.0009007030081110457, + "loss": 0.91032803, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.48510742, + "step": 1185, + "time_per_iteration": 2.6349968910217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.00910807, + "epoch": 0.2281646787225856, + "flos": 536521630464.0, + "grad_norm": 0.03583751901003141, + "language_loss": 0.85487026, + "learning_rate": 0.000900516590428627, + "loss": 0.86544555, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.48413086, + "step": 1186, + "time_per_iteration": 2.669015407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054531, + "balance_loss_mlp": 1.00596476, + "epoch": 0.22835706040784917, + "flos": 542478478080.0, + "grad_norm": 0.03191556588332838, + "language_loss": 0.9033947, + "learning_rate": 0.0009003300172518778, + "loss": 0.91394001, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.4855957, + "step": 1187, + "time_per_iteration": 2.7164688110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.00804579, + "epoch": 0.22854944209311273, + "flos": 792006042624.0, + "grad_norm": 0.0322044633529041, + "language_loss": 0.85374159, + "learning_rate": 0.0009001432886532321, + "loss": 0.86430913, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.48681641, + "step": 1188, + "time_per_iteration": 2.9621965885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054799, + "balance_loss_mlp": 1.00568485, + "epoch": 0.2287418237783763, + "flos": 470216707584.0, + "grad_norm": 0.03536870053258389, + "language_loss": 0.87358034, + "learning_rate": 0.0008999564047051843, + "loss": 0.88412833, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.49047852, + "step": 1189, + "time_per_iteration": 2.5233154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058932, + "balance_loss_mlp": 1.01003218, + "epoch": 0.22893420546363985, + "flos": 469005507072.0, + "grad_norm": 0.030491923293758834, + "language_loss": 0.8554523, + "learning_rate": 0.0008997693654802894, + "loss": 0.86604154, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.48852539, + "step": 1190, + "time_per_iteration": 2.6391589641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_mlp": 1.00965738, + "epoch": 0.22912658714890344, + "flos": 627402440448.0, + "grad_norm": 0.0331512035559832, + "language_loss": 0.87166977, + "learning_rate": 0.0008995821710511625, + "loss": 0.88225698, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49023438, + "step": 1191, + "time_per_iteration": 2.7549567222595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.00599909, + "epoch": 0.229318968834167, + "flos": 504021428736.0, + "grad_norm": 0.030936804790582927, + "language_loss": 0.85688579, + "learning_rate": 0.0008993948214904786, + "loss": 0.86743385, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.48779297, + "step": 1192, + "time_per_iteration": 2.596224784851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_mlp": 1.01483917, + "epoch": 0.22951135051943056, + "flos": 1377716374272.0, + "grad_norm": 0.008909469382289665, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79484069, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.46972656, + "step": 1193, + "time_per_iteration": 4.853066921234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062271, + "balance_loss_mlp": 1.01356232, + "epoch": 0.22970373220469412, + "flos": 645550994688.0, + "grad_norm": 0.0389743097765726, + "language_loss": 0.78935194, + "learning_rate": 0.0008990196572654427, + "loss": 0.79997468, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.48681641, + "step": 1194, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00771046, + "epoch": 0.22989611388995768, + "flos": 501273384192.0, + "grad_norm": 0.02988304738122761, + "language_loss": 0.88486552, + "learning_rate": 0.0008988318427467426, + "loss": 0.8954283, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.4855957, + "step": 1195, + "time_per_iteration": 2.6931521892547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053709, + "balance_loss_mlp": 1.00514269, + "epoch": 0.23008849557522124, + "flos": 1098334596864.0, + "grad_norm": 0.03694163801075408, + "language_loss": 0.87307864, + "learning_rate": 0.0008986438733877887, + "loss": 0.88361579, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.4855957, + "step": 1196, + "time_per_iteration": 3.4505865573883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00447488, + "epoch": 0.2302808772604848, + "flos": 684993722880.0, + "grad_norm": 0.030674764969734848, + "language_loss": 0.85086071, + "learning_rate": 0.0008984557492615576, + "loss": 0.86139137, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.48583984, + "step": 1197, + "time_per_iteration": 2.936891794204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056985, + "balance_loss_mlp": 1.00837183, + "epoch": 0.23047325894574835, + "flos": 529961127936.0, + "grad_norm": 0.03469763625730159, + "language_loss": 0.90249604, + "learning_rate": 0.0008982674704410854, + "loss": 0.91306591, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.48608398, + "step": 1198, + "time_per_iteration": 2.6928677558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_mlp": 1.00653744, + "epoch": 0.23066564063101191, + "flos": 684127607808.0, + "grad_norm": 0.03582939263118032, + "language_loss": 0.78263444, + "learning_rate": 0.0008980790369994682, + "loss": 0.79318547, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.4855957, + "step": 1199, + "time_per_iteration": 2.941063642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692904, + "epoch": 0.2308580223162755, + "flos": 559632605184.0, + "grad_norm": 0.03400437188822284, + "language_loss": 0.87868834, + "learning_rate": 0.000897890449009863, + "loss": 0.88924116, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.48339844, + "step": 1200, + "time_per_iteration": 2.6677346229553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058779, + "balance_loss_mlp": 1.01061893, + "epoch": 0.23105040400153906, + "flos": 556730003712.0, + "grad_norm": 0.030515141355108834, + "language_loss": 0.90571141, + "learning_rate": 0.0008977017065454853, + "loss": 0.91629916, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.48144531, + "step": 1201, + "time_per_iteration": 2.7204995155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053158, + "balance_loss_mlp": 1.00506902, + "epoch": 0.23124278568680262, + "flos": 706050714624.0, + "grad_norm": 0.034769733982414605, + "language_loss": 0.81452352, + "learning_rate": 0.0008975128096796121, + "loss": 0.82505512, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48071289, + "step": 1202, + "time_per_iteration": 2.861058473587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.00517035, + "epoch": 0.23143516737206618, + "flos": 613969397760.0, + "grad_norm": 0.03845725381901349, + "language_loss": 0.86815399, + "learning_rate": 0.0008973237584855794, + "loss": 0.87868845, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.48266602, + "step": 1203, + "time_per_iteration": 2.907670021057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055623, + "balance_loss_mlp": 1.00715244, + "epoch": 0.23162754905732974, + "flos": 390096718080.0, + "grad_norm": 0.03680581416715809, + "language_loss": 0.82972479, + "learning_rate": 0.0008971345530367832, + "loss": 0.84028101, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.48461914, + "step": 1204, + "time_per_iteration": 2.4500131607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_mlp": 1.00190353, + "epoch": 0.2318199307425933, + "flos": 668970116352.0, + "grad_norm": 0.03636020946200237, + "language_loss": 0.86001658, + "learning_rate": 0.0008969451934066799, + "loss": 0.87052464, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.48828125, + "step": 1205, + "time_per_iteration": 2.786860704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_mlp": 1.00558126, + "epoch": 0.23201231242785686, + "flos": 667628658432.0, + "grad_norm": 0.042825772722853955, + "language_loss": 0.80798173, + "learning_rate": 0.0008967556796687854, + "loss": 0.81852657, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.48852539, + "step": 1206, + "time_per_iteration": 2.9043900966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106058, + "balance_loss_mlp": 1.01153755, + "epoch": 0.23220469411312042, + "flos": 750095226624.0, + "grad_norm": 0.036226897286377145, + "language_loss": 0.84918714, + "learning_rate": 0.0008965660118966752, + "loss": 0.85979295, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.48974609, + "step": 1207, + "time_per_iteration": 2.8989100456237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054609, + "balance_loss_mlp": 1.00597119, + "epoch": 0.232397075798384, + "flos": 668262448896.0, + "grad_norm": 0.03230217319227319, + "language_loss": 0.90859735, + "learning_rate": 0.0008963761901639851, + "loss": 0.91914344, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.48632812, + "step": 1208, + "time_per_iteration": 2.801715612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050654, + "balance_loss_mlp": 1.00204051, + "epoch": 0.23258945748364757, + "flos": 611346753024.0, + "grad_norm": 0.038379048380249, + "language_loss": 0.83753544, + "learning_rate": 0.0008961862145444103, + "loss": 0.84804195, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.48608398, + "step": 1209, + "time_per_iteration": 2.6739237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105504, + "balance_loss_mlp": 1.00656986, + "epoch": 0.23278183916891113, + "flos": 490672956672.0, + "grad_norm": 0.04093378826068356, + "language_loss": 0.86382735, + "learning_rate": 0.0008959960851117059, + "loss": 0.87437773, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.48461914, + "step": 1210, + "time_per_iteration": 2.635650634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056695, + "balance_loss_mlp": 1.00808144, + "epoch": 0.23297422085417469, + "flos": 512674798080.0, + "grad_norm": 0.0354403494585401, + "language_loss": 0.84509313, + "learning_rate": 0.0008958058019396868, + "loss": 0.85566002, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.48608398, + "step": 1211, + "time_per_iteration": 2.788318157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105326, + "balance_loss_mlp": 1.00462246, + "epoch": 0.23316660253943824, + "flos": 547532272128.0, + "grad_norm": 0.03263062148431384, + "language_loss": 0.87462825, + "learning_rate": 0.0008956153651022274, + "loss": 0.8851608, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.48608398, + "step": 1212, + "time_per_iteration": 2.725313901901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_mlp": 1.00709951, + "epoch": 0.2333589842247018, + "flos": 511289598720.0, + "grad_norm": 0.03371055024816449, + "language_loss": 0.84886169, + "learning_rate": 0.0008954247746732618, + "loss": 0.85942048, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.48754883, + "step": 1213, + "time_per_iteration": 2.592165470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057864, + "balance_loss_mlp": 1.00894058, + "epoch": 0.23355136590996536, + "flos": 664407216384.0, + "grad_norm": 0.030798488974581865, + "language_loss": 0.9124192, + "learning_rate": 0.0008952340307267837, + "loss": 0.92299783, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48876953, + "step": 1214, + "time_per_iteration": 2.887542724609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_mlp": 1.00332439, + "epoch": 0.23374374759522892, + "flos": 509465995008.0, + "grad_norm": 0.038631928770240895, + "language_loss": 0.8442086, + "learning_rate": 0.0008950431333368468, + "loss": 0.85472775, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.48583984, + "step": 1215, + "time_per_iteration": 2.5713701248168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_mlp": 1.00283849, + "epoch": 0.2339361292804925, + "flos": 1296429915648.0, + "grad_norm": 0.03446682830311694, + "language_loss": 0.8584398, + "learning_rate": 0.0008948520825775634, + "loss": 0.86895549, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48706055, + "step": 1216, + "time_per_iteration": 3.631596565246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054055, + "balance_loss_mlp": 1.00541723, + "epoch": 0.23412851096575607, + "flos": 707177344512.0, + "grad_norm": 0.031791306217448204, + "language_loss": 0.84468639, + "learning_rate": 0.0008946608785231067, + "loss": 0.85522687, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48632812, + "step": 1217, + "time_per_iteration": 2.878099203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053689, + "balance_loss_mlp": 1.00517046, + "epoch": 0.23432089265101963, + "flos": 439175582208.0, + "grad_norm": 0.03486793229645632, + "language_loss": 0.85493773, + "learning_rate": 0.0008944695212477084, + "loss": 0.86547458, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.48510742, + "step": 1218, + "time_per_iteration": 2.5141704082489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053338, + "balance_loss_mlp": 1.00498641, + "epoch": 0.2345132743362832, + "flos": 481915574784.0, + "grad_norm": 0.03047714423600347, + "language_loss": 0.87145793, + "learning_rate": 0.0008942780108256599, + "loss": 0.88199133, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.48339844, + "step": 1219, + "time_per_iteration": 2.6020901203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050373, + "balance_loss_mlp": 1.00180733, + "epoch": 0.23470565602154675, + "flos": 412341577728.0, + "grad_norm": 0.03328064907126118, + "language_loss": 0.87382472, + "learning_rate": 0.0008940863473313121, + "loss": 0.88432848, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.4855957, + "step": 1220, + "time_per_iteration": 2.4561610221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_mlp": 1.00483322, + "epoch": 0.2348980377068103, + "flos": 546500906496.0, + "grad_norm": 0.04239569524538178, + "language_loss": 0.88751769, + "learning_rate": 0.0008938945308390756, + "loss": 0.89805412, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48779297, + "step": 1221, + "time_per_iteration": 2.657763719558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057653, + "balance_loss_mlp": 1.00906336, + "epoch": 0.23509041939207387, + "flos": 576843112704.0, + "grad_norm": 0.04482007629740174, + "language_loss": 0.88039029, + "learning_rate": 0.00089370256142342, + "loss": 0.89096677, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.48583984, + "step": 1222, + "time_per_iteration": 2.7348928451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_mlp": 1.00616074, + "epoch": 0.23528280107733743, + "flos": 589948566528.0, + "grad_norm": 0.030112791330182954, + "language_loss": 0.85687798, + "learning_rate": 0.0008935104391588746, + "loss": 0.86742526, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.4855957, + "step": 1223, + "time_per_iteration": 2.7620511054992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_mlp": 1.00350857, + "epoch": 0.235475182762601, + "flos": 824858132736.0, + "grad_norm": 0.028710207733723417, + "language_loss": 0.83630896, + "learning_rate": 0.0008933181641200276, + "loss": 0.84683019, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.48608398, + "step": 1224, + "time_per_iteration": 3.1587913036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053568, + "balance_loss_mlp": 1.00531197, + "epoch": 0.23566756444786457, + "flos": 681367902720.0, + "grad_norm": 0.03430983930689064, + "language_loss": 0.86561936, + "learning_rate": 0.0008931257363815271, + "loss": 0.87615514, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.48242188, + "step": 1225, + "time_per_iteration": 2.9277396202087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056611, + "balance_loss_mlp": 1.00849795, + "epoch": 0.23585994613312813, + "flos": 703135474176.0, + "grad_norm": 0.029906055234585397, + "language_loss": 0.90256047, + "learning_rate": 0.0008929331560180798, + "loss": 0.91312659, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.48095703, + "step": 1226, + "time_per_iteration": 2.911451578140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_mlp": 1.00676012, + "epoch": 0.2360523278183917, + "flos": 525196038912.0, + "grad_norm": 0.030679819106685022, + "language_loss": 0.9186613, + "learning_rate": 0.0008927404231044525, + "loss": 0.92921197, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48291016, + "step": 1227, + "time_per_iteration": 2.6848785877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.00756276, + "epoch": 0.23624470950365525, + "flos": 525443914752.0, + "grad_norm": 0.030207709240370546, + "language_loss": 0.82286787, + "learning_rate": 0.0008925475377154703, + "loss": 0.83342624, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48266602, + "step": 1228, + "time_per_iteration": 2.7278709411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058098, + "balance_loss_mlp": 1.00974643, + "epoch": 0.2364370911889188, + "flos": 597961342464.0, + "grad_norm": 0.04301213480645635, + "language_loss": 0.82405227, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463323, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.48339844, + "step": 1229, + "time_per_iteration": 2.7282724380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055873, + "balance_loss_mlp": 1.00766432, + "epoch": 0.23662947287418237, + "flos": 758173131264.0, + "grad_norm": 0.03660169780759576, + "language_loss": 0.92488217, + "learning_rate": 0.00089216130981104, + "loss": 0.93544096, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.48193359, + "step": 1230, + "time_per_iteration": 3.0333714485168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051461, + "balance_loss_mlp": 1.00337219, + "epoch": 0.23682185455944593, + "flos": 547208573952.0, + "grad_norm": 0.03138155314794734, + "language_loss": 0.83336782, + "learning_rate": 0.000891967967445539, + "loss": 0.8438825, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.48071289, + "step": 1231, + "time_per_iteration": 2.7093472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_mlp": 1.00587165, + "epoch": 0.2370142362447095, + "flos": 663523604736.0, + "grad_norm": 0.02795314572038805, + "language_loss": 0.89439881, + "learning_rate": 0.0008917744729045772, + "loss": 0.90493822, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.48046875, + "step": 1232, + "time_per_iteration": 2.8760838508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057809, + "balance_loss_mlp": 1.00974393, + "epoch": 0.23720661792997308, + "flos": 684913042944.0, + "grad_norm": 0.03460859048974857, + "language_loss": 0.8446126, + "learning_rate": 0.0008915808262632757, + "loss": 0.85519075, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.48046875, + "step": 1233, + "time_per_iteration": 2.889141321182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_mlp": 1.01058459, + "epoch": 0.23739899961523664, + "flos": 560023377408.0, + "grad_norm": 0.03296017154749467, + "language_loss": 0.94079709, + "learning_rate": 0.0008913870275968148, + "loss": 0.95138192, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.47875977, + "step": 1234, + "time_per_iteration": 2.7432892322540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_mlp": 1.00655627, + "epoch": 0.2375913813005002, + "flos": 891165000960.0, + "grad_norm": 0.03128077017401229, + "language_loss": 0.88428569, + "learning_rate": 0.0008911930769804342, + "loss": 0.89483166, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.48022461, + "step": 1235, + "time_per_iteration": 3.261483669281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692844, + "epoch": 0.23778376298576376, + "flos": 642366491136.0, + "grad_norm": 0.029107844015886564, + "language_loss": 0.91850013, + "learning_rate": 0.0008909989744894318, + "loss": 0.92905295, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.48339844, + "step": 1236, + "time_per_iteration": 2.8673832416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061412, + "balance_loss_mlp": 1.01287031, + "epoch": 0.23797614467102732, + "flos": 617946139392.0, + "grad_norm": 0.034095811880077646, + "language_loss": 0.82566786, + "learning_rate": 0.0008908047201991649, + "loss": 0.83628196, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.48535156, + "step": 1237, + "time_per_iteration": 2.7810442447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00511789, + "epoch": 0.23816852635629088, + "flos": 625464130560.0, + "grad_norm": 0.032663011960307756, + "language_loss": 0.87081301, + "learning_rate": 0.0008906103141850502, + "loss": 0.88134885, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.48461914, + "step": 1238, + "time_per_iteration": 2.880305528640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_mlp": 1.00416911, + "epoch": 0.23836090804155444, + "flos": 522441191424.0, + "grad_norm": 0.03474425243888252, + "language_loss": 0.88862967, + "learning_rate": 0.0008904157565225621, + "loss": 0.89915323, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48168945, + "step": 1239, + "time_per_iteration": 2.648766040802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052394, + "balance_loss_mlp": 1.00423324, + "epoch": 0.238553289726818, + "flos": 1155855892992.0, + "grad_norm": 0.034399895266541865, + "language_loss": 0.82445645, + "learning_rate": 0.000890221047287235, + "loss": 0.83498037, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48144531, + "step": 1240, + "time_per_iteration": 3.5001280307769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055, + "balance_loss_mlp": 1.00703037, + "epoch": 0.23874567141208156, + "flos": 500910802176.0, + "grad_norm": 0.03306053891413694, + "language_loss": 0.91726851, + "learning_rate": 0.0008900261865546615, + "loss": 0.92781848, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47949219, + "step": 1241, + "time_per_iteration": 2.6465680599212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052437, + "balance_loss_mlp": 1.00418115, + "epoch": 0.23893805309734514, + "flos": 558050074368.0, + "grad_norm": 0.0354259641755878, + "language_loss": 0.85598528, + "learning_rate": 0.0008898311744004936, + "loss": 0.86650962, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.48242188, + "step": 1242, + "time_per_iteration": 2.7268829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053623, + "balance_loss_mlp": 1.0055337, + "epoch": 0.2391304347826087, + "flos": 550317255168.0, + "grad_norm": 0.0320494810853186, + "language_loss": 0.87574649, + "learning_rate": 0.0008896360109004414, + "loss": 0.88628268, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.48071289, + "step": 1243, + "time_per_iteration": 2.6199252605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050337, + "balance_loss_mlp": 1.00222456, + "epoch": 0.23932281646787226, + "flos": 517079250432.0, + "grad_norm": 0.0302458656306059, + "language_loss": 0.85177696, + "learning_rate": 0.0008894406961302742, + "loss": 0.86228031, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.48095703, + "step": 1244, + "time_per_iteration": 2.604508876800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052152, + "balance_loss_mlp": 1.00411069, + "epoch": 0.23951519815313582, + "flos": 745002548736.0, + "grad_norm": 0.03429303167053761, + "language_loss": 0.84712255, + "learning_rate": 0.0008892452301658201, + "loss": 0.85764414, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.48022461, + "step": 1245, + "time_per_iteration": 2.924288272857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054436, + "balance_loss_mlp": 1.00651395, + "epoch": 0.23970757983839938, + "flos": 555175663104.0, + "grad_norm": 0.03219666617279603, + "language_loss": 0.84054452, + "learning_rate": 0.0008890496130829653, + "loss": 0.85108888, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.47900391, + "step": 1246, + "time_per_iteration": 2.6700189113616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052243, + "balance_loss_mlp": 1.00441635, + "epoch": 0.23989996152366294, + "flos": 481618121472.0, + "grad_norm": 0.033578246726411604, + "language_loss": 0.86002076, + "learning_rate": 0.0008888538449576555, + "loss": 0.87054318, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.47802734, + "step": 1247, + "time_per_iteration": 2.6269826889038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_mlp": 1.00886118, + "epoch": 0.2400923432089265, + "flos": 486281143296.0, + "grad_norm": 0.03580496599340432, + "language_loss": 0.83572984, + "learning_rate": 0.0008886579258658944, + "loss": 0.84630001, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48144531, + "step": 1248, + "time_per_iteration": 2.577885389328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054529, + "balance_loss_mlp": 1.0065589, + "epoch": 0.24028472489419006, + "flos": 624793401600.0, + "grad_norm": 0.03296142515540601, + "language_loss": 0.85843956, + "learning_rate": 0.0008884618558837446, + "loss": 0.86898482, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.47949219, + "step": 1249, + "time_per_iteration": 2.874666929244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.00870681, + "epoch": 0.24047710657945365, + "flos": 602809056768.0, + "grad_norm": 0.033943651692576245, + "language_loss": 0.87474859, + "learning_rate": 0.0008882656350873273, + "loss": 0.88531733, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.48144531, + "step": 1250, + "time_per_iteration": 2.8647053241729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055385, + "balance_loss_mlp": 1.00748658, + "epoch": 0.2406694882647172, + "flos": 843001829376.0, + "grad_norm": 0.04142560607115463, + "language_loss": 0.87984931, + "learning_rate": 0.0008880692635528219, + "loss": 0.89040315, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.47875977, + "step": 1251, + "time_per_iteration": 3.0643107891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105149, + "balance_loss_mlp": 1.00352037, + "epoch": 0.24086186994998077, + "flos": 528135578880.0, + "grad_norm": 0.03337559285192523, + "language_loss": 0.90356189, + "learning_rate": 0.0008878727413564669, + "loss": 0.91407681, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47949219, + "step": 1252, + "time_per_iteration": 2.7680115699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053848, + "balance_loss_mlp": 1.00826263, + "epoch": 0.24105425163524433, + "flos": 1341462028800.0, + "grad_norm": 0.009196650126926217, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81189448, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.45507812, + "step": 1253, + "time_per_iteration": 4.858070135116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_mlp": 1.00698781, + "epoch": 0.24124663332050789, + "flos": 615228230400.0, + "grad_norm": 0.036740782431925904, + "language_loss": 0.79496801, + "learning_rate": 0.0008874792452834528, + "loss": 0.80551577, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.47753906, + "step": 1254, + "time_per_iteration": 2.756243944168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_mlp": 1.00954247, + "epoch": 0.24143901500577145, + "flos": 576593291520.0, + "grad_norm": 0.037714132300224086, + "language_loss": 0.87880921, + "learning_rate": 0.0008872822715595626, + "loss": 0.88938332, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.47851562, + "step": 1255, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.00812411, + "epoch": 0.241631396691035, + "flos": 496147658496.0, + "grad_norm": 0.038695693582970765, + "language_loss": 0.87873089, + "learning_rate": 0.0008870851474793598, + "loss": 0.88929206, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.47973633, + "step": 1256, + "time_per_iteration": 2.6313350200653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058027, + "balance_loss_mlp": 1.009866, + "epoch": 0.24182377837629856, + "flos": 637397267712.0, + "grad_norm": 0.03630749648984725, + "language_loss": 0.904266, + "learning_rate": 0.0008868878731193752, + "loss": 0.9148463, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48144531, + "step": 1257, + "time_per_iteration": 2.820671558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052967, + "balance_loss_mlp": 1.00509274, + "epoch": 0.24201616006156215, + "flos": 516350195712.0, + "grad_norm": 0.04098435374075245, + "language_loss": 0.90631104, + "learning_rate": 0.0008866904485561973, + "loss": 0.91684067, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.47851562, + "step": 1258, + "time_per_iteration": 2.712970495223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053405, + "balance_loss_mlp": 1.0053165, + "epoch": 0.2422085417468257, + "flos": 616379159808.0, + "grad_norm": 0.03199149634406808, + "language_loss": 0.83463258, + "learning_rate": 0.000886492873866473, + "loss": 0.84516662, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.48071289, + "step": 1259, + "time_per_iteration": 2.8250985145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00330269, + "epoch": 0.24240092343208927, + "flos": 586913762304.0, + "grad_norm": 0.03973618931504764, + "language_loss": 0.85183978, + "learning_rate": 0.000886295149126908, + "loss": 0.86235273, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.47973633, + "step": 1260, + "time_per_iteration": 2.7110049724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_mlp": 1.00338328, + "epoch": 0.24259330511735283, + "flos": 763572010752.0, + "grad_norm": 0.03275678482299809, + "language_loss": 0.86485362, + "learning_rate": 0.0008860972744142655, + "loss": 0.87536597, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47827148, + "step": 1261, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051398, + "balance_loss_mlp": 1.00361907, + "epoch": 0.2427856868026164, + "flos": 628134407424.0, + "grad_norm": 0.03196094686024711, + "language_loss": 0.82455611, + "learning_rate": 0.0008858992498053671, + "loss": 0.83507007, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47753906, + "step": 1262, + "time_per_iteration": 2.8111376762390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054321, + "balance_loss_mlp": 1.00797272, + "epoch": 0.24297806848787995, + "flos": 1514922167808.0, + "grad_norm": 0.010120346862694057, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77643073, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.46289062, + "step": 1263, + "time_per_iteration": 4.84857177734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052202, + "balance_loss_mlp": 1.00420785, + "epoch": 0.2431704501731435, + "flos": 543073384704.0, + "grad_norm": 0.030775668427347653, + "language_loss": 0.83837479, + "learning_rate": 0.0008855027512063817, + "loss": 0.84889686, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47973633, + "step": 1264, + "time_per_iteration": 2.69954252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055608, + "balance_loss_mlp": 1.0077095, + "epoch": 0.24336283185840707, + "flos": 524879143680.0, + "grad_norm": 0.03906981412635217, + "language_loss": 0.86655742, + "learning_rate": 0.0008853042773702292, + "loss": 0.87711346, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.47875977, + "step": 1265, + "time_per_iteration": 2.703227996826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_mlp": 1.00530863, + "epoch": 0.24355521354367063, + "flos": 538206228480.0, + "grad_norm": 0.030917867079500824, + "language_loss": 0.88497615, + "learning_rate": 0.0008851056539456896, + "loss": 0.89550632, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.47680664, + "step": 1266, + "time_per_iteration": 2.6844840049743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_mlp": 1.00655031, + "epoch": 0.24374759522893422, + "flos": 932109580032.0, + "grad_norm": 0.032880300158599975, + "language_loss": 0.82697207, + "learning_rate": 0.0008849068810098755, + "loss": 0.83751392, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.47607422, + "step": 1267, + "time_per_iteration": 3.274641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_mlp": 1.00789249, + "epoch": 0.24393997691419778, + "flos": 428685970176.0, + "grad_norm": 0.04273651221625489, + "language_loss": 0.84108871, + "learning_rate": 0.0008847079586399575, + "loss": 0.85164183, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47387695, + "step": 1268, + "time_per_iteration": 2.475217819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057264, + "balance_loss_mlp": 1.00993788, + "epoch": 0.24413235859946134, + "flos": 579943045632.0, + "grad_norm": 0.03463136192779687, + "language_loss": 0.86878628, + "learning_rate": 0.0008845088869131641, + "loss": 0.87935889, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.47290039, + "step": 1269, + "time_per_iteration": 2.676954746246338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054221, + "balance_loss_mlp": 1.00689447, + "epoch": 0.2443247402847249, + "flos": 530901120000.0, + "grad_norm": 0.04739098518835349, + "language_loss": 0.8972156, + "learning_rate": 0.0008843096659067818, + "loss": 0.90775776, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.47290039, + "step": 1270, + "time_per_iteration": 2.6031625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_mlp": 1.00896251, + "epoch": 0.24451712196998845, + "flos": 697625779200.0, + "grad_norm": 0.03005687387855686, + "language_loss": 0.8676796, + "learning_rate": 0.000884110295698155, + "loss": 0.87824345, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.47387695, + "step": 1271, + "time_per_iteration": 2.946385145187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00460577, + "epoch": 0.24470950365525201, + "flos": 530864181504.0, + "grad_norm": 0.03542850047119753, + "language_loss": 0.86657912, + "learning_rate": 0.0008839107763646861, + "loss": 0.87710059, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.47509766, + "step": 1272, + "time_per_iteration": 2.6175343990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057047, + "balance_loss_mlp": 1.00955379, + "epoch": 0.24490188534051557, + "flos": 492348806400.0, + "grad_norm": 0.04294337139782129, + "language_loss": 0.9099223, + "learning_rate": 0.0008837111079838353, + "loss": 0.92049271, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.47460938, + "step": 1273, + "time_per_iteration": 2.699777126312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051504, + "balance_loss_mlp": 1.00393975, + "epoch": 0.24509426702577913, + "flos": 475112054016.0, + "grad_norm": 0.03233839715385124, + "language_loss": 0.90686411, + "learning_rate": 0.000883511290633121, + "loss": 0.91737914, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.4753418, + "step": 1274, + "time_per_iteration": 2.5347506999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053736, + "balance_loss_mlp": 1.0061239, + "epoch": 0.24528664871104272, + "flos": 551648019456.0, + "grad_norm": 0.029596958484994024, + "language_loss": 0.9283247, + "learning_rate": 0.000883311324390119, + "loss": 0.93886209, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.47583008, + "step": 1275, + "time_per_iteration": 2.7105162143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_mlp": 1.00703931, + "epoch": 0.24547903039630628, + "flos": 827336914176.0, + "grad_norm": 0.04026092464880397, + "language_loss": 0.8227402, + "learning_rate": 0.0008831112093324629, + "loss": 0.83328599, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.47509766, + "step": 1276, + "time_per_iteration": 3.0518436431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052804, + "balance_loss_mlp": 1.00523984, + "epoch": 0.24567141208156984, + "flos": 592694665728.0, + "grad_norm": 0.0350541873914122, + "language_loss": 0.89993191, + "learning_rate": 0.0008829109455378444, + "loss": 0.91045994, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.4753418, + "step": 1277, + "time_per_iteration": 2.705888032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053461, + "balance_loss_mlp": 1.00606322, + "epoch": 0.2458637937668334, + "flos": 548930110464.0, + "grad_norm": 0.03225743101348484, + "language_loss": 0.87107539, + "learning_rate": 0.000882710533084013, + "loss": 0.88161004, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.47363281, + "step": 1278, + "time_per_iteration": 2.6600000858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_mlp": 1.00418186, + "epoch": 0.24605617545209696, + "flos": 516912054528.0, + "grad_norm": 0.031446449457072034, + "language_loss": 0.89965951, + "learning_rate": 0.0008825099720487755, + "loss": 0.91017628, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.47460938, + "step": 1279, + "time_per_iteration": 2.6381545066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059078, + "balance_loss_mlp": 1.01320648, + "epoch": 0.24624855713736052, + "flos": 1515061173504.0, + "grad_norm": 0.006597619453236458, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76320213, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.45800781, + "step": 1280, + "time_per_iteration": 4.836413621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.0109787, + "epoch": 0.24644093882262408, + "flos": 1530749421312.0, + "grad_norm": 0.006438131933853504, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000866, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.45703125, + "step": 1281, + "time_per_iteration": 4.763012409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055673, + "balance_loss_mlp": 1.00817966, + "epoch": 0.24663332050788764, + "flos": 660349794816.0, + "grad_norm": 0.03366863359794558, + "language_loss": 0.89743239, + "learning_rate": 0.0008819073982335619, + "loss": 0.90798908, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.47460938, + "step": 1282, + "time_per_iteration": 2.830066204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051734, + "balance_loss_mlp": 1.00426519, + "epoch": 0.24682570219315123, + "flos": 542806066944.0, + "grad_norm": 0.034270358372240205, + "language_loss": 0.85323066, + "learning_rate": 0.0008817062436519235, + "loss": 0.86374807, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.47436523, + "step": 1283, + "time_per_iteration": 2.6451101303100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00680709, + "epoch": 0.24701808387841478, + "flos": 441659221248.0, + "grad_norm": 0.03422998600893363, + "language_loss": 0.90367711, + "learning_rate": 0.0008815049408787788, + "loss": 0.91422176, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.47631836, + "step": 1284, + "time_per_iteration": 2.5568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_mlp": 1.00672722, + "epoch": 0.24721046556367834, + "flos": 469033697280.0, + "grad_norm": 0.036620952447016124, + "language_loss": 0.86045629, + "learning_rate": 0.0008813034899922805, + "loss": 0.87100112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47729492, + "step": 1285, + "time_per_iteration": 2.5571885108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052621, + "balance_loss_mlp": 1.00498545, + "epoch": 0.2474028472489419, + "flos": 505408573440.0, + "grad_norm": 0.03938899634346209, + "language_loss": 0.90811062, + "learning_rate": 0.0008811018910706387, + "loss": 0.91863692, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.47607422, + "step": 1286, + "time_per_iteration": 2.5542702674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105262, + "balance_loss_mlp": 1.00496054, + "epoch": 0.24759522893420546, + "flos": 480956140800.0, + "grad_norm": 0.04329385189604929, + "language_loss": 0.82886434, + "learning_rate": 0.0008809001441921211, + "loss": 0.83939052, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.47631836, + "step": 1287, + "time_per_iteration": 2.7426302433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056359, + "balance_loss_mlp": 1.00879443, + "epoch": 0.24778761061946902, + "flos": 534754407168.0, + "grad_norm": 0.03495005483538565, + "language_loss": 0.86372733, + "learning_rate": 0.0008806982494350528, + "loss": 0.87429094, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.4753418, + "step": 1288, + "time_per_iteration": 2.6200613975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_mlp": 1.0063771, + "epoch": 0.24797999230473258, + "flos": 560943927552.0, + "grad_norm": 0.028534619779485338, + "language_loss": 0.90820038, + "learning_rate": 0.0008804962068778161, + "loss": 0.91874075, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.47631836, + "step": 1289, + "time_per_iteration": 2.8445866107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050603, + "balance_loss_mlp": 1.00287127, + "epoch": 0.24817237398999614, + "flos": 625481627136.0, + "grad_norm": 0.033144052318390974, + "language_loss": 0.81476247, + "learning_rate": 0.0008802940165988511, + "loss": 0.82526851, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.47705078, + "step": 1290, + "time_per_iteration": 2.874469518661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052663, + "balance_loss_mlp": 1.00500298, + "epoch": 0.2483647556752597, + "flos": 613485306624.0, + "grad_norm": 0.033485904546120666, + "language_loss": 0.88976955, + "learning_rate": 0.000880091678676655, + "loss": 0.90029621, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47631836, + "step": 1291, + "time_per_iteration": 2.8294923305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_mlp": 1.00159943, + "epoch": 0.2485571373605233, + "flos": 584688692736.0, + "grad_norm": 0.030875088012072577, + "language_loss": 0.89826584, + "learning_rate": 0.0008798891931897821, + "loss": 0.90875816, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47607422, + "step": 1292, + "time_per_iteration": 2.7068471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050902, + "balance_loss_mlp": 1.00359952, + "epoch": 0.24874951904578685, + "flos": 495737444352.0, + "grad_norm": 0.03670876005724945, + "language_loss": 0.84959131, + "learning_rate": 0.0008796865602168447, + "loss": 0.86010033, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.47265625, + "step": 1293, + "time_per_iteration": 2.550218343734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_mlp": 1.00526226, + "epoch": 0.2489419007310504, + "flos": 457174437120.0, + "grad_norm": 0.03243940706171699, + "language_loss": 0.89144397, + "learning_rate": 0.0008794837798365115, + "loss": 0.90196991, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.47290039, + "step": 1294, + "time_per_iteration": 2.6271979808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.00420678, + "epoch": 0.24913428241631397, + "flos": 486565957632.0, + "grad_norm": 0.03268946967982851, + "language_loss": 0.89255542, + "learning_rate": 0.0008792808521275089, + "loss": 0.90307105, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47314453, + "step": 1295, + "time_per_iteration": 2.733107566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_mlp": 1.00544262, + "epoch": 0.24932666410157753, + "flos": 519918668544.0, + "grad_norm": 0.031266052737173484, + "language_loss": 0.88015056, + "learning_rate": 0.0008790777771686206, + "loss": 0.89068043, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.47509766, + "step": 1296, + "time_per_iteration": 2.5860161781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_mlp": 1.0059917, + "epoch": 0.2495190457868411, + "flos": 473557713408.0, + "grad_norm": 0.03428757295266267, + "language_loss": 0.86048388, + "learning_rate": 0.0008788745550386872, + "loss": 0.8710202, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.47607422, + "step": 1297, + "time_per_iteration": 2.599851608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_mlp": 1.00776434, + "epoch": 0.24971142747210465, + "flos": 747199428096.0, + "grad_norm": 0.03345883603952397, + "language_loss": 0.80858141, + "learning_rate": 0.0008786711858166063, + "loss": 0.81913638, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47705078, + "step": 1298, + "time_per_iteration": 2.9357736110687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_mlp": 1.00770009, + "epoch": 0.2499038091573682, + "flos": 750903015936.0, + "grad_norm": 0.03503874681650984, + "language_loss": 0.84951854, + "learning_rate": 0.0008784676695813332, + "loss": 0.86007309, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.47729492, + "step": 1299, + "time_per_iteration": 2.955172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055374, + "balance_loss_mlp": 1.00776184, + "epoch": 0.2500961908426318, + "flos": 746344006656.0, + "grad_norm": 0.032686560936085865, + "language_loss": 0.85840905, + "learning_rate": 0.0008782640064118796, + "loss": 0.86896276, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47583008, + "step": 1300, + "time_per_iteration": 2.897998571395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055206, + "balance_loss_mlp": 1.00904846, + "epoch": 0.2502885725278953, + "flos": 1420526353152.0, + "grad_norm": 0.0075534145797937526, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77239954, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.4609375, + "step": 1301, + "time_per_iteration": 5.023081541061401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.00904393, + "epoch": 0.2504809542131589, + "flos": 516232577280.0, + "grad_norm": 0.03748206036604932, + "language_loss": 0.87484509, + "learning_rate": 0.0008778562395867648, + "loss": 0.88541192, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.47607422, + "step": 1302, + "time_per_iteration": 2.593972682952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105251, + "balance_loss_mlp": 1.00477886, + "epoch": 0.25067333589842244, + "flos": 526852446720.0, + "grad_norm": 0.031223058919554587, + "language_loss": 0.84117836, + "learning_rate": 0.0008776521360894127, + "loss": 0.85170352, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.47705078, + "step": 1303, + "time_per_iteration": 2.6153149604797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.02342987, + "epoch": 0.25086571758368603, + "flos": 1477160146944.0, + "grad_norm": 0.014969332736355754, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80031657, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.4609375, + "step": 1304, + "time_per_iteration": 4.792739629745483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053769, + "balance_loss_mlp": 1.00649047, + "epoch": 0.2510580992689496, + "flos": 529403159808.0, + "grad_norm": 0.03453306909815573, + "language_loss": 0.91369265, + "learning_rate": 0.0008772434893213186, + "loss": 0.92423034, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.47241211, + "step": 1305, + "time_per_iteration": 2.581268072128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056214, + "balance_loss_mlp": 1.00919807, + "epoch": 0.25125048095421315, + "flos": 518466395136.0, + "grad_norm": 0.035319884850533015, + "language_loss": 0.84733635, + "learning_rate": 0.0008770389462092276, + "loss": 0.85789847, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46972656, + "step": 1306, + "time_per_iteration": 2.627317428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056702, + "balance_loss_mlp": 1.00951862, + "epoch": 0.25144286263947674, + "flos": 621675972096.0, + "grad_norm": 0.03558379494917989, + "language_loss": 0.87486076, + "learning_rate": 0.0008768342567176357, + "loss": 0.88542777, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.47143555, + "step": 1307, + "time_per_iteration": 2.787318706512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052534, + "balance_loss_mlp": 1.00537527, + "epoch": 0.25163524432474027, + "flos": 504866156544.0, + "grad_norm": 0.03616031366836922, + "language_loss": 0.9109531, + "learning_rate": 0.0008766294209260107, + "loss": 0.92147839, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.47119141, + "step": 1308, + "time_per_iteration": 2.6384546756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_mlp": 1.00510657, + "epoch": 0.25182762601000386, + "flos": 510080343552.0, + "grad_norm": 0.03702737725286332, + "language_loss": 0.92033225, + "learning_rate": 0.0008764244389138767, + "loss": 0.93085706, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47338867, + "step": 1309, + "time_per_iteration": 2.5620551109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053435, + "balance_loss_mlp": 1.006037, + "epoch": 0.2520200076952674, + "flos": 635098321152.0, + "grad_norm": 0.03928250470986306, + "language_loss": 0.83104628, + "learning_rate": 0.000876219310760815, + "loss": 0.84158063, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.47363281, + "step": 1310, + "time_per_iteration": 2.886335849761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053423, + "balance_loss_mlp": 1.00614405, + "epoch": 0.252212389380531, + "flos": 495652873728.0, + "grad_norm": 0.03544669215118347, + "language_loss": 0.82256365, + "learning_rate": 0.0008760140365464631, + "loss": 0.83309782, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.47241211, + "step": 1311, + "time_per_iteration": 2.607191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053107, + "balance_loss_mlp": 1.00592351, + "epoch": 0.2524047710657945, + "flos": 491530323456.0, + "grad_norm": 0.037974131054051216, + "language_loss": 0.87817502, + "learning_rate": 0.0008758086163505156, + "loss": 0.88870609, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.47143555, + "step": 1312, + "time_per_iteration": 2.6121339797973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052408, + "balance_loss_mlp": 1.00505757, + "epoch": 0.2525971527510581, + "flos": 648613989120.0, + "grad_norm": 0.03226827566126977, + "language_loss": 0.90228277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91280687, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.47314453, + "step": 1313, + "time_per_iteration": 2.8256115913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049721, + "balance_loss_mlp": 1.00234711, + "epoch": 0.2527895344363217, + "flos": 570373983744.0, + "grad_norm": 0.0325160066751772, + "language_loss": 0.907884, + "learning_rate": 0.0008753973383328954, + "loss": 0.91838121, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.47338867, + "step": 1314, + "time_per_iteration": 2.722231388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051583, + "balance_loss_mlp": 1.00423265, + "epoch": 0.2529819161215852, + "flos": 515069008896.0, + "grad_norm": 0.040482030139478604, + "language_loss": 0.8500945, + "learning_rate": 0.0008751914806708952, + "loss": 0.86061025, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.47314453, + "step": 1315, + "time_per_iteration": 2.593076229095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_mlp": 1.00376213, + "epoch": 0.2531742978068488, + "flos": 532351448064.0, + "grad_norm": 0.03414491036051862, + "language_loss": 0.82694548, + "learning_rate": 0.0008749854773466439, + "loss": 0.8374573, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.47387695, + "step": 1316, + "time_per_iteration": 2.660116672515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054789, + "balance_loss_mlp": 1.00722456, + "epoch": 0.25336667949211233, + "flos": 597748459776.0, + "grad_norm": 0.03206754273868493, + "language_loss": 0.84984171, + "learning_rate": 0.0008747793284401192, + "loss": 0.86038959, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.4753418, + "step": 1317, + "time_per_iteration": 2.692183017730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105185, + "balance_loss_mlp": 1.00407052, + "epoch": 0.2535590611773759, + "flos": 603256209408.0, + "grad_norm": 0.034288977750124294, + "language_loss": 0.85941386, + "learning_rate": 0.0008745730340313551, + "loss": 0.86993235, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.47753906, + "step": 1318, + "time_per_iteration": 2.7932682037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105299, + "balance_loss_mlp": 1.00525868, + "epoch": 0.25375144286263945, + "flos": 496323602688.0, + "grad_norm": 0.035249055653748196, + "language_loss": 0.8522734, + "learning_rate": 0.0008743665942004422, + "loss": 0.86280334, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.47705078, + "step": 1319, + "time_per_iteration": 2.6616318225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052413, + "balance_loss_mlp": 1.00465751, + "epoch": 0.25394382454790304, + "flos": 513477729792.0, + "grad_norm": 0.032623992793633046, + "language_loss": 0.93257391, + "learning_rate": 0.0008741600090275277, + "loss": 0.94309807, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.47729492, + "step": 1320, + "time_per_iteration": 2.567985773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051086, + "balance_loss_mlp": 1.00333035, + "epoch": 0.25413620623316663, + "flos": 960856616448.0, + "grad_norm": 0.03465281335593922, + "language_loss": 0.8488484, + "learning_rate": 0.0008739532785928151, + "loss": 0.85935926, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.47729492, + "step": 1321, + "time_per_iteration": 3.4506430625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054222, + "balance_loss_mlp": 1.00882721, + "epoch": 0.25432858791843016, + "flos": 1580651625984.0, + "grad_norm": 0.01348888133328934, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75947809, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.453125, + "step": 1322, + "time_per_iteration": 4.819811820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055371, + "balance_loss_mlp": 1.00752044, + "epoch": 0.25452096960369375, + "flos": 584894772480.0, + "grad_norm": 0.03690210205672512, + "language_loss": 0.83839363, + "learning_rate": 0.0008735393822590908, + "loss": 0.84894735, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.47827148, + "step": 1323, + "time_per_iteration": 2.680769681930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069306, + "balance_loss_mlp": 1.02138364, + "epoch": 0.2547133512889573, + "flos": 509641939200.0, + "grad_norm": 0.03795743442729459, + "language_loss": 0.87760162, + "learning_rate": 0.0008733322165207681, + "loss": 0.8882947, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.47900391, + "step": 1324, + "time_per_iteration": 2.6391303539276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056249, + "balance_loss_mlp": 1.00856507, + "epoch": 0.25490573297422087, + "flos": 784037008128.0, + "grad_norm": 0.03625483542623235, + "language_loss": 0.83670151, + "learning_rate": 0.0008731249058420247, + "loss": 0.84726399, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.4765625, + "step": 1325, + "time_per_iteration": 3.0179827213287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.01479542, + "epoch": 0.2550981146594844, + "flos": 510953261568.0, + "grad_norm": 0.03728184694741104, + "language_loss": 0.91373062, + "learning_rate": 0.0008729174503033459, + "loss": 0.92435133, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.47241211, + "step": 1326, + "time_per_iteration": 2.644351005554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059853, + "balance_loss_mlp": 1.01262248, + "epoch": 0.255290496344748, + "flos": 677931632640.0, + "grad_norm": 0.04262364220636159, + "language_loss": 0.83700824, + "learning_rate": 0.0008727098499852728, + "loss": 0.84760678, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.47192383, + "step": 1327, + "time_per_iteration": 2.8393821716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059289, + "balance_loss_mlp": 1.01212943, + "epoch": 0.2554828780300115, + "flos": 538985827584.0, + "grad_norm": 0.0346626903619469, + "language_loss": 0.90499496, + "learning_rate": 0.0008725021049684034, + "loss": 0.91558784, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.47119141, + "step": 1328, + "time_per_iteration": 2.74480938911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_mlp": 1.00554764, + "epoch": 0.2556752597152751, + "flos": 825624125952.0, + "grad_norm": 0.0321884383853499, + "language_loss": 0.83690739, + "learning_rate": 0.000872294215333391, + "loss": 0.84743297, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.46972656, + "step": 1329, + "time_per_iteration": 3.177448034286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066156, + "balance_loss_mlp": 1.01880646, + "epoch": 0.2558676414005387, + "flos": 571891385856.0, + "grad_norm": 0.037080167806849716, + "language_loss": 0.84060931, + "learning_rate": 0.0008720861811609457, + "loss": 0.85127091, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.47314453, + "step": 1330, + "time_per_iteration": 2.7320711612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_mlp": 1.00745046, + "epoch": 0.2560600230858022, + "flos": 487748967936.0, + "grad_norm": 0.03498979971426328, + "language_loss": 0.84052318, + "learning_rate": 0.0008718780025318338, + "loss": 0.85106957, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.47143555, + "step": 1331, + "time_per_iteration": 2.7297112941741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.00705111, + "epoch": 0.2562524047710658, + "flos": 514120268544.0, + "grad_norm": 0.03699782349212247, + "language_loss": 0.84697664, + "learning_rate": 0.0008716696795268771, + "loss": 0.85751587, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.46826172, + "step": 1332, + "time_per_iteration": 2.6615397930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054318, + "balance_loss_mlp": 1.00756466, + "epoch": 0.25644478645632934, + "flos": 636110244864.0, + "grad_norm": 0.03600089626817585, + "language_loss": 0.85914254, + "learning_rate": 0.0008714612122269538, + "loss": 0.86968577, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.46704102, + "step": 1333, + "time_per_iteration": 2.849813938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056443, + "balance_loss_mlp": 1.00968957, + "epoch": 0.25663716814159293, + "flos": 437545419264.0, + "grad_norm": 0.03932780780666976, + "language_loss": 0.90516675, + "learning_rate": 0.0008712526007129982, + "loss": 0.91573119, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46704102, + "step": 1334, + "time_per_iteration": 2.520730972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_mlp": 1.00675464, + "epoch": 0.25682954982685646, + "flos": 499243700736.0, + "grad_norm": 0.03395243638019146, + "language_loss": 0.9133085, + "learning_rate": 0.0008710438450660003, + "loss": 0.9238441, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.4675293, + "step": 1335, + "time_per_iteration": 2.6936721801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00590599, + "epoch": 0.25702193151212005, + "flos": 458628655872.0, + "grad_norm": 0.038911849114865095, + "language_loss": 0.8791827, + "learning_rate": 0.0008708349453670064, + "loss": 0.88971329, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47119141, + "step": 1336, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074594, + "balance_loss_mlp": 1.02733934, + "epoch": 0.2572143131973836, + "flos": 599404867584.0, + "grad_norm": 0.03723585257139378, + "language_loss": 0.92015922, + "learning_rate": 0.0008706259016971185, + "loss": 0.93090516, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.47216797, + "step": 1337, + "time_per_iteration": 2.792436361312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055792, + "balance_loss_mlp": 1.00872791, + "epoch": 0.25740669488264717, + "flos": 699527150592.0, + "grad_norm": 0.04259016947882448, + "language_loss": 0.8355068, + "learning_rate": 0.0008704167141374944, + "loss": 0.84606469, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.47021484, + "step": 1338, + "time_per_iteration": 2.806931972503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056758, + "balance_loss_mlp": 1.01014686, + "epoch": 0.25759907656791076, + "flos": 503378889984.0, + "grad_norm": 0.03686560218677495, + "language_loss": 0.88890558, + "learning_rate": 0.0008702073827693482, + "loss": 0.89947319, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.46557617, + "step": 1339, + "time_per_iteration": 2.7613115310668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_mlp": 1.01112759, + "epoch": 0.2577914582531743, + "flos": 775242687744.0, + "grad_norm": 0.03484469931885578, + "language_loss": 0.89865053, + "learning_rate": 0.0008699979076739494, + "loss": 0.90922654, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.46411133, + "step": 1340, + "time_per_iteration": 2.9694418907165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_mlp": 1.00552797, + "epoch": 0.2579838399384379, + "flos": 460610707200.0, + "grad_norm": 0.04216529081594553, + "language_loss": 0.89380765, + "learning_rate": 0.0008697882889326234, + "loss": 0.9043293, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.46582031, + "step": 1341, + "time_per_iteration": 2.5050456523895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051599, + "balance_loss_mlp": 1.00482166, + "epoch": 0.2581762216237014, + "flos": 570263168256.0, + "grad_norm": 0.03742337984590145, + "language_loss": 0.87203884, + "learning_rate": 0.0008695785266267515, + "loss": 0.88255489, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.46728516, + "step": 1342, + "time_per_iteration": 2.677072763442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057516, + "balance_loss_mlp": 1.01069069, + "epoch": 0.258368603308965, + "flos": 605387960064.0, + "grad_norm": 0.035138016776099276, + "language_loss": 0.83827055, + "learning_rate": 0.0008693686208377704, + "loss": 0.84884572, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.46777344, + "step": 1343, + "time_per_iteration": 2.826026439666748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054134, + "balance_loss_mlp": 1.0075947, + "epoch": 0.2585609849942285, + "flos": 492487812096.0, + "grad_norm": 0.03194520317053949, + "language_loss": 0.89379156, + "learning_rate": 0.0008691585716471733, + "loss": 0.90433288, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.46484375, + "step": 1344, + "time_per_iteration": 2.6379647254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_mlp": 1.00646937, + "epoch": 0.2587533666794921, + "flos": 641958222336.0, + "grad_norm": 0.03185107281306307, + "language_loss": 0.86602217, + "learning_rate": 0.0008689483791365079, + "loss": 0.87655246, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.46508789, + "step": 1345, + "time_per_iteration": 2.8372344970703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105515, + "balance_loss_mlp": 1.00868249, + "epoch": 0.2589457483647557, + "flos": 577995987456.0, + "grad_norm": 0.038033594557881883, + "language_loss": 0.90178049, + "learning_rate": 0.0008687380433873786, + "loss": 0.91233194, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46411133, + "step": 1346, + "time_per_iteration": 2.7660248279571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.00636888, + "epoch": 0.25913813005001923, + "flos": 536467195392.0, + "grad_norm": 0.03823400300780179, + "language_loss": 0.83192778, + "learning_rate": 0.0008685275644814448, + "loss": 0.8424564, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.46435547, + "step": 1347, + "time_per_iteration": 2.6657776832580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058039, + "balance_loss_mlp": 1.01118934, + "epoch": 0.2593305117352828, + "flos": 722347474944.0, + "grad_norm": 0.04308500968206218, + "language_loss": 0.85215819, + "learning_rate": 0.0008683169425004216, + "loss": 0.86273861, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46801758, + "step": 1348, + "time_per_iteration": 2.8938682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067463, + "balance_loss_mlp": 1.02058995, + "epoch": 0.25952289342054635, + "flos": 711356275200.0, + "grad_norm": 0.04420512127692048, + "language_loss": 0.84604859, + "learning_rate": 0.0008681061775260799, + "loss": 0.85672331, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.46826172, + "step": 1349, + "time_per_iteration": 2.8803627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_mlp": 1.00634348, + "epoch": 0.25971527510580994, + "flos": 456850738944.0, + "grad_norm": 0.03368144531989068, + "language_loss": 0.92376006, + "learning_rate": 0.0008678952696402458, + "loss": 0.93428755, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46337891, + "step": 1350, + "time_per_iteration": 2.5544798374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054358, + "balance_loss_mlp": 1.00824761, + "epoch": 0.25990765679107347, + "flos": 613754569728.0, + "grad_norm": 0.03011764192417466, + "language_loss": 0.87159944, + "learning_rate": 0.000867684218924801, + "loss": 0.88214302, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.46044922, + "step": 1351, + "time_per_iteration": 2.856372833251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_mlp": 1.02496338, + "epoch": 0.26010003847633706, + "flos": 1541407196160.0, + "grad_norm": 0.012951365709411706, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80016494, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.4453125, + "step": 1352, + "time_per_iteration": 4.943616628646851 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058433, + "balance_loss_mlp": 1.01194191, + "epoch": 0.2602924201616006, + "flos": 717545447424.0, + "grad_norm": 0.029832851456929797, + "language_loss": 0.85926312, + "learning_rate": 0.0008672616893328834, + "loss": 0.86984742, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.46435547, + "step": 1353, + "time_per_iteration": 2.913235664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.01012051, + "epoch": 0.2604848018468642, + "flos": 644686824960.0, + "grad_norm": 0.03749633937906014, + "language_loss": 0.91143578, + "learning_rate": 0.0008670502106204512, + "loss": 0.92200339, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.46582031, + "step": 1354, + "time_per_iteration": 2.821753978729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091569, + "balance_loss_mlp": 1.0442189, + "epoch": 0.26067718353212777, + "flos": 518038684416.0, + "grad_norm": 0.04686611644365056, + "language_loss": 0.82400739, + "learning_rate": 0.0008668385894064892, + "loss": 0.83492303, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.47314453, + "step": 1355, + "time_per_iteration": 2.642392158508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056006, + "balance_loss_mlp": 1.00925195, + "epoch": 0.2608695652173913, + "flos": 824226287616.0, + "grad_norm": 0.03313451231790272, + "language_loss": 0.89331532, + "learning_rate": 0.0008666268257731562, + "loss": 0.90387547, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46704102, + "step": 1356, + "time_per_iteration": 3.1127805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060563, + "balance_loss_mlp": 1.01414335, + "epoch": 0.2610619469026549, + "flos": 1009450422528.0, + "grad_norm": 0.04035878870854939, + "language_loss": 0.86687934, + "learning_rate": 0.0008664149198026662, + "loss": 0.87748504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.46362305, + "step": 1357, + "time_per_iteration": 3.2328455448150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106616, + "balance_loss_mlp": 1.01971614, + "epoch": 0.2612543285879184, + "flos": 537826149888.0, + "grad_norm": 0.03943672852684058, + "language_loss": 0.8952527, + "learning_rate": 0.0008662028715772883, + "loss": 0.90591431, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.46386719, + "step": 1358, + "time_per_iteration": 2.621894359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058771, + "balance_loss_mlp": 1.01213586, + "epoch": 0.261446710273182, + "flos": 520439698176.0, + "grad_norm": 0.03590038892764462, + "language_loss": 0.86476588, + "learning_rate": 0.0008659906811793467, + "loss": 0.87535357, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.46582031, + "step": 1359, + "time_per_iteration": 2.6540629863739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054238, + "balance_loss_mlp": 1.00741243, + "epoch": 0.26163909195844554, + "flos": 584399987712.0, + "grad_norm": 0.03384500135634075, + "language_loss": 0.90458202, + "learning_rate": 0.0008657783486912215, + "loss": 0.91512442, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.46777344, + "step": 1360, + "time_per_iteration": 2.71598744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.01626348, + "epoch": 0.2618314736437091, + "flos": 960369613056.0, + "grad_norm": 0.03695926115068694, + "language_loss": 0.90376949, + "learning_rate": 0.0008655658741953472, + "loss": 0.91440493, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.47241211, + "step": 1361, + "time_per_iteration": 3.233081102371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061537, + "balance_loss_mlp": 1.01413929, + "epoch": 0.26202385532897265, + "flos": 575903120640.0, + "grad_norm": 0.032102410789184695, + "language_loss": 0.892542, + "learning_rate": 0.0008653532577742136, + "loss": 0.90315735, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.47363281, + "step": 1362, + "time_per_iteration": 2.671513319015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_mlp": 1.00673676, + "epoch": 0.26221623701423624, + "flos": 446398065408.0, + "grad_norm": 0.034188430773875136, + "language_loss": 0.88125902, + "learning_rate": 0.0008651404995103659, + "loss": 0.8917954, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.46850586, + "step": 1363, + "time_per_iteration": 2.5599000453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064691, + "balance_loss_mlp": 1.01803255, + "epoch": 0.26240861869949983, + "flos": 536755900416.0, + "grad_norm": 0.03309695956224158, + "language_loss": 0.87925225, + "learning_rate": 0.0008649275994864041, + "loss": 0.88989913, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.46606445, + "step": 1364, + "time_per_iteration": 2.68673038482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061602, + "balance_loss_mlp": 1.01472914, + "epoch": 0.26260100038476336, + "flos": 566488615680.0, + "grad_norm": 0.0327166713474878, + "language_loss": 0.84653741, + "learning_rate": 0.0008647145577849834, + "loss": 0.85715348, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46826172, + "step": 1365, + "time_per_iteration": 2.8294174671173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_mlp": 1.01471996, + "epoch": 0.26279338207002695, + "flos": 614321286144.0, + "grad_norm": 0.027467777319160957, + "language_loss": 0.83391041, + "learning_rate": 0.0008645013744888139, + "loss": 0.84452683, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.46875, + "step": 1366, + "time_per_iteration": 2.845019578933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.01238823, + "epoch": 0.2629857637552905, + "flos": 523945954560.0, + "grad_norm": 0.034051307399065846, + "language_loss": 0.88423878, + "learning_rate": 0.0008642880496806607, + "loss": 0.89483547, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.47241211, + "step": 1367, + "time_per_iteration": 2.7665200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_mlp": 1.01832283, + "epoch": 0.26317814544055407, + "flos": 535655515392.0, + "grad_norm": 0.03476637042829631, + "language_loss": 0.85672963, + "learning_rate": 0.0008640745834433437, + "loss": 0.86738896, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.47583008, + "step": 1368, + "time_per_iteration": 2.7824857234954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_mlp": 1.00967455, + "epoch": 0.2633705271258176, + "flos": 556780548096.0, + "grad_norm": 0.035052832704740904, + "language_loss": 0.8778615, + "learning_rate": 0.000863860975859738, + "loss": 0.88843262, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.47412109, + "step": 1369, + "time_per_iteration": 2.938157796859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059209, + "balance_loss_mlp": 1.01214516, + "epoch": 0.2635629088110812, + "flos": 553462874880.0, + "grad_norm": 0.04030614296387141, + "language_loss": 0.89190161, + "learning_rate": 0.0008636472270127733, + "loss": 0.90249372, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.47021484, + "step": 1370, + "time_per_iteration": 2.6449878215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_mlp": 1.0106585, + "epoch": 0.2637552904963448, + "flos": 456915867648.0, + "grad_norm": 0.03827203709322554, + "language_loss": 0.91134202, + "learning_rate": 0.0008634333369854345, + "loss": 0.9219166, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.4675293, + "step": 1371, + "time_per_iteration": 2.6090121269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053294, + "balance_loss_mlp": 1.00642049, + "epoch": 0.2639476721816083, + "flos": 614260048128.0, + "grad_norm": 0.03299961926418253, + "language_loss": 0.88250023, + "learning_rate": 0.0008632193058607608, + "loss": 0.89303321, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.46826172, + "step": 1372, + "time_per_iteration": 2.6980674266815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_mlp": 1.00562024, + "epoch": 0.2641400538668719, + "flos": 573026764032.0, + "grad_norm": 0.03659842444989107, + "language_loss": 0.81553382, + "learning_rate": 0.0008630051337218466, + "loss": 0.82606065, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47021484, + "step": 1373, + "time_per_iteration": 2.6634395122528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056457, + "balance_loss_mlp": 1.00960791, + "epoch": 0.2643324355521354, + "flos": 583340431872.0, + "grad_norm": 0.03511173854729822, + "language_loss": 0.82885635, + "learning_rate": 0.0008627908206518409, + "loss": 0.83942091, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.46801758, + "step": 1374, + "time_per_iteration": 2.6550941467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_mlp": 1.01022339, + "epoch": 0.264524817237399, + "flos": 1548027969792.0, + "grad_norm": 0.005864236448565476, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76206684, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.45117188, + "step": 1375, + "time_per_iteration": 4.995543718338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_mlp": 1.01197898, + "epoch": 0.26471719892266254, + "flos": 519043805184.0, + "grad_norm": 0.03321674595186757, + "language_loss": 0.92123759, + "learning_rate": 0.0008623617720514241, + "loss": 0.93182206, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.46411133, + "step": 1376, + "time_per_iteration": 2.592569351196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061616, + "balance_loss_mlp": 1.0151242, + "epoch": 0.26490958060792613, + "flos": 518205880320.0, + "grad_norm": 0.036665073764434085, + "language_loss": 0.85824203, + "learning_rate": 0.0008621470366875848, + "loss": 0.8688581, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.46435547, + "step": 1377, + "time_per_iteration": 2.5636963844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00766897, + "epoch": 0.26510196229318966, + "flos": 597683331072.0, + "grad_norm": 0.03396624681403314, + "language_loss": 0.88501984, + "learning_rate": 0.0008619321607257966, + "loss": 0.8955617, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.46459961, + "step": 1378, + "time_per_iteration": 2.687581777572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056779, + "balance_loss_mlp": 1.010144, + "epoch": 0.26529434397845325, + "flos": 687053541888.0, + "grad_norm": 0.031207845572821406, + "language_loss": 0.82550275, + "learning_rate": 0.000861717144249482, + "loss": 0.83607054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.46582031, + "step": 1379, + "time_per_iteration": 2.8333678245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.00819123, + "epoch": 0.26548672566371684, + "flos": 425260393728.0, + "grad_norm": 0.03047521662480035, + "language_loss": 0.90854567, + "learning_rate": 0.0008615019873421175, + "loss": 0.91909492, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.46679688, + "step": 1380, + "time_per_iteration": 2.47892689704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00437295, + "epoch": 0.26567910734898037, + "flos": 490850846208.0, + "grad_norm": 0.03515354974137605, + "language_loss": 0.8636173, + "learning_rate": 0.0008612866900872349, + "loss": 0.87412781, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.46630859, + "step": 1381, + "time_per_iteration": 2.558497428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.00833893, + "epoch": 0.26587148903424396, + "flos": 535229750016.0, + "grad_norm": 0.033124361732310995, + "language_loss": 0.88441265, + "learning_rate": 0.0008610712525684197, + "loss": 0.89496362, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.46704102, + "step": 1382, + "time_per_iteration": 2.6567015647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056513, + "balance_loss_mlp": 1.00997365, + "epoch": 0.2660638707195075, + "flos": 1019056422912.0, + "grad_norm": 0.038309225150243896, + "language_loss": 0.84641987, + "learning_rate": 0.0008608556748693121, + "loss": 0.85698497, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.46484375, + "step": 1383, + "time_per_iteration": 3.266127347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054133, + "balance_loss_mlp": 1.00754607, + "epoch": 0.2662562524047711, + "flos": 525063836160.0, + "grad_norm": 0.03266135396779854, + "language_loss": 0.86478686, + "learning_rate": 0.000860639957073607, + "loss": 0.87532818, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.46533203, + "step": 1384, + "time_per_iteration": 2.701979398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052082, + "balance_loss_mlp": 1.00542331, + "epoch": 0.2664486340900346, + "flos": 553480371456.0, + "grad_norm": 0.03507018041250785, + "language_loss": 0.88455647, + "learning_rate": 0.0008604240992650534, + "loss": 0.89507735, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.46606445, + "step": 1385, + "time_per_iteration": 2.6528589725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051347, + "balance_loss_mlp": 1.00476038, + "epoch": 0.2666410157752982, + "flos": 471209189376.0, + "grad_norm": 0.03349459525563368, + "language_loss": 0.89804894, + "learning_rate": 0.0008602081015274545, + "loss": 0.90856242, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.46533203, + "step": 1386, + "time_per_iteration": 2.7359464168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00626826, + "epoch": 0.2668333974605617, + "flos": 571016522496.0, + "grad_norm": 0.027882929979452454, + "language_loss": 0.8367793, + "learning_rate": 0.0008599919639446684, + "loss": 0.84730947, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.46704102, + "step": 1387, + "time_per_iteration": 2.72188401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_mlp": 1.00572038, + "epoch": 0.2670257791458253, + "flos": 399896159232.0, + "grad_norm": 0.038277743086958374, + "language_loss": 0.80995691, + "learning_rate": 0.000859775686600607, + "loss": 0.82048184, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46728516, + "step": 1388, + "time_per_iteration": 2.5220229625701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051137, + "balance_loss_mlp": 1.00443089, + "epoch": 0.2672181608310889, + "flos": 516892612608.0, + "grad_norm": 0.03738976993969629, + "language_loss": 0.85769641, + "learning_rate": 0.0008595592695792367, + "loss": 0.86820781, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.46655273, + "step": 1389, + "time_per_iteration": 2.7041423320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050805, + "balance_loss_mlp": 1.0042417, + "epoch": 0.26741054251635243, + "flos": 508526002944.0, + "grad_norm": 0.03398026188762752, + "language_loss": 0.91414082, + "learning_rate": 0.0008593427129645778, + "loss": 0.92464888, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.46508789, + "step": 1390, + "time_per_iteration": 2.563215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.0067687, + "epoch": 0.267602924201616, + "flos": 577809349632.0, + "grad_norm": 0.03481446530036303, + "language_loss": 0.86254311, + "learning_rate": 0.0008591260168407052, + "loss": 0.87307882, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.4675293, + "step": 1391, + "time_per_iteration": 2.788869619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_mlp": 1.00475395, + "epoch": 0.26779530588687955, + "flos": 525000652800.0, + "grad_norm": 0.029176301882166727, + "language_loss": 0.83413607, + "learning_rate": 0.0008589091812917479, + "loss": 0.84465045, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.46630859, + "step": 1392, + "time_per_iteration": 2.6471304893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057177, + "balance_loss_mlp": 1.0103997, + "epoch": 0.26798768757214314, + "flos": 557828443392.0, + "grad_norm": 0.034011915135398356, + "language_loss": 0.85611916, + "learning_rate": 0.0008586922064018887, + "loss": 0.86669087, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.46728516, + "step": 1393, + "time_per_iteration": 2.665710926055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00488269, + "epoch": 0.2681800692574067, + "flos": 932095974144.0, + "grad_norm": 0.035119979561623306, + "language_loss": 0.89861763, + "learning_rate": 0.0008584750922553651, + "loss": 0.90913308, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.46606445, + "step": 1394, + "time_per_iteration": 3.1556007862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00813687, + "epoch": 0.26837245094267026, + "flos": 702318936576.0, + "grad_norm": 0.034220503648090136, + "language_loss": 0.84388494, + "learning_rate": 0.0008582578389364677, + "loss": 0.85443103, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.46411133, + "step": 1395, + "time_per_iteration": 2.8831770420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054667, + "balance_loss_mlp": 1.00824666, + "epoch": 0.26856483262793385, + "flos": 594394814976.0, + "grad_norm": 0.030437239966241224, + "language_loss": 0.92446673, + "learning_rate": 0.0008580404465295422, + "loss": 0.93501341, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.46362305, + "step": 1396, + "time_per_iteration": 2.823685884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_mlp": 1.00578523, + "epoch": 0.2687572143131974, + "flos": 715589640960.0, + "grad_norm": 0.035135728363153845, + "language_loss": 0.88714433, + "learning_rate": 0.0008578229151189876, + "loss": 0.89766812, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.46533203, + "step": 1397, + "time_per_iteration": 2.9427757263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_mlp": 1.00858808, + "epoch": 0.26894959599846097, + "flos": 468671115264.0, + "grad_norm": 0.03944499035247069, + "language_loss": 0.82205743, + "learning_rate": 0.0008576052447892573, + "loss": 0.83260822, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.46435547, + "step": 1398, + "time_per_iteration": 2.570364475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_mlp": 1.00712895, + "epoch": 0.2691419776837245, + "flos": 469630549248.0, + "grad_norm": 0.035560759826370754, + "language_loss": 0.87260717, + "learning_rate": 0.000857387435624858, + "loss": 0.88314486, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.46582031, + "step": 1399, + "time_per_iteration": 2.5241427421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_mlp": 1.00698149, + "epoch": 0.2693343593689881, + "flos": 939286376448.0, + "grad_norm": 0.026228750880396605, + "language_loss": 0.88826966, + "learning_rate": 0.0008571694877103513, + "loss": 0.89880389, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.46386719, + "step": 1400, + "time_per_iteration": 3.2871432304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049973, + "balance_loss_mlp": 1.00355244, + "epoch": 0.2695267410542516, + "flos": 578795028480.0, + "grad_norm": 0.031687518811048296, + "language_loss": 0.88370931, + "learning_rate": 0.0008569514011303515, + "loss": 0.89420903, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.46362305, + "step": 1401, + "time_per_iteration": 2.8385562896728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00763249, + "epoch": 0.2697191227395152, + "flos": 557965503744.0, + "grad_norm": 0.03646210542720766, + "language_loss": 0.89149171, + "learning_rate": 0.0008567331759695277, + "loss": 0.90203321, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.46459961, + "step": 1402, + "time_per_iteration": 2.73796010017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_mlp": 1.00663948, + "epoch": 0.26991150442477874, + "flos": 530314961664.0, + "grad_norm": 0.03368837159460442, + "language_loss": 0.86897242, + "learning_rate": 0.0008565148123126023, + "loss": 0.87950301, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.46362305, + "step": 1403, + "time_per_iteration": 2.654782772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055763, + "balance_loss_mlp": 1.00970042, + "epoch": 0.2701038861100423, + "flos": 533087305728.0, + "grad_norm": 0.02742415368344255, + "language_loss": 0.86797845, + "learning_rate": 0.0008562963102443516, + "loss": 0.87853605, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.45996094, + "step": 1404, + "time_per_iteration": 2.6844303607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057243, + "balance_loss_mlp": 1.01122797, + "epoch": 0.2702962677953059, + "flos": 736505681664.0, + "grad_norm": 0.03794782730472634, + "language_loss": 0.85607296, + "learning_rate": 0.0008560776698496056, + "loss": 0.86664534, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.45947266, + "step": 1405, + "time_per_iteration": 2.9016945362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054152, + "balance_loss_mlp": 1.00806534, + "epoch": 0.27048864948056944, + "flos": 576001297152.0, + "grad_norm": 0.03333453941991407, + "language_loss": 0.8661586, + "learning_rate": 0.0008558588912132481, + "loss": 0.8767001, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.46020508, + "step": 1406, + "time_per_iteration": 2.8187410831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.03042603, + "epoch": 0.27068103116583303, + "flos": 1426912856832.0, + "grad_norm": 0.025019447230712623, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77533662, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.44335938, + "step": 1407, + "time_per_iteration": 4.91855001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059735, + "balance_loss_mlp": 1.01386356, + "epoch": 0.27087341285109656, + "flos": 533032870656.0, + "grad_norm": 0.03180107690871134, + "language_loss": 0.83613265, + "learning_rate": 0.0008554209195555016, + "loss": 0.84672999, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.45800781, + "step": 1408, + "time_per_iteration": 2.7004964351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.02188134, + "epoch": 0.27106579453636015, + "flos": 582465568512.0, + "grad_norm": 0.03644580883658202, + "language_loss": 0.89378774, + "learning_rate": 0.0008552017267041483, + "loss": 0.90446383, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.45654297, + "step": 1409, + "time_per_iteration": 2.7288694381713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.0219177, + "epoch": 0.2712581762216237, + "flos": 507881518848.0, + "grad_norm": 0.03188220116364099, + "language_loss": 0.84328783, + "learning_rate": 0.0008549823959512549, + "loss": 0.85396332, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.45556641, + "step": 1410, + "time_per_iteration": 2.67370343208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060246, + "balance_loss_mlp": 1.01435077, + "epoch": 0.27145055790688727, + "flos": 999143557632.0, + "grad_norm": 0.03419744556224296, + "language_loss": 0.87478781, + "learning_rate": 0.0008547629273819728, + "loss": 0.88539028, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.45825195, + "step": 1411, + "time_per_iteration": 3.3728370666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_mlp": 1.0104996, + "epoch": 0.2716429395921508, + "flos": 547729603584.0, + "grad_norm": 0.037303619224495106, + "language_loss": 0.84070724, + "learning_rate": 0.0008545433210815074, + "loss": 0.85127789, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.46508789, + "step": 1412, + "time_per_iteration": 2.6812539100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_mlp": 1.01536179, + "epoch": 0.2718353212774144, + "flos": 574311841536.0, + "grad_norm": 0.033089137280770606, + "language_loss": 0.8805269, + "learning_rate": 0.0008543235771351176, + "loss": 0.89114881, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.46777344, + "step": 1413, + "time_per_iteration": 2.713487148284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00961292, + "epoch": 0.272027702962678, + "flos": 645585987840.0, + "grad_norm": 0.026077025600286987, + "language_loss": 0.85152733, + "learning_rate": 0.0008541036956281154, + "loss": 0.86208814, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.46411133, + "step": 1414, + "time_per_iteration": 2.9018056392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062827, + "balance_loss_mlp": 1.01631117, + "epoch": 0.2722200846479415, + "flos": 654996602112.0, + "grad_norm": 0.04047455719590206, + "language_loss": 0.83293629, + "learning_rate": 0.0008538836766458665, + "loss": 0.84356457, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.46459961, + "step": 1415, + "time_per_iteration": 2.84184193611145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106005, + "balance_loss_mlp": 1.01365411, + "epoch": 0.2724124663332051, + "flos": 580779025152.0, + "grad_norm": 0.0390255284508479, + "language_loss": 0.85920322, + "learning_rate": 0.0008536635202737897, + "loss": 0.86980367, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.46337891, + "step": 1416, + "time_per_iteration": 2.814687728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01272988, + "epoch": 0.2726048480184686, + "flos": 538468688640.0, + "grad_norm": 0.03678906161491062, + "language_loss": 0.82951486, + "learning_rate": 0.0008534432265973573, + "loss": 0.8401081, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.46533203, + "step": 1417, + "time_per_iteration": 2.641660451889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00930703, + "epoch": 0.2727972297037322, + "flos": 997550333184.0, + "grad_norm": 0.4222293446211692, + "language_loss": 0.88806397, + "learning_rate": 0.000853222795702095, + "loss": 0.89862669, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.46923828, + "step": 1418, + "time_per_iteration": 3.3743135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181433, + "balance_loss_mlp": 1.1334635, + "epoch": 0.27298961138899575, + "flos": 607335018240.0, + "grad_norm": 0.06715989722341878, + "language_loss": 0.84640503, + "learning_rate": 0.0008530022276735813, + "loss": 0.85821939, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.47949219, + "step": 1419, + "time_per_iteration": 2.752645254135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069458, + "balance_loss_mlp": 1.02225161, + "epoch": 0.27318199307425933, + "flos": 530397586944.0, + "grad_norm": 0.040820608700474346, + "language_loss": 0.87344372, + "learning_rate": 0.0008527815225974489, + "loss": 0.88413835, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.47167969, + "step": 1420, + "time_per_iteration": 2.65108585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085616, + "balance_loss_mlp": 1.03852844, + "epoch": 0.2733743747595229, + "flos": 409912373760.0, + "grad_norm": 0.06690132065136703, + "language_loss": 0.92052042, + "learning_rate": 0.0008525606805593829, + "loss": 0.93137658, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.47045898, + "step": 1421, + "time_per_iteration": 2.4201173782348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.03422987, + "epoch": 0.27356675644478645, + "flos": 517228949760.0, + "grad_norm": 0.05290317096475839, + "language_loss": 0.85793996, + "learning_rate": 0.0008523397016451213, + "loss": 0.86875236, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46972656, + "step": 1422, + "time_per_iteration": 2.632446765899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080251, + "balance_loss_mlp": 1.03328276, + "epoch": 0.27375913813005004, + "flos": 1054059705600.0, + "grad_norm": 0.039766191828199446, + "language_loss": 0.90321743, + "learning_rate": 0.0008521185859404564, + "loss": 0.91401994, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.46923828, + "step": 1423, + "time_per_iteration": 3.381535291671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107676, + "balance_loss_mlp": 1.02998257, + "epoch": 0.27395151981531357, + "flos": 626004602112.0, + "grad_norm": 0.042654551092476074, + "language_loss": 0.92207062, + "learning_rate": 0.0008518973335312326, + "loss": 0.9328382, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.46728516, + "step": 1424, + "time_per_iteration": 2.787799596786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070757, + "balance_loss_mlp": 1.0240984, + "epoch": 0.27414390150057716, + "flos": 551415694848.0, + "grad_norm": 0.04883209929837253, + "language_loss": 0.85839558, + "learning_rate": 0.0008516759445033477, + "loss": 0.86910313, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.46606445, + "step": 1425, + "time_per_iteration": 2.6206350326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_mlp": 1.01881957, + "epoch": 0.2743362831858407, + "flos": 540952327680.0, + "grad_norm": 0.043467714857121094, + "language_loss": 0.87962419, + "learning_rate": 0.0008514544189427526, + "loss": 0.89028037, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.4675293, + "step": 1426, + "time_per_iteration": 2.679623603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_mlp": 1.0118494, + "epoch": 0.2745286648711043, + "flos": 469545978624.0, + "grad_norm": 0.04158543868721512, + "language_loss": 0.89037859, + "learning_rate": 0.0008512327569354511, + "loss": 0.90096468, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.46704102, + "step": 1427, + "time_per_iteration": 2.5345683097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.01036775, + "epoch": 0.2747210465563678, + "flos": 473872663296.0, + "grad_norm": 0.05094281183667316, + "language_loss": 0.85685182, + "learning_rate": 0.0008510109585675001, + "loss": 0.8674283, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.47241211, + "step": 1428, + "time_per_iteration": 2.5991017818450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076946, + "balance_loss_mlp": 1.03031158, + "epoch": 0.2749134282416314, + "flos": 1318059436800.0, + "grad_norm": 0.019364160619571847, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82230288, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.46582031, + "step": 1429, + "time_per_iteration": 4.724486351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.03459787, + "epoch": 0.275105809926895, + "flos": 972533129472.0, + "grad_norm": 0.05143903496013185, + "language_loss": 0.82696635, + "learning_rate": 0.0008505669530941415, + "loss": 0.83778298, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.47021484, + "step": 1430, + "time_per_iteration": 3.3173024654388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058611, + "balance_loss_mlp": 1.01231062, + "epoch": 0.2752981916121585, + "flos": 528369848832.0, + "grad_norm": 0.04649662222604448, + "language_loss": 0.87158883, + "learning_rate": 0.000850344746161112, + "loss": 0.88217485, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.46240234, + "step": 1431, + "time_per_iteration": 2.635831356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065544, + "balance_loss_mlp": 1.01943398, + "epoch": 0.2754905732974221, + "flos": 454599424512.0, + "grad_norm": 0.04970989937431765, + "language_loss": 0.90776384, + "learning_rate": 0.0008501224032121894, + "loss": 0.91841936, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.46044922, + "step": 1432, + "time_per_iteration": 2.531921148300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.02339363, + "epoch": 0.27568295498268564, + "flos": 498509788416.0, + "grad_norm": 0.04336527805629792, + "language_loss": 0.84821916, + "learning_rate": 0.0008498999243336946, + "loss": 0.85891324, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.45947266, + "step": 1433, + "time_per_iteration": 2.6142802238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068976, + "balance_loss_mlp": 1.02298498, + "epoch": 0.2758753366679492, + "flos": 609417191424.0, + "grad_norm": 0.03822636329404569, + "language_loss": 0.8997575, + "learning_rate": 0.0008496773096120021, + "loss": 0.91044724, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.45922852, + "step": 1434, + "time_per_iteration": 2.788863182067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066122, + "balance_loss_mlp": 1.01977372, + "epoch": 0.27606771835321275, + "flos": 741437966592.0, + "grad_norm": 0.04844453313229188, + "language_loss": 0.86675751, + "learning_rate": 0.0008494545591335381, + "loss": 0.87741876, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46289062, + "step": 1435, + "time_per_iteration": 2.8883180618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061614, + "balance_loss_mlp": 1.01516986, + "epoch": 0.27626010003847634, + "flos": 555749182464.0, + "grad_norm": 0.03304758436240527, + "language_loss": 0.88791698, + "learning_rate": 0.0008492316729847823, + "loss": 0.89853311, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.46386719, + "step": 1436, + "time_per_iteration": 2.794938087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.0072248, + "epoch": 0.2764524817237399, + "flos": 543696481536.0, + "grad_norm": 0.13725655625344893, + "language_loss": 0.82129836, + "learning_rate": 0.0008490086512522664, + "loss": 0.83184153, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47045898, + "step": 1437, + "time_per_iteration": 2.6979260444641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.01495445, + "epoch": 0.27664486340900346, + "flos": 407129336064.0, + "grad_norm": 0.04115092615815086, + "language_loss": 0.92702913, + "learning_rate": 0.0008487854940225755, + "loss": 0.93765163, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47265625, + "step": 1438, + "time_per_iteration": 2.4361565113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_mlp": 1.0080049, + "epoch": 0.27683724509426705, + "flos": 523157607168.0, + "grad_norm": 0.06281356926864295, + "language_loss": 0.92480713, + "learning_rate": 0.0008485622013823466, + "loss": 0.93535829, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.47070312, + "step": 1439, + "time_per_iteration": 2.588972568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060631, + "balance_loss_mlp": 1.01332879, + "epoch": 0.2770296267795306, + "flos": 536410814976.0, + "grad_norm": 0.048827385499573994, + "language_loss": 0.8582921, + "learning_rate": 0.00084833877341827, + "loss": 0.86889839, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47265625, + "step": 1440, + "time_per_iteration": 2.6215152740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063403, + "balance_loss_mlp": 1.01648188, + "epoch": 0.27722200846479417, + "flos": 488970862080.0, + "grad_norm": 0.04074125375838667, + "language_loss": 0.82920921, + "learning_rate": 0.000848115210217088, + "loss": 0.83984327, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.46875, + "step": 1441, + "time_per_iteration": 2.578479290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059768, + "balance_loss_mlp": 1.01244187, + "epoch": 0.2774143901500577, + "flos": 619444099584.0, + "grad_norm": 0.03981713509883016, + "language_loss": 0.84628934, + "learning_rate": 0.0008478915118655952, + "loss": 0.85688698, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.47290039, + "step": 1442, + "time_per_iteration": 2.697610855102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055568, + "balance_loss_mlp": 1.0080508, + "epoch": 0.2776067718353213, + "flos": 514845432576.0, + "grad_norm": 0.032345577367045, + "language_loss": 0.88479745, + "learning_rate": 0.0008476676784506393, + "loss": 0.89535314, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.47485352, + "step": 1443, + "time_per_iteration": 2.6315112113952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056662, + "balance_loss_mlp": 1.00897789, + "epoch": 0.2777991535205848, + "flos": 1006042342656.0, + "grad_norm": 0.04008629757661371, + "language_loss": 0.8412413, + "learning_rate": 0.0008474437100591201, + "loss": 0.85180795, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.4765625, + "step": 1444, + "time_per_iteration": 3.3463656902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051562, + "balance_loss_mlp": 1.00371146, + "epoch": 0.2779915352058484, + "flos": 551376811008.0, + "grad_norm": 0.033834103416723965, + "language_loss": 0.87362587, + "learning_rate": 0.0008472196067779898, + "loss": 0.88414145, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47827148, + "step": 1445, + "time_per_iteration": 2.6647677421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054302, + "balance_loss_mlp": 1.00649953, + "epoch": 0.278183916891112, + "flos": 875217216768.0, + "grad_norm": 0.0457526450580795, + "language_loss": 0.87776953, + "learning_rate": 0.0008469953686942531, + "loss": 0.88831258, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.4777832, + "step": 1446, + "time_per_iteration": 3.076035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056504, + "balance_loss_mlp": 1.00882006, + "epoch": 0.2783762985763755, + "flos": 625196812800.0, + "grad_norm": 0.042452946668595545, + "language_loss": 0.85090148, + "learning_rate": 0.0008467709958949668, + "loss": 0.86146653, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.4765625, + "step": 1447, + "time_per_iteration": 2.744459629058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_mlp": 1.00850928, + "epoch": 0.2785686802616391, + "flos": 582912721152.0, + "grad_norm": 0.04136143865758397, + "language_loss": 0.87796736, + "learning_rate": 0.0008465464884672403, + "loss": 0.88852853, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.47583008, + "step": 1448, + "time_per_iteration": 2.6887707710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_mlp": 1.00235641, + "epoch": 0.27876106194690264, + "flos": 588540034560.0, + "grad_norm": 0.031263057988026755, + "language_loss": 0.87220562, + "learning_rate": 0.0008463218464982348, + "loss": 0.88270551, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.47607422, + "step": 1449, + "time_per_iteration": 2.8354454040527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_mlp": 1.00326335, + "epoch": 0.27895344363216623, + "flos": 877431592704.0, + "grad_norm": 0.03730856956989286, + "language_loss": 0.89626968, + "learning_rate": 0.0008460970700751645, + "loss": 0.90677798, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.4753418, + "step": 1450, + "time_per_iteration": 3.12705135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062156, + "balance_loss_mlp": 1.01442492, + "epoch": 0.27914582531742976, + "flos": 605036071680.0, + "grad_norm": 0.0379360607610882, + "language_loss": 0.8910991, + "learning_rate": 0.000845872159285295, + "loss": 0.90172064, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.47705078, + "step": 1451, + "time_per_iteration": 2.792448043823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065174, + "balance_loss_mlp": 1.02025604, + "epoch": 0.27933820700269335, + "flos": 1501133346048.0, + "grad_norm": 0.01376981107013524, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.7883203, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.44921875, + "step": 1452, + "time_per_iteration": 4.966037034988403 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_mlp": 1.00921774, + "epoch": 0.2795305886879569, + "flos": 1033518885888.0, + "grad_norm": 0.037040263742322534, + "language_loss": 0.87809932, + "learning_rate": 0.0008454219349544836, + "loss": 0.88866544, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47363281, + "step": 1453, + "time_per_iteration": 3.428589344024658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055627, + "balance_loss_mlp": 1.00851548, + "epoch": 0.27972297037322047, + "flos": 608227378176.0, + "grad_norm": 0.03307542484781365, + "language_loss": 0.83086669, + "learning_rate": 0.000845196621588334, + "loss": 0.84142298, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.47070312, + "step": 1454, + "time_per_iteration": 2.7620909214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053868, + "balance_loss_mlp": 1.00661373, + "epoch": 0.27991535205848406, + "flos": 631561929216.0, + "grad_norm": 0.034345141589198824, + "language_loss": 0.77104861, + "learning_rate": 0.0008449711742049706, + "loss": 0.78158724, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.47216797, + "step": 1455, + "time_per_iteration": 2.7629852294921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057076, + "balance_loss_mlp": 1.009655, + "epoch": 0.2801077337437476, + "flos": 550354193664.0, + "grad_norm": 0.03843537360044117, + "language_loss": 0.85426688, + "learning_rate": 0.0008447455928919196, + "loss": 0.86483765, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.47387695, + "step": 1456, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054361, + "balance_loss_mlp": 1.00670111, + "epoch": 0.2803001154290112, + "flos": 487742164992.0, + "grad_norm": 0.03308646323695097, + "language_loss": 0.8834334, + "learning_rate": 0.0008445198777367595, + "loss": 0.89397705, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.47631836, + "step": 1457, + "time_per_iteration": 2.5908620357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054094, + "balance_loss_mlp": 1.00633848, + "epoch": 0.2804924971142747, + "flos": 523092478464.0, + "grad_norm": 0.036759152060528134, + "language_loss": 0.82140505, + "learning_rate": 0.0008442940288271208, + "loss": 0.8319459, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.47729492, + "step": 1458, + "time_per_iteration": 2.6980724334716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057218, + "balance_loss_mlp": 1.00953484, + "epoch": 0.2806848787995383, + "flos": 528850049280.0, + "grad_norm": 0.03179596299998768, + "language_loss": 0.88266242, + "learning_rate": 0.0008440680462506856, + "loss": 0.89323461, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.4765625, + "step": 1459, + "time_per_iteration": 2.818169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058111, + "balance_loss_mlp": 1.01047492, + "epoch": 0.2808772604848018, + "flos": 486485277696.0, + "grad_norm": 0.030255628698855237, + "language_loss": 0.87626624, + "learning_rate": 0.0008438419300951883, + "loss": 0.88684738, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.47607422, + "step": 1460, + "time_per_iteration": 2.644911527633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_mlp": 1.00825953, + "epoch": 0.2810696421700654, + "flos": 619340087040.0, + "grad_norm": 0.03597967684758823, + "language_loss": 0.87670606, + "learning_rate": 0.0008436156804484148, + "loss": 0.88726676, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.4777832, + "step": 1461, + "time_per_iteration": 2.7725627422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_mlp": 1.00657165, + "epoch": 0.28126202385532895, + "flos": 455687170560.0, + "grad_norm": 0.0394598317615188, + "language_loss": 0.89263237, + "learning_rate": 0.0008433892973982031, + "loss": 0.90317494, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.4765625, + "step": 1462, + "time_per_iteration": 2.5091495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063928, + "balance_loss_mlp": 1.0156002, + "epoch": 0.28145440554059253, + "flos": 531739044864.0, + "grad_norm": 0.041651284680957995, + "language_loss": 0.866346, + "learning_rate": 0.0008431627810324431, + "loss": 0.87698531, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.4831543, + "step": 1463, + "time_per_iteration": 2.6705899238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056697, + "balance_loss_mlp": 1.00872695, + "epoch": 0.2816467872258561, + "flos": 453164647680.0, + "grad_norm": 0.03544245246238935, + "language_loss": 0.81977493, + "learning_rate": 0.000842936131439076, + "loss": 0.83034194, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.47949219, + "step": 1464, + "time_per_iteration": 2.610419511795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055364, + "balance_loss_mlp": 1.00763226, + "epoch": 0.28183916891111965, + "flos": 473705467392.0, + "grad_norm": 0.034609246408770326, + "language_loss": 0.89094436, + "learning_rate": 0.0008427093487060951, + "loss": 0.90149802, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.47705078, + "step": 1465, + "time_per_iteration": 2.72540283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054608, + "balance_loss_mlp": 1.00656629, + "epoch": 0.28203155059638324, + "flos": 558189080064.0, + "grad_norm": 0.02738603689522664, + "language_loss": 0.8552286, + "learning_rate": 0.000842482432921545, + "loss": 0.86577463, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.48022461, + "step": 1466, + "time_per_iteration": 2.8388257026672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105456, + "balance_loss_mlp": 1.00654304, + "epoch": 0.28222393228164677, + "flos": 417879462912.0, + "grad_norm": 0.03402242241185157, + "language_loss": 0.88381398, + "learning_rate": 0.0008422553841735225, + "loss": 0.89435959, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.47998047, + "step": 1467, + "time_per_iteration": 2.495126485824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057213, + "balance_loss_mlp": 1.00917137, + "epoch": 0.28241631396691036, + "flos": 606041192448.0, + "grad_norm": 0.032675143321136885, + "language_loss": 0.86003613, + "learning_rate": 0.0008420282025501757, + "loss": 0.87060827, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.48022461, + "step": 1468, + "time_per_iteration": 2.7908880710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052419, + "balance_loss_mlp": 1.00473487, + "epoch": 0.2826086956521739, + "flos": 574051326720.0, + "grad_norm": 0.03300906221563125, + "language_loss": 0.86686498, + "learning_rate": 0.0008418008881397043, + "loss": 0.87738919, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.4765625, + "step": 1469, + "time_per_iteration": 2.7646520137786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054478, + "balance_loss_mlp": 1.00693762, + "epoch": 0.2828010773374375, + "flos": 844319954688.0, + "grad_norm": 0.03195966631281891, + "language_loss": 0.84124947, + "learning_rate": 0.0008415734410303595, + "loss": 0.85179424, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.47509766, + "step": 1470, + "time_per_iteration": 3.1784656047821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059132, + "balance_loss_mlp": 1.01151943, + "epoch": 0.28299345902270107, + "flos": 543772303872.0, + "grad_norm": 0.0307788797974712, + "language_loss": 0.91781342, + "learning_rate": 0.0008413458613104444, + "loss": 0.92840481, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.47583008, + "step": 1471, + "time_per_iteration": 2.7000675201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057543, + "balance_loss_mlp": 1.00995505, + "epoch": 0.2831858407079646, + "flos": 572755555584.0, + "grad_norm": 0.03187726406761503, + "language_loss": 0.84024346, + "learning_rate": 0.0008411181490683129, + "loss": 0.85081899, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.47558594, + "step": 1472, + "time_per_iteration": 2.7358603477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105744, + "balance_loss_mlp": 1.00958943, + "epoch": 0.2833782223932282, + "flos": 765172038144.0, + "grad_norm": 0.03258814259190176, + "language_loss": 0.83765668, + "learning_rate": 0.0008408903043923707, + "loss": 0.84823108, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.47827148, + "step": 1473, + "time_per_iteration": 3.016690492630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060961, + "balance_loss_mlp": 1.01291955, + "epoch": 0.2835706040784917, + "flos": 540088157952.0, + "grad_norm": 0.03783140599229066, + "language_loss": 0.82463539, + "learning_rate": 0.0008406623273710754, + "loss": 0.83524501, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.48022461, + "step": 1474, + "time_per_iteration": 2.651932954788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_mlp": 1.00736535, + "epoch": 0.2837629857637553, + "flos": 531654474240.0, + "grad_norm": 0.03425671969493541, + "language_loss": 0.84354198, + "learning_rate": 0.0008404342180929351, + "loss": 0.85409558, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.47973633, + "step": 1475, + "time_per_iteration": 2.6064491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105922, + "balance_loss_mlp": 1.01120257, + "epoch": 0.28395536744901884, + "flos": 541110775296.0, + "grad_norm": 0.03564784056716401, + "language_loss": 0.8245163, + "learning_rate": 0.00084020597664651, + "loss": 0.83510846, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.47998047, + "step": 1476, + "time_per_iteration": 2.7597527503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.00890458, + "epoch": 0.2841477491342824, + "flos": 574802735616.0, + "grad_norm": 0.037292940254278956, + "language_loss": 0.8496412, + "learning_rate": 0.0008399776031204111, + "loss": 0.86021066, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.48022461, + "step": 1477, + "time_per_iteration": 2.759089231491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.00412941, + "epoch": 0.28434013081954596, + "flos": 573139524864.0, + "grad_norm": 0.03522410712402375, + "language_loss": 0.80955458, + "learning_rate": 0.0008397490976033009, + "loss": 0.8200742, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.47802734, + "step": 1478, + "time_per_iteration": 2.6423845291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056133, + "balance_loss_mlp": 1.0100708, + "epoch": 0.28453251250480954, + "flos": 1556676481536.0, + "grad_norm": 0.010218347035897045, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78935778, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.45996094, + "step": 1479, + "time_per_iteration": 4.732174396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_mlp": 1.0056026, + "epoch": 0.28472489419007313, + "flos": 750427673088.0, + "grad_norm": 0.028762601306014927, + "language_loss": 0.86263019, + "learning_rate": 0.0008392916909509525, + "loss": 0.87316358, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.47705078, + "step": 1480, + "time_per_iteration": 3.0842366218566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105593, + "balance_loss_mlp": 1.00817478, + "epoch": 0.28491727587533666, + "flos": 491139551232.0, + "grad_norm": 0.03654292068957682, + "language_loss": 0.86134857, + "learning_rate": 0.0008390627899932954, + "loss": 0.87190789, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.47729492, + "step": 1481, + "time_per_iteration": 2.615267753601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053309, + "balance_loss_mlp": 1.0055064, + "epoch": 0.28510965756060025, + "flos": 730360250880.0, + "grad_norm": 0.03257927187729683, + "language_loss": 0.89633858, + "learning_rate": 0.000838833757399789, + "loss": 0.90687168, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.4777832, + "step": 1482, + "time_per_iteration": 2.9428212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053301, + "balance_loss_mlp": 1.00528359, + "epoch": 0.2853020392458638, + "flos": 552670636800.0, + "grad_norm": 0.036455185890550544, + "language_loss": 0.82055122, + "learning_rate": 0.0008386045932593515, + "loss": 0.83108419, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.47998047, + "step": 1483, + "time_per_iteration": 2.724045991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_mlp": 1.00416255, + "epoch": 0.28549442093112737, + "flos": 756097761024.0, + "grad_norm": 0.02777472605390161, + "language_loss": 0.8718375, + "learning_rate": 0.0008383752976609525, + "loss": 0.8823595, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.48022461, + "step": 1484, + "time_per_iteration": 2.929905891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054156, + "balance_loss_mlp": 1.00618601, + "epoch": 0.2856868026163909, + "flos": 539704188672.0, + "grad_norm": 0.028392575187028035, + "language_loss": 0.8111921, + "learning_rate": 0.0008381458706936123, + "loss": 0.82173365, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.47949219, + "step": 1485, + "time_per_iteration": 2.717545986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053651, + "balance_loss_mlp": 1.00563323, + "epoch": 0.2858791843016545, + "flos": 584921017344.0, + "grad_norm": 0.03333139148622456, + "language_loss": 0.88664746, + "learning_rate": 0.0008379163124464025, + "loss": 0.8971839, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.47998047, + "step": 1486, + "time_per_iteration": 2.7234747409820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00685012, + "epoch": 0.286071565986918, + "flos": 646052582400.0, + "grad_norm": 0.03454926432429506, + "language_loss": 0.77946562, + "learning_rate": 0.0008376866230084452, + "loss": 0.79001164, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.47729492, + "step": 1487, + "time_per_iteration": 2.856128692626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00408018, + "epoch": 0.2862639476721816, + "flos": 492331309824.0, + "grad_norm": 0.034661288064865674, + "language_loss": 0.87705112, + "learning_rate": 0.000837456802468914, + "loss": 0.88757157, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.47949219, + "step": 1488, + "time_per_iteration": 2.57454514503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_mlp": 1.00700366, + "epoch": 0.2864563293574452, + "flos": 522745447680.0, + "grad_norm": 0.035472984165373166, + "language_loss": 0.86247557, + "learning_rate": 0.0008372268509170331, + "loss": 0.87302554, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.47973633, + "step": 1489, + "time_per_iteration": 2.661430597305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_mlp": 1.00452483, + "epoch": 0.2866487110427087, + "flos": 548257436160.0, + "grad_norm": 0.03357077125927176, + "language_loss": 0.85950172, + "learning_rate": 0.0008369967684420779, + "loss": 0.8700276, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.48046875, + "step": 1490, + "time_per_iteration": 2.703200101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052654, + "balance_loss_mlp": 1.0047555, + "epoch": 0.2868410927279723, + "flos": 483218148864.0, + "grad_norm": 0.03511930922286833, + "language_loss": 0.8567192, + "learning_rate": 0.0008367665551333736, + "loss": 0.86724567, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.47875977, + "step": 1491, + "time_per_iteration": 2.6027045249938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051173, + "balance_loss_mlp": 1.00334597, + "epoch": 0.28703347441323585, + "flos": 726137578752.0, + "grad_norm": 0.03668604763704844, + "language_loss": 0.86648476, + "learning_rate": 0.0008365362110802977, + "loss": 0.87699652, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47802734, + "step": 1492, + "time_per_iteration": 2.872743606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00630987, + "epoch": 0.28722585609849943, + "flos": 636214257408.0, + "grad_norm": 0.0346446819062503, + "language_loss": 0.83264536, + "learning_rate": 0.0008363057363722773, + "loss": 0.84318721, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.47851562, + "step": 1493, + "time_per_iteration": 2.830925941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055811, + "balance_loss_mlp": 1.00827014, + "epoch": 0.28741823778376296, + "flos": 511252660224.0, + "grad_norm": 0.03541460771255837, + "language_loss": 0.8481909, + "learning_rate": 0.0008360751310987906, + "loss": 0.85874903, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.47509766, + "step": 1494, + "time_per_iteration": 2.6102633476257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055778, + "balance_loss_mlp": 1.00840437, + "epoch": 0.28761061946902655, + "flos": 604932059136.0, + "grad_norm": 0.030521465086419404, + "language_loss": 0.86298919, + "learning_rate": 0.0008358443953493666, + "loss": 0.87354696, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.47338867, + "step": 1495, + "time_per_iteration": 2.8808648586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053186, + "balance_loss_mlp": 1.00590765, + "epoch": 0.28780300115429014, + "flos": 408060579840.0, + "grad_norm": 0.03760103829607362, + "language_loss": 0.89352167, + "learning_rate": 0.0008356135292135851, + "loss": 0.90405357, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.47241211, + "step": 1496, + "time_per_iteration": 2.5025811195373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055101, + "balance_loss_mlp": 1.00794196, + "epoch": 0.28799538283955367, + "flos": 375745070592.0, + "grad_norm": 0.04396673202836768, + "language_loss": 0.93575335, + "learning_rate": 0.0008353825327810758, + "loss": 0.94630432, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.47119141, + "step": 1497, + "time_per_iteration": 2.4455389976501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053362, + "balance_loss_mlp": 1.00601161, + "epoch": 0.28818776452481726, + "flos": 593020309248.0, + "grad_norm": 0.03575929377279749, + "language_loss": 0.82620615, + "learning_rate": 0.00083515140614152, + "loss": 0.83673978, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.47314453, + "step": 1498, + "time_per_iteration": 2.7318496704101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.01204443, + "epoch": 0.2883801462100808, + "flos": 536104613376.0, + "grad_norm": 0.03408677708994041, + "language_loss": 0.8771323, + "learning_rate": 0.0008349201493846485, + "loss": 0.88772887, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.47583008, + "step": 1499, + "time_per_iteration": 2.671473503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105332, + "balance_loss_mlp": 1.00606573, + "epoch": 0.2885725278953444, + "flos": 481077649920.0, + "grad_norm": 0.037679681148910335, + "language_loss": 0.90198493, + "learning_rate": 0.0008346887626002432, + "loss": 0.91251814, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.47216797, + "step": 1500, + "time_per_iteration": 2.565556287765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_mlp": 1.00290418, + "epoch": 0.2887649095806079, + "flos": 465030710784.0, + "grad_norm": 0.03453406345592784, + "language_loss": 0.87256986, + "learning_rate": 0.000834457245878137, + "loss": 0.88307267, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.47338867, + "step": 1501, + "time_per_iteration": 2.6684980392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_mlp": 1.00411427, + "epoch": 0.2889572912658715, + "flos": 932641303296.0, + "grad_norm": 0.034149555340210275, + "language_loss": 0.82079703, + "learning_rate": 0.000834225599308212, + "loss": 0.83131123, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.47265625, + "step": 1502, + "time_per_iteration": 3.2747607231140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052615, + "balance_loss_mlp": 1.00526536, + "epoch": 0.28914967295113503, + "flos": 571257595392.0, + "grad_norm": 0.03426641952710734, + "language_loss": 0.85934782, + "learning_rate": 0.0008339938229804016, + "loss": 0.869874, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.47314453, + "step": 1503, + "time_per_iteration": 2.7027056217193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062946, + "balance_loss_mlp": 1.01783752, + "epoch": 0.2893420546363986, + "flos": 1489874828544.0, + "grad_norm": 0.016861580481692767, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76497769, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.45019531, + "step": 1504, + "time_per_iteration": 4.9503560066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010625, + "balance_loss_mlp": 1.01536465, + "epoch": 0.2895344363216622, + "flos": 471182944512.0, + "grad_norm": 0.04276572481675365, + "language_loss": 0.8589167, + "learning_rate": 0.0008335298814111094, + "loss": 0.86954165, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47094727, + "step": 1505, + "time_per_iteration": 2.548398017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.01654112, + "epoch": 0.28972681800692573, + "flos": 649341098496.0, + "grad_norm": 0.03572405467889404, + "language_loss": 0.89211309, + "learning_rate": 0.0008332977163497455, + "loss": 0.90274966, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.47070312, + "step": 1506, + "time_per_iteration": 2.786355972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059474, + "balance_loss_mlp": 1.01241064, + "epoch": 0.2899191996921893, + "flos": 573306720768.0, + "grad_norm": 0.03560254091063293, + "language_loss": 0.84471554, + "learning_rate": 0.0008330654218907325, + "loss": 0.85531026, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.47021484, + "step": 1507, + "time_per_iteration": 2.706066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054224, + "balance_loss_mlp": 1.00701702, + "epoch": 0.29011158137745285, + "flos": 662638047744.0, + "grad_norm": 0.03364876986368613, + "language_loss": 0.82771999, + "learning_rate": 0.0008328329981242548, + "loss": 0.8382622, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47167969, + "step": 1508, + "time_per_iteration": 2.9025378227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053376, + "balance_loss_mlp": 1.00607395, + "epoch": 0.29030396306271644, + "flos": 537403296768.0, + "grad_norm": 0.0314370875382877, + "language_loss": 0.88638061, + "learning_rate": 0.0008326004451405475, + "loss": 0.89691436, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47265625, + "step": 1509, + "time_per_iteration": 2.740288496017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091124, + "balance_loss_mlp": 1.04370284, + "epoch": 0.29049634474798, + "flos": 512956700160.0, + "grad_norm": 0.04021928954994292, + "language_loss": 0.83711147, + "learning_rate": 0.0008323677630298957, + "loss": 0.84802264, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.47387695, + "step": 1510, + "time_per_iteration": 2.5700840950012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_mlp": 1.00935256, + "epoch": 0.29068872643324356, + "flos": 614983266816.0, + "grad_norm": 0.03498537298994642, + "language_loss": 0.86212677, + "learning_rate": 0.0008321349518826345, + "loss": 0.87268996, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.46923828, + "step": 1511, + "time_per_iteration": 2.7968146800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060609, + "balance_loss_mlp": 1.01364064, + "epoch": 0.2908811081185071, + "flos": 547469088768.0, + "grad_norm": 0.03734404843374857, + "language_loss": 0.95525789, + "learning_rate": 0.0008319020117891491, + "loss": 0.96586394, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.46923828, + "step": 1512, + "time_per_iteration": 2.646127939224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.01107061, + "epoch": 0.2910734898037707, + "flos": 605902186752.0, + "grad_norm": 0.03463533015087841, + "language_loss": 0.88378417, + "learning_rate": 0.0008316689428398751, + "loss": 0.89436436, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.46899414, + "step": 1513, + "time_per_iteration": 2.7310631275177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056228, + "balance_loss_mlp": 1.00935447, + "epoch": 0.29126587148903427, + "flos": 575836046592.0, + "grad_norm": 0.028150288904366032, + "language_loss": 0.89498413, + "learning_rate": 0.0008314357451252979, + "loss": 0.90554643, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.46826172, + "step": 1514, + "time_per_iteration": 2.8262994289398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054124, + "balance_loss_mlp": 1.00727487, + "epoch": 0.2914582531742978, + "flos": 572134404096.0, + "grad_norm": 0.05354948204009119, + "language_loss": 0.89001274, + "learning_rate": 0.0008312024187359527, + "loss": 0.90055394, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.46801758, + "step": 1515, + "time_per_iteration": 2.717780590057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_mlp": 1.01109469, + "epoch": 0.2916506348595614, + "flos": 732303418368.0, + "grad_norm": 0.032865630858266236, + "language_loss": 0.8831327, + "learning_rate": 0.000830968963762425, + "loss": 0.89371502, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.47094727, + "step": 1516, + "time_per_iteration": 3.080526828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051181, + "balance_loss_mlp": 1.00383127, + "epoch": 0.2918430165448249, + "flos": 511467488256.0, + "grad_norm": 0.032871242995291323, + "language_loss": 0.84882748, + "learning_rate": 0.0008307353802953497, + "loss": 0.85933936, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.47314453, + "step": 1517, + "time_per_iteration": 2.744476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.03726828, + "epoch": 0.2920353982300885, + "flos": 631607616000.0, + "grad_norm": 0.03594729450056152, + "language_loss": 0.86997348, + "learning_rate": 0.0008305016684254125, + "loss": 0.88082325, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.47680664, + "step": 1518, + "time_per_iteration": 2.8340506553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_mlp": 1.00001049, + "epoch": 0.29222777991535204, + "flos": 502671222528.0, + "grad_norm": 0.03192476620539529, + "language_loss": 0.87901479, + "learning_rate": 0.0008302678282433479, + "loss": 0.88948864, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.47338867, + "step": 1519, + "time_per_iteration": 2.5783281326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048912, + "balance_loss_mlp": 1.00177681, + "epoch": 0.2924201616006156, + "flos": 487842286848.0, + "grad_norm": 0.03491462978028735, + "language_loss": 0.85667795, + "learning_rate": 0.0008300338598399411, + "loss": 0.86716712, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.47094727, + "step": 1520, + "time_per_iteration": 2.6763737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105218, + "balance_loss_mlp": 1.0049969, + "epoch": 0.2926125432858792, + "flos": 477411000576.0, + "grad_norm": 0.036990289889529016, + "language_loss": 0.957196, + "learning_rate": 0.0008297997633060263, + "loss": 0.96771777, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.47143555, + "step": 1521, + "time_per_iteration": 2.5368785858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055222, + "balance_loss_mlp": 1.00799167, + "epoch": 0.29280492497114274, + "flos": 677868449280.0, + "grad_norm": 0.0362418142607002, + "language_loss": 0.86058486, + "learning_rate": 0.0008295655387324883, + "loss": 0.87113714, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.47192383, + "step": 1522, + "time_per_iteration": 2.8447062969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055265, + "balance_loss_mlp": 1.0079869, + "epoch": 0.29299730665640633, + "flos": 459345071616.0, + "grad_norm": 0.03782463739456531, + "language_loss": 0.86245579, + "learning_rate": 0.0008293311862102609, + "loss": 0.87300849, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.47241211, + "step": 1523, + "time_per_iteration": 2.5397908687591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050568, + "balance_loss_mlp": 1.00328994, + "epoch": 0.29318968834166986, + "flos": 447496505088.0, + "grad_norm": 0.03500221637525105, + "language_loss": 0.90103561, + "learning_rate": 0.0008290967058303275, + "loss": 0.91154128, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.47241211, + "step": 1524, + "time_per_iteration": 2.4784419536590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.03349924, + "epoch": 0.29338207002693345, + "flos": 451256473344.0, + "grad_norm": 0.038529021386844775, + "language_loss": 0.87365985, + "learning_rate": 0.0008288620976837219, + "loss": 0.88447046, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.4753418, + "step": 1525, + "time_per_iteration": 2.540762424468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_mlp": 1.00684249, + "epoch": 0.293574451712197, + "flos": 503285571072.0, + "grad_norm": 0.03477645959362119, + "language_loss": 0.8372373, + "learning_rate": 0.000828627361861527, + "loss": 0.84778112, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.47509766, + "step": 1526, + "time_per_iteration": 2.583862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058639, + "balance_loss_mlp": 1.01124167, + "epoch": 0.29376683339746057, + "flos": 697684104960.0, + "grad_norm": 0.03858140978476568, + "language_loss": 0.85503912, + "learning_rate": 0.0008283924984548752, + "loss": 0.8656255, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.47363281, + "step": 1527, + "time_per_iteration": 2.848947525024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054231, + "balance_loss_mlp": 1.00680923, + "epoch": 0.2939592150827241, + "flos": 479542751232.0, + "grad_norm": 0.03208252397749005, + "language_loss": 0.8577444, + "learning_rate": 0.0008281575075549485, + "loss": 0.86828673, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.47387695, + "step": 1528, + "time_per_iteration": 2.6076998710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063099, + "balance_loss_mlp": 1.01703644, + "epoch": 0.2941515967679877, + "flos": 1488389507328.0, + "grad_norm": 0.010941905571601225, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78415793, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.45996094, + "step": 1529, + "time_per_iteration": 4.672811508178711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175133, + "balance_loss_mlp": 1.12690103, + "epoch": 0.2943439784532513, + "flos": 675400361472.0, + "grad_norm": 0.05299717257038309, + "language_loss": 0.90924174, + "learning_rate": 0.0008276871436402469, + "loss": 0.92099309, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.48217773, + "step": 1530, + "time_per_iteration": 2.8220977783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010581, + "balance_loss_mlp": 1.01096439, + "epoch": 0.2945363601385148, + "flos": 577383584256.0, + "grad_norm": 0.03620573442946411, + "language_loss": 0.88955015, + "learning_rate": 0.000827451770808083, + "loss": 0.90013111, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.47094727, + "step": 1531, + "time_per_iteration": 2.6981046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057368, + "balance_loss_mlp": 1.01013768, + "epoch": 0.2947287418237784, + "flos": 481618121472.0, + "grad_norm": 0.03382548660060083, + "language_loss": 0.84345412, + "learning_rate": 0.0008272162708478674, + "loss": 0.85402787, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.47192383, + "step": 1532, + "time_per_iteration": 2.5975306034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_mlp": 1.01151645, + "epoch": 0.2949211235090419, + "flos": 559261274880.0, + "grad_norm": 0.03154442800865326, + "language_loss": 0.87544608, + "learning_rate": 0.000826980643851029, + "loss": 0.88603282, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.47119141, + "step": 1533, + "time_per_iteration": 2.6889007091522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063865, + "balance_loss_mlp": 1.01675379, + "epoch": 0.2951135051943055, + "flos": 484857060096.0, + "grad_norm": 0.03876668067992812, + "language_loss": 0.85914761, + "learning_rate": 0.0008267448899090464, + "loss": 0.86978626, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.47070312, + "step": 1534, + "time_per_iteration": 2.5630924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062291, + "balance_loss_mlp": 1.01498842, + "epoch": 0.29530588687956905, + "flos": 551422497792.0, + "grad_norm": 0.034923849251574525, + "language_loss": 0.81812191, + "learning_rate": 0.0008265090091134473, + "loss": 0.82874477, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.47265625, + "step": 1535, + "time_per_iteration": 2.8399465084075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105968, + "balance_loss_mlp": 1.01235437, + "epoch": 0.29549826856483263, + "flos": 674310670080.0, + "grad_norm": 0.028029616611284485, + "language_loss": 0.80873084, + "learning_rate": 0.0008262730015558088, + "loss": 0.81932771, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.47290039, + "step": 1536, + "time_per_iteration": 2.874537944793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059174, + "balance_loss_mlp": 1.01151371, + "epoch": 0.29569065025009617, + "flos": 766136329728.0, + "grad_norm": 0.03177117147053012, + "language_loss": 0.82803708, + "learning_rate": 0.0008260368673277574, + "loss": 0.83862883, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.47631836, + "step": 1537, + "time_per_iteration": 3.0976641178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_mlp": 1.00573432, + "epoch": 0.29588303193535975, + "flos": 544831859712.0, + "grad_norm": 0.031452220479770684, + "language_loss": 0.84814745, + "learning_rate": 0.0008258006065209682, + "loss": 0.85868478, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.47973633, + "step": 1538, + "time_per_iteration": 2.7704694271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115804, + "balance_loss_mlp": 1.06735778, + "epoch": 0.29607541362062334, + "flos": 598146034944.0, + "grad_norm": 0.04896094729194987, + "language_loss": 0.81966412, + "learning_rate": 0.0008255642192271657, + "loss": 0.83082211, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.484375, + "step": 1539, + "time_per_iteration": 2.774122714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_mlp": 1.01219356, + "epoch": 0.29626779530588687, + "flos": 611038606080.0, + "grad_norm": 0.02837345788652225, + "language_loss": 0.84628069, + "learning_rate": 0.0008253277055381241, + "loss": 0.85687971, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.47680664, + "step": 1540, + "time_per_iteration": 2.837587833404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061212, + "balance_loss_mlp": 1.01340961, + "epoch": 0.29646017699115046, + "flos": 868959025152.0, + "grad_norm": 0.03662488769273821, + "language_loss": 0.86757702, + "learning_rate": 0.0008250910655456658, + "loss": 0.87818909, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.4777832, + "step": 1541, + "time_per_iteration": 3.123687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010574, + "balance_loss_mlp": 1.00954938, + "epoch": 0.296652558676414, + "flos": 496881570816.0, + "grad_norm": 0.03318095479066229, + "language_loss": 0.84889704, + "learning_rate": 0.0008248542993416625, + "loss": 0.85947102, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.47827148, + "step": 1542, + "time_per_iteration": 2.637747049331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068583, + "balance_loss_mlp": 1.02082753, + "epoch": 0.2968449403616776, + "flos": 572627243520.0, + "grad_norm": 0.03443634648546435, + "language_loss": 0.84426934, + "learning_rate": 0.0008246174070180352, + "loss": 0.8549552, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.47729492, + "step": 1543, + "time_per_iteration": 2.6872684955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062899, + "balance_loss_mlp": 1.01511967, + "epoch": 0.2970373220469411, + "flos": 795651304704.0, + "grad_norm": 0.035080805136432934, + "language_loss": 0.85198414, + "learning_rate": 0.0008243803886667537, + "loss": 0.86261314, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.47753906, + "step": 1544, + "time_per_iteration": 3.13710618019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069403, + "balance_loss_mlp": 1.02145684, + "epoch": 0.2972297037322047, + "flos": 662249220864.0, + "grad_norm": 0.04094703338464919, + "language_loss": 0.80137819, + "learning_rate": 0.0008241432443798364, + "loss": 0.81207222, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.47924805, + "step": 1545, + "time_per_iteration": 2.841092109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061565, + "balance_loss_mlp": 1.0138818, + "epoch": 0.29742208541746823, + "flos": 598232550912.0, + "grad_norm": 0.028624248431763765, + "language_loss": 0.86072361, + "learning_rate": 0.0008239059742493512, + "loss": 0.87133932, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.4765625, + "step": 1546, + "time_per_iteration": 2.7034194469451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349258, + "balance_loss_mlp": 1.29957151, + "epoch": 0.2976144671027318, + "flos": 771339823104.0, + "grad_norm": 0.07377893489124947, + "language_loss": 0.88059306, + "learning_rate": 0.0008236685783674142, + "loss": 0.89408565, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.49584961, + "step": 1547, + "time_per_iteration": 3.063077688217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071266, + "balance_loss_mlp": 1.02510834, + "epoch": 0.2978068487879954, + "flos": 1487914164480.0, + "grad_norm": 0.01225569795264997, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.7729246, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.4609375, + "step": 1548, + "time_per_iteration": 4.894561767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.02564275, + "epoch": 0.29799923047325894, + "flos": 476330057472.0, + "grad_norm": 0.041178192237982324, + "language_loss": 0.84313369, + "learning_rate": 0.0008231934097178955, + "loss": 0.85386503, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.47460938, + "step": 1549, + "time_per_iteration": 2.630146026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081209, + "balance_loss_mlp": 1.03362012, + "epoch": 0.2981916121585225, + "flos": 761169051648.0, + "grad_norm": 0.037198017460407115, + "language_loss": 0.86745787, + "learning_rate": 0.0008229556371347903, + "loss": 0.87826997, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.47558594, + "step": 1550, + "time_per_iteration": 2.9614980220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081757, + "balance_loss_mlp": 1.03416848, + "epoch": 0.29838399384378606, + "flos": 876517845504.0, + "grad_norm": 0.043512769843104544, + "language_loss": 0.80808616, + "learning_rate": 0.0008227177391691874, + "loss": 0.81890368, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.47558594, + "step": 1551, + "time_per_iteration": 3.11059832572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081803, + "balance_loss_mlp": 1.03445339, + "epoch": 0.29857637552904964, + "flos": 580752780288.0, + "grad_norm": 0.039547132323558824, + "language_loss": 0.90871334, + "learning_rate": 0.0008224797159134463, + "loss": 0.91953135, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.47314453, + "step": 1552, + "time_per_iteration": 2.7177717685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077357, + "balance_loss_mlp": 1.03026903, + "epoch": 0.2987687572143132, + "flos": 837809029632.0, + "grad_norm": 0.03288289742732326, + "language_loss": 0.84735203, + "learning_rate": 0.0008222415674599765, + "loss": 0.85812569, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.47045898, + "step": 1553, + "time_per_iteration": 3.090768814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.02513897, + "epoch": 0.29896113889957676, + "flos": 568168356096.0, + "grad_norm": 0.03857517262144223, + "language_loss": 0.8489393, + "learning_rate": 0.0008220032939012349, + "loss": 0.85966009, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.46899414, + "step": 1554, + "time_per_iteration": 2.7050375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.02554476, + "epoch": 0.29915352058484035, + "flos": 499836662016.0, + "grad_norm": 0.03341170745827686, + "language_loss": 0.89154899, + "learning_rate": 0.0008217648953297277, + "loss": 0.90227222, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.46728516, + "step": 1555, + "time_per_iteration": 2.8296022415161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106052, + "balance_loss_mlp": 1.01376653, + "epoch": 0.2993459022701039, + "flos": 593215695360.0, + "grad_norm": 0.042418434687241845, + "language_loss": 0.79395097, + "learning_rate": 0.0008215263718380095, + "loss": 0.80455619, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.46704102, + "step": 1556, + "time_per_iteration": 2.683760643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02541041, + "balance_loss_mlp": 2.4871583, + "epoch": 0.29953828395536747, + "flos": 573473916672.0, + "grad_norm": 0.19828678552993478, + "language_loss": 0.85491472, + "learning_rate": 0.0008212877235186833, + "loss": 0.88032514, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.54003906, + "step": 1557, + "time_per_iteration": 2.6963422298431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.0413208, + "epoch": 0.299730665640631, + "flos": 1508086566144.0, + "grad_norm": 0.015049722833054002, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78823709, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.44824219, + "step": 1558, + "time_per_iteration": 4.971554279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098273, + "balance_loss_mlp": 1.05063736, + "epoch": 0.2999230473258946, + "flos": 514808494080.0, + "grad_norm": 0.04814176942398931, + "language_loss": 0.82249933, + "learning_rate": 0.0008208100527678611, + "loss": 0.83348203, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.47607422, + "step": 1559, + "time_per_iteration": 2.6210360527038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130027, + "balance_loss_mlp": 1.08127058, + "epoch": 0.3001154290111581, + "flos": 835855168512.0, + "grad_norm": 0.05333171316141313, + "language_loss": 0.80031002, + "learning_rate": 0.0008205710305218135, + "loss": 0.81161028, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.48730469, + "step": 1560, + "time_per_iteration": 3.0021140575408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168816, + "balance_loss_mlp": 1.11898673, + "epoch": 0.3003078106964217, + "flos": 557946061824.0, + "grad_norm": 0.05314988858528354, + "language_loss": 0.91578549, + "learning_rate": 0.0008203318838190541, + "loss": 0.92747366, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.49707031, + "step": 1561, + "time_per_iteration": 2.7369065284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153064, + "balance_loss_mlp": 1.10247147, + "epoch": 0.30050019238168524, + "flos": 527169341952.0, + "grad_norm": 0.047834322975263, + "language_loss": 0.86778915, + "learning_rate": 0.0008200926127524281, + "loss": 0.87931979, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.50634766, + "step": 1562, + "time_per_iteration": 2.6357791423797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.10565686, + "epoch": 0.3006925740669488, + "flos": 578937924864.0, + "grad_norm": 0.04357261617021945, + "language_loss": 0.84502149, + "learning_rate": 0.0008198532174148289, + "loss": 0.85659254, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.51513672, + "step": 1563, + "time_per_iteration": 2.7241976261138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097195, + "balance_loss_mlp": 1.04941559, + "epoch": 0.3008849557522124, + "flos": 1493613409536.0, + "grad_norm": 0.019627167679756308, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8178336, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.47753906, + "step": 1564, + "time_per_iteration": 4.851420879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122708, + "balance_loss_mlp": 1.07035148, + "epoch": 0.30107733743747594, + "flos": 510824949504.0, + "grad_norm": 0.045341503179798265, + "language_loss": 0.90611446, + "learning_rate": 0.0008193740542985244, + "loss": 0.91734147, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.52441406, + "step": 1565, + "time_per_iteration": 2.62724232673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113673, + "balance_loss_mlp": 1.06098223, + "epoch": 0.30126971912273953, + "flos": 588821936640.0, + "grad_norm": 0.04014967632238747, + "language_loss": 0.87587321, + "learning_rate": 0.0008191342867058467, + "loss": 0.88700998, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.52783203, + "step": 1566, + "time_per_iteration": 2.766045570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133038, + "balance_loss_mlp": 1.07991791, + "epoch": 0.30146210080800306, + "flos": 603221216256.0, + "grad_norm": 0.039455426947262194, + "language_loss": 0.84397018, + "learning_rate": 0.0008188943952142509, + "loss": 0.85530061, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.53222656, + "step": 1567, + "time_per_iteration": 2.798323154449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113428, + "balance_loss_mlp": 1.06030834, + "epoch": 0.30165448249326665, + "flos": 919287973632.0, + "grad_norm": 0.03836627098538091, + "language_loss": 0.83653766, + "learning_rate": 0.0008186543799168711, + "loss": 0.84767193, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.53222656, + "step": 1568, + "time_per_iteration": 3.1216585636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.0594008, + "epoch": 0.3018468641785302, + "flos": 778631325696.0, + "grad_norm": 0.037681015369085746, + "language_loss": 0.89441907, + "learning_rate": 0.0008184142409068892, + "loss": 0.90554047, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.52832031, + "step": 1569, + "time_per_iteration": 2.9987363815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.03584409, + "epoch": 0.30203924586379377, + "flos": 523389931776.0, + "grad_norm": 0.031063886155947292, + "language_loss": 0.87584674, + "learning_rate": 0.000818173978277536, + "loss": 0.88672638, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.52197266, + "step": 1570, + "time_per_iteration": 2.657801389694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.04125619, + "epoch": 0.3022316275490573, + "flos": 525649994496.0, + "grad_norm": 0.03542742618693904, + "language_loss": 0.8460654, + "learning_rate": 0.000817933592122089, + "loss": 0.85699487, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.51757812, + "step": 1571, + "time_per_iteration": 2.699676752090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094536, + "balance_loss_mlp": 1.04289424, + "epoch": 0.3024240092343209, + "flos": 480873515520.0, + "grad_norm": 0.03710559119511486, + "language_loss": 0.84148443, + "learning_rate": 0.0008176930825338749, + "loss": 0.85242975, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.51708984, + "step": 1572, + "time_per_iteration": 2.560293197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_mlp": 1.03446782, + "epoch": 0.3026163909195845, + "flos": 688431938304.0, + "grad_norm": 0.03769478699711506, + "language_loss": 0.89810324, + "learning_rate": 0.0008174524496062679, + "loss": 0.90895915, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.51171875, + "step": 1573, + "time_per_iteration": 2.9185256958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083791, + "balance_loss_mlp": 1.03334129, + "epoch": 0.302808772604848, + "flos": 544087253760.0, + "grad_norm": 0.033203995249134796, + "language_loss": 0.86450267, + "learning_rate": 0.0008172116934326894, + "loss": 0.87534058, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.50488281, + "step": 1574, + "time_per_iteration": 2.77254056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107482, + "balance_loss_mlp": 1.02456117, + "epoch": 0.3030011542901116, + "flos": 476052046080.0, + "grad_norm": 0.03232260410081742, + "language_loss": 0.88820696, + "learning_rate": 0.0008169708141066097, + "loss": 0.89895517, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.50268555, + "step": 1575, + "time_per_iteration": 2.5428524017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083713, + "balance_loss_mlp": 1.03402615, + "epoch": 0.30319353597537513, + "flos": 482473542912.0, + "grad_norm": 0.035261838486320786, + "language_loss": 0.91478366, + "learning_rate": 0.0008167298117215465, + "loss": 0.92562079, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.49536133, + "step": 1576, + "time_per_iteration": 2.5388023853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.0151732, + "epoch": 0.3033859176606387, + "flos": 706113897984.0, + "grad_norm": 0.033895137386355495, + "language_loss": 0.89157575, + "learning_rate": 0.0008164886863710649, + "loss": 0.90221858, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.49047852, + "step": 1577, + "time_per_iteration": 2.9326250553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072249, + "balance_loss_mlp": 1.02363503, + "epoch": 0.30357829934590225, + "flos": 766110084864.0, + "grad_norm": 0.03320904121402137, + "language_loss": 0.87079322, + "learning_rate": 0.0008162474381487783, + "loss": 0.88151574, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.48608398, + "step": 1578, + "time_per_iteration": 3.0217320919036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_mlp": 1.02135277, + "epoch": 0.30377068103116583, + "flos": 533449887744.0, + "grad_norm": 0.035817825196195696, + "language_loss": 0.854909, + "learning_rate": 0.0008160060671483475, + "loss": 0.86560726, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.48461914, + "step": 1579, + "time_per_iteration": 2.6730797290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074874, + "balance_loss_mlp": 1.02647483, + "epoch": 0.3039630627164294, + "flos": 511224470016.0, + "grad_norm": 0.04566645575365512, + "language_loss": 0.84833682, + "learning_rate": 0.0008157645734634809, + "loss": 0.85908556, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.48388672, + "step": 1580, + "time_per_iteration": 2.5822741985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186287, + "balance_loss_mlp": 1.14089203, + "epoch": 0.30415544440169295, + "flos": 1509190841856.0, + "grad_norm": 0.045615209750242004, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78082776, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.453125, + "step": 1581, + "time_per_iteration": 4.900806665420532 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157879, + "balance_loss_mlp": 1.11257935, + "epoch": 0.30434782608695654, + "flos": 1461789772800.0, + "grad_norm": 0.04177274485031814, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74372375, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.45214844, + "step": 1582, + "time_per_iteration": 4.890560150146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071245, + "balance_loss_mlp": 1.02329922, + "epoch": 0.3045402077722201, + "flos": 483535044096.0, + "grad_norm": 0.03665669352532136, + "language_loss": 0.84926951, + "learning_rate": 0.000815039357240067, + "loss": 0.85998201, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.47924805, + "step": 1583, + "time_per_iteration": 2.655641555786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075238, + "balance_loss_mlp": 1.02695799, + "epoch": 0.30473258945748366, + "flos": 544627725312.0, + "grad_norm": 0.03699880598765725, + "language_loss": 0.86035675, + "learning_rate": 0.0008147973737554952, + "loss": 0.87110913, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.48266602, + "step": 1584, + "time_per_iteration": 2.8118185997009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066041, + "balance_loss_mlp": 1.01754665, + "epoch": 0.3049249711427472, + "flos": 568122669312.0, + "grad_norm": 0.039919187148179, + "language_loss": 0.86646891, + "learning_rate": 0.000814555268055744, + "loss": 0.87712932, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.48486328, + "step": 1585, + "time_per_iteration": 2.618649482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067734, + "balance_loss_mlp": 1.01926374, + "epoch": 0.3051173528280108, + "flos": 529290398976.0, + "grad_norm": 0.034961032963054674, + "language_loss": 0.88066852, + "learning_rate": 0.0008143130402348073, + "loss": 0.89134592, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.48461914, + "step": 1586, + "time_per_iteration": 2.6645073890686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.01593137, + "epoch": 0.3053097345132743, + "flos": 587600042496.0, + "grad_norm": 0.03198607314396223, + "language_loss": 0.79707628, + "learning_rate": 0.0008140706903867265, + "loss": 0.80772173, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.48608398, + "step": 1587, + "time_per_iteration": 2.772688150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.01706147, + "epoch": 0.3055021161985379, + "flos": 608201133312.0, + "grad_norm": 0.03820330265300666, + "language_loss": 0.90882033, + "learning_rate": 0.0008138282186055897, + "loss": 0.91947937, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.48803711, + "step": 1588, + "time_per_iteration": 2.6824429035186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106751, + "balance_loss_mlp": 1.01851535, + "epoch": 0.3056944978838015, + "flos": 574963128576.0, + "grad_norm": 0.03364087196891663, + "language_loss": 0.83419842, + "learning_rate": 0.0008135856249855331, + "loss": 0.84487349, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.48950195, + "step": 1589, + "time_per_iteration": 2.6829729080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.0164994, + "epoch": 0.305886879569065, + "flos": 635072076288.0, + "grad_norm": 0.036524553871552005, + "language_loss": 0.90591866, + "learning_rate": 0.0008133429096207398, + "loss": 0.91657621, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.4909668, + "step": 1590, + "time_per_iteration": 2.7734742164611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135399, + "balance_loss_mlp": 1.08351898, + "epoch": 0.3060792612543286, + "flos": 1372133769216.0, + "grad_norm": 0.023040785082221134, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76447666, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.51953125, + "step": 1591, + "time_per_iteration": 4.964044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106727, + "balance_loss_mlp": 1.01806068, + "epoch": 0.30627164293959214, + "flos": 519619269888.0, + "grad_norm": 0.029618090290997726, + "language_loss": 0.87174189, + "learning_rate": 0.0008128571140339123, + "loss": 0.88241458, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.49121094, + "step": 1592, + "time_per_iteration": 2.6813180446624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.01942289, + "epoch": 0.3064640246248557, + "flos": 456533843712.0, + "grad_norm": 0.02963099688993501, + "language_loss": 0.87551641, + "learning_rate": 0.0008126140340004805, + "loss": 0.88620031, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.48876953, + "step": 1593, + "time_per_iteration": 2.5293447971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_mlp": 1.01580834, + "epoch": 0.30665640631011926, + "flos": 851609511936.0, + "grad_norm": 0.028917997945976257, + "language_loss": 0.82855684, + "learning_rate": 0.0008123708325995172, + "loss": 0.8392061, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.49023438, + "step": 1594, + "time_per_iteration": 3.1976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068322, + "balance_loss_mlp": 1.01937473, + "epoch": 0.30684878799538284, + "flos": 759616656384.0, + "grad_norm": 0.02786640270256765, + "language_loss": 0.80270225, + "learning_rate": 0.0008121275099254414, + "loss": 0.81338549, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.48901367, + "step": 1595, + "time_per_iteration": 2.9073448181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105895, + "balance_loss_mlp": 1.01069379, + "epoch": 0.3070411696806464, + "flos": 518596652544.0, + "grad_norm": 0.02828411740511225, + "language_loss": 0.89261508, + "learning_rate": 0.0008118840660727194, + "loss": 0.90320462, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.48242188, + "step": 1596, + "time_per_iteration": 2.6137096881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105679, + "balance_loss_mlp": 1.00855815, + "epoch": 0.30723355136590996, + "flos": 845791670016.0, + "grad_norm": 0.02807637717187332, + "language_loss": 0.8853125, + "learning_rate": 0.0008116405011358644, + "loss": 0.89588046, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.48217773, + "step": 1597, + "time_per_iteration": 3.1528680324554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_mlp": 1.01163971, + "epoch": 0.30742593305117355, + "flos": 467079836160.0, + "grad_norm": 0.032917462624290315, + "language_loss": 0.80716425, + "learning_rate": 0.0008113968152094369, + "loss": 0.81776392, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.4831543, + "step": 1598, + "time_per_iteration": 2.5390987396240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059252, + "balance_loss_mlp": 1.011235, + "epoch": 0.3076183147364371, + "flos": 687817589760.0, + "grad_norm": 0.03298344899906339, + "language_loss": 0.830042, + "learning_rate": 0.0008111530083880438, + "loss": 0.84063458, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.47998047, + "step": 1599, + "time_per_iteration": 2.904327154159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059695, + "balance_loss_mlp": 1.01170099, + "epoch": 0.30781069642170067, + "flos": 615180598272.0, + "grad_norm": 0.03364515132561045, + "language_loss": 0.86925042, + "learning_rate": 0.0008109090807663399, + "loss": 0.87984729, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.47973633, + "step": 1600, + "time_per_iteration": 2.794553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059845, + "balance_loss_mlp": 1.01206601, + "epoch": 0.3080030781069642, + "flos": 591509710080.0, + "grad_norm": 0.029450986393402313, + "language_loss": 0.89288217, + "learning_rate": 0.0008106650324390257, + "loss": 0.90348059, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.47753906, + "step": 1601, + "time_per_iteration": 2.825118064880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055341, + "balance_loss_mlp": 1.00744271, + "epoch": 0.3081954597922278, + "flos": 563691972096.0, + "grad_norm": 0.03217567830931305, + "language_loss": 0.82333392, + "learning_rate": 0.0008104208635008493, + "loss": 0.83388734, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.47875977, + "step": 1602, + "time_per_iteration": 2.7727856636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057631, + "balance_loss_mlp": 1.0099231, + "epoch": 0.3083878414774913, + "flos": 448762140672.0, + "grad_norm": 0.03928010080840531, + "language_loss": 0.82422024, + "learning_rate": 0.0008101765740466058, + "loss": 0.83479655, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.47680664, + "step": 1603, + "time_per_iteration": 2.5764591693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106031, + "balance_loss_mlp": 1.01272202, + "epoch": 0.3085802231627549, + "flos": 494545685760.0, + "grad_norm": 0.03880240670965016, + "language_loss": 0.84925759, + "learning_rate": 0.0008099321641711364, + "loss": 0.85986066, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.47558594, + "step": 1604, + "time_per_iteration": 2.6562154293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059833, + "balance_loss_mlp": 1.01262641, + "epoch": 0.3087726048480185, + "flos": 488690905344.0, + "grad_norm": 0.030963234073246262, + "language_loss": 0.84138477, + "learning_rate": 0.0008096876339693295, + "loss": 0.85198307, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.47167969, + "step": 1605, + "time_per_iteration": 2.6818747520446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057926, + "balance_loss_mlp": 1.01083875, + "epoch": 0.308964986533282, + "flos": 731888346624.0, + "grad_norm": 0.03606871420254603, + "language_loss": 0.82584137, + "learning_rate": 0.0008094429835361206, + "loss": 0.83642066, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.47045898, + "step": 1606, + "time_per_iteration": 2.940202236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01211011, + "epoch": 0.3091573682185456, + "flos": 606516535296.0, + "grad_norm": 0.033324674351776856, + "language_loss": 0.86802429, + "learning_rate": 0.0008091982129664908, + "loss": 0.87861747, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.47167969, + "step": 1607, + "time_per_iteration": 2.7152366638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055819, + "balance_loss_mlp": 1.00858819, + "epoch": 0.30934974990380915, + "flos": 461307681024.0, + "grad_norm": 0.0316485976101594, + "language_loss": 0.83554763, + "learning_rate": 0.0008089533223554687, + "loss": 0.84610581, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.47192383, + "step": 1608, + "time_per_iteration": 2.73236083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00692201, + "epoch": 0.30954213158907273, + "flos": 554568117504.0, + "grad_norm": 0.03240022060424308, + "language_loss": 0.85798776, + "learning_rate": 0.0008087083117981294, + "loss": 0.86852884, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.47143555, + "step": 1609, + "time_per_iteration": 2.8992979526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052885, + "balance_loss_mlp": 1.00543988, + "epoch": 0.30973451327433627, + "flos": 554114161920.0, + "grad_norm": 0.03509024741452312, + "language_loss": 0.88937026, + "learning_rate": 0.0008084631813895943, + "loss": 0.89989913, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.47412109, + "step": 1610, + "time_per_iteration": 2.8113343715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00168037, + "epoch": 0.30992689495959985, + "flos": 566763714816.0, + "grad_norm": 0.03310460584308608, + "language_loss": 0.8446725, + "learning_rate": 0.0008082179312250315, + "loss": 0.85516399, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.47436523, + "step": 1611, + "time_per_iteration": 2.6286494731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146973, + "balance_loss_mlp": 1.09509277, + "epoch": 0.3101192766448634, + "flos": 1445562998784.0, + "grad_norm": 0.022501740699277736, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8100282, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.51953125, + "step": 1612, + "time_per_iteration": 4.877255439758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132568, + "balance_loss_mlp": 1.08087921, + "epoch": 0.31031165833012697, + "flos": 1535130541056.0, + "grad_norm": 0.020576462480935535, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.777619, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.51757812, + "step": 1613, + "time_per_iteration": 5.064774751663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.00363839, + "epoch": 0.31050404001539056, + "flos": 993633862656.0, + "grad_norm": 0.03245007970491877, + "language_loss": 0.83116508, + "learning_rate": 0.0008074814631475545, + "loss": 0.84167451, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.47265625, + "step": 1614, + "time_per_iteration": 3.322155714035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_mlp": 1.00741875, + "epoch": 0.3106964217006541, + "flos": 446973530112.0, + "grad_norm": 0.03235075185089818, + "language_loss": 0.80034411, + "learning_rate": 0.0008072357349114907, + "loss": 0.81089151, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.47290039, + "step": 1615, + "time_per_iteration": 2.699772596359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056016, + "balance_loss_mlp": 1.00880885, + "epoch": 0.3108888033859177, + "flos": 511495678464.0, + "grad_norm": 0.0340106704308988, + "language_loss": 0.89603639, + "learning_rate": 0.0008069898873959363, + "loss": 0.90659654, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.47167969, + "step": 1616, + "time_per_iteration": 2.680640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051359, + "balance_loss_mlp": 1.0043664, + "epoch": 0.3110811850711812, + "flos": 521779210752.0, + "grad_norm": 0.029395602971080924, + "language_loss": 0.86344647, + "learning_rate": 0.0008067439206963375, + "loss": 0.87396008, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.46948242, + "step": 1617, + "time_per_iteration": 2.6484971046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055386, + "balance_loss_mlp": 1.00844121, + "epoch": 0.3112735667564448, + "flos": 687731073792.0, + "grad_norm": 0.03406090033110643, + "language_loss": 0.87673247, + "learning_rate": 0.0008064978349081873, + "loss": 0.88728631, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.46899414, + "step": 1618, + "time_per_iteration": 2.92702579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_mlp": 1.00965679, + "epoch": 0.31146594844170833, + "flos": 534166303488.0, + "grad_norm": 0.030256910717709223, + "language_loss": 0.87292403, + "learning_rate": 0.0008062516301270245, + "loss": 0.88348979, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.46875, + "step": 1619, + "time_per_iteration": 2.7301478385925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.00859511, + "epoch": 0.3116583301269719, + "flos": 680842982400.0, + "grad_norm": 0.027867683897015817, + "language_loss": 0.88937479, + "learning_rate": 0.0008060053064484343, + "loss": 0.89992964, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.46850586, + "step": 1620, + "time_per_iteration": 2.947906017303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048804, + "balance_loss_mlp": 1.00202632, + "epoch": 0.31185071181223545, + "flos": 587330779392.0, + "grad_norm": 0.03167203134142694, + "language_loss": 0.86095911, + "learning_rate": 0.0008057588639680482, + "loss": 0.87144709, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.46728516, + "step": 1621, + "time_per_iteration": 2.7836551666259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_mlp": 1.00282919, + "epoch": 0.31204309349749904, + "flos": 726658608384.0, + "grad_norm": 0.037979301866738396, + "language_loss": 0.83855367, + "learning_rate": 0.0008055123027815434, + "loss": 0.84904802, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.46557617, + "step": 1622, + "time_per_iteration": 2.9263358116149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_mlp": 1.00455689, + "epoch": 0.3122354751827626, + "flos": 577895865600.0, + "grad_norm": 0.032507776226150094, + "language_loss": 0.85607505, + "learning_rate": 0.0008052656229846436, + "loss": 0.86658645, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.46533203, + "step": 1623, + "time_per_iteration": 2.662386894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051831, + "balance_loss_mlp": 1.00514877, + "epoch": 0.31242785686802615, + "flos": 577029750528.0, + "grad_norm": 0.03513403942618559, + "language_loss": 0.91195071, + "learning_rate": 0.0008050188246731182, + "loss": 0.92246902, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.46630859, + "step": 1624, + "time_per_iteration": 2.710176467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052884, + "balance_loss_mlp": 1.00624907, + "epoch": 0.31262023855328974, + "flos": 738197082624.0, + "grad_norm": 0.0324646036152644, + "language_loss": 0.82931978, + "learning_rate": 0.0008047719079427834, + "loss": 0.83984858, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.46582031, + "step": 1625, + "time_per_iteration": 2.970287561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.03533173, + "epoch": 0.3128126202385533, + "flos": 1562594445312.0, + "grad_norm": 0.01743050972952843, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75434434, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.47363281, + "step": 1626, + "time_per_iteration": 4.816533088684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053999, + "balance_loss_mlp": 1.0071733, + "epoch": 0.31300500192381686, + "flos": 515943872256.0, + "grad_norm": 0.030770809254638827, + "language_loss": 0.86711371, + "learning_rate": 0.0008042777196091757, + "loss": 0.87765372, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.46777344, + "step": 1627, + "time_per_iteration": 2.7191882133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.01072919, + "epoch": 0.3131973836090804, + "flos": 527662181376.0, + "grad_norm": 0.031150181208545357, + "language_loss": 0.82488692, + "learning_rate": 0.0008040304481977643, + "loss": 0.83546221, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.4675293, + "step": 1628, + "time_per_iteration": 2.706782579421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.01065385, + "epoch": 0.313389765294344, + "flos": 824210736384.0, + "grad_norm": 0.032636383561425994, + "language_loss": 0.87568998, + "learning_rate": 0.0008037830587512649, + "loss": 0.88626337, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.46630859, + "step": 1629, + "time_per_iteration": 3.0928542613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054937, + "balance_loss_mlp": 1.00820696, + "epoch": 0.31358214697960757, + "flos": 394703359488.0, + "grad_norm": 0.03241768310332359, + "language_loss": 0.79631239, + "learning_rate": 0.0008035355513657224, + "loss": 0.80686176, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.46679688, + "step": 1630, + "time_per_iteration": 2.449666738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054798, + "balance_loss_mlp": 1.00806797, + "epoch": 0.3137745286648711, + "flos": 573098695680.0, + "grad_norm": 0.0293939817515363, + "language_loss": 0.93494189, + "learning_rate": 0.0008032879261372279, + "loss": 0.94548988, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.46679688, + "step": 1631, + "time_per_iteration": 2.766951084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068432, + "balance_loss_mlp": 1.02256012, + "epoch": 0.3139669103501347, + "flos": 1501632021504.0, + "grad_norm": 0.011791019456215185, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80704272, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.45800781, + "step": 1632, + "time_per_iteration": 5.585620403289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_mlp": 1.00425589, + "epoch": 0.3141592920353982, + "flos": 526359607296.0, + "grad_norm": 0.030163528949794682, + "language_loss": 0.87607086, + "learning_rate": 0.0008027923225359748, + "loss": 0.88657928, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.46533203, + "step": 1633, + "time_per_iteration": 2.607407808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105517, + "balance_loss_mlp": 1.0084641, + "epoch": 0.3143516737206618, + "flos": 594388012032.0, + "grad_norm": 0.030785944321789945, + "language_loss": 0.88644683, + "learning_rate": 0.0008025443443556267, + "loss": 0.89699847, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.46655273, + "step": 1634, + "time_per_iteration": 2.704568862915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053981, + "balance_loss_mlp": 1.00756085, + "epoch": 0.31454405540592534, + "flos": 649680347904.0, + "grad_norm": 0.028625636333363444, + "language_loss": 0.88813668, + "learning_rate": 0.000802296248717147, + "loss": 0.89867646, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.46362305, + "step": 1635, + "time_per_iteration": 2.914228916168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_mlp": 1.00461841, + "epoch": 0.3147364370911889, + "flos": 644070531072.0, + "grad_norm": 0.032412817231273386, + "language_loss": 0.79727387, + "learning_rate": 0.0008020480357168554, + "loss": 0.80778593, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.46533203, + "step": 1636, + "time_per_iteration": 2.8196966648101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_mlp": 1.00505865, + "epoch": 0.31492881877645246, + "flos": 472821855744.0, + "grad_norm": 0.028828485286514015, + "language_loss": 0.88662213, + "learning_rate": 0.0008017997054511165, + "loss": 0.89713949, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.46630859, + "step": 1637, + "time_per_iteration": 2.6545960903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051567, + "balance_loss_mlp": 1.00486124, + "epoch": 0.31512120046171604, + "flos": 630630685440.0, + "grad_norm": 0.03463883423234526, + "language_loss": 0.86238796, + "learning_rate": 0.0008015512580163407, + "loss": 0.87290359, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.46655273, + "step": 1638, + "time_per_iteration": 2.775726795196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00429583, + "epoch": 0.31531358214697963, + "flos": 705054342144.0, + "grad_norm": 0.0328972983749375, + "language_loss": 0.81582069, + "learning_rate": 0.0008013026935089838, + "loss": 0.82632947, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.46533203, + "step": 1639, + "time_per_iteration": 2.859405040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048793, + "balance_loss_mlp": 1.00182474, + "epoch": 0.31550596383224316, + "flos": 573632364288.0, + "grad_norm": 0.03266078051512415, + "language_loss": 0.84787768, + "learning_rate": 0.0008010540120255472, + "loss": 0.85836554, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.46923828, + "step": 1640, + "time_per_iteration": 2.654087781906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051523, + "balance_loss_mlp": 1.00457835, + "epoch": 0.31569834551750675, + "flos": 659513815296.0, + "grad_norm": 0.0373471738494659, + "language_loss": 0.87093472, + "learning_rate": 0.0008008052136625774, + "loss": 0.88144994, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.46899414, + "step": 1641, + "time_per_iteration": 2.7806570529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_mlp": 1.00730693, + "epoch": 0.3158907272027703, + "flos": 567404308224.0, + "grad_norm": 0.028103315573088077, + "language_loss": 0.87394774, + "learning_rate": 0.0008005562985166666, + "loss": 0.88449007, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.46875, + "step": 1642, + "time_per_iteration": 2.6866798400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053754, + "balance_loss_mlp": 1.00699973, + "epoch": 0.31608310888803387, + "flos": 537973903872.0, + "grad_norm": 0.024374019828786602, + "language_loss": 0.85555339, + "learning_rate": 0.0008003072666844524, + "loss": 0.86609089, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.46704102, + "step": 1643, + "time_per_iteration": 2.684518337249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_mlp": 1.00856149, + "epoch": 0.3162754905732974, + "flos": 487640097792.0, + "grad_norm": 0.037314537224785074, + "language_loss": 0.8350842, + "learning_rate": 0.0008000581182626173, + "loss": 0.84563494, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.46459961, + "step": 1644, + "time_per_iteration": 2.5574259757995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051572, + "balance_loss_mlp": 1.00481844, + "epoch": 0.316467872258561, + "flos": 531096506112.0, + "grad_norm": 0.03327277300757214, + "language_loss": 0.87005818, + "learning_rate": 0.0007998088533478894, + "loss": 0.88057387, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.46704102, + "step": 1645, + "time_per_iteration": 2.6987338066101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_mlp": 1.00894499, + "epoch": 0.3166602539438245, + "flos": 444414068736.0, + "grad_norm": 0.040202418156990175, + "language_loss": 0.85042381, + "learning_rate": 0.000799559472037042, + "loss": 0.8609792, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.46533203, + "step": 1646, + "time_per_iteration": 2.6219563484191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056055, + "balance_loss_mlp": 1.00958765, + "epoch": 0.3168526356290881, + "flos": 647103389952.0, + "grad_norm": 0.026601574185044653, + "language_loss": 0.8823331, + "learning_rate": 0.0007993099744268932, + "loss": 0.89289367, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.46411133, + "step": 1647, + "time_per_iteration": 2.8902037143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_mlp": 1.00817358, + "epoch": 0.3170450173143517, + "flos": 587258847744.0, + "grad_norm": 0.03281471441230887, + "language_loss": 0.8855083, + "learning_rate": 0.000799060360614307, + "loss": 0.89605635, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.46582031, + "step": 1648, + "time_per_iteration": 2.694293975830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.00945473, + "epoch": 0.3172373989996152, + "flos": 828574359552.0, + "grad_norm": 0.03046931045185914, + "language_loss": 0.84284711, + "learning_rate": 0.0007988106306961917, + "loss": 0.85340536, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.46313477, + "step": 1649, + "time_per_iteration": 3.121788501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_mlp": 1.01195896, + "epoch": 0.3174297806848788, + "flos": 528434977536.0, + "grad_norm": 0.03563880571664149, + "language_loss": 0.85299373, + "learning_rate": 0.0007985607847695014, + "loss": 0.8635785, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.46459961, + "step": 1650, + "time_per_iteration": 2.625356912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.00107014, + "epoch": 0.31762216237014235, + "flos": 714482452992.0, + "grad_norm": 0.030498079123472206, + "language_loss": 0.83133662, + "learning_rate": 0.0007983108229312345, + "loss": 0.84180987, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.46191406, + "step": 1651, + "time_per_iteration": 2.894109010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_mlp": 1.00362098, + "epoch": 0.31781454405540593, + "flos": 484800679680.0, + "grad_norm": 0.03387492306443982, + "language_loss": 0.86931884, + "learning_rate": 0.0007980607452784351, + "loss": 0.87981641, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.46069336, + "step": 1652, + "time_per_iteration": 2.5593390464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048407, + "balance_loss_mlp": 1.00236845, + "epoch": 0.31800692574066947, + "flos": 549804973824.0, + "grad_norm": 0.04030851184116312, + "language_loss": 0.90997875, + "learning_rate": 0.0007978105519081919, + "loss": 0.92046285, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.4597168, + "step": 1653, + "time_per_iteration": 2.683809995651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_mlp": 0.99982309, + "epoch": 0.31819930742593305, + "flos": 517917175296.0, + "grad_norm": 0.033294821801319624, + "language_loss": 0.88831019, + "learning_rate": 0.0007975602429176385, + "loss": 0.89876974, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.46069336, + "step": 1654, + "time_per_iteration": 2.5786075592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00238276, + "epoch": 0.31839168911119664, + "flos": 456970302720.0, + "grad_norm": 0.028947480678153642, + "language_loss": 0.82318926, + "learning_rate": 0.0007973098184039536, + "loss": 0.83367276, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.45898438, + "step": 1655, + "time_per_iteration": 2.651188611984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 0.99921381, + "epoch": 0.3185840707964602, + "flos": 627296482560.0, + "grad_norm": 0.03276090001573999, + "language_loss": 0.8731916, + "learning_rate": 0.0007970592784643602, + "loss": 0.88364458, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.46020508, + "step": 1656, + "time_per_iteration": 2.8683595657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 0.99976265, + "epoch": 0.31877645248172376, + "flos": 568541631744.0, + "grad_norm": 0.035945607337745746, + "language_loss": 0.85986471, + "learning_rate": 0.0007968086231961272, + "loss": 0.87032342, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.46044922, + "step": 1657, + "time_per_iteration": 2.642733335494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00119007, + "epoch": 0.3189688341669873, + "flos": 490553392896.0, + "grad_norm": 0.04377426906704287, + "language_loss": 0.84065533, + "learning_rate": 0.0007965578526965671, + "loss": 0.85112733, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.45947266, + "step": 1658, + "time_per_iteration": 2.5638930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049099, + "balance_loss_mlp": 1.00291717, + "epoch": 0.3191612158522509, + "flos": 577381638912.0, + "grad_norm": 0.02931224295785387, + "language_loss": 0.86766565, + "learning_rate": 0.0007963069670630377, + "loss": 0.87815666, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.46118164, + "step": 1659, + "time_per_iteration": 2.7154479026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051177, + "balance_loss_mlp": 1.00506639, + "epoch": 0.3193535975375144, + "flos": 539193852672.0, + "grad_norm": 0.03496177903686506, + "language_loss": 0.88776976, + "learning_rate": 0.0007960559663929416, + "loss": 0.89828151, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.46044922, + "step": 1660, + "time_per_iteration": 2.6322021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_mlp": 1.00868368, + "epoch": 0.319545979222778, + "flos": 735628872960.0, + "grad_norm": 0.030221795014758104, + "language_loss": 0.88154632, + "learning_rate": 0.0007958048507837259, + "loss": 0.89209306, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.45922852, + "step": 1661, + "time_per_iteration": 2.9221389293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105245, + "balance_loss_mlp": 1.00648332, + "epoch": 0.31973836090804153, + "flos": 765768890112.0, + "grad_norm": 0.037416739988226255, + "language_loss": 0.87668484, + "learning_rate": 0.0007955536203328822, + "loss": 0.88720942, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.45898438, + "step": 1662, + "time_per_iteration": 2.9018445014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_mlp": 1.00184774, + "epoch": 0.3199307425933051, + "flos": 561742968576.0, + "grad_norm": 0.03025687936293395, + "language_loss": 0.84124553, + "learning_rate": 0.0007953022751379469, + "loss": 0.85172796, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.46337891, + "step": 1663, + "time_per_iteration": 2.781562566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085701, + "balance_loss_mlp": 1.03906643, + "epoch": 0.3201231242785687, + "flos": 752672184576.0, + "grad_norm": 0.03881407073457837, + "language_loss": 0.82717097, + "learning_rate": 0.000795050815296501, + "loss": 0.83802795, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.46582031, + "step": 1664, + "time_per_iteration": 2.9950287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_mlp": 1.00446498, + "epoch": 0.32031550596383224, + "flos": 497385103872.0, + "grad_norm": 0.02713287522590179, + "language_loss": 0.93810016, + "learning_rate": 0.0007947992409061695, + "loss": 0.94860852, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.46313477, + "step": 1665, + "time_per_iteration": 2.583118438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056564, + "balance_loss_mlp": 1.01045382, + "epoch": 0.3205078876490958, + "flos": 732875970816.0, + "grad_norm": 0.03263285268561658, + "language_loss": 0.86165506, + "learning_rate": 0.0007945475520646226, + "loss": 0.8722207, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.46044922, + "step": 1666, + "time_per_iteration": 2.903190851211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_mlp": 1.01324141, + "epoch": 0.32070026933435936, + "flos": 550475702784.0, + "grad_norm": 0.03801033406135743, + "language_loss": 0.85650241, + "learning_rate": 0.0007942957488695743, + "loss": 0.86709714, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.46166992, + "step": 1667, + "time_per_iteration": 2.661292791366577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_mlp": 1.01277089, + "epoch": 0.32089265101962294, + "flos": 746685201408.0, + "grad_norm": 0.031638418068872444, + "language_loss": 0.81749988, + "learning_rate": 0.0007940438314187833, + "loss": 0.82809013, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.46191406, + "step": 1668, + "time_per_iteration": 3.0293474197387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057511, + "balance_loss_mlp": 1.01144862, + "epoch": 0.3210850327048865, + "flos": 495196972800.0, + "grad_norm": 0.034120041175176606, + "language_loss": 0.81371748, + "learning_rate": 0.0007937917998100529, + "loss": 0.82429266, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.45996094, + "step": 1669, + "time_per_iteration": 2.5822434425354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08258255, + "balance_loss_mlp": 8.0, + "epoch": 0.32127741439015006, + "flos": 531673916160.0, + "grad_norm": 0.043058724234977634, + "language_loss": 0.81425405, + "learning_rate": 0.0007935396541412302, + "loss": 0.89683664, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 2.58203125, + "step": 1670, + "time_per_iteration": 2.5968360900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0830899, + "balance_loss_mlp": 8.0, + "epoch": 0.3214697960754136, + "flos": 502224069888.0, + "grad_norm": 0.0363513778225316, + "language_loss": 0.87401152, + "learning_rate": 0.0007932873945102068, + "loss": 0.9571014, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 3.0859375, + "step": 1671, + "time_per_iteration": 2.582617998123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08312805, + "balance_loss_mlp": 8.0, + "epoch": 0.3216621777606772, + "flos": 1386404736768.0, + "grad_norm": 0.003686648730821959, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.84074581, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 3.125, + "step": 1672, + "time_per_iteration": 4.829998970031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08311279, + "balance_loss_mlp": 8.0, + "epoch": 0.32185455944594077, + "flos": 572635991808.0, + "grad_norm": 0.030782594356869853, + "language_loss": 0.88089788, + "learning_rate": 0.0007927825337533461, + "loss": 0.96401072, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 3.109375, + "step": 1673, + "time_per_iteration": 2.6633598804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08310516, + "balance_loss_mlp": 8.0, + "epoch": 0.3220469411312043, + "flos": 544937817600.0, + "grad_norm": 0.040711103761993876, + "language_loss": 0.86732781, + "learning_rate": 0.0007925299328235131, + "loss": 0.95043296, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 3.1015625, + "step": 1674, + "time_per_iteration": 2.634169578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08307083, + "balance_loss_mlp": 8.0, + "epoch": 0.3222393228164679, + "flos": 492162168576.0, + "grad_norm": 0.03938689136463286, + "language_loss": 0.86802006, + "learning_rate": 0.000792277218323488, + "loss": 0.95109081, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 3.06640625, + "step": 1675, + "time_per_iteration": 2.5893990993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08270843, + "balance_loss_mlp": 8.0, + "epoch": 0.3224317045017314, + "flos": 491363127552.0, + "grad_norm": 0.03386575094399551, + "language_loss": 0.86165106, + "learning_rate": 0.0007920243903513833, + "loss": 0.94435954, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 2.7109375, + "step": 1676, + "time_per_iteration": 2.5602426528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02321873, + "balance_loss_mlp": 2.26942062, + "epoch": 0.322624086186995, + "flos": 576871302912.0, + "grad_norm": 0.12910494226103245, + "language_loss": 0.85448408, + "learning_rate": 0.0007917714490053556, + "loss": 0.87770277, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.52539062, + "step": 1677, + "time_per_iteration": 2.6558380126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071536, + "balance_loss_mlp": 1.02492559, + "epoch": 0.32281646787225854, + "flos": 630572359680.0, + "grad_norm": 0.04049679721352166, + "language_loss": 0.87627459, + "learning_rate": 0.0007915183943836055, + "loss": 0.88698995, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.46557617, + "step": 1678, + "time_per_iteration": 2.898658037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.02631712, + "epoch": 0.3230088495575221, + "flos": 782808311040.0, + "grad_norm": 0.04272749105284559, + "language_loss": 0.85738349, + "learning_rate": 0.0007912652265843773, + "loss": 0.86811107, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.46386719, + "step": 1679, + "time_per_iteration": 3.049938917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082142, + "balance_loss_mlp": 1.03557873, + "epoch": 0.3232012312427857, + "flos": 537201107712.0, + "grad_norm": 0.04201967602882564, + "language_loss": 0.83624417, + "learning_rate": 0.0007910119457059597, + "loss": 0.84706557, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.46508789, + "step": 1680, + "time_per_iteration": 2.7126853466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108585, + "balance_loss_mlp": 1.03895342, + "epoch": 0.32339361292804925, + "flos": 706233461760.0, + "grad_norm": 0.044345030126194285, + "language_loss": 0.81981564, + "learning_rate": 0.0007907585518466849, + "loss": 0.83067411, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.46850586, + "step": 1681, + "time_per_iteration": 2.9758992195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088847, + "balance_loss_mlp": 1.0419023, + "epoch": 0.32358599461331283, + "flos": 453257966592.0, + "grad_norm": 0.04210474159896445, + "language_loss": 0.91257876, + "learning_rate": 0.000790505045104929, + "loss": 0.92346722, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.46899414, + "step": 1682, + "time_per_iteration": 2.5105395317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090368, + "balance_loss_mlp": 1.04337561, + "epoch": 0.32377837629857636, + "flos": 602092641024.0, + "grad_norm": 0.04465728550727914, + "language_loss": 0.88834655, + "learning_rate": 0.0007902514255791125, + "loss": 0.89925027, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.46948242, + "step": 1683, + "time_per_iteration": 2.7610387802124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089116, + "balance_loss_mlp": 1.04190934, + "epoch": 0.32397075798383995, + "flos": 808899654912.0, + "grad_norm": 0.04108658803287063, + "language_loss": 0.89801908, + "learning_rate": 0.0007899976933676986, + "loss": 0.90891027, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.47167969, + "step": 1684, + "time_per_iteration": 2.963387966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089307, + "balance_loss_mlp": 1.04205263, + "epoch": 0.3241631396691035, + "flos": 602793505536.0, + "grad_norm": 0.046655842402160155, + "language_loss": 0.89137548, + "learning_rate": 0.0007897438485691955, + "loss": 0.90226853, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.47216797, + "step": 1685, + "time_per_iteration": 2.675910711288452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079467, + "balance_loss_mlp": 1.03195012, + "epoch": 0.32435552135436707, + "flos": 475177182720.0, + "grad_norm": 0.045429866607221585, + "language_loss": 0.84063458, + "learning_rate": 0.0007894898912821542, + "loss": 0.85142922, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.47485352, + "step": 1686, + "time_per_iteration": 2.530951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077585, + "balance_loss_mlp": 1.02980566, + "epoch": 0.3245479030396306, + "flos": 539220097536.0, + "grad_norm": 0.03833008440392265, + "language_loss": 0.88029444, + "learning_rate": 0.0007892358216051695, + "loss": 0.89107037, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.47753906, + "step": 1687, + "time_per_iteration": 2.7729742527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.01963735, + "epoch": 0.3247402847248942, + "flos": 548697785856.0, + "grad_norm": 0.039082280310976325, + "language_loss": 0.93519121, + "learning_rate": 0.0007889816396368803, + "loss": 0.94586968, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.48193359, + "step": 1688, + "time_per_iteration": 2.625795602798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_mlp": 1.01371753, + "epoch": 0.3249326664101578, + "flos": 378992757504.0, + "grad_norm": 0.03548852277095179, + "language_loss": 0.86296374, + "learning_rate": 0.0007887273454759687, + "loss": 0.87358844, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.48754883, + "step": 1689, + "time_per_iteration": 2.4798507690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070366, + "balance_loss_mlp": 1.02106154, + "epoch": 0.3251250480954213, + "flos": 529123203072.0, + "grad_norm": 0.03304707654173593, + "language_loss": 0.83602285, + "learning_rate": 0.0007884729392211603, + "loss": 0.84672654, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.49194336, + "step": 1690, + "time_per_iteration": 2.6475188732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.01732576, + "epoch": 0.3253174297806849, + "flos": 450559499520.0, + "grad_norm": 0.03986808198030794, + "language_loss": 0.86860085, + "learning_rate": 0.0007882184209712245, + "loss": 0.87927043, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.49609375, + "step": 1691, + "time_per_iteration": 2.5213029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.03961909, + "epoch": 0.32550981146594843, + "flos": 705490801152.0, + "grad_norm": 0.03183986603149819, + "language_loss": 0.86227143, + "learning_rate": 0.000787963790824974, + "loss": 0.8731674, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.49975586, + "step": 1692, + "time_per_iteration": 2.9866673946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086614, + "balance_loss_mlp": 1.03654587, + "epoch": 0.325702193151212, + "flos": 393559233024.0, + "grad_norm": 0.035135222587328305, + "language_loss": 0.90092403, + "learning_rate": 0.0007877090488812651, + "loss": 0.91179013, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.50073242, + "step": 1693, + "time_per_iteration": 2.443784475326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067298, + "balance_loss_mlp": 1.01708698, + "epoch": 0.32589457483647555, + "flos": 578584091136.0, + "grad_norm": 0.03604448220117138, + "language_loss": 0.84406531, + "learning_rate": 0.0007874541952389973, + "loss": 0.85473824, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.50219727, + "step": 1694, + "time_per_iteration": 2.6662275791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069205, + "balance_loss_mlp": 1.01918459, + "epoch": 0.32608695652173914, + "flos": 499330216704.0, + "grad_norm": 0.03462929627838828, + "language_loss": 0.87473089, + "learning_rate": 0.0007871992299971136, + "loss": 0.88542295, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.50024414, + "step": 1695, + "time_per_iteration": 2.5501420497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068106, + "balance_loss_mlp": 1.01803839, + "epoch": 0.32627933820700267, + "flos": 592301948160.0, + "grad_norm": 0.0349674772808078, + "language_loss": 0.85830671, + "learning_rate": 0.0007869441532546001, + "loss": 0.86898774, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.5, + "step": 1696, + "time_per_iteration": 2.7640528678894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065186, + "balance_loss_mlp": 1.01550007, + "epoch": 0.32647171989226625, + "flos": 610274558208.0, + "grad_norm": 0.03448959411295718, + "language_loss": 0.80548751, + "learning_rate": 0.0007866889651104867, + "loss": 0.81613934, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.49658203, + "step": 1697, + "time_per_iteration": 2.8403704166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106687, + "balance_loss_mlp": 1.01723123, + "epoch": 0.32666410157752984, + "flos": 478190599680.0, + "grad_norm": 0.0393752309547029, + "language_loss": 0.84585583, + "learning_rate": 0.000786433665663846, + "loss": 0.85652447, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.49536133, + "step": 1698, + "time_per_iteration": 2.7460434436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.01603401, + "epoch": 0.3268564832627934, + "flos": 719694694656.0, + "grad_norm": 0.03598572558720647, + "language_loss": 0.87469888, + "learning_rate": 0.0007861782550137942, + "loss": 0.88535315, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.49291992, + "step": 1699, + "time_per_iteration": 2.922189474105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_mlp": 1.01299262, + "epoch": 0.32704886494805696, + "flos": 770106268416.0, + "grad_norm": 0.033319227910548664, + "language_loss": 0.86952895, + "learning_rate": 0.0007859227332594901, + "loss": 0.88014954, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.48999023, + "step": 1700, + "time_per_iteration": 2.8891940116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_mlp": 1.00782549, + "epoch": 0.3272412466333205, + "flos": 851405377536.0, + "grad_norm": 0.0384838580126543, + "language_loss": 0.85734528, + "learning_rate": 0.0007856671005001365, + "loss": 0.8679111, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.48730469, + "step": 1701, + "time_per_iteration": 3.169032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105609, + "balance_loss_mlp": 1.00728559, + "epoch": 0.3274336283185841, + "flos": 833041995264.0, + "grad_norm": 0.03605284930108709, + "language_loss": 0.82799482, + "learning_rate": 0.0007854113568349787, + "loss": 0.83855575, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.48779297, + "step": 1702, + "time_per_iteration": 3.123967170715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060179, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3276260100038476, + "flos": 693253407744.0, + "grad_norm": 0.03564674283827795, + "language_loss": 0.81364781, + "learning_rate": 0.0007851555023633052, + "loss": 0.82424963, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.48388672, + "step": 1703, + "time_per_iteration": 2.8430581092834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059511, + "balance_loss_mlp": 1.01120698, + "epoch": 0.3278183916891112, + "flos": 436978702848.0, + "grad_norm": 0.03514994366577059, + "language_loss": 0.83518881, + "learning_rate": 0.0007848995371844474, + "loss": 0.84578383, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.48291016, + "step": 1704, + "time_per_iteration": 2.552917003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_mlp": 1.00861514, + "epoch": 0.3280107733743748, + "flos": 462017293824.0, + "grad_norm": 0.03278124420090015, + "language_loss": 0.81157213, + "learning_rate": 0.0007846434613977801, + "loss": 0.82213771, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.47924805, + "step": 1705, + "time_per_iteration": 2.496506929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062567, + "balance_loss_mlp": 1.01483595, + "epoch": 0.3282031550596383, + "flos": 680529977856.0, + "grad_norm": 0.03615486988598079, + "language_loss": 0.79136091, + "learning_rate": 0.0007843872751027203, + "loss": 0.80198663, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.47705078, + "step": 1706, + "time_per_iteration": 2.8048393726348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00088739, + "epoch": 0.3283955367449019, + "flos": 546255942912.0, + "grad_norm": 0.030185021157442368, + "language_loss": 0.879673, + "learning_rate": 0.0007841309783987287, + "loss": 0.89015824, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.47607422, + "step": 1707, + "time_per_iteration": 2.7402358055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053102, + "balance_loss_mlp": 1.00553715, + "epoch": 0.32858791843016544, + "flos": 482241218304.0, + "grad_norm": 0.035416956868504886, + "language_loss": 0.89878803, + "learning_rate": 0.0007838745713853084, + "loss": 0.90931904, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.4753418, + "step": 1708, + "time_per_iteration": 2.603816270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.00752318, + "epoch": 0.328780300115429, + "flos": 567916589568.0, + "grad_norm": 0.03507338685235107, + "language_loss": 0.84775996, + "learning_rate": 0.0007836180541620053, + "loss": 0.8583082, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.47265625, + "step": 1709, + "time_per_iteration": 2.7194666862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_mlp": 1.00730944, + "epoch": 0.32897268180069256, + "flos": 476992038144.0, + "grad_norm": 0.03621825417570051, + "language_loss": 0.86992389, + "learning_rate": 0.0007833614268284082, + "loss": 0.88046837, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.47094727, + "step": 1710, + "time_per_iteration": 2.510921001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.01346588, + "epoch": 0.32916506348595614, + "flos": 1580453327616.0, + "grad_norm": 0.014405511351568959, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75167489, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.44335938, + "step": 1711, + "time_per_iteration": 4.875708341598511 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.00429153, + "epoch": 0.3293574451712197, + "flos": 483851939328.0, + "grad_norm": 0.03545808379065215, + "language_loss": 0.7916249, + "learning_rate": 0.0007828478422289016, + "loss": 0.80213821, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.4699707, + "step": 1712, + "time_per_iteration": 2.583045721054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_mlp": 1.00582564, + "epoch": 0.32954982685648326, + "flos": 623725097472.0, + "grad_norm": 0.0327870747371716, + "language_loss": 0.89787406, + "learning_rate": 0.0007825908851623833, + "loss": 0.9084022, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.46948242, + "step": 1713, + "time_per_iteration": 2.824685573577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050866, + "balance_loss_mlp": 1.00396931, + "epoch": 0.32974220854174685, + "flos": 546071250432.0, + "grad_norm": 0.03386258255996434, + "language_loss": 0.85659784, + "learning_rate": 0.0007823338183843533, + "loss": 0.8671065, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.46850586, + "step": 1714, + "time_per_iteration": 2.672525644302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051399, + "balance_loss_mlp": 1.00459802, + "epoch": 0.3299345902270104, + "flos": 983823727872.0, + "grad_norm": 0.03566876288837857, + "language_loss": 0.82096756, + "learning_rate": 0.0007820766419946141, + "loss": 0.83148158, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4675293, + "step": 1715, + "time_per_iteration": 3.2718288898468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051636, + "balance_loss_mlp": 1.00662231, + "epoch": 0.33012697191227397, + "flos": 1406904727296.0, + "grad_norm": 0.0085720970679931, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80724114, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.44921875, + "step": 1716, + "time_per_iteration": 4.983957290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065575, + "balance_loss_mlp": 1.01836789, + "epoch": 0.3303193535975375, + "flos": 506170675968.0, + "grad_norm": 0.038525927315114124, + "language_loss": 0.76583785, + "learning_rate": 0.0007815619607794288, + "loss": 0.77649361, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.47167969, + "step": 1717, + "time_per_iteration": 2.6315019130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054249, + "balance_loss_mlp": 1.00713778, + "epoch": 0.3305117352828011, + "flos": 939485653248.0, + "grad_norm": 0.041342276741222116, + "language_loss": 0.83710063, + "learning_rate": 0.0007813044561538001, + "loss": 0.84764308, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.47070312, + "step": 1718, + "time_per_iteration": 3.127446174621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055308, + "balance_loss_mlp": 1.00814831, + "epoch": 0.3307041169680646, + "flos": 722794627584.0, + "grad_norm": 0.03526572402512133, + "language_loss": 0.88796169, + "learning_rate": 0.0007810468423160958, + "loss": 0.89851475, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.47119141, + "step": 1719, + "time_per_iteration": 2.8622305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_mlp": 1.00741386, + "epoch": 0.3308964986533282, + "flos": 584817004800.0, + "grad_norm": 0.029883098234782163, + "language_loss": 0.82424414, + "learning_rate": 0.0007807891193663306, + "loss": 0.83478725, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.46850586, + "step": 1720, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.01715815, + "epoch": 0.33108888033859174, + "flos": 474525895680.0, + "grad_norm": 0.040993977150413745, + "language_loss": 0.82757467, + "learning_rate": 0.0007805312874045614, + "loss": 0.83821499, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.46826172, + "step": 1721, + "time_per_iteration": 2.516045331954956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_mlp": 1.00279772, + "epoch": 0.3312812620238553, + "flos": 386996785152.0, + "grad_norm": 0.03885390252626127, + "language_loss": 0.87709427, + "learning_rate": 0.0007802733465308874, + "loss": 0.88759029, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.4675293, + "step": 1722, + "time_per_iteration": 2.4662280082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_mlp": 1.00108933, + "epoch": 0.3314736437091189, + "flos": 495605241600.0, + "grad_norm": 0.03316625802825005, + "language_loss": 0.85110468, + "learning_rate": 0.0007800152968454501, + "loss": 0.86158121, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.46508789, + "step": 1723, + "time_per_iteration": 2.6313533782958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_mlp": 1.00515401, + "epoch": 0.33166602539438245, + "flos": 654931473408.0, + "grad_norm": 0.02722776998075876, + "language_loss": 0.90998107, + "learning_rate": 0.0007797571384484334, + "loss": 0.92049968, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.46655273, + "step": 1724, + "time_per_iteration": 2.8411970138549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00257659, + "epoch": 0.33185840707964603, + "flos": 521835591168.0, + "grad_norm": 0.03419077024576391, + "language_loss": 0.92796665, + "learning_rate": 0.0007794988714400633, + "loss": 0.93846071, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.46777344, + "step": 1725, + "time_per_iteration": 2.5964980125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_mlp": 1.00367355, + "epoch": 0.33205078876490957, + "flos": 437899252992.0, + "grad_norm": 0.033932075991051254, + "language_loss": 0.86014992, + "learning_rate": 0.0007792404959206079, + "loss": 0.87065518, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.46801758, + "step": 1726, + "time_per_iteration": 2.491852283477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_mlp": 1.00497568, + "epoch": 0.33224317045017315, + "flos": 770095574784.0, + "grad_norm": 0.034529473302537826, + "language_loss": 0.82129228, + "learning_rate": 0.0007789820119903774, + "loss": 0.83181036, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.46777344, + "step": 1727, + "time_per_iteration": 2.9898605346679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_mlp": 1.01260376, + "epoch": 0.3324355521354367, + "flos": 1469296103424.0, + "grad_norm": 0.013638873720884416, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79550946, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.45605469, + "step": 1728, + "time_per_iteration": 4.859704971313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_mlp": 1.00343382, + "epoch": 0.3326279338207003, + "flos": 497800175616.0, + "grad_norm": 0.033386991625918766, + "language_loss": 0.84234303, + "learning_rate": 0.0007784647192990428, + "loss": 0.85284609, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.46826172, + "step": 1729, + "time_per_iteration": 2.7268624305725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_mlp": 1.00419581, + "epoch": 0.33282031550596386, + "flos": 637054127616.0, + "grad_norm": 0.031138270474946127, + "language_loss": 0.81414318, + "learning_rate": 0.0007782059107387696, + "loss": 0.82465172, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.46606445, + "step": 1730, + "time_per_iteration": 2.85831618309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00752223, + "epoch": 0.3330126971912274, + "flos": 690722136576.0, + "grad_norm": 0.03556521205278414, + "language_loss": 0.89100444, + "learning_rate": 0.0007779469941693826, + "loss": 0.9015491, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.46899414, + "step": 1731, + "time_per_iteration": 2.8736839294433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058267, + "balance_loss_mlp": 1.01168013, + "epoch": 0.333205078876491, + "flos": 567554007552.0, + "grad_norm": 0.03898705252222011, + "language_loss": 0.77083337, + "learning_rate": 0.0007776879696914029, + "loss": 0.78141606, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.46533203, + "step": 1732, + "time_per_iteration": 2.84578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055134, + "balance_loss_mlp": 1.00868976, + "epoch": 0.3333974605617545, + "flos": 642171105024.0, + "grad_norm": 0.028730663384365272, + "language_loss": 0.89631069, + "learning_rate": 0.000777428837405392, + "loss": 0.90686202, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.46386719, + "step": 1733, + "time_per_iteration": 2.8595433235168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.00275302, + "epoch": 0.3335898422470181, + "flos": 462779396352.0, + "grad_norm": 0.03984590801707433, + "language_loss": 0.87746447, + "learning_rate": 0.0007771695974119544, + "loss": 0.88795674, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.46411133, + "step": 1734, + "time_per_iteration": 2.5200014114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_mlp": 1.00537193, + "epoch": 0.33378222393228163, + "flos": 854338114560.0, + "grad_norm": 0.03554719013753984, + "language_loss": 0.76235908, + "learning_rate": 0.0007769102498117359, + "loss": 0.77287674, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.46337891, + "step": 1735, + "time_per_iteration": 3.1014633178710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052548, + "balance_loss_mlp": 1.00624716, + "epoch": 0.3339746056175452, + "flos": 956310246144.0, + "grad_norm": 0.03187783426815399, + "language_loss": 0.80701965, + "learning_rate": 0.000776650794705424, + "loss": 0.81754518, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.46240234, + "step": 1736, + "time_per_iteration": 3.253756046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_mlp": 1.00434327, + "epoch": 0.33416698730280875, + "flos": 545895306240.0, + "grad_norm": 0.03238990381642275, + "language_loss": 0.83209848, + "learning_rate": 0.0007763912321937483, + "loss": 0.84260583, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.46337891, + "step": 1737, + "time_per_iteration": 2.712942361831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051632, + "balance_loss_mlp": 1.00525999, + "epoch": 0.33435936898807234, + "flos": 1015876776960.0, + "grad_norm": 0.036470780413058734, + "language_loss": 0.8337301, + "learning_rate": 0.0007761315623774799, + "loss": 0.84424639, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.46313477, + "step": 1738, + "time_per_iteration": 3.38946795463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053137, + "balance_loss_mlp": 1.00671661, + "epoch": 0.3345517506733359, + "flos": 616372356864.0, + "grad_norm": 0.034452353492031275, + "language_loss": 0.88688117, + "learning_rate": 0.0007758717853574313, + "loss": 0.89741254, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.46362305, + "step": 1739, + "time_per_iteration": 2.7438387870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105524, + "balance_loss_mlp": 1.00896263, + "epoch": 0.33474413235859946, + "flos": 495570248448.0, + "grad_norm": 0.03665446817767542, + "language_loss": 0.90973008, + "learning_rate": 0.0007756119012344571, + "loss": 0.92028248, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.4621582, + "step": 1740, + "time_per_iteration": 2.5443572998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.0052774, + "epoch": 0.33493651404386304, + "flos": 629488504320.0, + "grad_norm": 0.0365358867260097, + "language_loss": 0.85516071, + "learning_rate": 0.0007753519101094535, + "loss": 0.86567724, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.46313477, + "step": 1741, + "time_per_iteration": 2.785595417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050508, + "balance_loss_mlp": 1.00396836, + "epoch": 0.3351288957291266, + "flos": 514743365376.0, + "grad_norm": 0.038608286094447275, + "language_loss": 0.87042749, + "learning_rate": 0.0007750918120833575, + "loss": 0.88093251, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.46484375, + "step": 1742, + "time_per_iteration": 2.5612564086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_mlp": 1.00825262, + "epoch": 0.33532127741439016, + "flos": 648483731712.0, + "grad_norm": 0.038902913238311417, + "language_loss": 0.88245445, + "learning_rate": 0.0007748316072571485, + "loss": 0.89300191, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.46435547, + "step": 1743, + "time_per_iteration": 2.8040030002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056064, + "balance_loss_mlp": 1.00969172, + "epoch": 0.3355136590996537, + "flos": 769789373184.0, + "grad_norm": 0.032744002461956113, + "language_loss": 0.80090916, + "learning_rate": 0.0007745712957318467, + "loss": 0.81146979, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.46313477, + "step": 1744, + "time_per_iteration": 2.955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_mlp": 1.00656557, + "epoch": 0.3357060407849173, + "flos": 596650020096.0, + "grad_norm": 0.027209343707751667, + "language_loss": 0.86834347, + "learning_rate": 0.0007743108776085141, + "loss": 0.87887406, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.46435547, + "step": 1745, + "time_per_iteration": 2.8065922260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059361, + "balance_loss_mlp": 1.01277399, + "epoch": 0.3358984224701808, + "flos": 599802442752.0, + "grad_norm": 0.030632877870575562, + "language_loss": 0.83193165, + "learning_rate": 0.0007740503529882543, + "loss": 0.84252524, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.46533203, + "step": 1746, + "time_per_iteration": 2.783057451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058625, + "balance_loss_mlp": 1.01218116, + "epoch": 0.3360908041554444, + "flos": 579430764288.0, + "grad_norm": 0.03209356344176002, + "language_loss": 0.91440552, + "learning_rate": 0.0007737897219722114, + "loss": 0.92499179, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.46386719, + "step": 1747, + "time_per_iteration": 2.6678693294525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00723922, + "epoch": 0.336283185840708, + "flos": 514621856256.0, + "grad_norm": 0.02947569275247992, + "language_loss": 0.81706387, + "learning_rate": 0.0007735289846615716, + "loss": 0.82759976, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.46289062, + "step": 1748, + "time_per_iteration": 2.664217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_mlp": 1.00312185, + "epoch": 0.3364755675259715, + "flos": 526014521856.0, + "grad_norm": 0.03437288512368296, + "language_loss": 0.83148289, + "learning_rate": 0.0007732681411575621, + "loss": 0.84197474, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.45996094, + "step": 1749, + "time_per_iteration": 2.679304361343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_mlp": 1.00613475, + "epoch": 0.3366679492112351, + "flos": 555974704128.0, + "grad_norm": 0.040002531784274646, + "language_loss": 0.88002014, + "learning_rate": 0.0007730071915614514, + "loss": 0.89053994, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.45776367, + "step": 1750, + "time_per_iteration": 2.6813647747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00734055, + "epoch": 0.33686033089649864, + "flos": 428164940544.0, + "grad_norm": 0.03793638318473741, + "language_loss": 0.88937026, + "learning_rate": 0.0007727461359745489, + "loss": 0.89990187, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.45751953, + "step": 1751, + "time_per_iteration": 2.459137439727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_mlp": 1.00425673, + "epoch": 0.3370527125817622, + "flos": 542841060096.0, + "grad_norm": 0.030686532457312277, + "language_loss": 0.86821485, + "learning_rate": 0.0007724849744982056, + "loss": 0.87871712, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.45898438, + "step": 1752, + "time_per_iteration": 2.682023525238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.00412822, + "epoch": 0.33724509426702576, + "flos": 543231832320.0, + "grad_norm": 0.03146587739195435, + "language_loss": 0.82788759, + "learning_rate": 0.0007722237072338131, + "loss": 0.8383888, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.45922852, + "step": 1753, + "time_per_iteration": 2.7289977073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_mlp": 1.00735557, + "epoch": 0.33743747595228935, + "flos": 473753099520.0, + "grad_norm": 0.036309304678759154, + "language_loss": 0.86263937, + "learning_rate": 0.0007719623342828046, + "loss": 0.8731702, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.45654297, + "step": 1754, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_mlp": 1.00127256, + "epoch": 0.33762985763755293, + "flos": 470837859072.0, + "grad_norm": 0.037209700878319825, + "language_loss": 0.84580374, + "learning_rate": 0.000771700855746654, + "loss": 0.85627109, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.45385742, + "step": 1755, + "time_per_iteration": 2.585667848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049151, + "balance_loss_mlp": 1.00366056, + "epoch": 0.33782223932281646, + "flos": 493251859968.0, + "grad_norm": 0.03059786996599164, + "language_loss": 0.89290714, + "learning_rate": 0.0007714392717268763, + "loss": 0.90339863, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.45410156, + "step": 1756, + "time_per_iteration": 2.5836589336395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_mlp": 1.00321686, + "epoch": 0.33801462100808005, + "flos": 466018334976.0, + "grad_norm": 0.035533831964213135, + "language_loss": 0.87473714, + "learning_rate": 0.0007711775823250273, + "loss": 0.88522607, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.45605469, + "step": 1757, + "time_per_iteration": 2.5619492530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_mlp": 1.00417781, + "epoch": 0.3382070026933436, + "flos": 797068584960.0, + "grad_norm": 0.03198873828119691, + "language_loss": 0.84101963, + "learning_rate": 0.0007709157876427039, + "loss": 0.85151625, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.45410156, + "step": 1758, + "time_per_iteration": 3.084735870361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049654, + "balance_loss_mlp": 1.00414026, + "epoch": 0.33839938437860717, + "flos": 509429056512.0, + "grad_norm": 0.031347294296384644, + "language_loss": 0.86196065, + "learning_rate": 0.0007706538877815439, + "loss": 0.87245721, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4543457, + "step": 1759, + "time_per_iteration": 2.6354048252105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_mlp": 1.00371122, + "epoch": 0.3385917660638707, + "flos": 485274077184.0, + "grad_norm": 0.03028112214235413, + "language_loss": 0.83875918, + "learning_rate": 0.0007703918828432259, + "loss": 0.84925139, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.4543457, + "step": 1760, + "time_per_iteration": 2.6017844676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_mlp": 1.00358403, + "epoch": 0.3387841477491343, + "flos": 546416335872.0, + "grad_norm": 0.033680258429279644, + "language_loss": 0.89293355, + "learning_rate": 0.000770129772929469, + "loss": 0.90342498, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.45483398, + "step": 1761, + "time_per_iteration": 2.671287775039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048026, + "balance_loss_mlp": 1.00217831, + "epoch": 0.3389765294343978, + "flos": 721064342784.0, + "grad_norm": 0.03497277274463044, + "language_loss": 0.89180952, + "learning_rate": 0.0007698675581420334, + "loss": 0.90228981, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.45776367, + "step": 1762, + "time_per_iteration": 2.9236271381378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_mlp": 1.00677264, + "epoch": 0.3391689111196614, + "flos": 701264238336.0, + "grad_norm": 0.034268369898116914, + "language_loss": 0.79778481, + "learning_rate": 0.0007696052385827199, + "loss": 0.80830908, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.45581055, + "step": 1763, + "time_per_iteration": 2.9605488777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055868, + "balance_loss_mlp": 1.01018691, + "epoch": 0.339361292804925, + "flos": 628249113600.0, + "grad_norm": 0.03454670185411084, + "language_loss": 0.78905737, + "learning_rate": 0.00076934281435337, + "loss": 0.79961604, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.45605469, + "step": 1764, + "time_per_iteration": 2.7454025745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052159, + "balance_loss_mlp": 1.00647831, + "epoch": 0.33955367449018853, + "flos": 610795587840.0, + "grad_norm": 0.03693575970108084, + "language_loss": 0.86892688, + "learning_rate": 0.0007690802855558658, + "loss": 0.87944847, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.45605469, + "step": 1765, + "time_per_iteration": 2.8936946392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.01057434, + "epoch": 0.3397460561754521, + "flos": 1456589191680.0, + "grad_norm": 0.006269192400269108, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77429777, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.44335938, + "step": 1766, + "time_per_iteration": 4.913206100463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054352, + "balance_loss_mlp": 1.00855207, + "epoch": 0.33993843786071565, + "flos": 488291384832.0, + "grad_norm": 0.039386286306125895, + "language_loss": 0.89967024, + "learning_rate": 0.0007685549146641262, + "loss": 0.91021377, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.45727539, + "step": 1767, + "time_per_iteration": 2.593353271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050967, + "balance_loss_mlp": 1.00554788, + "epoch": 0.34013081954597923, + "flos": 418233296640.0, + "grad_norm": 0.032458575290873634, + "language_loss": 0.89062989, + "learning_rate": 0.0007682920727738579, + "loss": 0.90113962, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.45336914, + "step": 1768, + "time_per_iteration": 2.510331392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054131, + "balance_loss_mlp": 1.00835514, + "epoch": 0.34032320123124277, + "flos": 438430976256.0, + "grad_norm": 0.037803385345055784, + "language_loss": 0.85379529, + "learning_rate": 0.000768029126723369, + "loss": 0.86433661, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.45703125, + "step": 1769, + "time_per_iteration": 2.5152533054351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054345, + "balance_loss_mlp": 1.00852144, + "epoch": 0.34051558291650635, + "flos": 458544085248.0, + "grad_norm": 0.04157155741286578, + "language_loss": 0.82432753, + "learning_rate": 0.0007677660766147447, + "loss": 0.83487099, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.45751953, + "step": 1770, + "time_per_iteration": 2.5669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_mlp": 1.00858307, + "epoch": 0.3407079646017699, + "flos": 1562140489728.0, + "grad_norm": 0.006526141838203855, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73523682, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.44238281, + "step": 1771, + "time_per_iteration": 4.953578233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051633, + "balance_loss_mlp": 1.00602317, + "epoch": 0.3409003462870335, + "flos": 493531816704.0, + "grad_norm": 0.043561887450476046, + "language_loss": 0.80659652, + "learning_rate": 0.0007672396646316306, + "loss": 0.81711292, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.45532227, + "step": 1772, + "time_per_iteration": 2.5720248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00356674, + "epoch": 0.34109272797229706, + "flos": 809822150400.0, + "grad_norm": 0.03735237922314452, + "language_loss": 0.80629146, + "learning_rate": 0.000766976302961512, + "loss": 0.81678128, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.45336914, + "step": 1773, + "time_per_iteration": 3.0438191890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050599, + "balance_loss_mlp": 1.00513268, + "epoch": 0.3412851096575606, + "flos": 471100319232.0, + "grad_norm": 0.03730121261656314, + "language_loss": 0.82086515, + "learning_rate": 0.0007667128376420003, + "loss": 0.83137119, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.45385742, + "step": 1774, + "time_per_iteration": 2.5461959838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_mlp": 1.00681531, + "epoch": 0.3414774913428242, + "flos": 596771529216.0, + "grad_norm": 0.03978671612524881, + "language_loss": 0.85611963, + "learning_rate": 0.0007664492687753817, + "loss": 0.86664057, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.4519043, + "step": 1775, + "time_per_iteration": 2.7454183101654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_mlp": 1.00362372, + "epoch": 0.3416698730280877, + "flos": 528508854528.0, + "grad_norm": 0.03225195621375244, + "language_loss": 0.82109249, + "learning_rate": 0.000766185596463983, + "loss": 0.83158267, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.453125, + "step": 1776, + "time_per_iteration": 2.636876106262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050456, + "balance_loss_mlp": 1.00513279, + "epoch": 0.3418622547133513, + "flos": 876118324992.0, + "grad_norm": 0.033083928099711564, + "language_loss": 0.77454132, + "learning_rate": 0.0007659218208101706, + "loss": 0.78504586, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.45239258, + "step": 1777, + "time_per_iteration": 3.097163677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055706, + "balance_loss_mlp": 1.01031137, + "epoch": 0.34205463639861483, + "flos": 604877624064.0, + "grad_norm": 0.03453483859247358, + "language_loss": 0.86064076, + "learning_rate": 0.0007656579419163515, + "loss": 0.87119782, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.453125, + "step": 1778, + "time_per_iteration": 2.7452263832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055225, + "balance_loss_mlp": 1.0096159, + "epoch": 0.3422470180838784, + "flos": 464715760896.0, + "grad_norm": 0.037184345749469765, + "language_loss": 0.77793133, + "learning_rate": 0.0007653939598849724, + "loss": 0.78848356, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.45532227, + "step": 1779, + "time_per_iteration": 2.5020663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0134964, + "epoch": 0.34243939976914195, + "flos": 1589819222016.0, + "grad_norm": 0.009860928497574006, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83937383, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.4375, + "step": 1780, + "time_per_iteration": 4.958939552307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00849116, + "epoch": 0.34263178145440554, + "flos": 874444420608.0, + "grad_norm": 0.034671274665512654, + "language_loss": 0.80890739, + "learning_rate": 0.000764865686819522, + "loss": 0.81944883, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.45581055, + "step": 1781, + "time_per_iteration": 3.0468943119049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.01148522, + "epoch": 0.3428241631396691, + "flos": 507874715904.0, + "grad_norm": 0.02984044691012994, + "language_loss": 0.86276633, + "learning_rate": 0.0007646013959905449, + "loss": 0.87333775, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.45581055, + "step": 1782, + "time_per_iteration": 2.59788179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056783, + "balance_loss_mlp": 1.01114941, + "epoch": 0.34301654482493266, + "flos": 881525952768.0, + "grad_norm": 0.034646354408830966, + "language_loss": 0.81384498, + "learning_rate": 0.0007643370024341949, + "loss": 0.82441282, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.45556641, + "step": 1783, + "time_per_iteration": 3.0783512592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_mlp": 1.00288546, + "epoch": 0.34320892651019624, + "flos": 432669514752.0, + "grad_norm": 0.031189947688426686, + "language_loss": 0.84145617, + "learning_rate": 0.0007640725062531195, + "loss": 0.85193729, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.45141602, + "step": 1784, + "time_per_iteration": 2.5152812004089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00559807, + "epoch": 0.3434013081954598, + "flos": 464594251776.0, + "grad_norm": 0.03760163078295718, + "language_loss": 0.86810297, + "learning_rate": 0.0007638079075500047, + "loss": 0.87861264, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.45288086, + "step": 1785, + "time_per_iteration": 2.5846633911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.0019455, + "epoch": 0.34359368988072336, + "flos": 1560677522688.0, + "grad_norm": 0.003111664808940008, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76225722, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.43164062, + "step": 1786, + "time_per_iteration": 4.94433856010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_mlp": 1.003739, + "epoch": 0.3437860715659869, + "flos": 496573423872.0, + "grad_norm": 0.03208809815455149, + "language_loss": 0.83580017, + "learning_rate": 0.0007632784029886026, + "loss": 0.8462882, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.45019531, + "step": 1787, + "time_per_iteration": 2.6222987174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_mlp": 1.00523186, + "epoch": 0.3439784532512505, + "flos": 719610124032.0, + "grad_norm": 0.03771035877194531, + "language_loss": 0.86448389, + "learning_rate": 0.0007630134973358873, + "loss": 0.87498415, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.44799805, + "step": 1788, + "time_per_iteration": 2.9359545707702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00320995, + "epoch": 0.34417083493651407, + "flos": 566922162432.0, + "grad_norm": 0.0315223877917514, + "language_loss": 0.8730194, + "learning_rate": 0.0007627484895722763, + "loss": 0.88349926, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.44775391, + "step": 1789, + "time_per_iteration": 2.710433006286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00397587, + "epoch": 0.3443632166217776, + "flos": 797702375424.0, + "grad_norm": 0.034658336241014505, + "language_loss": 0.80973929, + "learning_rate": 0.0007624833798006552, + "loss": 0.82022536, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.4465332, + "step": 1790, + "time_per_iteration": 3.061995506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049093, + "balance_loss_mlp": 1.00419891, + "epoch": 0.3445555983070412, + "flos": 570393425664.0, + "grad_norm": 0.0359941873064626, + "language_loss": 0.84664464, + "learning_rate": 0.0007622181681239483, + "loss": 0.85713559, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.44873047, + "step": 1791, + "time_per_iteration": 2.708204984664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00192165, + "epoch": 0.3447479799923047, + "flos": 569981266176.0, + "grad_norm": 0.030307911746310208, + "language_loss": 0.85264516, + "learning_rate": 0.0007619528546451202, + "loss": 0.86311066, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.4465332, + "step": 1792, + "time_per_iteration": 2.8142476081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047842, + "balance_loss_mlp": 1.00323367, + "epoch": 0.3449403616775683, + "flos": 969333074688.0, + "grad_norm": 0.03266645448260783, + "language_loss": 0.84415537, + "learning_rate": 0.0007616874394671745, + "loss": 0.85463381, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.4465332, + "step": 1793, + "time_per_iteration": 3.340257406234741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048671, + "balance_loss_mlp": 1.00411057, + "epoch": 0.34513274336283184, + "flos": 569677009920.0, + "grad_norm": 0.042713127170940564, + "language_loss": 0.85883492, + "learning_rate": 0.0007614219226931547, + "loss": 0.86932158, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44604492, + "step": 1794, + "time_per_iteration": 2.666299343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00301611, + "epoch": 0.3453251250480954, + "flos": 461858846208.0, + "grad_norm": 0.03409376285864792, + "language_loss": 0.85191298, + "learning_rate": 0.0007611563044261435, + "loss": 0.86238825, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.44580078, + "step": 1795, + "time_per_iteration": 2.509730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00340092, + "epoch": 0.34551750673335896, + "flos": 416520508416.0, + "grad_norm": 0.03871598691360063, + "language_loss": 0.87655377, + "learning_rate": 0.0007608905847692631, + "loss": 0.88703358, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.4465332, + "step": 1796, + "time_per_iteration": 2.468144416809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045751, + "balance_loss_mlp": 1.0012145, + "epoch": 0.34570988841862255, + "flos": 589115499264.0, + "grad_norm": 0.03133980127061019, + "language_loss": 0.87422049, + "learning_rate": 0.0007606247638256749, + "loss": 0.88467801, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.44580078, + "step": 1797, + "time_per_iteration": 2.8401029109954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_mlp": 1.00758362, + "epoch": 0.34590227010388613, + "flos": 1571145747456.0, + "grad_norm": 0.007450888717391324, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79220599, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.42773438, + "step": 1798, + "time_per_iteration": 4.913544178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_mlp": 1.00097656, + "epoch": 0.34609465178914967, + "flos": 1540930886400.0, + "grad_norm": 0.004797214297707501, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80371094, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.4296875, + "step": 1799, + "time_per_iteration": 4.771878719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_mlp": 1.00469148, + "epoch": 0.34628703347441325, + "flos": 610517576448.0, + "grad_norm": 0.037119753663607306, + "language_loss": 0.86850703, + "learning_rate": 0.0007598266943068686, + "loss": 0.8790009, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44750977, + "step": 1800, + "time_per_iteration": 2.746819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050462, + "balance_loss_mlp": 1.00535274, + "epoch": 0.3464794151596768, + "flos": 474265380864.0, + "grad_norm": 0.03436691989893219, + "language_loss": 0.84791839, + "learning_rate": 0.0007595604692488507, + "loss": 0.85842299, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.45019531, + "step": 1801, + "time_per_iteration": 2.564328908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_mlp": 1.00587356, + "epoch": 0.34667179684494037, + "flos": 606822736896.0, + "grad_norm": 0.03808690892272381, + "language_loss": 0.83437663, + "learning_rate": 0.0007592941434205215, + "loss": 0.8448841, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.44848633, + "step": 1802, + "time_per_iteration": 2.826420545578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059933, + "balance_loss_mlp": 1.016922, + "epoch": 0.3468641785302039, + "flos": 1568362709760.0, + "grad_norm": 0.013636299413791342, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74630988, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.43066406, + "step": 1803, + "time_per_iteration": 5.063625812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050175, + "balance_loss_mlp": 1.00523341, + "epoch": 0.3470565602154675, + "flos": 908724484608.0, + "grad_norm": 0.03942668215130471, + "language_loss": 0.80763334, + "learning_rate": 0.0007587611898665566, + "loss": 0.81813502, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.44921875, + "step": 1804, + "time_per_iteration": 3.0834579467773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.0052247, + "epoch": 0.347248941900731, + "flos": 640060741632.0, + "grad_norm": 0.031209613313051415, + "language_loss": 0.82727098, + "learning_rate": 0.0007584945623478315, + "loss": 0.83777213, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.44873047, + "step": 1805, + "time_per_iteration": 2.861560106277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00688517, + "epoch": 0.3474413235859946, + "flos": 848782732800.0, + "grad_norm": 0.03633023546687314, + "language_loss": 0.81859386, + "learning_rate": 0.000758227834472617, + "loss": 0.82910925, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.44702148, + "step": 1806, + "time_per_iteration": 3.0337021350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.00767589, + "epoch": 0.3476337052712582, + "flos": 516697226496.0, + "grad_norm": 0.035243207865769656, + "language_loss": 0.77929807, + "learning_rate": 0.0007579610063444664, + "loss": 0.78982013, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.44580078, + "step": 1807, + "time_per_iteration": 2.7339653968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_mlp": 1.01154768, + "epoch": 0.34782608695652173, + "flos": 915115845888.0, + "grad_norm": 0.03414685220945043, + "language_loss": 0.88006967, + "learning_rate": 0.0007576940780669712, + "loss": 0.89063108, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4465332, + "step": 1808, + "time_per_iteration": 3.211806058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_mlp": 1.00756717, + "epoch": 0.3480184686417853, + "flos": 775084240128.0, + "grad_norm": 0.07111913657628408, + "language_loss": 0.84903318, + "learning_rate": 0.0007574270497437624, + "loss": 0.85955209, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.4440918, + "step": 1809, + "time_per_iteration": 2.984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00518048, + "epoch": 0.34821085032704885, + "flos": 578004735744.0, + "grad_norm": 0.031195535995176178, + "language_loss": 0.88877916, + "learning_rate": 0.000757159921478509, + "loss": 0.89927369, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.44360352, + "step": 1810, + "time_per_iteration": 2.778917074203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.00888824, + "epoch": 0.34840323201231244, + "flos": 1528042205952.0, + "grad_norm": 0.009192534613281171, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75502062, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.42578125, + "step": 1811, + "time_per_iteration": 4.791734218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.0040617, + "epoch": 0.34859561369757597, + "flos": 510182410752.0, + "grad_norm": 0.038842956055274956, + "language_loss": 0.88272417, + "learning_rate": 0.0007566253655367423, + "loss": 0.89320654, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.44262695, + "step": 1812, + "time_per_iteration": 2.6542506217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050997, + "balance_loss_mlp": 1.00689006, + "epoch": 0.34878799538283956, + "flos": 549757341696.0, + "grad_norm": 0.030689577509801048, + "language_loss": 0.90222162, + "learning_rate": 0.000756357938067762, + "loss": 0.91273159, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.44189453, + "step": 1813, + "time_per_iteration": 2.6897120475769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047642, + "balance_loss_mlp": 1.00346339, + "epoch": 0.34898037706810314, + "flos": 985195321344.0, + "grad_norm": 0.03422241032564105, + "language_loss": 0.83499646, + "learning_rate": 0.0007560904110718033, + "loss": 0.84547287, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44262695, + "step": 1814, + "time_per_iteration": 3.3129422664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_mlp": 1.00102115, + "epoch": 0.3491727587533667, + "flos": 682837672704.0, + "grad_norm": 0.03439092984945392, + "language_loss": 0.84187126, + "learning_rate": 0.0007558227846527297, + "loss": 0.85232258, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.44189453, + "step": 1815, + "time_per_iteration": 2.8228747844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_mlp": 1.00880051, + "epoch": 0.34936514043863026, + "flos": 394889997312.0, + "grad_norm": 0.04066201843968592, + "language_loss": 0.84257603, + "learning_rate": 0.0007555550589144429, + "loss": 0.8531037, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44042969, + "step": 1816, + "time_per_iteration": 2.4170055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053968, + "balance_loss_mlp": 1.01000416, + "epoch": 0.3495575221238938, + "flos": 462340992000.0, + "grad_norm": 0.036355924698056825, + "language_loss": 0.84744954, + "learning_rate": 0.000755287233960883, + "loss": 0.85798925, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.44042969, + "step": 1817, + "time_per_iteration": 2.577195405960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_mlp": 1.01115596, + "epoch": 0.3497499038091574, + "flos": 725429911296.0, + "grad_norm": 0.037028935917378006, + "language_loss": 0.78975379, + "learning_rate": 0.0007550193098960292, + "loss": 0.80030644, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.44189453, + "step": 1818, + "time_per_iteration": 2.9124276638031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_mlp": 1.00609303, + "epoch": 0.3499422854944209, + "flos": 829197456384.0, + "grad_norm": 0.03031702063556045, + "language_loss": 0.8721534, + "learning_rate": 0.0007547512868238988, + "loss": 0.88265729, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.44384766, + "step": 1819, + "time_per_iteration": 3.1275570392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_mlp": 1.00203693, + "epoch": 0.3501346671796845, + "flos": 494543740416.0, + "grad_norm": 0.03689243892136314, + "language_loss": 0.8434422, + "learning_rate": 0.0007544831648485473, + "loss": 0.85390604, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.44433594, + "step": 1820, + "time_per_iteration": 2.6672415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053354, + "balance_loss_mlp": 1.00917482, + "epoch": 0.35032704886494803, + "flos": 579849726720.0, + "grad_norm": 0.04031883928972686, + "language_loss": 0.8166672, + "learning_rate": 0.0007542149440740694, + "loss": 0.82720077, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.44262695, + "step": 1821, + "time_per_iteration": 2.659205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_mlp": 1.0069536, + "epoch": 0.3505194305502116, + "flos": 585832819200.0, + "grad_norm": 0.035872862949689145, + "language_loss": 0.86380953, + "learning_rate": 0.000753946624604597, + "loss": 0.8743242, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.44604492, + "step": 1822, + "time_per_iteration": 2.748387575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049848, + "balance_loss_mlp": 1.00528705, + "epoch": 0.3507118122354752, + "flos": 527979076608.0, + "grad_norm": 0.036265727976650085, + "language_loss": 0.88431466, + "learning_rate": 0.0007536782065443015, + "loss": 0.89481318, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44628906, + "step": 1823, + "time_per_iteration": 2.608429193496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054753, + "balance_loss_mlp": 1.00997818, + "epoch": 0.35090419392073874, + "flos": 512546486016.0, + "grad_norm": 0.039277226542114754, + "language_loss": 0.75647306, + "learning_rate": 0.0007534096899973919, + "loss": 0.76702058, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.44799805, + "step": 1824, + "time_per_iteration": 2.702721118927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.0046134, + "epoch": 0.3510965756060023, + "flos": 565196735232.0, + "grad_norm": 0.031185756782702443, + "language_loss": 0.83427215, + "learning_rate": 0.0007531410750681154, + "loss": 0.84476435, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.44677734, + "step": 1825, + "time_per_iteration": 2.7568912506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00831807, + "epoch": 0.35128895729126586, + "flos": 1022254532352.0, + "grad_norm": 0.030666943866844928, + "language_loss": 0.87304175, + "learning_rate": 0.0007528723618607575, + "loss": 0.88357341, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.44848633, + "step": 1826, + "time_per_iteration": 3.4575371742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.00510669, + "epoch": 0.35148133897652944, + "flos": 589425591552.0, + "grad_norm": 0.04947505148138052, + "language_loss": 0.83428013, + "learning_rate": 0.0007526035504796422, + "loss": 0.84477776, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.44702148, + "step": 1827, + "time_per_iteration": 2.7913553714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053021, + "balance_loss_mlp": 1.00838912, + "epoch": 0.351673720661793, + "flos": 496286664192.0, + "grad_norm": 0.03604129919469899, + "language_loss": 0.87358594, + "learning_rate": 0.0007523346410291312, + "loss": 0.88411617, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.44702148, + "step": 1828, + "time_per_iteration": 2.769817590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.00499058, + "epoch": 0.35186610234705656, + "flos": 763999721472.0, + "grad_norm": 0.036507155273352104, + "language_loss": 0.85486639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86536574, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.44921875, + "step": 1829, + "time_per_iteration": 2.960890293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00364745, + "epoch": 0.3520584840323201, + "flos": 627389801472.0, + "grad_norm": 0.0323509050656096, + "language_loss": 0.88885164, + "learning_rate": 0.0007517965283375599, + "loss": 0.89933491, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.44702148, + "step": 1830, + "time_per_iteration": 2.868405818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047282, + "balance_loss_mlp": 1.00260293, + "epoch": 0.3522508657175837, + "flos": 538449246720.0, + "grad_norm": 0.03139560131485747, + "language_loss": 0.89993465, + "learning_rate": 0.0007515273253054132, + "loss": 0.91040754, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.44726562, + "step": 1831, + "time_per_iteration": 2.6341445446014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_mlp": 1.00298083, + "epoch": 0.35244324740284727, + "flos": 568502747904.0, + "grad_norm": 0.03545868131612223, + "language_loss": 0.83198845, + "learning_rate": 0.0007512580246216988, + "loss": 0.8424651, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44726562, + "step": 1832, + "time_per_iteration": 2.691678524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00860476, + "epoch": 0.3526356290881108, + "flos": 514055139840.0, + "grad_norm": 0.03517539350184397, + "language_loss": 0.85415643, + "learning_rate": 0.000750988626390968, + "loss": 0.86468661, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.44506836, + "step": 1833, + "time_per_iteration": 2.6027944087982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050095, + "balance_loss_mlp": 1.00577271, + "epoch": 0.3528280107733744, + "flos": 596973718272.0, + "grad_norm": 0.033457257877764275, + "language_loss": 0.85569251, + "learning_rate": 0.0007507191307178108, + "loss": 0.86619347, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.4440918, + "step": 1834, + "time_per_iteration": 2.8065004348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054314, + "balance_loss_mlp": 1.00999165, + "epoch": 0.3530203924586379, + "flos": 552299306496.0, + "grad_norm": 0.040042804692427734, + "language_loss": 0.75668854, + "learning_rate": 0.0007504495377068543, + "loss": 0.76723164, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.4440918, + "step": 1835, + "time_per_iteration": 2.736536741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052598, + "balance_loss_mlp": 1.00832355, + "epoch": 0.3532127741439015, + "flos": 654306431232.0, + "grad_norm": 0.0387965270782292, + "language_loss": 0.82353514, + "learning_rate": 0.0007501798474627642, + "loss": 0.83406115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44360352, + "step": 1836, + "time_per_iteration": 2.9019014835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052616, + "balance_loss_mlp": 1.00824583, + "epoch": 0.35340515582916504, + "flos": 724151636736.0, + "grad_norm": 0.03634896017563763, + "language_loss": 0.84383756, + "learning_rate": 0.0007499100600902433, + "loss": 0.85436368, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.44458008, + "step": 1837, + "time_per_iteration": 3.0071663856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105242, + "balance_loss_mlp": 1.00812232, + "epoch": 0.35359753751442863, + "flos": 595998733056.0, + "grad_norm": 0.039287132740407786, + "language_loss": 0.853827, + "learning_rate": 0.0007496401756940324, + "loss": 0.86435115, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.44384766, + "step": 1838, + "time_per_iteration": 2.6924545764923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052318, + "balance_loss_mlp": 1.00780547, + "epoch": 0.3537899191996922, + "flos": 633806440704.0, + "grad_norm": 0.041905435038062475, + "language_loss": 0.83424079, + "learning_rate": 0.0007493701943789098, + "loss": 0.84476393, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.44580078, + "step": 1839, + "time_per_iteration": 2.744781970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.00727141, + "epoch": 0.35398230088495575, + "flos": 507353686272.0, + "grad_norm": 0.0353986915713622, + "language_loss": 0.8339026, + "learning_rate": 0.000749100116249692, + "loss": 0.84441972, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.44506836, + "step": 1840, + "time_per_iteration": 2.5823822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_mlp": 1.00490189, + "epoch": 0.35417468257021933, + "flos": 509047032576.0, + "grad_norm": 0.03988576427868324, + "language_loss": 0.86907303, + "learning_rate": 0.0007488299414112321, + "loss": 0.87956673, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.4453125, + "step": 1841, + "time_per_iteration": 2.6171295642852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055735, + "balance_loss_mlp": 1.01126969, + "epoch": 0.35436706425548287, + "flos": 657660076032.0, + "grad_norm": 0.035376771477334756, + "language_loss": 0.78015333, + "learning_rate": 0.0007485596699684215, + "loss": 0.79071069, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.44555664, + "step": 1842, + "time_per_iteration": 2.8393046855926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070571, + "balance_loss_mlp": 1.02572489, + "epoch": 0.35455944594074645, + "flos": 653889414144.0, + "grad_norm": 0.03498191670442302, + "language_loss": 0.86517459, + "learning_rate": 0.000748289302026189, + "loss": 0.87588024, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.44848633, + "step": 1843, + "time_per_iteration": 2.8524656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060154, + "balance_loss_mlp": 1.01566541, + "epoch": 0.35475182762601, + "flos": 850011429888.0, + "grad_norm": 0.03510464987001869, + "language_loss": 0.86422503, + "learning_rate": 0.0007480188376895004, + "loss": 0.87482655, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.4453125, + "step": 1844, + "time_per_iteration": 3.1228320598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048378, + "balance_loss_mlp": 1.00584412, + "epoch": 0.3549442093112736, + "flos": 1524777989376.0, + "grad_norm": 0.00626506088035535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74859715, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.42578125, + "step": 1845, + "time_per_iteration": 4.8881309032440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053267, + "balance_loss_mlp": 1.00906432, + "epoch": 0.3551365909965371, + "flos": 652715152128.0, + "grad_norm": 0.03760423595997357, + "language_loss": 0.78996736, + "learning_rate": 0.0007474776202528074, + "loss": 0.80050004, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.44287109, + "step": 1846, + "time_per_iteration": 2.9740474224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055296, + "balance_loss_mlp": 1.01118839, + "epoch": 0.3553289726818007, + "flos": 898923098112.0, + "grad_norm": 0.04404679517400465, + "language_loss": 0.81547415, + "learning_rate": 0.000747206867362922, + "loss": 0.82602704, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.44189453, + "step": 1847, + "time_per_iteration": 3.0834994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052455, + "balance_loss_mlp": 1.00822854, + "epoch": 0.3555213543670643, + "flos": 689734512384.0, + "grad_norm": 0.03965516085145463, + "language_loss": 0.8451193, + "learning_rate": 0.0007469360184988194, + "loss": 0.85564387, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.44311523, + "step": 1848, + "time_per_iteration": 2.8074848651885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050526, + "balance_loss_mlp": 1.00632286, + "epoch": 0.3557137360523278, + "flos": 539604066816.0, + "grad_norm": 0.033414642983477745, + "language_loss": 0.87585986, + "learning_rate": 0.0007466650737656518, + "loss": 0.88636506, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.44287109, + "step": 1849, + "time_per_iteration": 2.604926347732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049824, + "balance_loss_mlp": 1.00562072, + "epoch": 0.3559061177375914, + "flos": 403154539776.0, + "grad_norm": 0.03235738057519393, + "language_loss": 0.9068622, + "learning_rate": 0.0007463940332686098, + "loss": 0.91736042, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.44287109, + "step": 1850, + "time_per_iteration": 2.4913558959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056333, + "balance_loss_mlp": 1.01196373, + "epoch": 0.35609849942285493, + "flos": 697895042304.0, + "grad_norm": 0.0320980052654178, + "language_loss": 0.85078359, + "learning_rate": 0.0007461228971129205, + "loss": 0.86134696, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.44458008, + "step": 1851, + "time_per_iteration": 2.898726463317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.01557255, + "epoch": 0.3562908811081185, + "flos": 570002653440.0, + "grad_norm": 0.036011031747473804, + "language_loss": 0.86088216, + "learning_rate": 0.0007458516654038483, + "loss": 0.87148154, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.44458008, + "step": 1852, + "time_per_iteration": 2.6340625286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050769, + "balance_loss_mlp": 1.00651896, + "epoch": 0.35648326279338205, + "flos": 683610468864.0, + "grad_norm": 0.03085087761867809, + "language_loss": 0.87196577, + "learning_rate": 0.0007455803382466946, + "loss": 0.88247347, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44335938, + "step": 1853, + "time_per_iteration": 2.7936782836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_mlp": 1.00468445, + "epoch": 0.35667564447864564, + "flos": 630341980416.0, + "grad_norm": 0.02905562967314866, + "language_loss": 0.8756358, + "learning_rate": 0.0007453089157467979, + "loss": 0.88612318, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.44140625, + "step": 1854, + "time_per_iteration": 2.8003768920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053337, + "balance_loss_mlp": 1.00920558, + "epoch": 0.35686802616390917, + "flos": 815505844224.0, + "grad_norm": 0.03187136352260198, + "language_loss": 0.82840991, + "learning_rate": 0.0007450373980095341, + "loss": 0.83894324, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.44213867, + "step": 1855, + "time_per_iteration": 3.072218179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.00869787, + "epoch": 0.35706040784917276, + "flos": 527206280448.0, + "grad_norm": 0.03314729603592228, + "language_loss": 0.87318838, + "learning_rate": 0.0007447657851403155, + "loss": 0.88371575, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.44116211, + "step": 1856, + "time_per_iteration": 2.5849640369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047421, + "balance_loss_mlp": 1.00338531, + "epoch": 0.35725278953443634, + "flos": 513065570304.0, + "grad_norm": 0.033114806318055315, + "language_loss": 0.79136717, + "learning_rate": 0.0007444940772445915, + "loss": 0.80184138, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.44116211, + "step": 1857, + "time_per_iteration": 2.729100227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048076, + "balance_loss_mlp": 1.00404048, + "epoch": 0.3574451712196999, + "flos": 488493573888.0, + "grad_norm": 0.030889137628629628, + "language_loss": 0.80389744, + "learning_rate": 0.0007442222744278484, + "loss": 0.81437826, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.44116211, + "step": 1858, + "time_per_iteration": 2.673224687576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_mlp": 1.00433075, + "epoch": 0.35763755290496346, + "flos": 551822018304.0, + "grad_norm": 0.029026961526961815, + "language_loss": 0.8481214, + "learning_rate": 0.0007439503767956099, + "loss": 0.8586058, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.44189453, + "step": 1859, + "time_per_iteration": 2.7095680236816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_mlp": 1.00567627, + "epoch": 0.357829934590227, + "flos": 1507228232448.0, + "grad_norm": 0.007157576597672099, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80719817, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.42578125, + "step": 1860, + "time_per_iteration": 4.909587383270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00549006, + "epoch": 0.3580223162754906, + "flos": 569842260480.0, + "grad_norm": 0.027013738684289513, + "language_loss": 0.86190987, + "learning_rate": 0.000743406297506922, + "loss": 0.87240434, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.44042969, + "step": 1861, + "time_per_iteration": 2.7355735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00518215, + "epoch": 0.3582146979607541, + "flos": 627761131776.0, + "grad_norm": 0.0339710504259095, + "language_loss": 0.84903038, + "learning_rate": 0.0007431341160617031, + "loss": 0.8595221, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.44067383, + "step": 1862, + "time_per_iteration": 2.8932178020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054928, + "balance_loss_mlp": 1.01082051, + "epoch": 0.3584070796460177, + "flos": 508319923200.0, + "grad_norm": 0.030700215862736833, + "language_loss": 0.88826722, + "learning_rate": 0.0007428618402234491, + "loss": 0.89881647, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.44189453, + "step": 1863, + "time_per_iteration": 2.6574699878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105281, + "balance_loss_mlp": 1.00882196, + "epoch": 0.3585994613312813, + "flos": 607641219840.0, + "grad_norm": 0.030466419719222444, + "language_loss": 0.80836076, + "learning_rate": 0.0007425894700978668, + "loss": 0.8188889, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.44067383, + "step": 1864, + "time_per_iteration": 2.7388875484466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048686, + "balance_loss_mlp": 1.00467396, + "epoch": 0.3587918430165448, + "flos": 1415089579776.0, + "grad_norm": 0.030441642762586523, + "language_loss": 0.8033703, + "learning_rate": 0.0007423170057906996, + "loss": 0.8138572, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.44091797, + "step": 1865, + "time_per_iteration": 3.8431384563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3589842247018084, + "flos": 479514561024.0, + "grad_norm": 0.03198832631900347, + "language_loss": 0.8674798, + "learning_rate": 0.0007420444474077275, + "loss": 0.87792838, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.44067383, + "step": 1866, + "time_per_iteration": 2.5487258434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_mlp": 1.0028863, + "epoch": 0.35917660638707194, + "flos": 505706026752.0, + "grad_norm": 0.036738697797889144, + "language_loss": 0.90374953, + "learning_rate": 0.0007417717950547671, + "loss": 0.91421801, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.44042969, + "step": 1867, + "time_per_iteration": 2.6784894466400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052044, + "balance_loss_mlp": 1.00960541, + "epoch": 0.3593689880723355, + "flos": 1495484645376.0, + "grad_norm": 0.0080630279180651, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77048653, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.42480469, + "step": 1868, + "time_per_iteration": 4.930212497711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104515, + "balance_loss_mlp": 1.00118589, + "epoch": 0.35956136975759906, + "flos": 529672422912.0, + "grad_norm": 0.03031015371847706, + "language_loss": 0.85577166, + "learning_rate": 0.0007412262088623299, + "loss": 0.86622322, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.44042969, + "step": 1869, + "time_per_iteration": 2.73066782951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_mlp": 1.00385797, + "epoch": 0.35975375144286265, + "flos": 536000600832.0, + "grad_norm": 0.03552204952813077, + "language_loss": 0.80084878, + "learning_rate": 0.0007409532752346684, + "loss": 0.81132627, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.43969727, + "step": 1870, + "time_per_iteration": 2.6379218101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00638759, + "epoch": 0.3599461331281262, + "flos": 505929603072.0, + "grad_norm": 0.028943079800369927, + "language_loss": 0.8876543, + "learning_rate": 0.0007406802480606491, + "loss": 0.89815807, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.44067383, + "step": 1871, + "time_per_iteration": 2.6258225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049318, + "balance_loss_mlp": 1.00547302, + "epoch": 0.36013851481338977, + "flos": 512537737728.0, + "grad_norm": 0.03609789661305553, + "language_loss": 0.91903639, + "learning_rate": 0.0007404071274462707, + "loss": 0.92952955, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.43920898, + "step": 1872, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_mlp": 1.00494921, + "epoch": 0.36033089649865335, + "flos": 548632657152.0, + "grad_norm": 0.03255043761438457, + "language_loss": 0.84506214, + "learning_rate": 0.0007401339134975682, + "loss": 0.85555267, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.44189453, + "step": 1873, + "time_per_iteration": 2.6355786323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_mlp": 1.00575614, + "epoch": 0.3605232781839169, + "flos": 459614334720.0, + "grad_norm": 0.03456024010205507, + "language_loss": 0.84983587, + "learning_rate": 0.0007398606063206122, + "loss": 0.86033404, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.44140625, + "step": 1874, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_mlp": 1.00577569, + "epoch": 0.36071565986918047, + "flos": 510564434688.0, + "grad_norm": 0.03262157431229983, + "language_loss": 0.79280519, + "learning_rate": 0.0007395872060215101, + "loss": 0.80330336, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.44116211, + "step": 1875, + "time_per_iteration": 2.59242582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_mlp": 1.00785792, + "epoch": 0.360908041554444, + "flos": 560257647360.0, + "grad_norm": 0.03426029536230158, + "language_loss": 0.89306337, + "learning_rate": 0.0007393137127064056, + "loss": 0.9035809, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.43969727, + "step": 1876, + "time_per_iteration": 2.6217613220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00577068, + "epoch": 0.3611004232397076, + "flos": 524879143680.0, + "grad_norm": 0.03313366432597027, + "language_loss": 0.84778088, + "learning_rate": 0.0007390401264814779, + "loss": 0.85827708, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.43920898, + "step": 1877, + "time_per_iteration": 2.621366262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_mlp": 1.00752687, + "epoch": 0.3612928049249711, + "flos": 542033270784.0, + "grad_norm": 0.036139064810301956, + "language_loss": 0.85492337, + "learning_rate": 0.0007387664474529427, + "loss": 0.86543715, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.43920898, + "step": 1878, + "time_per_iteration": 2.6200942993164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.00776029, + "epoch": 0.3614851866102347, + "flos": 553630070784.0, + "grad_norm": 0.03346030230294773, + "language_loss": 0.91826439, + "learning_rate": 0.0007384926757270518, + "loss": 0.92877924, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.43798828, + "step": 1879, + "time_per_iteration": 2.6367645263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048472, + "balance_loss_mlp": 1.00481761, + "epoch": 0.36167756829549824, + "flos": 773427832320.0, + "grad_norm": 0.030641441804162946, + "language_loss": 0.80120707, + "learning_rate": 0.0007382188114100924, + "loss": 0.81169182, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.43725586, + "step": 1880, + "time_per_iteration": 2.9662272930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_mlp": 1.0051316, + "epoch": 0.36186994998076183, + "flos": 713188627200.0, + "grad_norm": 0.030233131555612264, + "language_loss": 0.82161707, + "learning_rate": 0.0007379448546083884, + "loss": 0.83210421, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.43652344, + "step": 1881, + "time_per_iteration": 2.9433577060699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_mlp": 1.00420797, + "epoch": 0.3620623316660254, + "flos": 748901522688.0, + "grad_norm": 0.028477152913266954, + "language_loss": 0.88624489, + "learning_rate": 0.0007376708054282992, + "loss": 0.89672405, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.43774414, + "step": 1882, + "time_per_iteration": 2.9565789699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00425851, + "epoch": 0.36225471335128895, + "flos": 483535044096.0, + "grad_norm": 0.03088815199044137, + "language_loss": 0.84632647, + "learning_rate": 0.0007373966639762201, + "loss": 0.85680467, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.4362793, + "step": 1883, + "time_per_iteration": 2.6308107376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_mlp": 1.00762069, + "epoch": 0.36244709503655254, + "flos": 507911654400.0, + "grad_norm": 0.045291722940018896, + "language_loss": 0.89109468, + "learning_rate": 0.0007371224303585822, + "loss": 0.90160698, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.43676758, + "step": 1884, + "time_per_iteration": 2.5738682746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053806, + "balance_loss_mlp": 1.01194, + "epoch": 0.36263947672181607, + "flos": 1397054741760.0, + "grad_norm": 0.007615502937667497, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81410873, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.41894531, + "step": 1885, + "time_per_iteration": 4.7547221183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105859, + "balance_loss_mlp": 1.01500738, + "epoch": 0.36283185840707965, + "flos": 654523204608.0, + "grad_norm": 0.03432185210428161, + "language_loss": 0.83272493, + "learning_rate": 0.0007365736870525335, + "loss": 0.84331077, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.43652344, + "step": 1886, + "time_per_iteration": 2.8305654525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_mlp": 1.00591362, + "epoch": 0.3630242400923432, + "flos": 489845725440.0, + "grad_norm": 0.036050619102321185, + "language_loss": 0.8310129, + "learning_rate": 0.000736299177577164, + "loss": 0.84150714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.43579102, + "step": 1887, + "time_per_iteration": 2.632485866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105207, + "balance_loss_mlp": 1.00853443, + "epoch": 0.3632166217776068, + "flos": 518232125184.0, + "grad_norm": 0.034844830144856315, + "language_loss": 0.84275633, + "learning_rate": 0.0007360245763623174, + "loss": 0.85327709, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.43603516, + "step": 1888, + "time_per_iteration": 2.6480350494384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049354, + "balance_loss_mlp": 1.00596213, + "epoch": 0.36340900346287036, + "flos": 647348353536.0, + "grad_norm": 0.03423797247490227, + "language_loss": 0.90607542, + "learning_rate": 0.0007357498835146039, + "loss": 0.91656893, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.43457031, + "step": 1889, + "time_per_iteration": 2.8152430057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055179, + "balance_loss_mlp": 1.01154852, + "epoch": 0.3636013851481339, + "flos": 554411615232.0, + "grad_norm": 0.0362068794335816, + "language_loss": 0.87730169, + "learning_rate": 0.0007354750991406684, + "loss": 0.8878535, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.43701172, + "step": 1890, + "time_per_iteration": 2.71056866645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047867, + "balance_loss_mlp": 1.0042125, + "epoch": 0.3637937668333975, + "flos": 547692665088.0, + "grad_norm": 0.03762567530645649, + "language_loss": 0.81321651, + "learning_rate": 0.0007352002233471919, + "loss": 0.82369518, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.43725586, + "step": 1891, + "time_per_iteration": 2.6590068340301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.01098096, + "epoch": 0.363986148518661, + "flos": 539211349248.0, + "grad_norm": 0.036762310622647384, + "language_loss": 0.79772675, + "learning_rate": 0.0007349252562408906, + "loss": 0.808276, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.44018555, + "step": 1892, + "time_per_iteration": 2.715721368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111044, + "balance_loss_mlp": 1.0663805, + "epoch": 0.3641785302039246, + "flos": 661511417856.0, + "grad_norm": 0.04360229312277944, + "language_loss": 0.82000142, + "learning_rate": 0.0007346501979285158, + "loss": 0.83110583, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.44140625, + "step": 1893, + "time_per_iteration": 2.927184820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_mlp": 1.01934052, + "epoch": 0.36437091188918813, + "flos": 1472084965632.0, + "grad_norm": 0.015393341944361743, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81600404, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.41992188, + "step": 1894, + "time_per_iteration": 4.786630868911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050162, + "balance_loss_mlp": 1.00648379, + "epoch": 0.3645632935744517, + "flos": 598445433600.0, + "grad_norm": 0.030741456608760154, + "language_loss": 0.86771834, + "learning_rate": 0.0007340998081127308, + "loss": 0.87822002, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.4375, + "step": 1895, + "time_per_iteration": 2.7590408325195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00284135, + "epoch": 0.36475567525971525, + "flos": 600696748032.0, + "grad_norm": 0.032247737775586885, + "language_loss": 0.91682166, + "learning_rate": 0.0007338244768230007, + "loss": 0.92728615, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.43676758, + "step": 1896, + "time_per_iteration": 2.806001663208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048582, + "balance_loss_mlp": 1.00502336, + "epoch": 0.36494805694497884, + "flos": 799832180736.0, + "grad_norm": 0.03166243516623692, + "language_loss": 0.89817142, + "learning_rate": 0.0007335490547545578, + "loss": 0.90865725, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.4362793, + "step": 1897, + "time_per_iteration": 3.0448927879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049034, + "balance_loss_mlp": 1.00535595, + "epoch": 0.3651404386302424, + "flos": 638478210816.0, + "grad_norm": 0.03536594015703217, + "language_loss": 0.82896376, + "learning_rate": 0.0007332735420143308, + "loss": 0.83945411, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.4375, + "step": 1898, + "time_per_iteration": 2.739990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_mlp": 1.00419891, + "epoch": 0.36533282031550596, + "flos": 492563634432.0, + "grad_norm": 0.03491103953335563, + "language_loss": 0.87321162, + "learning_rate": 0.0007329979387092826, + "loss": 0.88369012, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.43725586, + "step": 1899, + "time_per_iteration": 2.5661838054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_mlp": 1.00020182, + "epoch": 0.36552520200076954, + "flos": 857509979136.0, + "grad_norm": 0.025671163998745472, + "language_loss": 0.84557235, + "learning_rate": 0.0007327222449464124, + "loss": 0.85601258, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.43896484, + "step": 1900, + "time_per_iteration": 3.2916476726531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_mlp": 1.00545931, + "epoch": 0.3657175836860331, + "flos": 484716109056.0, + "grad_norm": 0.033162883177173925, + "language_loss": 0.89287698, + "learning_rate": 0.0007324464608327538, + "loss": 0.90336835, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4375, + "step": 1901, + "time_per_iteration": 2.6514644622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050151, + "balance_loss_mlp": 1.00647259, + "epoch": 0.36590996537129666, + "flos": 435721815552.0, + "grad_norm": 0.0385016057803441, + "language_loss": 0.88887352, + "learning_rate": 0.0007321705864753758, + "loss": 0.89937502, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.4375, + "step": 1902, + "time_per_iteration": 2.6785683631896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00171947, + "epoch": 0.3661023470565602, + "flos": 713514270720.0, + "grad_norm": 0.027132815564249787, + "language_loss": 0.85073566, + "learning_rate": 0.0007318946219813823, + "loss": 0.86119133, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.43920898, + "step": 1903, + "time_per_iteration": 2.9874324798583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00431406, + "epoch": 0.3662947287418238, + "flos": 565823722752.0, + "grad_norm": 0.03452387251033087, + "language_loss": 0.90632051, + "learning_rate": 0.000731618567457912, + "loss": 0.91680402, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.44116211, + "step": 1904, + "time_per_iteration": 2.684290885925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049154, + "balance_loss_mlp": 1.00516582, + "epoch": 0.3664871104270873, + "flos": 791203110912.0, + "grad_norm": 0.032826620308443535, + "language_loss": 0.87174082, + "learning_rate": 0.000731342423012139, + "loss": 0.88223237, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.44067383, + "step": 1905, + "time_per_iteration": 3.0617177486419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051542, + "balance_loss_mlp": 1.00750625, + "epoch": 0.3666794921123509, + "flos": 753981561600.0, + "grad_norm": 0.03506961035904521, + "language_loss": 0.83108962, + "learning_rate": 0.0007310661887512722, + "loss": 0.84160507, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.44116211, + "step": 1906, + "time_per_iteration": 3.046901226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.0011121, + "epoch": 0.3668718737976145, + "flos": 524607935232.0, + "grad_norm": 0.03388484398579531, + "language_loss": 0.82964659, + "learning_rate": 0.0007307898647825549, + "loss": 0.84010023, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.44335938, + "step": 1907, + "time_per_iteration": 2.6592161655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_mlp": 1.00767255, + "epoch": 0.367064255482878, + "flos": 573046205952.0, + "grad_norm": 0.03554957537225944, + "language_loss": 0.8992576, + "learning_rate": 0.0007305134512132659, + "loss": 0.90977585, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.44238281, + "step": 1908, + "time_per_iteration": 2.6961183547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055033, + "balance_loss_mlp": 1.01078284, + "epoch": 0.3672566371681416, + "flos": 448054473216.0, + "grad_norm": 0.04018581054394134, + "language_loss": 0.843858, + "learning_rate": 0.0007302369481507183, + "loss": 0.85440832, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.44335938, + "step": 1909, + "time_per_iteration": 2.488203763961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056026, + "balance_loss_mlp": 1.01358795, + "epoch": 0.36744901885340514, + "flos": 1543366893312.0, + "grad_norm": 0.00771809390988723, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81017786, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.42480469, + "step": 1910, + "time_per_iteration": 4.828088045120239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059395, + "balance_loss_mlp": 1.01457202, + "epoch": 0.36764140053866873, + "flos": 564762221568.0, + "grad_norm": 0.032014471163266715, + "language_loss": 0.86287534, + "learning_rate": 0.000729683673975274, + "loss": 0.87346923, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.44824219, + "step": 1911, + "time_per_iteration": 2.6982359886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058317, + "balance_loss_mlp": 1.01366162, + "epoch": 0.36783378222393226, + "flos": 1218652614144.0, + "grad_norm": 0.03007186425733569, + "language_loss": 0.8357197, + "learning_rate": 0.0007294069030771774, + "loss": 0.84630299, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.44702148, + "step": 1912, + "time_per_iteration": 3.6612210273742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049037, + "balance_loss_mlp": 1.0043577, + "epoch": 0.36802616390919585, + "flos": 499720988928.0, + "grad_norm": 0.03131225250708543, + "language_loss": 0.91280997, + "learning_rate": 0.0007291300431154224, + "loss": 0.92330033, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.44726562, + "step": 1913, + "time_per_iteration": 2.574129581451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053715, + "balance_loss_mlp": 1.01108551, + "epoch": 0.36821854559445943, + "flos": 1585618904064.0, + "grad_norm": 0.006266309435424964, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7144345, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.42675781, + "step": 1914, + "time_per_iteration": 4.960723876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052577, + "balance_loss_mlp": 1.0082792, + "epoch": 0.36841092727972297, + "flos": 837090668544.0, + "grad_norm": 0.03136779226227803, + "language_loss": 0.80375087, + "learning_rate": 0.0007285760564309179, + "loss": 0.81427664, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.44384766, + "step": 1915, + "time_per_iteration": 3.0985960960388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010541, + "balance_loss_mlp": 1.00965917, + "epoch": 0.36860330896498655, + "flos": 691211085312.0, + "grad_norm": 0.031502418433557444, + "language_loss": 0.85988045, + "learning_rate": 0.0007282989299232448, + "loss": 0.87042141, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.4453125, + "step": 1916, + "time_per_iteration": 3.034715175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.01065195, + "epoch": 0.3687956906502501, + "flos": 555240791808.0, + "grad_norm": 0.03953946470073971, + "language_loss": 0.84794021, + "learning_rate": 0.0007280217147820668, + "loss": 0.85849106, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.4453125, + "step": 1917, + "time_per_iteration": 2.61297869682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_mlp": 1.0093317, + "epoch": 0.3689880723355137, + "flos": 577820043264.0, + "grad_norm": 0.030128455165502346, + "language_loss": 0.7994225, + "learning_rate": 0.0007277444111150079, + "loss": 0.80996048, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.44555664, + "step": 1918, + "time_per_iteration": 2.7244873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_mlp": 1.00845671, + "epoch": 0.3691804540207772, + "flos": 529887250944.0, + "grad_norm": 0.035938670194894204, + "language_loss": 0.84948546, + "learning_rate": 0.0007274670190297272, + "loss": 0.86001301, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.44384766, + "step": 1919, + "time_per_iteration": 2.6209609508514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_mlp": 1.0041858, + "epoch": 0.3693728357060408, + "flos": 562181372928.0, + "grad_norm": 0.026922320390231402, + "language_loss": 0.82273662, + "learning_rate": 0.0007271895386339179, + "loss": 0.83322287, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.4453125, + "step": 1920, + "time_per_iteration": 2.7952609062194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00292087, + "epoch": 0.3695652173913043, + "flos": 580900534272.0, + "grad_norm": 0.03055527362799568, + "language_loss": 0.83712995, + "learning_rate": 0.0007269119700353073, + "loss": 0.84760189, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.44360352, + "step": 1921, + "time_per_iteration": 2.808595895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049468, + "balance_loss_mlp": 1.00519335, + "epoch": 0.3697575990765679, + "flos": 514059997440.0, + "grad_norm": 0.029192022992987326, + "language_loss": 0.85655916, + "learning_rate": 0.0007266343133416571, + "loss": 0.86705387, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.44360352, + "step": 1922, + "time_per_iteration": 2.7229409217834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045757, + "balance_loss_mlp": 1.00255585, + "epoch": 0.3699499807618315, + "flos": 1573906430976.0, + "grad_norm": 0.004633598174219594, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.7816267, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.43261719, + "step": 1923, + "time_per_iteration": 4.855220556259155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049661, + "balance_loss_mlp": 1.00526702, + "epoch": 0.37014236244709503, + "flos": 498325095936.0, + "grad_norm": 0.04063724538866958, + "language_loss": 0.84789312, + "learning_rate": 0.0007260787361004556, + "loss": 0.85838968, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.44482422, + "step": 1924, + "time_per_iteration": 2.5634405612945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063278, + "balance_loss_mlp": 1.01998138, + "epoch": 0.3703347441323586, + "flos": 1447608233472.0, + "grad_norm": 0.011285785538321925, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.7482478, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 4.881471157073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050604, + "balance_loss_mlp": 1.00601971, + "epoch": 0.37052712581762215, + "flos": 564714589440.0, + "grad_norm": 0.030700116077417884, + "language_loss": 0.87676865, + "learning_rate": 0.0007255228077730903, + "loss": 0.88727468, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.44628906, + "step": 1926, + "time_per_iteration": 2.6604056358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_mlp": 1.00426066, + "epoch": 0.37071950750288574, + "flos": 927571958016.0, + "grad_norm": 0.030848240929213684, + "language_loss": 0.82266426, + "learning_rate": 0.0007252447122218632, + "loss": 0.83315009, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.4440918, + "step": 1927, + "time_per_iteration": 3.189232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.00424135, + "epoch": 0.37091188918814927, + "flos": 419201478912.0, + "grad_norm": 0.038028798643346066, + "language_loss": 0.88517463, + "learning_rate": 0.0007249665292228834, + "loss": 0.89565861, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.44238281, + "step": 1928, + "time_per_iteration": 2.6051783561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_mlp": 1.00443351, + "epoch": 0.37110427087341286, + "flos": 464147099136.0, + "grad_norm": 0.03246756835091633, + "language_loss": 0.8426615, + "learning_rate": 0.000724688258884151, + "loss": 0.85314661, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.44165039, + "step": 1929, + "time_per_iteration": 2.5537402629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105112, + "balance_loss_mlp": 1.00703681, + "epoch": 0.3712966525586764, + "flos": 851081679360.0, + "grad_norm": 0.026814038228573516, + "language_loss": 0.86998665, + "learning_rate": 0.0007244099013137002, + "loss": 0.88049793, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.44165039, + "step": 1930, + "time_per_iteration": 3.091195821762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052812, + "balance_loss_mlp": 1.00901484, + "epoch": 0.37148903424394, + "flos": 927559319040.0, + "grad_norm": 0.03484228463474462, + "language_loss": 0.89224607, + "learning_rate": 0.0007241314566195993, + "loss": 0.90277416, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.4387207, + "step": 1931, + "time_per_iteration": 3.2276151180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00616395, + "epoch": 0.37168141592920356, + "flos": 520821722112.0, + "grad_norm": 0.033577876196724185, + "language_loss": 0.86394525, + "learning_rate": 0.0007238529249099496, + "loss": 0.87444603, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.43994141, + "step": 1932, + "time_per_iteration": 2.6099538803100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_mlp": 1.00075531, + "epoch": 0.3718737976144671, + "flos": 1449062452224.0, + "grad_norm": 0.005805601038449312, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78900075, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.42480469, + "step": 1933, + "time_per_iteration": 4.864013910293579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00741839, + "epoch": 0.3720661792997307, + "flos": 760954223616.0, + "grad_norm": 0.031651541573232696, + "language_loss": 0.81381935, + "learning_rate": 0.000723295600876581, + "loss": 0.82433319, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.44042969, + "step": 1934, + "time_per_iteration": 3.003988742828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_mlp": 1.00353265, + "epoch": 0.3722585609849942, + "flos": 518045487360.0, + "grad_norm": 0.031160015664157277, + "language_loss": 0.88386387, + "learning_rate": 0.0007230168087692344, + "loss": 0.89433783, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.43945312, + "step": 1935, + "time_per_iteration": 2.6490824222564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_mlp": 1.00165451, + "epoch": 0.3724509426702578, + "flos": 783869812224.0, + "grad_norm": 0.03743087194604022, + "language_loss": 0.82867873, + "learning_rate": 0.0007227379300790839, + "loss": 0.83913326, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.4387207, + "step": 1936, + "time_per_iteration": 3.010700225830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044084, + "balance_loss_mlp": 1.00011992, + "epoch": 0.37264332435552133, + "flos": 392599799040.0, + "grad_norm": 0.032423549870759565, + "language_loss": 0.86443603, + "learning_rate": 0.0007224589649143997, + "loss": 0.87487686, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.44042969, + "step": 1937, + "time_per_iteration": 2.54010272026062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_mlp": 1.00072384, + "epoch": 0.3728357060407849, + "flos": 543913254912.0, + "grad_norm": 0.03387233199209411, + "language_loss": 0.81436574, + "learning_rate": 0.0007221799133834861, + "loss": 0.82481098, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.4387207, + "step": 1938, + "time_per_iteration": 2.6355655193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_mlp": 1.00154293, + "epoch": 0.3730280877260485, + "flos": 434484370176.0, + "grad_norm": 0.03416430777388856, + "language_loss": 0.82122993, + "learning_rate": 0.00072190077559468, + "loss": 0.83168304, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.43847656, + "step": 1939, + "time_per_iteration": 2.5033867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049643, + "balance_loss_mlp": 1.00579786, + "epoch": 0.37322046941131204, + "flos": 532511841024.0, + "grad_norm": 0.031902006564455146, + "language_loss": 0.89473069, + "learning_rate": 0.0007216215516563527, + "loss": 0.90522707, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.43920898, + "step": 1940, + "time_per_iteration": 2.685201406478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_mlp": 1.00538588, + "epoch": 0.3734128510965756, + "flos": 532576969728.0, + "grad_norm": 0.03682978505173481, + "language_loss": 0.83770883, + "learning_rate": 0.0007213422416769083, + "loss": 0.84820092, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.43896484, + "step": 1941, + "time_per_iteration": 2.5981826782226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104851, + "balance_loss_mlp": 1.00454593, + "epoch": 0.37360523278183916, + "flos": 501433777152.0, + "grad_norm": 0.029644951468961563, + "language_loss": 0.75750655, + "learning_rate": 0.0007210628457647849, + "loss": 0.76799166, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.44042969, + "step": 1942, + "time_per_iteration": 2.5780391693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_mlp": 1.00365365, + "epoch": 0.37379761446710275, + "flos": 549112857600.0, + "grad_norm": 0.03283775645447924, + "language_loss": 0.79155779, + "learning_rate": 0.000720783364028453, + "loss": 0.80203396, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.44042969, + "step": 1943, + "time_per_iteration": 2.7498555183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052612, + "balance_loss_mlp": 1.0085758, + "epoch": 0.3739899961523663, + "flos": 476740271616.0, + "grad_norm": 0.03229344723146533, + "language_loss": 0.88345349, + "learning_rate": 0.0007205037965764177, + "loss": 0.89397967, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.44116211, + "step": 1944, + "time_per_iteration": 2.559565305709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049461, + "balance_loss_mlp": 1.00533009, + "epoch": 0.37418237783762986, + "flos": 613077037824.0, + "grad_norm": 0.033726561022773015, + "language_loss": 0.85856438, + "learning_rate": 0.0007202241435172161, + "loss": 0.86905897, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.44213867, + "step": 1945, + "time_per_iteration": 2.7495012283325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_mlp": 1.00618136, + "epoch": 0.3743747595228934, + "flos": 767629432320.0, + "grad_norm": 0.030482282234963888, + "language_loss": 0.88839138, + "learning_rate": 0.0007199444049594198, + "loss": 0.89889503, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.44262695, + "step": 1946, + "time_per_iteration": 2.927438259124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_mlp": 1.00679135, + "epoch": 0.374567141208157, + "flos": 525491546880.0, + "grad_norm": 0.03274984488565387, + "language_loss": 0.84098482, + "learning_rate": 0.0007196645810116322, + "loss": 0.85149455, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.44262695, + "step": 1947, + "time_per_iteration": 2.669954538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00717854, + "epoch": 0.37475952289342057, + "flos": 682614096384.0, + "grad_norm": 0.03500222096290466, + "language_loss": 0.84308642, + "learning_rate": 0.0007193846717824912, + "loss": 0.85360044, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.44311523, + "step": 1948, + "time_per_iteration": 2.873595714569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_mlp": 1.01018322, + "epoch": 0.3749519045786841, + "flos": 461216307456.0, + "grad_norm": 0.03758393676626501, + "language_loss": 0.89286113, + "learning_rate": 0.0007191046773806669, + "loss": 0.90340507, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.44287109, + "step": 1949, + "time_per_iteration": 2.5632805824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052491, + "balance_loss_mlp": 1.00816894, + "epoch": 0.3751442862639477, + "flos": 956388013824.0, + "grad_norm": 0.04355990755149793, + "language_loss": 0.83803475, + "learning_rate": 0.0007188245979148631, + "loss": 0.84855968, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.4440918, + "step": 1950, + "time_per_iteration": 3.153048515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050564, + "balance_loss_mlp": 1.00619411, + "epoch": 0.3753366679492112, + "flos": 528806307840.0, + "grad_norm": 0.034134677221205334, + "language_loss": 0.88437903, + "learning_rate": 0.0007185444334938157, + "loss": 0.89488459, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.44458008, + "step": 1951, + "time_per_iteration": 2.77795147895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052982, + "balance_loss_mlp": 1.0084213, + "epoch": 0.3755290496344748, + "flos": 522849460224.0, + "grad_norm": 0.03641649118573359, + "language_loss": 0.85489821, + "learning_rate": 0.0007182641842262947, + "loss": 0.86542803, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.44628906, + "step": 1952, + "time_per_iteration": 2.6038033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063178, + "balance_loss_mlp": 1.01852179, + "epoch": 0.37572143131973834, + "flos": 622372945920.0, + "grad_norm": 0.036303705105214745, + "language_loss": 0.78406018, + "learning_rate": 0.0007179838502211022, + "loss": 0.79469192, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.44702148, + "step": 1953, + "time_per_iteration": 2.8537991046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050235, + "balance_loss_mlp": 1.00565112, + "epoch": 0.37591381300500193, + "flos": 772274957568.0, + "grad_norm": 0.033405608161133214, + "language_loss": 0.87193865, + "learning_rate": 0.0007177034315870738, + "loss": 0.88244104, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.44677734, + "step": 1954, + "time_per_iteration": 2.9944725036621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_mlp": 1.00469208, + "epoch": 0.37610619469026546, + "flos": 521481757440.0, + "grad_norm": 0.05036646851246907, + "language_loss": 0.91552407, + "learning_rate": 0.0007174229284330773, + "loss": 0.92601728, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.44702148, + "step": 1955, + "time_per_iteration": 2.607128143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046853, + "balance_loss_mlp": 1.0023644, + "epoch": 0.37629857637552905, + "flos": 599971584000.0, + "grad_norm": 0.029911324472659546, + "language_loss": 0.87468076, + "learning_rate": 0.0007171423408680141, + "loss": 0.88514924, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.44555664, + "step": 1956, + "time_per_iteration": 2.8234241008758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00272334, + "epoch": 0.37649095806079264, + "flos": 566019108864.0, + "grad_norm": 0.03303955535560464, + "language_loss": 0.90624022, + "learning_rate": 0.0007168616690008176, + "loss": 0.91671115, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.44458008, + "step": 1957, + "time_per_iteration": 2.645219326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047066, + "balance_loss_mlp": 1.00271976, + "epoch": 0.37668333974605617, + "flos": 593569529088.0, + "grad_norm": 0.03512927569377508, + "language_loss": 0.86650079, + "learning_rate": 0.0007165809129404545, + "loss": 0.87697142, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.44433594, + "step": 1958, + "time_per_iteration": 2.762319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.00742376, + "epoch": 0.37687572143131975, + "flos": 420365047296.0, + "grad_norm": 0.03381206580119959, + "language_loss": 0.8673501, + "learning_rate": 0.0007163000727959239, + "loss": 0.87786663, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.44311523, + "step": 1959, + "time_per_iteration": 2.4887454509735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_mlp": 1.00466919, + "epoch": 0.3770681031165833, + "flos": 1360387269888.0, + "grad_norm": 0.007286715675134549, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79006183, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.42480469, + "step": 1960, + "time_per_iteration": 4.844388961791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053466, + "balance_loss_mlp": 1.00938201, + "epoch": 0.3772604848018469, + "flos": 646154649600.0, + "grad_norm": 0.030030705089392724, + "language_loss": 0.85244703, + "learning_rate": 0.00071573814069052, + "loss": 0.86298174, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.44165039, + "step": 1961, + "time_per_iteration": 2.93870210647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_mlp": 0.99976981, + "epoch": 0.3774528664871104, + "flos": 903202150656.0, + "grad_norm": 0.029467737659617427, + "language_loss": 0.88618672, + "learning_rate": 0.0007154570489478081, + "loss": 0.89662528, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.44165039, + "step": 1962, + "time_per_iteration": 3.2101829051971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_mlp": 1.00241697, + "epoch": 0.377645248172374, + "flos": 789464077824.0, + "grad_norm": 0.02894999631439154, + "language_loss": 0.87102842, + "learning_rate": 0.0007151758735572514, + "loss": 0.88149416, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.44238281, + "step": 1963, + "time_per_iteration": 3.0217864513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046842, + "balance_loss_mlp": 1.00282979, + "epoch": 0.3778376298576376, + "flos": 587925686016.0, + "grad_norm": 0.035422959183698866, + "language_loss": 0.81287247, + "learning_rate": 0.0007148946146280119, + "loss": 0.82334089, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.44091797, + "step": 1964, + "time_per_iteration": 2.9066553115844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_mlp": 1.01407623, + "epoch": 0.3780300115429011, + "flos": 1399672528896.0, + "grad_norm": 0.012885740561533653, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73248661, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.42480469, + "step": 1965, + "time_per_iteration": 4.874085426330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3782223932281647, + "flos": 1360634178816.0, + "grad_norm": 0.008484298942656315, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397645, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.42578125, + "step": 1966, + "time_per_iteration": 4.964066743850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048666, + "balance_loss_mlp": 1.00467777, + "epoch": 0.37841477491342823, + "flos": 705517046016.0, + "grad_norm": 0.02737284959483133, + "language_loss": 0.8436377, + "learning_rate": 0.0007140503377003022, + "loss": 0.85412437, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.44067383, + "step": 1967, + "time_per_iteration": 3.014033555984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_mlp": 1.00764298, + "epoch": 0.3786071565986918, + "flos": 530156514048.0, + "grad_norm": 0.03014770490429956, + "language_loss": 0.85294402, + "learning_rate": 0.000713768745708599, + "loss": 0.86346149, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.44189453, + "step": 1968, + "time_per_iteration": 2.6359875202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_mlp": 1.0084002, + "epoch": 0.37879953828395535, + "flos": 994901443584.0, + "grad_norm": 0.03323886334735767, + "language_loss": 0.78270096, + "learning_rate": 0.0007134870707245085, + "loss": 0.79322648, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.44238281, + "step": 1969, + "time_per_iteration": 3.276670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054281, + "balance_loss_mlp": 1.01010239, + "epoch": 0.37899191996921894, + "flos": 627793212672.0, + "grad_norm": 0.033324026165203316, + "language_loss": 0.84867144, + "learning_rate": 0.0007132053128573864, + "loss": 0.85921425, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.44262695, + "step": 1970, + "time_per_iteration": 2.747647523880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_mlp": 1.00727034, + "epoch": 0.37918430165448247, + "flos": 687520136448.0, + "grad_norm": 0.034311044198206936, + "language_loss": 0.84702653, + "learning_rate": 0.0007129234722166211, + "loss": 0.85754126, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.44287109, + "step": 1971, + "time_per_iteration": 2.8502755165100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104953, + "balance_loss_mlp": 1.00535131, + "epoch": 0.37937668333974606, + "flos": 476618762496.0, + "grad_norm": 0.028798969169212138, + "language_loss": 0.91637433, + "learning_rate": 0.0007126415489116328, + "loss": 0.92686969, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.44262695, + "step": 1972, + "time_per_iteration": 2.703598737716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_mlp": 1.00559556, + "epoch": 0.37956906502500964, + "flos": 708825004032.0, + "grad_norm": 0.033945121596029554, + "language_loss": 0.81780016, + "learning_rate": 0.0007123595430518736, + "loss": 0.82829797, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.44262695, + "step": 1973, + "time_per_iteration": 2.859210252761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_mlp": 1.00345445, + "epoch": 0.3797614467102732, + "flos": 427559340288.0, + "grad_norm": 0.03504063937858188, + "language_loss": 0.86830699, + "learning_rate": 0.0007120774547468282, + "loss": 0.87878382, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.44311523, + "step": 1974, + "time_per_iteration": 2.5465054512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105377, + "balance_loss_mlp": 1.00944817, + "epoch": 0.37995382839553676, + "flos": 482881811712.0, + "grad_norm": 0.031503790568027705, + "language_loss": 0.82317638, + "learning_rate": 0.0007117952841060128, + "loss": 0.83371413, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.4440918, + "step": 1975, + "time_per_iteration": 2.789965867996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_mlp": 1.00924242, + "epoch": 0.3801462100808003, + "flos": 561671036928.0, + "grad_norm": 0.03572346778222672, + "language_loss": 0.84539783, + "learning_rate": 0.0007115130312389756, + "loss": 0.85593396, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.44433594, + "step": 1976, + "time_per_iteration": 2.7104804515838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046782, + "balance_loss_mlp": 1.00236499, + "epoch": 0.3803385917660639, + "flos": 465888077568.0, + "grad_norm": 0.03508123942848817, + "language_loss": 0.80071044, + "learning_rate": 0.0007112306962552973, + "loss": 0.81117821, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.44506836, + "step": 1977, + "time_per_iteration": 2.644700527191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053171, + "balance_loss_mlp": 1.00863445, + "epoch": 0.3805309734513274, + "flos": 522905840640.0, + "grad_norm": 0.0297417361696937, + "language_loss": 0.8625899, + "learning_rate": 0.0007109482792645896, + "loss": 0.87312162, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.44580078, + "step": 1978, + "time_per_iteration": 2.736924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052388, + "balance_loss_mlp": 1.00780404, + "epoch": 0.380723355136591, + "flos": 592553714688.0, + "grad_norm": 0.03207088172149068, + "language_loss": 0.84620887, + "learning_rate": 0.0007106657803764969, + "loss": 0.85673285, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.44628906, + "step": 1979, + "time_per_iteration": 2.797027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_mlp": 1.00851822, + "epoch": 0.38091573682185453, + "flos": 623855354880.0, + "grad_norm": 0.034228405400289826, + "language_loss": 0.82734859, + "learning_rate": 0.0007103831997006948, + "loss": 0.83788031, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.4465332, + "step": 1980, + "time_per_iteration": 2.774831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00601208, + "epoch": 0.3811081185071181, + "flos": 570176652288.0, + "grad_norm": 0.02916230611543443, + "language_loss": 0.85986841, + "learning_rate": 0.0007101005373468908, + "loss": 0.87037432, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.4465332, + "step": 1981, + "time_per_iteration": 2.889430284500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_mlp": 1.00647449, + "epoch": 0.3813005001923817, + "flos": 585991266816.0, + "grad_norm": 0.029260882769569122, + "language_loss": 0.87282979, + "learning_rate": 0.0007098177934248242, + "loss": 0.88334191, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.44726562, + "step": 1982, + "time_per_iteration": 2.734011173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_mlp": 1.00509369, + "epoch": 0.38149288187764524, + "flos": 622811350272.0, + "grad_norm": 0.03279838714755621, + "language_loss": 0.86164075, + "learning_rate": 0.0007095349680442661, + "loss": 0.87213778, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.44677734, + "step": 1983, + "time_per_iteration": 2.8532214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049432, + "balance_loss_mlp": 1.00496709, + "epoch": 0.3816852635629088, + "flos": 571798066944.0, + "grad_norm": 0.03407469020321441, + "language_loss": 0.79342288, + "learning_rate": 0.0007092520613150188, + "loss": 0.80391723, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4453125, + "step": 1984, + "time_per_iteration": 2.6656527519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_mlp": 1.01058352, + "epoch": 0.38187764524817236, + "flos": 566679144192.0, + "grad_norm": 0.03287674379309895, + "language_loss": 0.81891948, + "learning_rate": 0.0007089690733469165, + "loss": 0.82946956, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.44506836, + "step": 1985, + "time_per_iteration": 2.6921868324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104986, + "balance_loss_mlp": 1.00544298, + "epoch": 0.38207002693343595, + "flos": 632399854080.0, + "grad_norm": 0.03591516825864857, + "language_loss": 0.8265506, + "learning_rate": 0.000708686004249825, + "loss": 0.83704919, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.44506836, + "step": 1986, + "time_per_iteration": 2.771472454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_mlp": 1.0026772, + "epoch": 0.3822624086186995, + "flos": 549841912320.0, + "grad_norm": 0.027805852633017242, + "language_loss": 0.91746366, + "learning_rate": 0.0007084028541336413, + "loss": 0.92793083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.44116211, + "step": 1987, + "time_per_iteration": 2.7168381214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049881, + "balance_loss_mlp": 1.00572634, + "epoch": 0.38245479030396307, + "flos": 615067837440.0, + "grad_norm": 0.03052630202850825, + "language_loss": 0.86906445, + "learning_rate": 0.0007081196231082942, + "loss": 0.87956333, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.44238281, + "step": 1988, + "time_per_iteration": 2.8021280765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00325835, + "epoch": 0.38264717198922665, + "flos": 669304508160.0, + "grad_norm": 0.03253134732635267, + "language_loss": 0.8090933, + "learning_rate": 0.0007078363112837436, + "loss": 0.81956601, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.44091797, + "step": 1989, + "time_per_iteration": 2.812901020050049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00232375, + "epoch": 0.3828395536744902, + "flos": 455687170560.0, + "grad_norm": 0.03353740504071411, + "language_loss": 0.8610149, + "learning_rate": 0.000707552918769981, + "loss": 0.87147707, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43969727, + "step": 1990, + "time_per_iteration": 2.503817081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0038017, + "epoch": 0.3830319353597538, + "flos": 500483091456.0, + "grad_norm": 0.030831133245435974, + "language_loss": 0.84298265, + "learning_rate": 0.000707269445677029, + "loss": 0.85345787, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43798828, + "step": 1991, + "time_per_iteration": 2.77250599861145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_mlp": 1.00373507, + "epoch": 0.3832243170450173, + "flos": 745467197952.0, + "grad_norm": 0.03142895241328533, + "language_loss": 0.85860848, + "learning_rate": 0.0007069858921149416, + "loss": 0.86908376, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.4387207, + "step": 1992, + "time_per_iteration": 3.001058578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_mlp": 1.00363255, + "epoch": 0.3834166987302809, + "flos": 579346193664.0, + "grad_norm": 0.027707623231004064, + "language_loss": 0.86360574, + "learning_rate": 0.0007067022581938043, + "loss": 0.87407815, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.43676758, + "step": 1993, + "time_per_iteration": 2.896017551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049302, + "balance_loss_mlp": 1.00579047, + "epoch": 0.3836090804155444, + "flos": 537609376512.0, + "grad_norm": 0.038344647976828676, + "language_loss": 0.83944476, + "learning_rate": 0.0007064185440237334, + "loss": 0.8499378, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.43579102, + "step": 1994, + "time_per_iteration": 2.8133461475372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051736, + "balance_loss_mlp": 1.00820076, + "epoch": 0.383801462100808, + "flos": 603052075008.0, + "grad_norm": 0.0304270283066245, + "language_loss": 0.85033917, + "learning_rate": 0.0007061347497148764, + "loss": 0.86085653, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.43603516, + "step": 1995, + "time_per_iteration": 2.829977035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050646, + "balance_loss_mlp": 1.00694358, + "epoch": 0.38399384378607154, + "flos": 573799560192.0, + "grad_norm": 0.034646706108572276, + "language_loss": 0.86866224, + "learning_rate": 0.0007058508753774122, + "loss": 0.87916863, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.43774414, + "step": 1996, + "time_per_iteration": 2.684966564178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049901, + "balance_loss_mlp": 1.00629473, + "epoch": 0.38418622547133513, + "flos": 537780463104.0, + "grad_norm": 0.03333459391135046, + "language_loss": 0.87270373, + "learning_rate": 0.0007055669211215505, + "loss": 0.88320273, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.43676758, + "step": 1997, + "time_per_iteration": 2.623508930206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054497, + "balance_loss_mlp": 1.01079535, + "epoch": 0.3843786071565987, + "flos": 574014388224.0, + "grad_norm": 0.04127067736406929, + "language_loss": 0.78599155, + "learning_rate": 0.0007052828870575322, + "loss": 0.79653656, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43774414, + "step": 1998, + "time_per_iteration": 2.644423723220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_mlp": 1.00761676, + "epoch": 0.38457098884186225, + "flos": 730080294144.0, + "grad_norm": 0.03146347648703673, + "language_loss": 0.87266672, + "learning_rate": 0.0007049987732956291, + "loss": 0.88318008, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.43798828, + "step": 1999, + "time_per_iteration": 2.963409185409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_mlp": 1.00447905, + "epoch": 0.38476337052712584, + "flos": 584621618688.0, + "grad_norm": 0.024706606255084192, + "language_loss": 0.83278054, + "learning_rate": 0.0007047145799461439, + "loss": 0.84326208, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.4375, + "step": 2000, + "time_per_iteration": 2.86661434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_mlp": 1.00459874, + "epoch": 0.38495575221238937, + "flos": 554159848704.0, + "grad_norm": 0.03147773281119346, + "language_loss": 0.83074015, + "learning_rate": 0.00070443030711941, + "loss": 0.84122348, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.43798828, + "step": 2001, + "time_per_iteration": 2.778719425201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_mlp": 1.00175321, + "epoch": 0.38514813389765296, + "flos": 655678024704.0, + "grad_norm": 0.03168685191580143, + "language_loss": 0.82975376, + "learning_rate": 0.0007041459549257924, + "loss": 0.84020758, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.43701172, + "step": 2002, + "time_per_iteration": 2.8597054481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046243, + "balance_loss_mlp": 1.00261223, + "epoch": 0.3853405155829165, + "flos": 869647250688.0, + "grad_norm": 0.03552713767777679, + "language_loss": 0.78954732, + "learning_rate": 0.0007038615234756859, + "loss": 0.80000973, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.43701172, + "step": 2003, + "time_per_iteration": 3.167647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.00697505, + "epoch": 0.3855328972681801, + "flos": 547469088768.0, + "grad_norm": 0.03596547507231522, + "language_loss": 0.84374714, + "learning_rate": 0.000703577012879517, + "loss": 0.85425198, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.43579102, + "step": 2004, + "time_per_iteration": 2.644718885421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00397706, + "epoch": 0.3857252789534436, + "flos": 535099492608.0, + "grad_norm": 0.03525407945169758, + "language_loss": 0.89214581, + "learning_rate": 0.0007032924232477423, + "loss": 0.90262067, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.43579102, + "step": 2005, + "time_per_iteration": 2.6340301036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_mlp": 1.01023984, + "epoch": 0.3859176606387072, + "flos": 492767768832.0, + "grad_norm": 0.0325086763316175, + "language_loss": 0.80829036, + "learning_rate": 0.0007030077546908493, + "loss": 0.81882888, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.43676758, + "step": 2006, + "time_per_iteration": 2.6427574157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051659, + "balance_loss_mlp": 1.00969696, + "epoch": 0.3861100423239708, + "flos": 1490158675968.0, + "grad_norm": 0.006099468603868092, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84116316, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.41992188, + "step": 2007, + "time_per_iteration": 4.792185068130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_mlp": 1.00383234, + "epoch": 0.3863024240092343, + "flos": 474693091584.0, + "grad_norm": 0.0379943815396184, + "language_loss": 0.79703128, + "learning_rate": 0.0007024381812438117, + "loss": 0.80750644, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.4375, + "step": 2008, + "time_per_iteration": 2.6320388317108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_mlp": 1.0153178, + "epoch": 0.3864948056944979, + "flos": 717979961088.0, + "grad_norm": 0.04179543058298576, + "language_loss": 0.84345418, + "learning_rate": 0.0007021532765747951, + "loss": 0.85404319, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.43652344, + "step": 2009, + "time_per_iteration": 3.0408942699432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_mlp": 1.01370513, + "epoch": 0.38668718737976143, + "flos": 728955609600.0, + "grad_norm": 0.033678441310908816, + "language_loss": 0.80296206, + "learning_rate": 0.0007018682934229162, + "loss": 0.81353402, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43554688, + "step": 2010, + "time_per_iteration": 2.9119958877563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_mlp": 1.01025474, + "epoch": 0.386879569065025, + "flos": 526489864704.0, + "grad_norm": 0.031759350944825356, + "language_loss": 0.83489478, + "learning_rate": 0.0007015832318988152, + "loss": 0.84543192, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43530273, + "step": 2011, + "time_per_iteration": 2.625828981399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_mlp": 1.00643158, + "epoch": 0.38707195075028855, + "flos": 1530727067136.0, + "grad_norm": 0.008010138125144308, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74938273, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.41796875, + "step": 2012, + "time_per_iteration": 4.969848155975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_mlp": 1.00555038, + "epoch": 0.38726433243555214, + "flos": 558386411520.0, + "grad_norm": 0.029387859415775444, + "language_loss": 0.84841448, + "learning_rate": 0.0007010128741766604, + "loss": 0.85890484, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.43554688, + "step": 2013, + "time_per_iteration": 2.808583974838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_mlp": 1.00205982, + "epoch": 0.38745671412081567, + "flos": 554756700672.0, + "grad_norm": 0.037665143906504196, + "language_loss": 0.84820414, + "learning_rate": 0.0007007275782000391, + "loss": 0.85866058, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.43652344, + "step": 2014, + "time_per_iteration": 2.6201975345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_mlp": 1.00775766, + "epoch": 0.38764909580607926, + "flos": 459345071616.0, + "grad_norm": 0.03590133597746071, + "language_loss": 0.85486585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86537898, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.4362793, + "step": 2015, + "time_per_iteration": 2.5167059898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_mlp": 1.00792837, + "epoch": 0.38784147749134285, + "flos": 523259674368.0, + "grad_norm": 0.036833384765870066, + "language_loss": 0.90223992, + "learning_rate": 0.0007001567525695169, + "loss": 0.9127546, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.43603516, + "step": 2016, + "time_per_iteration": 2.663416624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_mlp": 0.99923599, + "epoch": 0.3880338591766064, + "flos": 667401191424.0, + "grad_norm": 0.027528515382714943, + "language_loss": 0.84397906, + "learning_rate": 0.0006998712231372303, + "loss": 0.85440457, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.43383789, + "step": 2017, + "time_per_iteration": 2.982222080230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_mlp": 1.00389743, + "epoch": 0.38822624086186996, + "flos": 595176359424.0, + "grad_norm": 0.028816590459513517, + "language_loss": 0.86776507, + "learning_rate": 0.0006995856161080532, + "loss": 0.87823659, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43310547, + "step": 2018, + "time_per_iteration": 2.8449933528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046566, + "balance_loss_mlp": 1.00300694, + "epoch": 0.3884186225471335, + "flos": 613682638080.0, + "grad_norm": 0.032032500930829794, + "language_loss": 0.82425624, + "learning_rate": 0.0006992999315928679, + "loss": 0.83472192, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.4362793, + "step": 2019, + "time_per_iteration": 2.803743362426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_mlp": 1.00401926, + "epoch": 0.3886110042323971, + "flos": 608244874752.0, + "grad_norm": 0.027721707471257077, + "language_loss": 0.86241317, + "learning_rate": 0.0006990141697025871, + "loss": 0.87288654, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.43383789, + "step": 2020, + "time_per_iteration": 2.7804739475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046585, + "balance_loss_mlp": 1.00481415, + "epoch": 0.3888033859176606, + "flos": 1531196573952.0, + "grad_norm": 0.004554603876592686, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77406228, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.41796875, + "step": 2021, + "time_per_iteration": 4.76949667930603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_mlp": 1.00478315, + "epoch": 0.3889957676029242, + "flos": 693672370176.0, + "grad_norm": 0.038162906437672096, + "language_loss": 0.8292582, + "learning_rate": 0.0006984424142405392, + "loss": 0.83973902, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.43359375, + "step": 2022, + "time_per_iteration": 2.7983930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_mlp": 1.00599611, + "epoch": 0.3891881492881878, + "flos": 516195638784.0, + "grad_norm": 0.03974199995652067, + "language_loss": 0.82402384, + "learning_rate": 0.0006981564208907474, + "loss": 0.83451867, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.43554688, + "step": 2023, + "time_per_iteration": 2.613600730895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_mlp": 1.00707471, + "epoch": 0.3893805309734513, + "flos": 630176729856.0, + "grad_norm": 0.03303002735023947, + "language_loss": 0.90586042, + "learning_rate": 0.0006978703506098102, + "loss": 0.91636622, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.43579102, + "step": 2024, + "time_per_iteration": 2.7258403301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_mlp": 1.00748503, + "epoch": 0.3895729126587149, + "flos": 545207080704.0, + "grad_norm": 0.0334033578711094, + "language_loss": 0.88520938, + "learning_rate": 0.00069758420350879, + "loss": 0.89571834, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.43481445, + "step": 2025, + "time_per_iteration": 2.6406970024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_mlp": 1.00427127, + "epoch": 0.38976529434397844, + "flos": 619407161088.0, + "grad_norm": 0.03600656764113765, + "language_loss": 0.86979783, + "learning_rate": 0.000697297979698779, + "loss": 0.88027489, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43505859, + "step": 2026, + "time_per_iteration": 2.729025363922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00297225, + "epoch": 0.38995767602924203, + "flos": 836346062592.0, + "grad_norm": 0.030634369701250594, + "language_loss": 0.84155977, + "learning_rate": 0.0006970116792908992, + "loss": 0.85202479, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.43603516, + "step": 2027, + "time_per_iteration": 3.0780837535858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054265, + "balance_loss_mlp": 1.01070547, + "epoch": 0.39015005771450556, + "flos": 542647619328.0, + "grad_norm": 0.03376343400122794, + "language_loss": 0.81809974, + "learning_rate": 0.000696725302396302, + "loss": 0.82864237, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.4362793, + "step": 2028, + "time_per_iteration": 2.6632442474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_mlp": 1.00277102, + "epoch": 0.39034243939976915, + "flos": 1009142275584.0, + "grad_norm": 0.030316104633677343, + "language_loss": 0.86213875, + "learning_rate": 0.0006964388491261692, + "loss": 0.872603, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.43725586, + "step": 2029, + "time_per_iteration": 3.2410776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052185, + "balance_loss_mlp": 1.00848317, + "epoch": 0.3905348210850327, + "flos": 680241272832.0, + "grad_norm": 0.03528753395725821, + "language_loss": 0.88294208, + "learning_rate": 0.0006961523195917114, + "loss": 0.89346391, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.43774414, + "step": 2030, + "time_per_iteration": 2.8754475116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104739, + "balance_loss_mlp": 1.00375915, + "epoch": 0.39072720277029627, + "flos": 549989666304.0, + "grad_norm": 0.032806843563698423, + "language_loss": 0.78588331, + "learning_rate": 0.0006958657139041696, + "loss": 0.79635721, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.43701172, + "step": 2031, + "time_per_iteration": 2.7329561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_mlp": 1.00554657, + "epoch": 0.39091958445555985, + "flos": 1551054025728.0, + "grad_norm": 0.008088132411436895, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77760577, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.42089844, + "step": 2032, + "time_per_iteration": 4.958296298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048975, + "balance_loss_mlp": 1.00529635, + "epoch": 0.3911119661408234, + "flos": 505052794368.0, + "grad_norm": 0.03533188094946227, + "language_loss": 0.78901434, + "learning_rate": 0.0006952922745149434, + "loss": 0.7995041, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.4375, + "step": 2033, + "time_per_iteration": 2.6192519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050645, + "balance_loss_mlp": 1.00684798, + "epoch": 0.391304347826087, + "flos": 558330031104.0, + "grad_norm": 0.032114717040763616, + "language_loss": 0.88009661, + "learning_rate": 0.000695005441035888, + "loss": 0.89060307, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.4387207, + "step": 2034, + "time_per_iteration": 2.6519060134887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_mlp": 1.00334167, + "epoch": 0.3914967295113505, + "flos": 1502944322304.0, + "grad_norm": 0.004600085335304226, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7476902, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.42285156, + "step": 2035, + "time_per_iteration": 4.875830888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049757, + "balance_loss_mlp": 1.00581694, + "epoch": 0.3916891111966141, + "flos": 708330219264.0, + "grad_norm": 0.02756997110289995, + "language_loss": 0.81809461, + "learning_rate": 0.0006944315470656863, + "loss": 0.82859218, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.44018555, + "step": 2036, + "time_per_iteration": 2.9486818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104941, + "balance_loss_mlp": 1.00537384, + "epoch": 0.3918814928818776, + "flos": 557409480960.0, + "grad_norm": 0.03430912315299504, + "language_loss": 0.91194409, + "learning_rate": 0.000694144486797345, + "loss": 0.92243814, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.44116211, + "step": 2037, + "time_per_iteration": 2.661637783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_mlp": 1.01155853, + "epoch": 0.3920738745671412, + "flos": 1541688131328.0, + "grad_norm": 0.009695617032389551, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80574143, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.421875, + "step": 2038, + "time_per_iteration": 4.676162004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_mlp": 1.00672829, + "epoch": 0.39226625625240474, + "flos": 499805559552.0, + "grad_norm": 0.03059706599431713, + "language_loss": 0.9011066, + "learning_rate": 0.0006935701402514156, + "loss": 0.91161263, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.43945312, + "step": 2039, + "time_per_iteration": 2.5921828746795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_mlp": 0.99837494, + "epoch": 0.39245863793766833, + "flos": 1350453680640.0, + "grad_norm": 0.0024785612799689367, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74075705, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.42480469, + "step": 2040, + "time_per_iteration": 4.920953273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_mlp": 1.00180471, + "epoch": 0.3926510196229319, + "flos": 1348115873280.0, + "grad_norm": 0.032003611488688986, + "language_loss": 0.84899294, + "learning_rate": 0.0006929954931031422, + "loss": 0.85944915, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.43896484, + "step": 2041, + "time_per_iteration": 3.7454288005828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_mlp": 1.00144792, + "epoch": 0.39284340130819545, + "flos": 500604600576.0, + "grad_norm": 0.027328608847006428, + "language_loss": 0.89267606, + "learning_rate": 0.0006927080570819805, + "loss": 0.9031285, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.4387207, + "step": 2042, + "time_per_iteration": 2.6191000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049281, + "balance_loss_mlp": 1.00565004, + "epoch": 0.39303578299345904, + "flos": 521342751744.0, + "grad_norm": 0.03887631720492337, + "language_loss": 0.81479704, + "learning_rate": 0.0006924205462449161, + "loss": 0.82528985, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.43701172, + "step": 2043, + "time_per_iteration": 2.6156415939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_mlp": 1.00467432, + "epoch": 0.39322816467872257, + "flos": 909539076864.0, + "grad_norm": 0.03230930456366714, + "language_loss": 0.82451463, + "learning_rate": 0.0006921329607035702, + "loss": 0.83499742, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.43676758, + "step": 2044, + "time_per_iteration": 3.248239040374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_mlp": 1.0066911, + "epoch": 0.39342054636398616, + "flos": 518642339328.0, + "grad_norm": 0.028076885263619615, + "language_loss": 0.88591248, + "learning_rate": 0.0006918453005695938, + "loss": 0.89641762, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.43896484, + "step": 2045, + "time_per_iteration": 2.6417062282562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.00430059, + "epoch": 0.3936129280492497, + "flos": 549012735744.0, + "grad_norm": 0.027900695924135757, + "language_loss": 0.84910023, + "learning_rate": 0.0006915575659546662, + "loss": 0.85958266, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.44018555, + "step": 2046, + "time_per_iteration": 2.6784913539886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053966, + "balance_loss_mlp": 1.0100733, + "epoch": 0.3938053097345133, + "flos": 527141151744.0, + "grad_norm": 0.03448231278490725, + "language_loss": 0.81310439, + "learning_rate": 0.0006912697569704959, + "loss": 0.82364404, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.43969727, + "step": 2047, + "time_per_iteration": 2.6214752197265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050494, + "balance_loss_mlp": 1.00679207, + "epoch": 0.39399769141977686, + "flos": 472589531136.0, + "grad_norm": 0.03168334850546869, + "language_loss": 0.87124646, + "learning_rate": 0.0006909818737288205, + "loss": 0.88175148, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.43774414, + "step": 2048, + "time_per_iteration": 2.6057982444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00775015, + "epoch": 0.3941900731050404, + "flos": 502727602944.0, + "grad_norm": 0.03501112209435681, + "language_loss": 0.81578481, + "learning_rate": 0.000690693916341406, + "loss": 0.82629883, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.43725586, + "step": 2049, + "time_per_iteration": 2.6459243297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.00910771, + "epoch": 0.394382454790304, + "flos": 582007722240.0, + "grad_norm": 0.03071224069667877, + "language_loss": 0.83009964, + "learning_rate": 0.0006904058849200475, + "loss": 0.8406263, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.4362793, + "step": 2050, + "time_per_iteration": 2.766828775405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_mlp": 1.00243104, + "epoch": 0.3945748364755675, + "flos": 514845432576.0, + "grad_norm": 0.030877215482718844, + "language_loss": 0.85563171, + "learning_rate": 0.0006901177795765683, + "loss": 0.86609566, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.44042969, + "step": 2051, + "time_per_iteration": 2.659912109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051919, + "balance_loss_mlp": 1.00807357, + "epoch": 0.3947672181608311, + "flos": 595058740992.0, + "grad_norm": 0.03343854917241654, + "language_loss": 0.821091, + "learning_rate": 0.0006898296004228213, + "loss": 0.8316102, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.43920898, + "step": 2052, + "time_per_iteration": 2.7115862369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_mlp": 1.00455475, + "epoch": 0.39495959984609463, + "flos": 1551052080384.0, + "grad_norm": 0.003971648916451202, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79173255, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.41992188, + "step": 2053, + "time_per_iteration": 4.894740343093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051907, + "balance_loss_mlp": 1.00818145, + "epoch": 0.3951519815313582, + "flos": 497524109568.0, + "grad_norm": 0.03573797234588687, + "language_loss": 0.80267316, + "learning_rate": 0.0006892530211320763, + "loss": 0.81319225, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.43798828, + "step": 2054, + "time_per_iteration": 2.767686605453491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104549, + "balance_loss_mlp": 1.00193131, + "epoch": 0.39534436321662175, + "flos": 532223136000.0, + "grad_norm": 0.03591265467553322, + "language_loss": 0.84680569, + "learning_rate": 0.000688964621218926, + "loss": 0.85726058, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.4362793, + "step": 2055, + "time_per_iteration": 2.6054694652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_mlp": 1.004722, + "epoch": 0.39553674490188534, + "flos": 703725523200.0, + "grad_norm": 0.03424008758122415, + "language_loss": 0.8074584, + "learning_rate": 0.0006886761479432037, + "loss": 0.8179388, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.43383789, + "step": 2056, + "time_per_iteration": 2.8390727043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.0042696, + "epoch": 0.3957291265871489, + "flos": 410656979712.0, + "grad_norm": 0.03388460034269331, + "language_loss": 0.85256028, + "learning_rate": 0.0006883876014169045, + "loss": 0.86303759, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.43530273, + "step": 2057, + "time_per_iteration": 2.554170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051678, + "balance_loss_mlp": 1.00814319, + "epoch": 0.39592150827241246, + "flos": 619639485696.0, + "grad_norm": 0.03722447028160607, + "language_loss": 0.90694773, + "learning_rate": 0.000688098981752052, + "loss": 0.91746461, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.43603516, + "step": 2058, + "time_per_iteration": 2.733053684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_mlp": 1.00568974, + "epoch": 0.39611388995767605, + "flos": 822721524480.0, + "grad_norm": 0.04279286873756595, + "language_loss": 0.80609208, + "learning_rate": 0.0006878102890606982, + "loss": 0.81658387, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.43554688, + "step": 2059, + "time_per_iteration": 3.084789752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_mlp": 1.00416124, + "epoch": 0.3963062716429396, + "flos": 493214921472.0, + "grad_norm": 0.03961147378322192, + "language_loss": 0.81771576, + "learning_rate": 0.0006875215234549239, + "loss": 0.82819128, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.43457031, + "step": 2060, + "time_per_iteration": 2.5823421478271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_mlp": 1.00351596, + "epoch": 0.39649865332820317, + "flos": 585834764544.0, + "grad_norm": 0.03854635921535854, + "language_loss": 0.8654902, + "learning_rate": 0.0006872326850468376, + "loss": 0.87595946, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.43481445, + "step": 2061, + "time_per_iteration": 2.705690860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.0052762, + "epoch": 0.3966910350134667, + "flos": 459512267520.0, + "grad_norm": 0.037411346592439484, + "language_loss": 0.79843795, + "learning_rate": 0.0006869437739485762, + "loss": 0.80892581, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.43579102, + "step": 2062, + "time_per_iteration": 2.5978832244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050469, + "balance_loss_mlp": 1.00710082, + "epoch": 0.3968834166987303, + "flos": 509615694336.0, + "grad_norm": 0.03224635872548594, + "language_loss": 0.93265009, + "learning_rate": 0.0006866547902723053, + "loss": 0.94315481, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.43432617, + "step": 2063, + "time_per_iteration": 2.7325148582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.00502992, + "epoch": 0.3970757983839938, + "flos": 573743179776.0, + "grad_norm": 0.0353853142482034, + "language_loss": 0.80804694, + "learning_rate": 0.000686365734130218, + "loss": 0.81852973, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.43310547, + "step": 2064, + "time_per_iteration": 2.719521999359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_mlp": 1.00350547, + "epoch": 0.3972681800692574, + "flos": 482586303744.0, + "grad_norm": 0.03284702600830507, + "language_loss": 0.8411094, + "learning_rate": 0.000686076605634536, + "loss": 0.8515777, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.43383789, + "step": 2065, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_mlp": 1.00822306, + "epoch": 0.397460561754521, + "flos": 488905733376.0, + "grad_norm": 0.0324228687482344, + "language_loss": 0.84781277, + "learning_rate": 0.0006857874048975088, + "loss": 0.85833061, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.4362793, + "step": 2066, + "time_per_iteration": 2.5906848907470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_mlp": 1.00659895, + "epoch": 0.3976529434397845, + "flos": 422896318464.0, + "grad_norm": 0.03171433053589848, + "language_loss": 0.8744958, + "learning_rate": 0.0006854981320314142, + "loss": 0.8849957, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.43457031, + "step": 2067, + "time_per_iteration": 2.4699788093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_mlp": 1.00240779, + "epoch": 0.3978453251250481, + "flos": 546622415616.0, + "grad_norm": 0.03563960500295594, + "language_loss": 0.8728829, + "learning_rate": 0.0006852087871485579, + "loss": 0.88334048, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.43408203, + "step": 2068, + "time_per_iteration": 2.6414859294891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_mlp": 1.00163472, + "epoch": 0.39803770681031164, + "flos": 652002627072.0, + "grad_norm": 0.03732729296318665, + "language_loss": 0.82978511, + "learning_rate": 0.0006849193703612735, + "loss": 0.84023428, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.43334961, + "step": 2069, + "time_per_iteration": 2.791269063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_mlp": 0.999928, + "epoch": 0.39823008849557523, + "flos": 741427272960.0, + "grad_norm": 0.030595728613543666, + "language_loss": 0.78243995, + "learning_rate": 0.0006846298817819225, + "loss": 0.79287314, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.43457031, + "step": 2070, + "time_per_iteration": 2.9561986923217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_mlp": 1.00235701, + "epoch": 0.39842247018083876, + "flos": 385889597184.0, + "grad_norm": 0.036398106493658954, + "language_loss": 0.81909132, + "learning_rate": 0.0006843403215228945, + "loss": 0.82954645, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.43212891, + "step": 2071, + "time_per_iteration": 2.4993679523468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.00218797, + "epoch": 0.39861485186610235, + "flos": 534763155456.0, + "grad_norm": 0.028807086351499752, + "language_loss": 0.8150484, + "learning_rate": 0.0006840506896966065, + "loss": 0.82550067, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.4309082, + "step": 2072, + "time_per_iteration": 2.7684881687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049113, + "balance_loss_mlp": 1.00595963, + "epoch": 0.39880723355136594, + "flos": 644413671168.0, + "grad_norm": 0.03625588542647267, + "language_loss": 0.83127856, + "learning_rate": 0.0006837609864155038, + "loss": 0.8417697, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.43212891, + "step": 2073, + "time_per_iteration": 2.8514270782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_mlp": 1.00782108, + "epoch": 0.39899961523662947, + "flos": 516892612608.0, + "grad_norm": 0.031931162968107815, + "language_loss": 0.83936673, + "learning_rate": 0.0006834712117920592, + "loss": 0.84987766, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.43334961, + "step": 2074, + "time_per_iteration": 2.6099319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_mlp": 1.00583923, + "epoch": 0.39919199692189306, + "flos": 465338857728.0, + "grad_norm": 0.040350277752625376, + "language_loss": 0.86345923, + "learning_rate": 0.0006831813659387729, + "loss": 0.87394845, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.43139648, + "step": 2075, + "time_per_iteration": 2.5189003944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_mlp": 1.00421119, + "epoch": 0.3993843786071566, + "flos": 532679036928.0, + "grad_norm": 0.031639049857806745, + "language_loss": 0.84865057, + "learning_rate": 0.0006828914489681733, + "loss": 0.85912478, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.43261719, + "step": 2076, + "time_per_iteration": 2.7052366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_mlp": 1.00252223, + "epoch": 0.3995767602924202, + "flos": 505024604160.0, + "grad_norm": 0.02906284980485529, + "language_loss": 0.85967886, + "learning_rate": 0.0006826014609928162, + "loss": 0.87013543, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.43188477, + "step": 2077, + "time_per_iteration": 2.7127158641815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046635, + "balance_loss_mlp": 1.00514984, + "epoch": 0.3997691419776837, + "flos": 1457473781760.0, + "grad_norm": 0.010869866041652092, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84246022, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.41503906, + "step": 2078, + "time_per_iteration": 4.8602213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_mlp": 1.00586236, + "epoch": 0.3999615236629473, + "flos": 531756541440.0, + "grad_norm": 0.03484656463436615, + "language_loss": 0.80513203, + "learning_rate": 0.0006820212724781896, + "loss": 0.81562173, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.43164062, + "step": 2079, + "time_per_iteration": 2.6769065856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00732243, + "epoch": 0.4001539053482108, + "flos": 696362088960.0, + "grad_norm": 0.03370335981625205, + "language_loss": 0.84624374, + "learning_rate": 0.0006817310721641694, + "loss": 0.85674727, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.4309082, + "step": 2080, + "time_per_iteration": 2.8362321853637695 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 173365568, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4728136041234432.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/training_args.bin b/sft_pretrain/Full_xmoe/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e62437ed6fbf4cf3ea22fcfae3749bb9df2d0109 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144fbe7f1cf435dbbf0ef9621414cb3e97a5ff4a560571b878000caf2931b07 +size 7992 diff --git a/sft_pretrain/Full_xmoe/checkpoint-2080/zero_to_fp32.py b/sft_pretrain/Full_xmoe/checkpoint-2080/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-2080/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/added_tokens.json b/sft_pretrain/Full_xmoe/checkpoint-3120/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/config.json b/sft_pretrain/Full_xmoe/checkpoint-3120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5ed860286ec8c9b3f17e5234326d2ed728ca6a65 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "xmoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/generation_config.json b/sft_pretrain/Full_xmoe/checkpoint-3120/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34e951f53b0bf50d9f941c025e39d6e0ab66123e --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b5f6af1efc0fa6db8959a5e1005a3b99bea9a1f1d8bbe6f496f88a89460b05b +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..035d00b30cee5b71d0701ea671bab3bbd8f41ff1 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a938351e84fd8eeebe2953a2b20e80cb1d8d437eb71309294159db4cb076ea3c +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0668077a8889149d5c6de3830b621203afa5f9ea --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6352a4385594bae27576f56d78f1ba7b2436324488983efea19d83219330677d +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1d69a26d1c7263530e7e0416fa3fbbbf21ca0ff --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c33b1058b45a38beb882e68a22371f9c9c5057a909854ee89029679fa68ea9f0 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76bfb3b26d9111afd155f581c4735e12045366b4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ed53f36818215cd4463eca44e0973a1386b60cae56f7fa1a2aded288a4664a +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd7870d6ec0586cc9f33d8f730ccf63abf7f60d8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e57a586bde58939ea9397b6b4a2e750b99d4bd987da49e74170c06590821b3fe +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e65853ccc904454001f4d02bc1b7577bde90ac01 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cef54d09f76cf096eb55b4671b101124bb631c0e224e0e3c80685d51641e58db +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74cb8fabdd1c6cef47a37bec9efae33788d429cb --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6dc9a7ddda81942377c835a584d289ebc5699927ed3f197397a3f8b71896dde +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/latest b/sft_pretrain/Full_xmoe/checkpoint-3120/latest new file mode 100644 index 0000000000000000000000000000000000000000..804da059f781bacb3f274fb2103e4bc7f9bb7407 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/latest @@ -0,0 +1 @@ +global_step3120 \ No newline at end of file diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/model-00001-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-3120/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/model-00002-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-3120/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e0ad3834b5f67b1758aa394199df1c991aa1848a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378c1cb30f4943cb4afbdb60dcda3cc5273b8bfc8f2a41c23b13f1a290a5fd87 +size 3759044016 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/model.safetensors.index.json b/sft_pretrain/Full_xmoe/checkpoint-3120/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..507806fb086ee2ffdb4c1df263574fc5a7cfa513 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/model.safetensors.index.json @@ -0,0 +1,675 @@ +{ + "metadata": { + "total_size": 8731443248 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_0.pth b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_1.pth b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_2.pth b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_3.pth b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/special_tokens_map.json b/sft_pretrain/Full_xmoe/checkpoint-3120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/tokenizer.model b/sft_pretrain/Full_xmoe/checkpoint-3120/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/tokenizer_config.json b/sft_pretrain/Full_xmoe/checkpoint-3120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/trainer_state.json b/sft_pretrain/Full_xmoe/checkpoint-3120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a7cc40e7e3262c1ca3d73cf9335abd915f7444c8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/trainer_state.json @@ -0,0 +1,46833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002308580223162, + "eval_steps": 500, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334678, + "balance_loss_mlp": 2.48847342, + "epoch": 0.00019238168526356292, + "flos": 471022563072.0, + "grad_norm": 15.010934477254423, + "language_loss": 2.91277003, + "learning_rate": 0.0, + "loss": 1.95375419, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 8.6015625, + "step": 1, + "time_per_iteration": 23.313215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03608113, + "balance_loss_mlp": 3.00043201, + "epoch": 0.00038476337052712584, + "flos": 505538830848.0, + "grad_norm": 25.821694542927546, + "language_loss": 10.7459116, + "learning_rate": 0.00013726078121135892, + "loss": 10.78199196, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.06640625, + "step": 2, + "time_per_iteration": 2.6342098712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03648002, + "balance_loss_mlp": 3.03803182, + "epoch": 0.0005771450557906887, + "flos": 600334166016.0, + "grad_norm": 27.537763142134942, + "language_loss": 10.88985825, + "learning_rate": 0.00021755319103969496, + "loss": 10.9263401, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.08984375, + "step": 3, + "time_per_iteration": 2.9129159450531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03639085, + "balance_loss_mlp": 3.03521824, + "epoch": 0.0007695267410542517, + "flos": 581497386240.0, + "grad_norm": 10.719163482624658, + "language_loss": 8.79598808, + "learning_rate": 0.00027452156242271784, + "loss": 8.83237934, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.02734375, + "step": 4, + "time_per_iteration": 2.72357439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03604871, + "balance_loss_mlp": 3.01435566, + "epoch": 0.0009619084263178145, + "flos": 487154061312.0, + "grad_norm": 22.68157363884245, + "language_loss": 9.41989708, + "learning_rate": 0.0003187096642208417, + "loss": 9.45594501, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.8984375, + "step": 5, + "time_per_iteration": 2.6791844367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03472164, + "balance_loss_mlp": 2.9011035, + "epoch": 0.0011542901115813775, + "flos": 561167503872.0, + "grad_norm": 7.113488232519407, + "language_loss": 9.41725159, + "learning_rate": 0.0003548139722510539, + "loss": 9.45197296, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.72265625, + "step": 6, + "time_per_iteration": 2.7308623790740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03266853, + "balance_loss_mlp": 2.70799947, + "epoch": 0.0013466717968449403, + "flos": 534951738624.0, + "grad_norm": 3.189932925125429, + "language_loss": 8.01036549, + "learning_rate": 0.00038533972973918044, + "loss": 8.0430336, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.59765625, + "step": 7, + "time_per_iteration": 2.6907436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02962571, + "balance_loss_mlp": 2.41211033, + "epoch": 0.0015390534821085034, + "flos": 493334485248.0, + "grad_norm": 5.13822781788523, + "language_loss": 7.84486008, + "learning_rate": 0.0004117823436340768, + "loss": 7.87448597, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.51171875, + "step": 8, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550478, + "balance_loss_mlp": 2.0114615, + "epoch": 0.0017314351673720662, + "flos": 565776090624.0, + "grad_norm": 3.8232757327488405, + "language_loss": 7.62468719, + "learning_rate": 0.00043510638207938993, + "loss": 7.65019178, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.7688682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02337757, + "balance_loss_mlp": 1.81705093, + "epoch": 0.001923816852635629, + "flos": 594509521152.0, + "grad_norm": 3.0012265425900817, + "language_loss": 6.96830463, + "learning_rate": 0.00045597044543220066, + "loss": 6.99168253, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.20703125, + "step": 10, + "time_per_iteration": 2.736985921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02262083, + "balance_loss_mlp": 1.74290299, + "epoch": 0.002116198537899192, + "flos": 610895709696.0, + "grad_norm": 2.2728267884834983, + "language_loss": 6.92078686, + "learning_rate": 0.00047484428652143135, + "loss": 6.94340801, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.19140625, + "step": 11, + "time_per_iteration": 2.8857340812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308547, + "balance_loss_mlp": 1.78135598, + "epoch": 0.002308580223162755, + "flos": 546175262976.0, + "grad_norm": 4.334726148282724, + "language_loss": 6.71077013, + "learning_rate": 0.0004920747534624128, + "loss": 6.73385572, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 5.2734375, + "step": 12, + "time_per_iteration": 2.635601282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02317905, + "balance_loss_mlp": 1.79147708, + "epoch": 0.002500961908426318, + "flos": 645924270336.0, + "grad_norm": 3.1568536142119923, + "language_loss": 6.53248501, + "learning_rate": 0.0005079252465375872, + "loss": 6.55566406, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 5.265625, + "step": 13, + "time_per_iteration": 2.8112540245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02242807, + "balance_loss_mlp": 1.72019386, + "epoch": 0.0026933435936898806, + "flos": 488849352960.0, + "grad_norm": 7.572425831928954, + "language_loss": 6.47189951, + "learning_rate": 0.0005226005109505393, + "loss": 6.49432755, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 5.2265625, + "step": 14, + "time_per_iteration": 2.590078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02247915, + "balance_loss_mlp": 1.72415757, + "epoch": 0.0028857252789534437, + "flos": 435526429440.0, + "grad_norm": 2.3229781853457747, + "language_loss": 6.01724243, + "learning_rate": 0.0005362628552605367, + "loss": 6.03972149, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 5.23828125, + "step": 15, + "time_per_iteration": 2.636983871459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02135688, + "balance_loss_mlp": 1.62108541, + "epoch": 0.0030781069642170067, + "flos": 597841778688.0, + "grad_norm": 4.36506198708269, + "language_loss": 5.46747923, + "learning_rate": 0.0005490431248454357, + "loss": 5.48883629, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 5.14453125, + "step": 16, + "time_per_iteration": 2.6904103755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02173305, + "balance_loss_mlp": 1.67586899, + "epoch": 0.0032704886494805694, + "flos": 1541513154048.0, + "grad_norm": 0.3693165783384919, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77878416, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 4.96875, + "step": 17, + "time_per_iteration": 6.815098285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01958957, + "balance_loss_mlp": 1.45846832, + "epoch": 0.0034628703347441324, + "flos": 474971102976.0, + "grad_norm": 7.376330921510473, + "language_loss": 3.16160107, + "learning_rate": 0.0005723671632907488, + "loss": 3.18119049, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 5.0, + "step": 18, + "time_per_iteration": 2.7730185985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974299, + "balance_loss_mlp": 1.48144007, + "epoch": 0.0036552520200076955, + "flos": 449478556416.0, + "grad_norm": 2.0435067055151803, + "language_loss": 1.8205657, + "learning_rate": 0.0005830738490244919, + "loss": 1.84030867, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.921875, + "step": 19, + "time_per_iteration": 2.5196421146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215561, + "balance_loss_mlp": 1.73147547, + "epoch": 0.003847633705271258, + "flos": 637351580928.0, + "grad_norm": 2.199322832792736, + "language_loss": 1.81859815, + "learning_rate": 0.0005932312266435596, + "loss": 1.84075379, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.83203125, + "step": 20, + "time_per_iteration": 2.7772061824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02397049, + "balance_loss_mlp": 1.91639686, + "epoch": 0.004040015390534821, + "flos": 590591105280.0, + "grad_norm": 2.068137361611091, + "language_loss": 1.81285238, + "learning_rate": 0.0006028929207788754, + "loss": 1.83682299, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.796875, + "step": 21, + "time_per_iteration": 2.7197327613830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02949394, + "balance_loss_mlp": 2.47560835, + "epoch": 0.004232397075798384, + "flos": 757866929664.0, + "grad_norm": 0.9893066861855494, + "language_loss": 1.43565178, + "learning_rate": 0.0006121050677327902, + "loss": 1.46514571, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.7265625, + "step": 22, + "time_per_iteration": 2.8821635246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04240368, + "balance_loss_mlp": 3.77421188, + "epoch": 0.004424778761061947, + "flos": 527727310080.0, + "grad_norm": 1.6702760591351544, + "language_loss": 1.36044598, + "learning_rate": 0.0006209076479463684, + "loss": 1.40284979, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.6484375, + "step": 23, + "time_per_iteration": 2.6194069385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04405254, + "balance_loss_mlp": 3.93871665, + "epoch": 0.00461716044632551, + "flos": 549218815488.0, + "grad_norm": 1.6356367296774819, + "language_loss": 1.46302319, + "learning_rate": 0.0006293355346737718, + "loss": 1.50707567, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.65234375, + "step": 24, + "time_per_iteration": 2.741433620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03977472, + "balance_loss_mlp": 3.50483179, + "epoch": 0.004809542131589073, + "flos": 568752569088.0, + "grad_norm": 1.079559317914091, + "language_loss": 1.33177948, + "learning_rate": 0.0006374193284416834, + "loss": 1.37155437, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.71484375, + "step": 25, + "time_per_iteration": 2.902089834213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03127712, + "balance_loss_mlp": 2.642483, + "epoch": 0.005001923816852636, + "flos": 471584410368.0, + "grad_norm": 0.4847890845471295, + "language_loss": 1.26058078, + "learning_rate": 0.0006451860277489461, + "loss": 1.29185796, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.84375, + "step": 26, + "time_per_iteration": 2.6045680046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02733563, + "balance_loss_mlp": 2.23154879, + "epoch": 0.005194305502116198, + "flos": 416381502720.0, + "grad_norm": 0.2845036760864029, + "language_loss": 1.33193052, + "learning_rate": 0.0006526595731190848, + "loss": 1.35926616, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.015625, + "step": 27, + "time_per_iteration": 2.4412264823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02759137, + "balance_loss_mlp": 2.2411015, + "epoch": 0.005386687187379761, + "flos": 629996894976.0, + "grad_norm": 0.34713687972437796, + "language_loss": 1.22031224, + "learning_rate": 0.0006598612921618983, + "loss": 1.24790359, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.1796875, + "step": 28, + "time_per_iteration": 2.80483078956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575766, + "balance_loss_mlp": 2.05010033, + "epoch": 0.005579068872643324, + "flos": 888021326592.0, + "grad_norm": 0.3062478898066755, + "language_loss": 1.16221631, + "learning_rate": 0.0006668102665011454, + "loss": 1.18797398, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.2578125, + "step": 29, + "time_per_iteration": 3.243164300918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02507804, + "balance_loss_mlp": 1.97527242, + "epoch": 0.005771450557906887, + "flos": 548658902016.0, + "grad_norm": 0.22276861521731073, + "language_loss": 1.24634933, + "learning_rate": 0.0006735236364718957, + "loss": 1.27142727, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.328125, + "step": 30, + "time_per_iteration": 2.7701382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02465182, + "balance_loss_mlp": 1.93226886, + "epoch": 0.00596383224317045, + "flos": 533069809152.0, + "grad_norm": 0.21102664747409663, + "language_loss": 1.23222375, + "learning_rate": 0.0006800168558381346, + "loss": 1.25687563, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.33203125, + "step": 31, + "time_per_iteration": 2.635246515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02445382, + "balance_loss_mlp": 1.91552007, + "epoch": 0.0061562139284340135, + "flos": 590163394560.0, + "grad_norm": 0.21886797396213825, + "language_loss": 1.26610851, + "learning_rate": 0.0006863039060567947, + "loss": 1.29056239, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.30078125, + "step": 32, + "time_per_iteration": 2.7791683673858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02338603, + "balance_loss_mlp": 1.80950415, + "epoch": 0.006348595613697576, + "flos": 619442154240.0, + "grad_norm": 0.18971916612404452, + "language_loss": 1.17543316, + "learning_rate": 0.0006923974775611263, + "loss": 1.19881916, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.29296875, + "step": 33, + "time_per_iteration": 2.836601495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02160521, + "balance_loss_mlp": 1.64134097, + "epoch": 0.006540977298961139, + "flos": 779300109312.0, + "grad_norm": 0.13369632510289112, + "language_loss": 1.13907146, + "learning_rate": 0.0006983091239737814, + "loss": 1.16067672, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.19140625, + "step": 34, + "time_per_iteration": 3.021479606628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0221033, + "balance_loss_mlp": 1.69649041, + "epoch": 0.006733358984224702, + "flos": 668373264384.0, + "grad_norm": 0.11522706717853448, + "language_loss": 1.11973858, + "learning_rate": 0.0007040493939600222, + "loss": 1.14184177, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 5.13671875, + "step": 35, + "time_per_iteration": 2.9400346279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0227657, + "balance_loss_mlp": 1.76997864, + "epoch": 0.006925740669488265, + "flos": 565496133888.0, + "grad_norm": 0.11143421895921844, + "language_loss": 1.12295914, + "learning_rate": 0.0007096279445021078, + "loss": 1.14572477, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 5.0625, + "step": 36, + "time_per_iteration": 2.698153495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02284885, + "balance_loss_mlp": 1.78668559, + "epoch": 0.007118122354751828, + "flos": 551112405504.0, + "grad_norm": 0.11733654674395574, + "language_loss": 1.1734066, + "learning_rate": 0.0007150536386503726, + "loss": 1.19625545, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.9765625, + "step": 37, + "time_per_iteration": 2.8579084873199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02268399, + "balance_loss_mlp": 1.77782845, + "epoch": 0.007310504040015391, + "flos": 703814951424.0, + "grad_norm": 0.14208952684155102, + "language_loss": 1.10088778, + "learning_rate": 0.0007203346302358509, + "loss": 1.12357187, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.8984375, + "step": 38, + "time_per_iteration": 2.928835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220017, + "balance_loss_mlp": 1.73555112, + "epoch": 0.007502885725278953, + "flos": 600501361920.0, + "grad_norm": 0.142042154575746, + "language_loss": 1.15486813, + "learning_rate": 0.000725478437577282, + "loss": 1.17706823, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.8359375, + "step": 39, + "time_per_iteration": 2.8706436157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0209897, + "balance_loss_mlp": 1.62251425, + "epoch": 0.007695267410542516, + "flos": 561428018688.0, + "grad_norm": 0.13255726845543458, + "language_loss": 1.10233212, + "learning_rate": 0.0007304920078549186, + "loss": 1.12332189, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.75390625, + "step": 40, + "time_per_iteration": 2.6895179748535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01939831, + "balance_loss_mlp": 1.46986008, + "epoch": 0.007887649095806078, + "flos": 509231725056.0, + "grad_norm": 0.11166218824526469, + "language_loss": 1.12161303, + "learning_rate": 0.0007353817735343603, + "loss": 1.14101124, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.6875, + "step": 41, + "time_per_iteration": 2.709167957305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0184399, + "balance_loss_mlp": 1.3778342, + "epoch": 0.008080030781069641, + "flos": 504905040384.0, + "grad_norm": 0.06254207778511488, + "language_loss": 1.07663667, + "learning_rate": 0.0007401537019902344, + "loss": 1.09507656, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.6484375, + "step": 42, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01789021, + "balance_loss_mlp": 1.32896876, + "epoch": 0.008272412466333205, + "flos": 519106988544.0, + "grad_norm": 0.07012531219711775, + "language_loss": 1.09992051, + "learning_rate": 0.0007448133392900729, + "loss": 1.11781073, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.5859375, + "step": 43, + "time_per_iteration": 2.6997878551483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01787217, + "balance_loss_mlp": 1.32983518, + "epoch": 0.008464794151596768, + "flos": 609184866816.0, + "grad_norm": 0.09276066699658307, + "language_loss": 1.05755496, + "learning_rate": 0.0007493658489441491, + "loss": 1.07542706, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.56640625, + "step": 44, + "time_per_iteration": 2.8852477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177156, + "balance_loss_mlp": 1.31913674, + "epoch": 0.00865717583686033, + "flos": 539007214848.0, + "grad_norm": 0.11478380715178954, + "language_loss": 1.09959674, + "learning_rate": 0.0007538160463002316, + "loss": 1.11731243, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.53125, + "step": 45, + "time_per_iteration": 2.685568332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01802016, + "balance_loss_mlp": 1.35378933, + "epoch": 0.008849557522123894, + "flos": 509010094080.0, + "grad_norm": 0.14537339285711792, + "language_loss": 1.13533509, + "learning_rate": 0.0007581684291577274, + "loss": 1.15335524, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.49609375, + "step": 46, + "time_per_iteration": 2.5798568725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764716, + "balance_loss_mlp": 1.31915987, + "epoch": 0.009041939207387457, + "flos": 626508135168.0, + "grad_norm": 0.13285081251714825, + "language_loss": 1.15270185, + "learning_rate": 0.0007624272050891776, + "loss": 1.17034888, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.46875, + "step": 47, + "time_per_iteration": 2.822632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0175788, + "balance_loss_mlp": 1.31461263, + "epoch": 0.00923432089265102, + "flos": 550610817792.0, + "grad_norm": 0.11934546954286276, + "language_loss": 1.04916859, + "learning_rate": 0.0007665963158851307, + "loss": 1.06674731, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.4453125, + "step": 48, + "time_per_iteration": 2.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01741735, + "balance_loss_mlp": 1.29846764, + "epoch": 0.009426702577914583, + "flos": 563679333120.0, + "grad_norm": 0.08548395668661983, + "language_loss": 1.13647461, + "learning_rate": 0.0007706794594783609, + "loss": 1.15389204, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.4453125, + "step": 49, + "time_per_iteration": 2.734813928604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01727457, + "balance_loss_mlp": 1.28838515, + "epoch": 0.009619084263178146, + "flos": 617926697472.0, + "grad_norm": 0.06892583067190382, + "language_loss": 1.12110853, + "learning_rate": 0.0007746801096530423, + "loss": 1.13838315, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.40234375, + "step": 50, + "time_per_iteration": 2.7447421550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01719129, + "balance_loss_mlp": 1.28043914, + "epoch": 0.009811465948441709, + "flos": 542489171712.0, + "grad_norm": 0.04778558244894799, + "language_loss": 1.16797209, + "learning_rate": 0.0007786015338021173, + "loss": 1.1851635, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.3984375, + "step": 51, + "time_per_iteration": 2.65645694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01722789, + "balance_loss_mlp": 1.28562462, + "epoch": 0.010003847633705272, + "flos": 536977531392.0, + "grad_norm": 0.06217135289779639, + "language_loss": 1.09074998, + "learning_rate": 0.0007824468089603051, + "loss": 1.10797799, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.3828125, + "step": 52, + "time_per_iteration": 2.7218713760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01697539, + "balance_loss_mlp": 1.26380801, + "epoch": 0.010196229318968833, + "flos": 910806657792.0, + "grad_norm": 0.04206474108062499, + "language_loss": 1.08130515, + "learning_rate": 0.0007862188363098669, + "loss": 1.09828055, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.34765625, + "step": 53, + "time_per_iteration": 3.149973154067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01668333, + "balance_loss_mlp": 1.23765349, + "epoch": 0.010388611004232396, + "flos": 586970142720.0, + "grad_norm": 0.050634309517598654, + "language_loss": 1.08688021, + "learning_rate": 0.0007899203543304438, + "loss": 1.10356343, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.31640625, + "step": 54, + "time_per_iteration": 2.7033088207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01691162, + "balance_loss_mlp": 1.26315343, + "epoch": 0.01058099268949596, + "flos": 503472208896.0, + "grad_norm": 0.06464656169002964, + "language_loss": 1.22991037, + "learning_rate": 0.0007935539507422731, + "loss": 1.246822, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.2890625, + "step": 55, + "time_per_iteration": 2.601745843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017059, + "balance_loss_mlp": 1.28017938, + "epoch": 0.010773374374759523, + "flos": 545558969088.0, + "grad_norm": 0.06403483907250343, + "language_loss": 1.12561536, + "learning_rate": 0.0007971220733732573, + "loss": 1.14267421, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.265625, + "step": 56, + "time_per_iteration": 2.677314281463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0169453, + "balance_loss_mlp": 1.27262425, + "epoch": 0.010965756060023086, + "flos": 527286960384.0, + "grad_norm": 0.061369678053330295, + "language_loss": 1.07931721, + "learning_rate": 0.0008006270400641869, + "loss": 1.09626245, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.2265625, + "step": 57, + "time_per_iteration": 2.7162468433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699229, + "balance_loss_mlp": 1.27846837, + "epoch": 0.011158137745286649, + "flos": 578098054656.0, + "grad_norm": 0.06126094216688289, + "language_loss": 1.08923888, + "learning_rate": 0.0008040710477125043, + "loss": 1.10623109, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.21484375, + "step": 58, + "time_per_iteration": 2.724116563796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648065, + "balance_loss_mlp": 1.23150039, + "epoch": 0.011350519430550212, + "flos": 530314961664.0, + "grad_norm": 0.059594432794803906, + "language_loss": 1.09501219, + "learning_rate": 0.0008074561805429771, + "loss": 1.11149275, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.171875, + "step": 59, + "time_per_iteration": 2.613821268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01628698, + "balance_loss_mlp": 1.21594822, + "epoch": 0.011542901115813775, + "flos": 556971076608.0, + "grad_norm": 0.046387810099464834, + "language_loss": 1.0703913, + "learning_rate": 0.0008107844176832545, + "loss": 1.08667827, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.1328125, + "step": 60, + "time_per_iteration": 2.6809566020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01602811, + "balance_loss_mlp": 1.19349384, + "epoch": 0.011735282801077338, + "flos": 573176463360.0, + "grad_norm": 0.036957475185327084, + "language_loss": 1.08104563, + "learning_rate": 0.0008140576401132568, + "loss": 1.09707379, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.09765625, + "step": 61, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596506, + "balance_loss_mlp": 1.19024038, + "epoch": 0.0119276644863409, + "flos": 616717442304.0, + "grad_norm": 0.034032461682055544, + "language_loss": 1.09685671, + "learning_rate": 0.0008172776370494935, + "loss": 1.11282182, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.06640625, + "step": 62, + "time_per_iteration": 2.7589328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01605764, + "balance_loss_mlp": 1.20255029, + "epoch": 0.012120046171604464, + "flos": 502085064192.0, + "grad_norm": 0.035968497482949544, + "language_loss": 1.17104983, + "learning_rate": 0.0008204461118185703, + "loss": 1.18710756, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.03515625, + "step": 63, + "time_per_iteration": 2.594369411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01603311, + "balance_loss_mlp": 1.20353031, + "epoch": 0.012312427856868027, + "flos": 474302319360.0, + "grad_norm": 0.04911792883083492, + "language_loss": 1.06295228, + "learning_rate": 0.0008235646872681536, + "loss": 1.07898545, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 3.99609375, + "step": 64, + "time_per_iteration": 2.5651702880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01599528, + "balance_loss_mlp": 1.20279896, + "epoch": 0.012504809542131588, + "flos": 539471864064.0, + "grad_norm": 0.049725750424410776, + "language_loss": 1.06296277, + "learning_rate": 0.0008266349107584288, + "loss": 1.07895803, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 3.95898438, + "step": 65, + "time_per_iteration": 2.6876485347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596697, + "balance_loss_mlp": 1.20492756, + "epoch": 0.012697191227395151, + "flos": 609857541120.0, + "grad_norm": 0.056540756097456804, + "language_loss": 1.08585978, + "learning_rate": 0.0008296582587724851, + "loss": 1.10182667, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 3.91210938, + "step": 66, + "time_per_iteration": 2.71223783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587883, + "balance_loss_mlp": 1.19821179, + "epoch": 0.012889572912658714, + "flos": 769398600960.0, + "grad_norm": 0.04465917834699911, + "language_loss": 1.0627861, + "learning_rate": 0.0008326361411800136, + "loss": 1.07866502, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 3.89648438, + "step": 67, + "time_per_iteration": 2.9413115978240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01577237, + "balance_loss_mlp": 1.19099891, + "epoch": 0.013081954597922277, + "flos": 535021724928.0, + "grad_norm": 0.05343660826588632, + "language_loss": 1.06744349, + "learning_rate": 0.0008355699051851403, + "loss": 1.08321595, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 3.86132812, + "step": 68, + "time_per_iteration": 2.726212501525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0157129, + "balance_loss_mlp": 1.18829489, + "epoch": 0.01327433628318584, + "flos": 574181584128.0, + "grad_norm": 0.041490887209285586, + "language_loss": 1.14052749, + "learning_rate": 0.0008384608389860635, + "loss": 1.15624034, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 3.828125, + "step": 69, + "time_per_iteration": 2.6679208278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156381, + "balance_loss_mlp": 1.18386579, + "epoch": 0.013466717968449404, + "flos": 498259967232.0, + "grad_norm": 0.03618836919088814, + "language_loss": 1.04182374, + "learning_rate": 0.000841310175171381, + "loss": 1.05746174, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 3.796875, + "step": 70, + "time_per_iteration": 2.6277127265930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01563963, + "balance_loss_mlp": 1.18592632, + "epoch": 0.013659099653712967, + "flos": 566622763776.0, + "grad_norm": 0.04320101591589407, + "language_loss": 1.02295327, + "learning_rate": 0.000844119093875517, + "loss": 1.03859293, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 3.77734375, + "step": 71, + "time_per_iteration": 2.7236883640289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558639, + "balance_loss_mlp": 1.18403625, + "epoch": 0.01385148133897653, + "flos": 574943686656.0, + "grad_norm": 0.03416580025853519, + "language_loss": 1.06855714, + "learning_rate": 0.0008468887257134666, + "loss": 1.08414352, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 3.7421875, + "step": 72, + "time_per_iteration": 2.6696412563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558456, + "balance_loss_mlp": 1.18499684, + "epoch": 0.014043863024240093, + "flos": 577959048960.0, + "grad_norm": 0.037886537215891476, + "language_loss": 1.09368944, + "learning_rate": 0.0008496201545131264, + "loss": 1.10927403, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 3.73046875, + "step": 73, + "time_per_iteration": 2.701594591140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545785, + "balance_loss_mlp": 1.17575896, + "epoch": 0.014236244709503656, + "flos": 940265252352.0, + "grad_norm": 0.04766211184506119, + "language_loss": 1.07240248, + "learning_rate": 0.0008523144198617317, + "loss": 1.08786011, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.6953125, + "step": 74, + "time_per_iteration": 3.1882145404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01551426, + "balance_loss_mlp": 1.18387985, + "epoch": 0.014428626394767219, + "flos": 529496478720.0, + "grad_norm": 0.031986864242930464, + "language_loss": 1.06216824, + "learning_rate": 0.0008549725194813783, + "loss": 1.0776825, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.66992188, + "step": 75, + "time_per_iteration": 2.666274309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01546662, + "balance_loss_mlp": 1.18102288, + "epoch": 0.014621008080030782, + "flos": 805283549952.0, + "grad_norm": 0.03321604497436844, + "language_loss": 1.05779314, + "learning_rate": 0.0008575954114472099, + "loss": 1.07325983, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.65039062, + "step": 76, + "time_per_iteration": 3.1192731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547179, + "balance_loss_mlp": 1.18478322, + "epoch": 0.014813389765294343, + "flos": 698357746176.0, + "grad_norm": 0.03477979781895141, + "language_loss": 1.02737951, + "learning_rate": 0.0008601840162606118, + "loss": 1.04285145, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.6171875, + "step": 77, + "time_per_iteration": 3.0015783309936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547226, + "balance_loss_mlp": 1.18788171, + "epoch": 0.015005771450557906, + "flos": 598165476864.0, + "grad_norm": 0.032631512960834254, + "language_loss": 1.09477437, + "learning_rate": 0.000862739218788641, + "loss": 1.11024666, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.58984375, + "step": 78, + "time_per_iteration": 2.790245771408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536731, + "balance_loss_mlp": 1.18177319, + "epoch": 0.01519815313582147, + "flos": 550493199360.0, + "grad_norm": 0.0308447873241268, + "language_loss": 1.07131243, + "learning_rate": 0.0008652618700799138, + "loss": 1.0866797, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.55664062, + "step": 79, + "time_per_iteration": 2.6302430629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01532812, + "balance_loss_mlp": 1.18033433, + "epoch": 0.015390534821085032, + "flos": 431440817664.0, + "grad_norm": 0.04595099678969376, + "language_loss": 1.06556606, + "learning_rate": 0.0008677527890662774, + "loss": 1.08089423, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.53125, + "step": 80, + "time_per_iteration": 2.4970459938049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01520539, + "balance_loss_mlp": 1.17130363, + "epoch": 0.015582916506348595, + "flos": 525185345280.0, + "grad_norm": 0.030530536654869142, + "language_loss": 1.07461143, + "learning_rate": 0.0008702127641587799, + "loss": 1.08981681, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.49804688, + "step": 81, + "time_per_iteration": 2.6258630752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01512144, + "balance_loss_mlp": 1.16500628, + "epoch": 0.015775298191612157, + "flos": 576617591040.0, + "grad_norm": 0.026948447424875538, + "language_loss": 1.02672768, + "learning_rate": 0.0008726425547457192, + "loss": 1.04184914, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.4765625, + "step": 82, + "time_per_iteration": 2.7344956398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517079, + "balance_loss_mlp": 1.17375636, + "epoch": 0.01596767987687572, + "flos": 611440071936.0, + "grad_norm": 0.03479426421062965, + "language_loss": 1.02940345, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457426, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.4375, + "step": 83, + "time_per_iteration": 2.738685369491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01509349, + "balance_loss_mlp": 1.16850555, + "epoch": 0.016160061562139283, + "flos": 568233484800.0, + "grad_norm": 0.05178756375238081, + "language_loss": 1.08039558, + "learning_rate": 0.0008774144832015932, + "loss": 1.09548914, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.41210938, + "step": 84, + "time_per_iteration": 2.6948299407958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575775, + "balance_loss_mlp": 2.26144409, + "epoch": 0.016352443247402846, + "flos": 1414502431488.0, + "grad_norm": 0.37456313977874084, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7735008, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.140625, + "step": 85, + "time_per_iteration": 4.596364974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517697, + "balance_loss_mlp": 1.17895198, + "epoch": 0.01654482493266641, + "flos": 731786279424.0, + "grad_norm": 0.04138572693056026, + "language_loss": 1.03059626, + "learning_rate": 0.0008820741205014318, + "loss": 1.04577315, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.390625, + "step": 86, + "time_per_iteration": 2.901047706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01566516, + "balance_loss_mlp": 1.22757995, + "epoch": 0.016737206617929972, + "flos": 537405242112.0, + "grad_norm": 0.0588613682629828, + "language_loss": 1.04849172, + "learning_rate": 0.0008843634575408404, + "loss": 1.06415701, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.39257812, + "step": 87, + "time_per_iteration": 2.6739823818206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583525, + "balance_loss_mlp": 1.24497032, + "epoch": 0.016929588303193535, + "flos": 538130406144.0, + "grad_norm": 0.09131872689500015, + "language_loss": 1.06101418, + "learning_rate": 0.0008866266301555082, + "loss": 1.07684946, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.38867188, + "step": 88, + "time_per_iteration": 2.741093635559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156783, + "balance_loss_mlp": 1.23118281, + "epoch": 0.017121969988457098, + "flos": 527792438784.0, + "grad_norm": 0.07103005743700296, + "language_loss": 1.07027078, + "learning_rate": 0.0008888642296509615, + "loss": 1.08594918, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.36914062, + "step": 89, + "time_per_iteration": 2.622267007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01554346, + "balance_loss_mlp": 1.2196058, + "epoch": 0.01731435167372066, + "flos": 626768649984.0, + "grad_norm": 0.057543283798364535, + "language_loss": 1.11941445, + "learning_rate": 0.0008910768275115906, + "loss": 1.13495779, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.34960938, + "step": 90, + "time_per_iteration": 2.778939962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536545, + "balance_loss_mlp": 1.20409441, + "epoch": 0.017506733358984224, + "flos": 497385103872.0, + "grad_norm": 0.06951140803051024, + "language_loss": 1.07318401, + "learning_rate": 0.0008932649762767675, + "loss": 1.08854938, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.32617188, + "step": 91, + "time_per_iteration": 2.5841660499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01529864, + "balance_loss_mlp": 1.20122755, + "epoch": 0.017699115044247787, + "flos": 747218870016.0, + "grad_norm": 0.037985069994816135, + "language_loss": 1.10022223, + "learning_rate": 0.0008954292103690864, + "loss": 1.11552095, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.28710938, + "step": 92, + "time_per_iteration": 2.976200580596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525091, + "balance_loss_mlp": 1.19893408, + "epoch": 0.01789149672951135, + "flos": 516521282304.0, + "grad_norm": 0.05507041657686672, + "language_loss": 1.1172272, + "learning_rate": 0.0008975700468778296, + "loss": 1.13247812, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.26171875, + "step": 93, + "time_per_iteration": 2.5778274536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01518338, + "balance_loss_mlp": 1.19427943, + "epoch": 0.018083878414774913, + "flos": 587230657536.0, + "grad_norm": 0.047907590915393955, + "language_loss": 1.05762661, + "learning_rate": 0.0008996879863005366, + "loss": 1.07280993, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.24023438, + "step": 94, + "time_per_iteration": 2.6827101707458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01506508, + "balance_loss_mlp": 1.18664575, + "epoch": 0.018276260100038477, + "flos": 498370782720.0, + "grad_norm": 0.03950158468897577, + "language_loss": 1.05640411, + "learning_rate": 0.0009017835132453337, + "loss": 1.07146931, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.19726562, + "step": 95, + "time_per_iteration": 2.5879104137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01488471, + "balance_loss_mlp": 1.17223215, + "epoch": 0.01846864178530204, + "flos": 641233058304.0, + "grad_norm": 0.042611409633865054, + "language_loss": 1.05607677, + "learning_rate": 0.0009038570970964896, + "loss": 1.07096148, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.16015625, + "step": 96, + "time_per_iteration": 2.761634349822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487316, + "balance_loss_mlp": 1.17374837, + "epoch": 0.018661023470565603, + "flos": 512667995136.0, + "grad_norm": 0.026597294022958493, + "language_loss": 1.02809072, + "learning_rate": 0.0009059091926454854, + "loss": 1.04296374, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.1328125, + "step": 97, + "time_per_iteration": 2.602036952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487556, + "balance_loss_mlp": 1.17742097, + "epoch": 0.018853405155829166, + "flos": 932697683712.0, + "grad_norm": 0.04097414840704221, + "language_loss": 1.01764143, + "learning_rate": 0.0009079402406897198, + "loss": 1.03251696, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.09765625, + "step": 98, + "time_per_iteration": 3.2514705657958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483888, + "balance_loss_mlp": 1.17642295, + "epoch": 0.01904578684109273, + "flos": 577587718656.0, + "grad_norm": 0.027217181555243938, + "language_loss": 1.03385735, + "learning_rate": 0.0009099506686008212, + "loss": 1.04869628, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.0703125, + "step": 99, + "time_per_iteration": 2.7867672443389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473245, + "balance_loss_mlp": 1.16883183, + "epoch": 0.019238168526356292, + "flos": 559521789696.0, + "grad_norm": 0.02943095981266107, + "language_loss": 1.06245995, + "learning_rate": 0.0009119408908644013, + "loss": 1.07719231, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.0390625, + "step": 100, + "time_per_iteration": 2.718982219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466855, + "balance_loss_mlp": 1.164922, + "epoch": 0.019430550211619855, + "flos": 725104267776.0, + "grad_norm": 0.035830377247789626, + "language_loss": 1.12020779, + "learning_rate": 0.0009139113095929519, + "loss": 1.13487625, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.01367188, + "step": 101, + "time_per_iteration": 2.9023444652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146708, + "balance_loss_mlp": 1.16781712, + "epoch": 0.019622931896883418, + "flos": 500456846592.0, + "grad_norm": 0.031534744220975436, + "language_loss": 1.0658195, + "learning_rate": 0.0009158623150134762, + "loss": 1.08049035, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 2.98632812, + "step": 102, + "time_per_iteration": 2.5731325149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01479653, + "balance_loss_mlp": 1.1828692, + "epoch": 0.01981531358214698, + "flos": 510282532608.0, + "grad_norm": 0.0334583858191085, + "language_loss": 1.05968487, + "learning_rate": 0.000917794285931332, + "loss": 1.07448149, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 2.9609375, + "step": 103, + "time_per_iteration": 2.656132221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477107, + "balance_loss_mlp": 1.18184972, + "epoch": 0.020007695267410544, + "flos": 522393559296.0, + "grad_norm": 0.033386157220771755, + "language_loss": 0.97816026, + "learning_rate": 0.0009197075901716639, + "loss": 0.99293131, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 2.9453125, + "step": 104, + "time_per_iteration": 2.7207133769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01472947, + "balance_loss_mlp": 1.1811223, + "epoch": 0.020200076952674107, + "flos": 534444314880.0, + "grad_norm": 0.03432724584635873, + "language_loss": 1.08410704, + "learning_rate": 0.0009216025849997171, + "loss": 1.09883642, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 2.92382812, + "step": 105, + "time_per_iteration": 2.783440113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461838, + "balance_loss_mlp": 1.17115784, + "epoch": 0.020392458637937667, + "flos": 686083414272.0, + "grad_norm": 0.04360543496830388, + "language_loss": 1.02907205, + "learning_rate": 0.0009234796175212258, + "loss": 1.04369044, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 2.9140625, + "step": 106, + "time_per_iteration": 2.914760112762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450941, + "balance_loss_mlp": 1.1615957, + "epoch": 0.02058484032320123, + "flos": 703415430912.0, + "grad_norm": 0.03266429542390293, + "language_loss": 1.06572628, + "learning_rate": 0.000925339025064007, + "loss": 1.08023572, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 2.90039062, + "step": 107, + "time_per_iteration": 2.951838254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453976, + "balance_loss_mlp": 1.16558492, + "epoch": 0.020777222008464793, + "flos": 640328059392.0, + "grad_norm": 0.03192051704400644, + "language_loss": 0.99516582, + "learning_rate": 0.0009271811355418027, + "loss": 1.00970554, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 2.890625, + "step": 108, + "time_per_iteration": 2.897881507873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01449031, + "balance_loss_mlp": 1.16159379, + "epoch": 0.020969603693728356, + "flos": 683321763840.0, + "grad_norm": 0.04466737388011785, + "language_loss": 1.06219566, + "learning_rate": 0.0009290062678013548, + "loss": 1.07668602, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 2.88085938, + "step": 109, + "time_per_iteration": 2.8423218727111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430047, + "balance_loss_mlp": 1.14413536, + "epoch": 0.02116198537899192, + "flos": 534420015360.0, + "grad_norm": 0.034258615277409615, + "language_loss": 1.04797208, + "learning_rate": 0.0009308147319536321, + "loss": 1.06227255, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 2.86523438, + "step": 110, + "time_per_iteration": 2.6316323280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425495, + "balance_loss_mlp": 1.14053667, + "epoch": 0.021354367064255482, + "flos": 718728457728.0, + "grad_norm": 0.048864006828935096, + "language_loss": 1.11352324, + "learning_rate": 0.0009326068296900676, + "loss": 1.12777817, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 2.85546875, + "step": 111, + "time_per_iteration": 2.8313205242156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416698, + "balance_loss_mlp": 1.13269377, + "epoch": 0.021546748749519045, + "flos": 520624390656.0, + "grad_norm": 0.040751650479700946, + "language_loss": 1.01643181, + "learning_rate": 0.0009343828545846161, + "loss": 1.03059864, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 2.84570312, + "step": 112, + "time_per_iteration": 2.7729175090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401674, + "balance_loss_mlp": 1.11805177, + "epoch": 0.021739130434782608, + "flos": 506161927680.0, + "grad_norm": 0.042106341000359294, + "language_loss": 1.06266427, + "learning_rate": 0.0009361430923823841, + "loss": 1.07668102, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 2.84179688, + "step": 113, + "time_per_iteration": 2.5920841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01394311, + "balance_loss_mlp": 1.11126053, + "epoch": 0.02193151212004617, + "flos": 464427055872.0, + "grad_norm": 0.07156510336232694, + "language_loss": 1.09574234, + "learning_rate": 0.0009378878212755459, + "loss": 1.10968542, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 2.8359375, + "step": 114, + "time_per_iteration": 2.5213706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376382, + "balance_loss_mlp": 1.09371293, + "epoch": 0.022123893805309734, + "flos": 553332617472.0, + "grad_norm": 0.03568103744776456, + "language_loss": 0.9948864, + "learning_rate": 0.0009396173121672103, + "loss": 1.0086503, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.83203125, + "step": 115, + "time_per_iteration": 2.654648780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351182, + "balance_loss_mlp": 1.0677501, + "epoch": 0.022316275490573297, + "flos": 637379771136.0, + "grad_norm": 0.04471438423319615, + "language_loss": 1.05214882, + "learning_rate": 0.0009413318289238633, + "loss": 1.06566072, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.83984375, + "step": 116, + "time_per_iteration": 2.7842695713043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311882, + "balance_loss_mlp": 1.0282588, + "epoch": 0.02250865717583686, + "flos": 800316271872.0, + "grad_norm": 0.046340717018109684, + "language_loss": 0.97282118, + "learning_rate": 0.0009430316286169771, + "loss": 0.98593992, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.84179688, + "step": 117, + "time_per_iteration": 3.015839099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377985, + "balance_loss_mlp": 1.09283674, + "epoch": 0.022701038861100423, + "flos": 457063621632.0, + "grad_norm": 0.07808854544893538, + "language_loss": 1.02862036, + "learning_rate": 0.0009447169617543361, + "loss": 1.04240024, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.85742188, + "step": 118, + "time_per_iteration": 2.582919120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371579, + "balance_loss_mlp": 1.08871901, + "epoch": 0.022893420546363986, + "flos": 584187105024.0, + "grad_norm": 0.08661397198668377, + "language_loss": 1.09685123, + "learning_rate": 0.0009463880725016029, + "loss": 1.11056697, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.83398438, + "step": 119, + "time_per_iteration": 2.6932969093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312523, + "balance_loss_mlp": 1.03252411, + "epoch": 0.02308580223162755, + "flos": 562478826240.0, + "grad_norm": 0.04303328442288268, + "language_loss": 1.04977584, + "learning_rate": 0.0009480451988946134, + "loss": 1.06290102, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.8046875, + "step": 120, + "time_per_iteration": 2.8070547580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299284, + "balance_loss_mlp": 1.02252805, + "epoch": 0.023278183916891113, + "flos": 772646287872.0, + "grad_norm": 0.03799067846502037, + "language_loss": 1.05637264, + "learning_rate": 0.0009496885730428627, + "loss": 1.0693655, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.77148438, + "step": 121, + "time_per_iteration": 3.014753580093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130842, + "balance_loss_mlp": 1.03376198, + "epoch": 0.023470565602154676, + "flos": 554431057152.0, + "grad_norm": 0.04194740398285866, + "language_loss": 1.04016769, + "learning_rate": 0.0009513184213246156, + "loss": 1.05325174, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.75, + "step": 122, + "time_per_iteration": 2.633074998855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316034, + "balance_loss_mlp": 1.04442739, + "epoch": 0.02366294728741824, + "flos": 561167503872.0, + "grad_norm": 0.038872106950025416, + "language_loss": 1.07101583, + "learning_rate": 0.0009529349645740552, + "loss": 1.08417618, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.71875, + "step": 123, + "time_per_iteration": 2.6846470832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320226, + "balance_loss_mlp": 1.05014575, + "epoch": 0.0238553289726818, + "flos": 469517788416.0, + "grad_norm": 0.03403697644067516, + "language_loss": 1.05937934, + "learning_rate": 0.0009545384182608524, + "loss": 1.07258177, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.703125, + "step": 124, + "time_per_iteration": 2.5332376956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326404, + "balance_loss_mlp": 1.05880272, + "epoch": 0.024047710657945365, + "flos": 561104320512.0, + "grad_norm": 0.042208642163400256, + "language_loss": 1.03444421, + "learning_rate": 0.0009561289926625252, + "loss": 1.04770815, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.67773438, + "step": 125, + "time_per_iteration": 2.68180251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324487, + "balance_loss_mlp": 1.05841172, + "epoch": 0.024240092343208928, + "flos": 505771155456.0, + "grad_norm": 0.03944680997458598, + "language_loss": 1.08491933, + "learning_rate": 0.0009577068930299292, + "loss": 1.0981642, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.66210938, + "step": 126, + "time_per_iteration": 2.602088689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323529, + "balance_loss_mlp": 1.05936122, + "epoch": 0.02443247402847249, + "flos": 436753181184.0, + "grad_norm": 0.04017271590188075, + "language_loss": 1.04077768, + "learning_rate": 0.0009592723197462087, + "loss": 1.05401289, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.64257812, + "step": 127, + "time_per_iteration": 2.643617630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318089, + "balance_loss_mlp": 1.05563784, + "epoch": 0.024624855713736054, + "flos": 685069545216.0, + "grad_norm": 0.03549644551725154, + "language_loss": 1.0056293, + "learning_rate": 0.0009608254684795125, + "loss": 1.01881027, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.625, + "step": 128, + "time_per_iteration": 2.949061632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309242, + "balance_loss_mlp": 1.04831672, + "epoch": 0.024817237398999614, + "flos": 526114643712.0, + "grad_norm": 0.03183934804306691, + "language_loss": 1.03377914, + "learning_rate": 0.0009623665303297678, + "loss": 1.04687166, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.609375, + "step": 129, + "time_per_iteration": 2.7315783500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130104, + "balance_loss_mlp": 1.04106867, + "epoch": 0.025009619084263177, + "flos": 656887279872.0, + "grad_norm": 0.038944166016075116, + "language_loss": 1.07603359, + "learning_rate": 0.0009638956919697878, + "loss": 1.08904397, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.59960938, + "step": 130, + "time_per_iteration": 2.9588887691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293161, + "balance_loss_mlp": 1.03395224, + "epoch": 0.02520200076952674, + "flos": 455370275328.0, + "grad_norm": 0.03345888261117193, + "language_loss": 0.99743778, + "learning_rate": 0.0009654131357809714, + "loss": 1.0103693, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.59179688, + "step": 131, + "time_per_iteration": 2.5802786350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296775, + "balance_loss_mlp": 1.03966463, + "epoch": 0.025394382454790303, + "flos": 841269599232.0, + "grad_norm": 0.04496153180844387, + "language_loss": 1.08517051, + "learning_rate": 0.0009669190399838441, + "loss": 1.09813821, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.5703125, + "step": 132, + "time_per_iteration": 3.1034374237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297684, + "balance_loss_mlp": 1.04190826, + "epoch": 0.025586764140053866, + "flos": 582229353216.0, + "grad_norm": 0.044253016077327914, + "language_loss": 1.0183959, + "learning_rate": 0.0009684135787636724, + "loss": 1.03137255, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.55664062, + "step": 133, + "time_per_iteration": 2.8056888580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284726, + "balance_loss_mlp": 1.03066742, + "epoch": 0.02577914582531743, + "flos": 791678453760.0, + "grad_norm": 0.04023348500073193, + "language_loss": 1.06134284, + "learning_rate": 0.0009698969223913726, + "loss": 1.07419014, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.5390625, + "step": 134, + "time_per_iteration": 3.0520598888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_mlp": 1.02717578, + "epoch": 0.025971527510580992, + "flos": 596063861760.0, + "grad_norm": 0.02965492003563146, + "language_loss": 1.08660483, + "learning_rate": 0.0009713692373399265, + "loss": 1.09939814, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.51953125, + "step": 135, + "time_per_iteration": 2.679379463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01931427, + "balance_loss_mlp": 1.66744995, + "epoch": 0.026163909195844555, + "flos": 1581077391360.0, + "grad_norm": 0.18396358569787127, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81387651, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.640625, + "step": 136, + "time_per_iteration": 5.69318151473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01580238, + "balance_loss_mlp": 1.32083893, + "epoch": 0.026356290881108118, + "flos": 1505163555840.0, + "grad_norm": 0.11058621392355464, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79391277, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.59375, + "step": 137, + "time_per_iteration": 4.930646896362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336039, + "balance_loss_mlp": 1.08846498, + "epoch": 0.02654867256637168, + "flos": 598341421056.0, + "grad_norm": 0.05793494017899448, + "language_loss": 1.01254559, + "learning_rate": 0.0009757216201974225, + "loss": 1.02590609, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.47265625, + "step": 138, + "time_per_iteration": 2.8532111644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376264, + "balance_loss_mlp": 1.13059723, + "epoch": 0.026741054251635244, + "flos": 546136379136.0, + "grad_norm": 0.07027637242601113, + "language_loss": 1.06507492, + "learning_rate": 0.0009771514130396581, + "loss": 1.07883763, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.453125, + "step": 139, + "time_per_iteration": 2.742065668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01373402, + "balance_loss_mlp": 1.12792611, + "epoch": 0.026933435936898807, + "flos": 507846525696.0, + "grad_norm": 0.06681977417406691, + "language_loss": 1.06790614, + "learning_rate": 0.00097857095638274, + "loss": 1.08164012, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.45117188, + "step": 140, + "time_per_iteration": 2.689812660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01350241, + "balance_loss_mlp": 1.10533786, + "epoch": 0.02712581762216237, + "flos": 742254504192.0, + "grad_norm": 0.04346752833457442, + "language_loss": 0.97943556, + "learning_rate": 0.0009799803961288726, + "loss": 0.99293798, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.4453125, + "step": 141, + "time_per_iteration": 3.064852714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340988, + "balance_loss_mlp": 1.09684777, + "epoch": 0.027318199307425933, + "flos": 849779105280.0, + "grad_norm": 0.04419232462487818, + "language_loss": 1.04253626, + "learning_rate": 0.000981379875086876, + "loss": 1.05594611, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.4375, + "step": 142, + "time_per_iteration": 3.049978494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342845, + "balance_loss_mlp": 1.09870481, + "epoch": 0.027510580992689496, + "flos": 576638978304.0, + "grad_norm": 0.03936283820829166, + "language_loss": 0.99339008, + "learning_rate": 0.0009827695330590185, + "loss": 1.00681853, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.4375, + "step": 143, + "time_per_iteration": 2.677050828933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360296, + "balance_loss_mlp": 1.11729932, + "epoch": 0.02770296267795306, + "flos": 773790414336.0, + "grad_norm": 0.036415015399305896, + "language_loss": 0.98794824, + "learning_rate": 0.0009841495069248256, + "loss": 1.00155115, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.42578125, + "step": 144, + "time_per_iteration": 2.9983932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369915, + "balance_loss_mlp": 1.12768197, + "epoch": 0.027895344363216622, + "flos": 570449806080.0, + "grad_norm": 0.04357781303470995, + "language_loss": 0.98341697, + "learning_rate": 0.0009855199307219871, + "loss": 0.99711609, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.41796875, + "step": 145, + "time_per_iteration": 2.6622605323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136275, + "balance_loss_mlp": 1.12261522, + "epoch": 0.028087726048480186, + "flos": 548409080832.0, + "grad_norm": 0.032618269384273584, + "language_loss": 1.00131154, + "learning_rate": 0.0009868809357244854, + "loss": 1.01493907, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.39648438, + "step": 146, + "time_per_iteration": 2.7002813816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347166, + "balance_loss_mlp": 1.10836601, + "epoch": 0.02828010773374375, + "flos": 525873570816.0, + "grad_norm": 0.032542426789695725, + "language_loss": 1.04416764, + "learning_rate": 0.0009882326505180556, + "loss": 1.05763924, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.3828125, + "step": 147, + "time_per_iteration": 2.710149049758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334853, + "balance_loss_mlp": 1.09815085, + "epoch": 0.02847248941900731, + "flos": 773772917760.0, + "grad_norm": 0.045451062042893155, + "language_loss": 1.02790403, + "learning_rate": 0.0009895752010730906, + "loss": 1.04125249, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.36132812, + "step": 148, + "time_per_iteration": 2.965888261795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328294, + "balance_loss_mlp": 1.0936898, + "epoch": 0.028664871104270875, + "flos": 535470822912.0, + "grad_norm": 0.03549847888949514, + "language_loss": 1.08720016, + "learning_rate": 0.0009909087108150867, + "loss": 1.10048318, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.33984375, + "step": 149, + "time_per_iteration": 2.759585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328431, + "balance_loss_mlp": 1.09649718, + "epoch": 0.028857252789534438, + "flos": 368605212672.0, + "grad_norm": 0.04584721914032896, + "language_loss": 1.09262538, + "learning_rate": 0.0009922333006927371, + "loss": 1.10590982, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.3125, + "step": 150, + "time_per_iteration": 2.5677716732025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132956, + "balance_loss_mlp": 1.09896171, + "epoch": 0.029049634474798, + "flos": 516484343808.0, + "grad_norm": 0.054837011337671125, + "language_loss": 1.02855873, + "learning_rate": 0.0009935490892437632, + "loss": 1.04185438, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.29882812, + "step": 151, + "time_per_iteration": 2.5842795372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323589, + "balance_loss_mlp": 1.09623301, + "epoch": 0.029242016160061564, + "flos": 589349769216.0, + "grad_norm": 0.041624099188269474, + "language_loss": 1.01284385, + "learning_rate": 0.0009948561926585687, + "loss": 1.02607965, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.2734375, + "step": 152, + "time_per_iteration": 2.7717602252960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309484, + "balance_loss_mlp": 1.08422625, + "epoch": 0.029434397845325123, + "flos": 553137231360.0, + "grad_norm": 0.04242067063834005, + "language_loss": 1.0541966, + "learning_rate": 0.0009961547248418122, + "loss": 1.0672915, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.25976562, + "step": 153, + "time_per_iteration": 2.6492583751678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303287, + "balance_loss_mlp": 1.07898307, + "epoch": 0.029626779530588686, + "flos": 604608360960.0, + "grad_norm": 0.03242941124289258, + "language_loss": 1.02145946, + "learning_rate": 0.0009974447974719707, + "loss": 1.03449237, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.25, + "step": 154, + "time_per_iteration": 2.7111871242523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303637, + "balance_loss_mlp": 1.08181214, + "epoch": 0.02981916121585225, + "flos": 622218388992.0, + "grad_norm": 0.03743420896054, + "language_loss": 1.03581393, + "learning_rate": 0.0009987265200589763, + "loss": 1.0488503, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.22460938, + "step": 155, + "time_per_iteration": 2.7590832710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281243, + "balance_loss_mlp": 1.06151628, + "epoch": 0.030011542901115813, + "flos": 662881065984.0, + "grad_norm": 0.03665146617631418, + "language_loss": 1.03448439, + "learning_rate": 0.001, + "loss": 1.04729688, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.203125, + "step": 156, + "time_per_iteration": 2.868732452392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262046, + "balance_loss_mlp": 1.04441714, + "epoch": 0.030203924586379376, + "flos": 652819164672.0, + "grad_norm": 0.048414208125286275, + "language_loss": 1.0101347, + "learning_rate": 0.0009999999029413921, + "loss": 1.02275515, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.18164062, + "step": 157, + "time_per_iteration": 2.8458704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249467, + "balance_loss_mlp": 1.03393674, + "epoch": 0.03039630627164294, + "flos": 532444766976.0, + "grad_norm": 0.038165698108555156, + "language_loss": 1.02398324, + "learning_rate": 0.0009999996117656068, + "loss": 1.03647804, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.16015625, + "step": 158, + "time_per_iteration": 2.7255747318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250196, + "balance_loss_mlp": 1.03657281, + "epoch": 0.030588687956906502, + "flos": 587295786240.0, + "grad_norm": 0.04636715302465643, + "language_loss": 0.95869231, + "learning_rate": 0.0009999991264727564, + "loss": 0.97119427, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.140625, + "step": 159, + "time_per_iteration": 2.7805936336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_mlp": 1.05284619, + "epoch": 0.030781069642170065, + "flos": 514287464448.0, + "grad_norm": 0.055354258548617474, + "language_loss": 1.07316554, + "learning_rate": 0.0009999984470630296, + "loss": 1.08580732, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.1171875, + "step": 160, + "time_per_iteration": 2.6011087894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284107, + "balance_loss_mlp": 1.07372677, + "epoch": 0.030973451327433628, + "flos": 719560546560.0, + "grad_norm": 0.03499871632601644, + "language_loss": 0.95530587, + "learning_rate": 0.0009999975735366902, + "loss": 0.96814692, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.10742188, + "step": 161, + "time_per_iteration": 3.083415985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_mlp": 1.07439709, + "epoch": 0.03116583301269719, + "flos": 1111615994880.0, + "grad_norm": 0.03722431710536786, + "language_loss": 0.96960843, + "learning_rate": 0.0009999965058940775, + "loss": 0.9824428, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.09375, + "step": 162, + "time_per_iteration": 3.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_mlp": 1.05655301, + "epoch": 0.031358214697960754, + "flos": 451833883392.0, + "grad_norm": 0.04231417263227255, + "language_loss": 1.04135799, + "learning_rate": 0.0009999952441356057, + "loss": 1.05399871, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.078125, + "step": 163, + "time_per_iteration": 2.5445146560668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239952, + "balance_loss_mlp": 1.03357697, + "epoch": 0.031550596383224314, + "flos": 1257087309312.0, + "grad_norm": 0.03293922474511325, + "language_loss": 1.04807603, + "learning_rate": 0.000999993788261765, + "loss": 1.06047547, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.06640625, + "step": 164, + "time_per_iteration": 3.603273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233685, + "balance_loss_mlp": 1.02769136, + "epoch": 0.03174297806848788, + "flos": 669323950080.0, + "grad_norm": 0.03785089383184646, + "language_loss": 1.05591631, + "learning_rate": 0.00099999213827312, + "loss": 1.06825328, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.0625, + "step": 165, + "time_per_iteration": 2.822242498397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237294, + "balance_loss_mlp": 1.03206336, + "epoch": 0.03193535975375144, + "flos": 552364435200.0, + "grad_norm": 0.03413051380570177, + "language_loss": 1.00392842, + "learning_rate": 0.000999990294170312, + "loss": 1.01630139, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.0546875, + "step": 166, + "time_per_iteration": 2.6473989486694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124218, + "balance_loss_mlp": 1.03790259, + "epoch": 0.032127741439015006, + "flos": 544740486144.0, + "grad_norm": 0.02951320831702663, + "language_loss": 1.04371905, + "learning_rate": 0.0009999882559540566, + "loss": 1.0561409, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.04492188, + "step": 167, + "time_per_iteration": 2.654994487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_mlp": 1.04661989, + "epoch": 0.032320123124278566, + "flos": 549514323456.0, + "grad_norm": 0.03217165834370848, + "language_loss": 1.01348543, + "learning_rate": 0.000999986023625145, + "loss": 1.02598298, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.03320312, + "step": 168, + "time_per_iteration": 2.759324550628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01736656, + "balance_loss_mlp": 1.53829193, + "epoch": 0.03251250480954213, + "flos": 1308817963776.0, + "grad_norm": 0.15145695156494207, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8066107, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.9765625, + "step": 169, + "time_per_iteration": 4.9954283237457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125768, + "balance_loss_mlp": 1.05588245, + "epoch": 0.03270488649480569, + "flos": 562202760192.0, + "grad_norm": 0.04037677915440104, + "language_loss": 1.01481748, + "learning_rate": 0.0009999809766328958, + "loss": 1.02739429, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.01953125, + "step": 170, + "time_per_iteration": 2.6656970977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250876, + "balance_loss_mlp": 1.0494597, + "epoch": 0.03289726818006926, + "flos": 483339657984.0, + "grad_norm": 0.04232720535630845, + "language_loss": 1.03883123, + "learning_rate": 0.0009999781619715177, + "loss": 1.0513401, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.015625, + "step": 171, + "time_per_iteration": 2.5408902168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238141, + "balance_loss_mlp": 1.03786898, + "epoch": 0.03308964986533282, + "flos": 675821269248.0, + "grad_norm": 0.04278552863969592, + "language_loss": 1.04043615, + "learning_rate": 0.000999975153201402, + "loss": 1.05281758, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.00390625, + "step": 172, + "time_per_iteration": 2.85229754447937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233367, + "balance_loss_mlp": 1.03385854, + "epoch": 0.033282031550596385, + "flos": 610341632256.0, + "grad_norm": 0.04144744195910536, + "language_loss": 1.01965618, + "learning_rate": 0.0009999719503237174, + "loss": 1.03198993, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 1.9921875, + "step": 173, + "time_per_iteration": 2.7612979412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234993, + "balance_loss_mlp": 1.03739214, + "epoch": 0.033474413235859944, + "flos": 468996758784.0, + "grad_norm": 0.06741318195929925, + "language_loss": 1.10547054, + "learning_rate": 0.0009999685533397073, + "loss": 1.1178205, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 1.97265625, + "step": 174, + "time_per_iteration": 2.5750949382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246896, + "balance_loss_mlp": 1.05101097, + "epoch": 0.03366679492112351, + "flos": 580715841792.0, + "grad_norm": 0.0354258140398677, + "language_loss": 1.02665091, + "learning_rate": 0.00099996496225069, + "loss": 1.03911996, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 1.95605469, + "step": 175, + "time_per_iteration": 2.6886191368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124614, + "balance_loss_mlp": 1.05168545, + "epoch": 0.03385917660638707, + "flos": 638886479616.0, + "grad_norm": 0.036851717024697625, + "language_loss": 1.04551578, + "learning_rate": 0.0009999611770580604, + "loss": 1.0579772, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 1.94433594, + "step": 176, + "time_per_iteration": 2.8528547286987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227252, + "balance_loss_mlp": 1.03422809, + "epoch": 0.03405155829165064, + "flos": 442740164352.0, + "grad_norm": 0.05003520598604069, + "language_loss": 1.03819132, + "learning_rate": 0.0009999571977632876, + "loss": 1.0504638, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 1.9296875, + "step": 177, + "time_per_iteration": 2.6220269203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224145, + "balance_loss_mlp": 1.03188384, + "epoch": 0.034243939976914196, + "flos": 467275222272.0, + "grad_norm": 0.0554689754659714, + "language_loss": 1.0658946, + "learning_rate": 0.0009999530243679166, + "loss": 1.07813609, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 1.921875, + "step": 178, + "time_per_iteration": 2.5593671798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235914, + "balance_loss_mlp": 1.04479802, + "epoch": 0.03443632166217776, + "flos": 780713498880.0, + "grad_norm": 0.03675993055709111, + "language_loss": 1.01102996, + "learning_rate": 0.0009999486568735675, + "loss": 1.02338898, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 1.91015625, + "step": 179, + "time_per_iteration": 3.083312749862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235549, + "balance_loss_mlp": 1.04548192, + "epoch": 0.03462870334744132, + "flos": 1265760120576.0, + "grad_norm": 0.04656515886260978, + "language_loss": 1.01660061, + "learning_rate": 0.0009999440952819362, + "loss": 1.02895617, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 1.89941406, + "step": 180, + "time_per_iteration": 3.691354513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231777, + "balance_loss_mlp": 1.04390287, + "epoch": 0.03482108503270489, + "flos": 608303200512.0, + "grad_norm": 0.04339398829325753, + "language_loss": 1.02140999, + "learning_rate": 0.0009999393395947935, + "loss": 1.03372765, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 1.87695312, + "step": 181, + "time_per_iteration": 2.8826780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222143, + "balance_loss_mlp": 1.03617644, + "epoch": 0.03501346671796845, + "flos": 539315361792.0, + "grad_norm": 0.033650569268787865, + "language_loss": 1.05363226, + "learning_rate": 0.0009999343898139858, + "loss": 1.06585371, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 1.85742188, + "step": 182, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217643, + "balance_loss_mlp": 1.03329813, + "epoch": 0.035205848403232015, + "flos": 519499706112.0, + "grad_norm": 0.04889617812287003, + "language_loss": 1.03914642, + "learning_rate": 0.0009999292459414348, + "loss": 1.05132294, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 1.84082031, + "step": 183, + "time_per_iteration": 2.648263931274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223183, + "balance_loss_mlp": 1.04103076, + "epoch": 0.035398230088495575, + "flos": 473334137088.0, + "grad_norm": 0.03546540132303448, + "language_loss": 1.08284354, + "learning_rate": 0.0009999239079791374, + "loss": 1.09507537, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 1.81835938, + "step": 184, + "time_per_iteration": 2.6003947257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229231, + "balance_loss_mlp": 1.04908144, + "epoch": 0.03559061177375914, + "flos": 513095705856.0, + "grad_norm": 0.03580873522044792, + "language_loss": 1.00877666, + "learning_rate": 0.0009999183759291659, + "loss": 1.02106905, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 1.79785156, + "step": 185, + "time_per_iteration": 2.7518959045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229953, + "balance_loss_mlp": 1.05161583, + "epoch": 0.0357829934590227, + "flos": 478350992640.0, + "grad_norm": 0.05401643684385997, + "language_loss": 1.03586912, + "learning_rate": 0.0009999126497936682, + "loss": 1.04816866, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 1.78710938, + "step": 186, + "time_per_iteration": 2.565373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218003, + "balance_loss_mlp": 1.04052448, + "epoch": 0.03597537514428627, + "flos": 645885386496.0, + "grad_norm": 0.027605248849540943, + "language_loss": 1.06344712, + "learning_rate": 0.0009999067295748676, + "loss": 1.07562721, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 1.77832031, + "step": 187, + "time_per_iteration": 2.862023115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208675, + "balance_loss_mlp": 1.03167319, + "epoch": 0.03616775682954983, + "flos": 582270182400.0, + "grad_norm": 0.041753828035088196, + "language_loss": 1.04174721, + "learning_rate": 0.000999900615275062, + "loss": 1.05383396, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 1.7734375, + "step": 188, + "time_per_iteration": 2.7248780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206757, + "balance_loss_mlp": 1.02994609, + "epoch": 0.03636013851481339, + "flos": 383265007104.0, + "grad_norm": 0.05119808239604003, + "language_loss": 1.10189009, + "learning_rate": 0.0009998943068966256, + "loss": 1.11395764, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 1.77148438, + "step": 189, + "time_per_iteration": 2.487445592880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216253, + "balance_loss_mlp": 1.04010975, + "epoch": 0.03655252020007695, + "flos": 584308614144.0, + "grad_norm": 0.029643950017142998, + "language_loss": 1.04644084, + "learning_rate": 0.0009998878044420072, + "loss": 1.05860329, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 1.76464844, + "step": 190, + "time_per_iteration": 2.736809015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012186, + "balance_loss_mlp": 1.04321897, + "epoch": 0.03674490188534051, + "flos": 472598279424.0, + "grad_norm": 0.03987592529636011, + "language_loss": 1.00565469, + "learning_rate": 0.0009998811079137318, + "loss": 1.01784062, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 1.75683594, + "step": 191, + "time_per_iteration": 2.6006946563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214791, + "balance_loss_mlp": 1.04017353, + "epoch": 0.03693728357060408, + "flos": 529411908096.0, + "grad_norm": 0.03601320862003297, + "language_loss": 1.01597381, + "learning_rate": 0.0009998742173143987, + "loss": 1.02812171, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 1.74902344, + "step": 192, + "time_per_iteration": 2.6246893405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200861, + "balance_loss_mlp": 1.02719736, + "epoch": 0.03712966525586764, + "flos": 800346407424.0, + "grad_norm": 0.02962706666311765, + "language_loss": 1.0204885, + "learning_rate": 0.0009998671326466833, + "loss": 1.03249693, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 1.73925781, + "step": 193, + "time_per_iteration": 2.9852418899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194118, + "balance_loss_mlp": 1.02121651, + "epoch": 0.037322046941131205, + "flos": 831359342592.0, + "grad_norm": 0.049736474928026, + "language_loss": 1.0340569, + "learning_rate": 0.0009998598539133362, + "loss": 1.04599798, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 1.73144531, + "step": 194, + "time_per_iteration": 3.0510568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194861, + "balance_loss_mlp": 1.02339077, + "epoch": 0.037514428626394765, + "flos": 438589423872.0, + "grad_norm": 0.030819097200883293, + "language_loss": 1.03682184, + "learning_rate": 0.0009998523811171828, + "loss": 1.04877055, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 1.71679688, + "step": 195, + "time_per_iteration": 2.5203936100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197718, + "balance_loss_mlp": 1.0269146, + "epoch": 0.03770681031165833, + "flos": 512639804928.0, + "grad_norm": 0.031890398221933944, + "language_loss": 1.04342675, + "learning_rate": 0.0009998447142611248, + "loss": 1.05540395, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 1.70996094, + "step": 196, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193737, + "balance_loss_mlp": 1.02341044, + "epoch": 0.03789919199692189, + "flos": 808843274496.0, + "grad_norm": 0.030368823498634023, + "language_loss": 0.97672093, + "learning_rate": 0.0009998368533481387, + "loss": 0.98865831, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 1.70507812, + "step": 197, + "time_per_iteration": 3.031437397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185957, + "balance_loss_mlp": 1.01677489, + "epoch": 0.03809157368218546, + "flos": 691792386048.0, + "grad_norm": 0.027429804092446938, + "language_loss": 1.00742936, + "learning_rate": 0.0009998287983812762, + "loss": 1.01928902, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 1.69335938, + "step": 198, + "time_per_iteration": 2.8533172607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186443, + "balance_loss_mlp": 1.01764262, + "epoch": 0.03828395536744902, + "flos": 519004921344.0, + "grad_norm": 0.029672573654994608, + "language_loss": 1.06761527, + "learning_rate": 0.0009998205493636646, + "loss": 1.07947969, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 1.68945312, + "step": 199, + "time_per_iteration": 2.6512415409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190294, + "balance_loss_mlp": 1.02197027, + "epoch": 0.038476337052712584, + "flos": 582763021824.0, + "grad_norm": 0.03300049351517658, + "language_loss": 0.99112457, + "learning_rate": 0.0009998121062985063, + "loss": 1.00302756, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 1.68457031, + "step": 200, + "time_per_iteration": 2.6979846954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187054, + "balance_loss_mlp": 1.01996994, + "epoch": 0.03866871873797614, + "flos": 578273998848.0, + "grad_norm": 0.03164459486115397, + "language_loss": 1.0110172, + "learning_rate": 0.0009998034691890794, + "loss": 1.02288771, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 1.671875, + "step": 201, + "time_per_iteration": 2.80670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183327, + "balance_loss_mlp": 1.01672018, + "epoch": 0.03886110042323971, + "flos": 541772755968.0, + "grad_norm": 0.032663388617215364, + "language_loss": 1.05587053, + "learning_rate": 0.0009997946380387369, + "loss": 1.06770372, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 1.66699219, + "step": 202, + "time_per_iteration": 2.6591310501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179406, + "balance_loss_mlp": 1.01394379, + "epoch": 0.03905348210850327, + "flos": 719240739072.0, + "grad_norm": 0.030305493428663434, + "language_loss": 1.08528447, + "learning_rate": 0.0009997856128509076, + "loss": 1.09707844, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 1.65527344, + "step": 203, + "time_per_iteration": 2.9006340503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181527, + "balance_loss_mlp": 1.01720893, + "epoch": 0.039245863793766836, + "flos": 428397265152.0, + "grad_norm": 0.03189317300504765, + "language_loss": 1.03375864, + "learning_rate": 0.0009997763936290952, + "loss": 1.04557395, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.64355469, + "step": 204, + "time_per_iteration": 2.5836358070373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178747, + "balance_loss_mlp": 1.01538289, + "epoch": 0.039438245479030395, + "flos": 664270156032.0, + "grad_norm": 0.033629424624266296, + "language_loss": 1.0866276, + "learning_rate": 0.0009997669803768789, + "loss": 1.09841514, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.63378906, + "step": 205, + "time_per_iteration": 2.7809464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180514, + "balance_loss_mlp": 1.01791251, + "epoch": 0.03963062716429396, + "flos": 636496159488.0, + "grad_norm": 0.025840840316256445, + "language_loss": 1.03755617, + "learning_rate": 0.0009997573730979134, + "loss": 1.04936123, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.62597656, + "step": 206, + "time_per_iteration": 2.7759904861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207138, + "balance_loss_mlp": 1.04272461, + "epoch": 0.03982300884955752, + "flos": 1421589799680.0, + "grad_norm": 0.03078548913711826, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80400336, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.64453125, + "step": 207, + "time_per_iteration": 4.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177297, + "balance_loss_mlp": 1.0162214, + "epoch": 0.04001539053482109, + "flos": 690520914432.0, + "grad_norm": 0.03233621027438014, + "language_loss": 1.02104092, + "learning_rate": 0.0009997375764747294, + "loss": 1.03281379, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.61035156, + "step": 208, + "time_per_iteration": 2.9808952808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181785, + "balance_loss_mlp": 1.02156758, + "epoch": 0.04020777222008465, + "flos": 534752461824.0, + "grad_norm": 0.037334696417832054, + "language_loss": 0.99876916, + "learning_rate": 0.0009997273871381967, + "loss": 1.01058698, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.6015625, + "step": 209, + "time_per_iteration": 2.6938650608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183132, + "balance_loss_mlp": 1.02396429, + "epoch": 0.040400153905348214, + "flos": 568997532672.0, + "grad_norm": 0.03228633343407045, + "language_loss": 1.04497194, + "learning_rate": 0.0009997170037902862, + "loss": 1.05680323, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.59082031, + "step": 210, + "time_per_iteration": 2.722900629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189763, + "balance_loss_mlp": 1.03145349, + "epoch": 0.040592535590611774, + "flos": 714679784448.0, + "grad_norm": 0.026587079094436805, + "language_loss": 1.0723207, + "learning_rate": 0.0009997064264350292, + "loss": 1.08421838, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.58203125, + "step": 211, + "time_per_iteration": 2.8636813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186902, + "balance_loss_mlp": 1.02954614, + "epoch": 0.04078491727587533, + "flos": 579207187968.0, + "grad_norm": 0.028855359605628288, + "language_loss": 1.01311755, + "learning_rate": 0.0009996956550765317, + "loss": 1.02498662, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.57226562, + "step": 212, + "time_per_iteration": 2.6752002239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183355, + "balance_loss_mlp": 1.0270474, + "epoch": 0.0409772989611389, + "flos": 553369555968.0, + "grad_norm": 0.03615073574048419, + "language_loss": 0.96463609, + "learning_rate": 0.0009996846897189762, + "loss": 0.97646964, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.56152344, + "step": 213, + "time_per_iteration": 2.618417501449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180441, + "balance_loss_mlp": 1.02470577, + "epoch": 0.04116968064640246, + "flos": 556764996864.0, + "grad_norm": 0.04473264124517712, + "language_loss": 1.02233624, + "learning_rate": 0.0009996735303666193, + "loss": 1.03414059, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.55566406, + "step": 214, + "time_per_iteration": 2.7398550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118203, + "balance_loss_mlp": 1.026963, + "epoch": 0.041362062331666026, + "flos": 579652395264.0, + "grad_norm": 0.027182691243245845, + "language_loss": 1.04435229, + "learning_rate": 0.0009996621770237937, + "loss": 1.05617261, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.54882812, + "step": 215, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182535, + "balance_loss_mlp": 1.02775347, + "epoch": 0.041554444016929586, + "flos": 612701816832.0, + "grad_norm": 0.028683660550217302, + "language_loss": 1.00582075, + "learning_rate": 0.0009996506296949073, + "loss": 1.01764607, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.54589844, + "step": 216, + "time_per_iteration": 2.877587080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180569, + "balance_loss_mlp": 1.02607429, + "epoch": 0.04174682570219315, + "flos": 529151393280.0, + "grad_norm": 0.031901868987761664, + "language_loss": 1.00452459, + "learning_rate": 0.0009996388883844428, + "loss": 1.01633024, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.54296875, + "step": 217, + "time_per_iteration": 2.6346311569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.02002692, + "epoch": 0.04193920738745671, + "flos": 512500799232.0, + "grad_norm": 0.02715845750356807, + "language_loss": 1.03465486, + "learning_rate": 0.0009996269530969588, + "loss": 1.04639161, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.53417969, + "step": 218, + "time_per_iteration": 2.6205921173095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170474, + "balance_loss_mlp": 1.0176959, + "epoch": 0.04213158907272028, + "flos": 572553366528.0, + "grad_norm": 0.03606301207395498, + "language_loss": 1.04169452, + "learning_rate": 0.0009996148238370888, + "loss": 1.05339921, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.52539062, + "step": 219, + "time_per_iteration": 2.8047173023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.01725543, + "epoch": 0.04232397075798384, + "flos": 965905552896.0, + "grad_norm": 0.026524392964530758, + "language_loss": 0.99111861, + "learning_rate": 0.0009996025006095421, + "loss": 1.00281417, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.52050781, + "step": 220, + "time_per_iteration": 3.315859317779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 0.99693298, + "epoch": 0.042516352443247404, + "flos": 1472733340416.0, + "grad_norm": 0.01509407607306266, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.78931135, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.5078125, + "step": 221, + "time_per_iteration": 5.540910243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166927, + "balance_loss_mlp": 1.0164367, + "epoch": 0.042708734128510964, + "flos": 655892852736.0, + "grad_norm": 0.029367950869880366, + "language_loss": 0.99126619, + "learning_rate": 0.0009995772722706307, + "loss": 1.00293541, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.50195312, + "step": 222, + "time_per_iteration": 2.901489019393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167445, + "balance_loss_mlp": 1.01705015, + "epoch": 0.04290111581377453, + "flos": 432734643456.0, + "grad_norm": 0.04040999725558835, + "language_loss": 1.13508129, + "learning_rate": 0.0009995643671690604, + "loss": 1.1467557, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.50097656, + "step": 223, + "time_per_iteration": 2.5576720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168153, + "balance_loss_mlp": 1.01823533, + "epoch": 0.04309349749903809, + "flos": 645867889920.0, + "grad_norm": 0.02824445481068148, + "language_loss": 1.00763512, + "learning_rate": 0.0009995512681194023, + "loss": 1.01931667, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.49609375, + "step": 224, + "time_per_iteration": 2.9571568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167559, + "balance_loss_mlp": 1.01840472, + "epoch": 0.04328587918430166, + "flos": 832897153536.0, + "grad_norm": 0.025764365733734692, + "language_loss": 0.98235118, + "learning_rate": 0.0009995379751267417, + "loss": 0.99402678, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.48828125, + "step": 225, + "time_per_iteration": 3.2627484798431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166832, + "balance_loss_mlp": 1.01824963, + "epoch": 0.043478260869565216, + "flos": 526116589056.0, + "grad_norm": 0.03531387708455554, + "language_loss": 1.00006318, + "learning_rate": 0.0009995244881962398, + "loss": 1.01173151, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.48242188, + "step": 226, + "time_per_iteration": 2.624209403991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170136, + "balance_loss_mlp": 1.02212548, + "epoch": 0.04367064255482878, + "flos": 440413027584.0, + "grad_norm": 0.039279482080902435, + "language_loss": 1.01293874, + "learning_rate": 0.0009995108073331323, + "loss": 1.02464008, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.4765625, + "step": 227, + "time_per_iteration": 2.6042520999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164096, + "balance_loss_mlp": 1.01742136, + "epoch": 0.04386302424009234, + "flos": 508467677184.0, + "grad_norm": 0.03801127181345805, + "language_loss": 1.03535032, + "learning_rate": 0.0009994969325427309, + "loss": 1.04699123, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.46582031, + "step": 228, + "time_per_iteration": 2.6691603660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163607, + "balance_loss_mlp": 1.01769507, + "epoch": 0.04405540592535591, + "flos": 541744565760.0, + "grad_norm": 0.03512041362752814, + "language_loss": 1.00143218, + "learning_rate": 0.0009994828638304218, + "loss": 1.0130682, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.46191406, + "step": 229, + "time_per_iteration": 2.627833366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164617, + "balance_loss_mlp": 1.01927722, + "epoch": 0.04424778761061947, + "flos": 447309867264.0, + "grad_norm": 0.03576658395893793, + "language_loss": 1.06260157, + "learning_rate": 0.0009994686012016675, + "loss": 1.07424784, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.45703125, + "step": 230, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159156, + "balance_loss_mlp": 1.01448417, + "epoch": 0.044440169295883035, + "flos": 701982599424.0, + "grad_norm": 0.03592315304636455, + "language_loss": 1.05298328, + "learning_rate": 0.000999454144662005, + "loss": 1.06457496, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.45019531, + "step": 231, + "time_per_iteration": 2.918896436691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156937, + "balance_loss_mlp": 1.01274192, + "epoch": 0.044632550981146595, + "flos": 589427536896.0, + "grad_norm": 0.032106980286660924, + "language_loss": 0.996499, + "learning_rate": 0.0009994394942170468, + "loss": 1.00806844, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.4453125, + "step": 232, + "time_per_iteration": 2.700378179550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169343, + "balance_loss_mlp": 1.02524316, + "epoch": 0.04482493266641016, + "flos": 555855140352.0, + "grad_norm": 0.03061962333593277, + "language_loss": 0.97402102, + "learning_rate": 0.0009994246498724808, + "loss": 0.9857145, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.44433594, + "step": 233, + "time_per_iteration": 2.692657232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171534, + "balance_loss_mlp": 1.02848291, + "epoch": 0.04501731435167372, + "flos": 724070956800.0, + "grad_norm": 0.03598428268947968, + "language_loss": 1.00358808, + "learning_rate": 0.00099940961163407, + "loss": 1.01530337, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.43359375, + "step": 234, + "time_per_iteration": 2.8496198654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167121, + "balance_loss_mlp": 1.02473748, + "epoch": 0.04520969603693728, + "flos": 512798252544.0, + "grad_norm": 0.03236637347420306, + "language_loss": 1.0231185, + "learning_rate": 0.0009993943795076528, + "loss": 1.03478956, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.42675781, + "step": 235, + "time_per_iteration": 2.6304001808166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157951, + "balance_loss_mlp": 1.01623452, + "epoch": 0.04540207772220085, + "flos": 365878555392.0, + "grad_norm": 0.04557463461025321, + "language_loss": 1.04854226, + "learning_rate": 0.0009993789534991427, + "loss": 1.06012177, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.41992188, + "step": 236, + "time_per_iteration": 2.500347852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156422, + "balance_loss_mlp": 1.01613641, + "epoch": 0.045594459407464406, + "flos": 523724323584.0, + "grad_norm": 0.028810086143122388, + "language_loss": 0.99360317, + "learning_rate": 0.0009993633336145287, + "loss": 1.00516737, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.40527344, + "step": 237, + "time_per_iteration": 2.6991968154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156358, + "balance_loss_mlp": 1.01664495, + "epoch": 0.04578684109272797, + "flos": 673116966144.0, + "grad_norm": 0.036851747197037266, + "language_loss": 1.03695393, + "learning_rate": 0.0009993475198598752, + "loss": 1.04851758, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.39941406, + "step": 238, + "time_per_iteration": 3.0150160789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160083, + "balance_loss_mlp": 1.02084696, + "epoch": 0.04597922277799153, + "flos": 542621374464.0, + "grad_norm": 0.03967898438127139, + "language_loss": 1.00323462, + "learning_rate": 0.0009993315122413212, + "loss": 1.01483548, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.39453125, + "step": 239, + "time_per_iteration": 2.6226179599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.01690221, + "epoch": 0.0461716044632551, + "flos": 459994413312.0, + "grad_norm": 0.029756199222484733, + "language_loss": 1.00536144, + "learning_rate": 0.0009993153107650818, + "loss": 1.01691425, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.38574219, + "step": 240, + "time_per_iteration": 2.635673999786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154071, + "balance_loss_mlp": 1.01607406, + "epoch": 0.04636398614851866, + "flos": 456171261696.0, + "grad_norm": 0.03103837756937707, + "language_loss": 0.99882519, + "learning_rate": 0.0009992989154374468, + "loss": 1.01036584, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.38183594, + "step": 241, + "time_per_iteration": 2.5449135303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115588, + "balance_loss_mlp": 1.01836014, + "epoch": 0.046556367833782225, + "flos": 557902320384.0, + "grad_norm": 0.06487144756994469, + "language_loss": 1.0686537, + "learning_rate": 0.0009992823262647817, + "loss": 1.08021247, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.37695312, + "step": 242, + "time_per_iteration": 2.705120325088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011561, + "balance_loss_mlp": 1.01905739, + "epoch": 0.046748749519045785, + "flos": 594088613376.0, + "grad_norm": 0.03633512017688626, + "language_loss": 1.00915635, + "learning_rate": 0.0009992655432535264, + "loss": 1.02071738, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.37207031, + "step": 243, + "time_per_iteration": 2.8158721923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160044, + "balance_loss_mlp": 1.02347767, + "epoch": 0.04694113120430935, + "flos": 570942645504.0, + "grad_norm": 0.036353271768507285, + "language_loss": 1.01172018, + "learning_rate": 0.0009992485664101973, + "loss": 1.02332067, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.3671875, + "step": 244, + "time_per_iteration": 2.723409414291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156505, + "balance_loss_mlp": 1.0207969, + "epoch": 0.04713351288957291, + "flos": 865246689024.0, + "grad_norm": 0.05316255083066814, + "language_loss": 1.03417325, + "learning_rate": 0.000999231395741385, + "loss": 1.04573822, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.35839844, + "step": 245, + "time_per_iteration": 3.1441562175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155254, + "balance_loss_mlp": 1.02011812, + "epoch": 0.04732589457483648, + "flos": 538236364032.0, + "grad_norm": 0.039550829703112036, + "language_loss": 1.01375949, + "learning_rate": 0.0009992140312537557, + "loss": 1.02531195, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.35253906, + "step": 246, + "time_per_iteration": 2.6407320499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158298, + "balance_loss_mlp": 1.02402055, + "epoch": 0.04751827626010004, + "flos": 763272612096.0, + "grad_norm": 0.029332271702031103, + "language_loss": 0.96132767, + "learning_rate": 0.000999196472954051, + "loss": 0.97291064, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.34375, + "step": 247, + "time_per_iteration": 2.9791386127471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115741, + "balance_loss_mlp": 1.02313232, + "epoch": 0.0477106579453636, + "flos": 1583128462080.0, + "grad_norm": 0.019406803026512872, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80582267, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.34375, + "step": 248, + "time_per_iteration": 5.547277927398682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115733, + "balance_loss_mlp": 1.02457833, + "epoch": 0.04790303963062716, + "flos": 458693784576.0, + "grad_norm": 0.04949407998464004, + "language_loss": 1.04053593, + "learning_rate": 0.0009991607749457578, + "loss": 1.05210924, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.328125, + "step": 249, + "time_per_iteration": 2.610372304916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158188, + "balance_loss_mlp": 1.02629459, + "epoch": 0.04809542131589073, + "flos": 783787186944.0, + "grad_norm": 0.03428496832179458, + "language_loss": 1.01565814, + "learning_rate": 0.0009991426352510286, + "loss": 1.02723992, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.31933594, + "step": 250, + "time_per_iteration": 2.9723451137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.0272516, + "epoch": 0.04828780300115429, + "flos": 560322776064.0, + "grad_norm": 0.03370153589925739, + "language_loss": 1.02967048, + "learning_rate": 0.0009991243017719422, + "loss": 1.04125512, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.3125, + "step": 251, + "time_per_iteration": 2.691317319869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115263, + "balance_loss_mlp": 1.02149975, + "epoch": 0.048480184686417856, + "flos": 502922989056.0, + "grad_norm": 0.033537523086657674, + "language_loss": 0.98110956, + "learning_rate": 0.0009991057745156165, + "loss": 0.99263585, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.31152344, + "step": 252, + "time_per_iteration": 2.615726947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126877, + "balance_loss_mlp": 0.99641418, + "epoch": 0.048672566371681415, + "flos": 1539471810048.0, + "grad_norm": 0.00943295316075806, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83037865, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.3046875, + "step": 253, + "time_per_iteration": 5.119662523269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155145, + "balance_loss_mlp": 1.02439594, + "epoch": 0.04886494805694498, + "flos": 538952779776.0, + "grad_norm": 0.04101934284448647, + "language_loss": 1.06555986, + "learning_rate": 0.0009990681387000943, + "loss": 1.07711136, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.30761719, + "step": 254, + "time_per_iteration": 2.7494144439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153346, + "balance_loss_mlp": 1.02316916, + "epoch": 0.04905732974220854, + "flos": 681485521152.0, + "grad_norm": 0.029284228955777224, + "language_loss": 1.01195645, + "learning_rate": 0.0009990490301555093, + "loss": 1.02348995, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.30175781, + "step": 255, + "time_per_iteration": 2.9595844745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_mlp": 1.00462341, + "epoch": 0.04924971142747211, + "flos": 1424277573120.0, + "grad_norm": 0.011666997955433429, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80348712, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.2890625, + "step": 256, + "time_per_iteration": 4.918023347854614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126785, + "balance_loss_mlp": 0.99822998, + "epoch": 0.04944209311273567, + "flos": 1561239381504.0, + "grad_norm": 0.006197531934497474, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80369532, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.28515625, + "step": 257, + "time_per_iteration": 4.996341228485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127556, + "balance_loss_mlp": 0.99976349, + "epoch": 0.04963447479799923, + "flos": 1574173748736.0, + "grad_norm": 0.01126324229515774, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71103442, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.27734375, + "step": 258, + "time_per_iteration": 4.951507329940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167552, + "balance_loss_mlp": 1.03966403, + "epoch": 0.049826856483262794, + "flos": 626499386880.0, + "grad_norm": 0.07394024090910019, + "language_loss": 0.96613419, + "learning_rate": 0.0009989706585723202, + "loss": 0.97780967, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.27832031, + "step": 259, + "time_per_iteration": 2.819796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158437, + "balance_loss_mlp": 1.03073978, + "epoch": 0.05001923816852635, + "flos": 505156806912.0, + "grad_norm": 0.042054435700702504, + "language_loss": 1.02184892, + "learning_rate": 0.0009989505813633442, + "loss": 1.0334332, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.27636719, + "step": 260, + "time_per_iteration": 2.671597719192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.02206886, + "epoch": 0.05021161985378992, + "flos": 588468102912.0, + "grad_norm": 0.05343186989039486, + "language_loss": 1.02308297, + "learning_rate": 0.000998930310444573, + "loss": 1.03457689, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.27246094, + "step": 261, + "time_per_iteration": 2.7573728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.01883233, + "epoch": 0.05040400153905348, + "flos": 634403292672.0, + "grad_norm": 0.052960623500171895, + "language_loss": 1.00806391, + "learning_rate": 0.0009989098458238765, + "loss": 1.01951981, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.26660156, + "step": 262, + "time_per_iteration": 2.7937912940979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146699, + "balance_loss_mlp": 1.02033675, + "epoch": 0.050596383224317046, + "flos": 554809190400.0, + "grad_norm": 0.04531187332347281, + "language_loss": 0.99888676, + "learning_rate": 0.0009988891875091998, + "loss": 1.0103538, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.26269531, + "step": 263, + "time_per_iteration": 2.811218500137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.01939976, + "epoch": 0.050788764909580605, + "flos": 550762462464.0, + "grad_norm": 0.03965392167411722, + "language_loss": 0.94696999, + "learning_rate": 0.0009988683355085636, + "loss": 0.95842183, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.25683594, + "step": 264, + "time_per_iteration": 2.7378242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141586, + "balance_loss_mlp": 1.01617777, + "epoch": 0.05098114659484417, + "flos": 606345448704.0, + "grad_norm": 0.024717188615823983, + "language_loss": 1.02827787, + "learning_rate": 0.000998847289830063, + "loss": 1.03969371, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.25292969, + "step": 265, + "time_per_iteration": 2.8625917434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142152, + "balance_loss_mlp": 1.01693416, + "epoch": 0.05117352828010773, + "flos": 439473035520.0, + "grad_norm": 0.036783183293041616, + "language_loss": 0.96527213, + "learning_rate": 0.0009988260504818682, + "loss": 0.97669363, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.25097656, + "step": 266, + "time_per_iteration": 2.5658230781555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138539, + "balance_loss_mlp": 1.0135119, + "epoch": 0.0513659099653713, + "flos": 506031670272.0, + "grad_norm": 0.04116504124695153, + "language_loss": 1.03285778, + "learning_rate": 0.000998804617472226, + "loss": 1.0442431, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.24902344, + "step": 267, + "time_per_iteration": 2.63395094871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138256, + "balance_loss_mlp": 1.01418352, + "epoch": 0.05155829165063486, + "flos": 696715922688.0, + "grad_norm": 0.034853618125567455, + "language_loss": 0.98327756, + "learning_rate": 0.0009987829908094568, + "loss": 0.9946602, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.23925781, + "step": 268, + "time_per_iteration": 2.8239262104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.01331627, + "epoch": 0.051750673335898424, + "flos": 1350302059008.0, + "grad_norm": 0.042488112993129025, + "language_loss": 1.04893267, + "learning_rate": 0.0009987611705019569, + "loss": 1.0603019, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.234375, + "step": 269, + "time_per_iteration": 4.33854079246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.01387095, + "epoch": 0.051943055021161984, + "flos": 490590331392.0, + "grad_norm": 0.037116049987967636, + "language_loss": 1.03026497, + "learning_rate": 0.0009987391565581978, + "loss": 1.04163671, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.23144531, + "step": 270, + "time_per_iteration": 2.609722852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136969, + "balance_loss_mlp": 1.01365864, + "epoch": 0.05213543670642555, + "flos": 546880985088.0, + "grad_norm": 0.03927026934880779, + "language_loss": 0.95517516, + "learning_rate": 0.000998716948986726, + "loss": 0.96654487, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.23144531, + "step": 271, + "time_per_iteration": 2.797673225402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137765, + "balance_loss_mlp": 1.01512277, + "epoch": 0.05232781839168911, + "flos": 604673489664.0, + "grad_norm": 0.04118655717732696, + "language_loss": 0.97937191, + "learning_rate": 0.0009986945477961633, + "loss": 0.9907496, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.22460938, + "step": 272, + "time_per_iteration": 2.6988775730133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135438, + "balance_loss_mlp": 1.01336777, + "epoch": 0.052520200076952676, + "flos": 539656556544.0, + "grad_norm": 0.027940819886650203, + "language_loss": 1.02222085, + "learning_rate": 0.0009986719529952066, + "loss": 1.0335753, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.21875, + "step": 273, + "time_per_iteration": 2.9503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133251, + "balance_loss_mlp": 1.01175284, + "epoch": 0.052712581762216236, + "flos": 464333736960.0, + "grad_norm": 0.036678205813438995, + "language_loss": 1.02377117, + "learning_rate": 0.000998649164592628, + "loss": 1.0351038, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.21289062, + "step": 274, + "time_per_iteration": 2.575183868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134193, + "balance_loss_mlp": 1.01279056, + "epoch": 0.0529049634474798, + "flos": 549106054656.0, + "grad_norm": 0.029580362230619023, + "language_loss": 1.00386071, + "learning_rate": 0.0009986261825972748, + "loss": 1.01520276, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.21191406, + "step": 275, + "time_per_iteration": 2.781388521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.01578796, + "epoch": 0.05309734513274336, + "flos": 619201081344.0, + "grad_norm": 0.028327187192750843, + "language_loss": 1.01742268, + "learning_rate": 0.000998603007018069, + "loss": 1.0287869, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.20410156, + "step": 276, + "time_per_iteration": 2.8231008052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137197, + "balance_loss_mlp": 1.01665294, + "epoch": 0.05328972681800693, + "flos": 606618602496.0, + "grad_norm": 0.02408735734832513, + "language_loss": 1.00149679, + "learning_rate": 0.0009985796378640089, + "loss": 1.01286888, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.203125, + "step": 277, + "time_per_iteration": 2.721719264984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136169, + "balance_loss_mlp": 1.01610124, + "epoch": 0.05348210850327049, + "flos": 605731100160.0, + "grad_norm": 0.0319931943489141, + "language_loss": 0.99697894, + "learning_rate": 0.0009985560751441665, + "loss": 1.0083406, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.19824219, + "step": 278, + "time_per_iteration": 2.835160255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133809, + "balance_loss_mlp": 1.01412332, + "epoch": 0.053674490188534055, + "flos": 631998388224.0, + "grad_norm": 0.030840524384760076, + "language_loss": 1.0228467, + "learning_rate": 0.00099853231886769, + "loss": 1.03418469, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.19433594, + "step": 279, + "time_per_iteration": 2.8541102409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131641, + "balance_loss_mlp": 1.01243138, + "epoch": 0.053866871873797614, + "flos": 480174596352.0, + "grad_norm": 0.030057370429500904, + "language_loss": 1.01521945, + "learning_rate": 0.0009985083690438024, + "loss": 1.02653599, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.18945312, + "step": 280, + "time_per_iteration": 2.778996706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133594, + "balance_loss_mlp": 1.01514757, + "epoch": 0.054059253559061174, + "flos": 789490322688.0, + "grad_norm": 0.030570218765999514, + "language_loss": 0.92515564, + "learning_rate": 0.0009984842256818016, + "loss": 0.93649161, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.18164062, + "step": 281, + "time_per_iteration": 3.113694429397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.01928854, + "epoch": 0.05425163524432474, + "flos": 629506000896.0, + "grad_norm": 0.043548376252248826, + "language_loss": 1.03102541, + "learning_rate": 0.0009984598887910613, + "loss": 1.04240274, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.18164062, + "step": 282, + "time_per_iteration": 2.8303444385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.01504183, + "epoch": 0.0544440169295883, + "flos": 616993508352.0, + "grad_norm": 0.05077708884656826, + "language_loss": 0.98823464, + "learning_rate": 0.0009984353583810297, + "loss": 0.99956, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.171875, + "step": 283, + "time_per_iteration": 2.835850954055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.01315546, + "epoch": 0.05463639861485187, + "flos": 648930884352.0, + "grad_norm": 0.03524270200319673, + "language_loss": 1.0117259, + "learning_rate": 0.0009984106344612302, + "loss": 1.02302563, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.16503906, + "step": 284, + "time_per_iteration": 2.760528564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129453, + "balance_loss_mlp": 1.01319993, + "epoch": 0.054828780300115426, + "flos": 798585987072.0, + "grad_norm": 0.03078454247465455, + "language_loss": 0.96210134, + "learning_rate": 0.0009983857170412615, + "loss": 0.97339588, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.15917969, + "step": 285, + "time_per_iteration": 2.9911587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131741, + "balance_loss_mlp": 1.01567924, + "epoch": 0.05502116198537899, + "flos": 550799400960.0, + "grad_norm": 0.028192528419898312, + "language_loss": 0.95645988, + "learning_rate": 0.000998360606130798, + "loss": 0.96777725, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.15722656, + "step": 286, + "time_per_iteration": 2.8603405952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119957, + "balance_loss_mlp": 1.00475311, + "epoch": 0.05521354367064255, + "flos": 1410909659136.0, + "grad_norm": 0.016802553847575376, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70193076, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.1484375, + "step": 287, + "time_per_iteration": 4.872994899749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139216, + "balance_loss_mlp": 1.02372622, + "epoch": 0.05540592535590612, + "flos": 646612495872.0, + "grad_norm": 0.03160477576624613, + "language_loss": 1.01500821, + "learning_rate": 0.0009983098038774552, + "loss": 1.02640033, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.15136719, + "step": 288, + "time_per_iteration": 2.7645044326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119652, + "balance_loss_mlp": 1.00521088, + "epoch": 0.05559830704116968, + "flos": 1514318512896.0, + "grad_norm": 0.011772143096286682, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79289877, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.140625, + "step": 289, + "time_per_iteration": 4.783201456069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150059, + "balance_loss_mlp": 1.03542745, + "epoch": 0.055790688726433245, + "flos": 509335737600.0, + "grad_norm": 0.037615798403722346, + "language_loss": 1.00063777, + "learning_rate": 0.0009982582277800948, + "loss": 1.01213825, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.14257812, + "step": 290, + "time_per_iteration": 2.5825588703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142418, + "balance_loss_mlp": 1.02873969, + "epoch": 0.055983070411696804, + "flos": 659075410944.0, + "grad_norm": 0.03490310528255379, + "language_loss": 1.06654799, + "learning_rate": 0.0009982321495648908, + "loss": 1.07797217, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.13671875, + "step": 291, + "time_per_iteration": 2.8099231719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137522, + "balance_loss_mlp": 1.02470279, + "epoch": 0.05617545209696037, + "flos": 588476851200.0, + "grad_norm": 0.035465642673631545, + "language_loss": 0.97683877, + "learning_rate": 0.0009982058779188115, + "loss": 0.98821402, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.13183594, + "step": 292, + "time_per_iteration": 2.7125580310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136096, + "balance_loss_mlp": 1.02384841, + "epoch": 0.05636783378222393, + "flos": 612788332800.0, + "grad_norm": 0.032210362870472055, + "language_loss": 1.05647731, + "learning_rate": 0.0009981794128520567, + "loss": 1.06783831, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.12597656, + "step": 293, + "time_per_iteration": 2.7916390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135514, + "balance_loss_mlp": 1.0241251, + "epoch": 0.0565602154674875, + "flos": 669424071936.0, + "grad_norm": 0.03595229916115603, + "language_loss": 1.02550793, + "learning_rate": 0.000998152754374901, + "loss": 1.03686309, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.1171875, + "step": 294, + "time_per_iteration": 2.8770558834075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134115, + "balance_loss_mlp": 1.0227263, + "epoch": 0.05675259715275106, + "flos": 618365101824.0, + "grad_norm": 0.028486588423889302, + "language_loss": 0.98274708, + "learning_rate": 0.0009981259024976943, + "loss": 0.99408829, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.1171875, + "step": 295, + "time_per_iteration": 2.729853630065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133246, + "balance_loss_mlp": 1.02204788, + "epoch": 0.05694497883801462, + "flos": 753154330368.0, + "grad_norm": 0.04188437456637708, + "language_loss": 0.968624, + "learning_rate": 0.0009980988572308612, + "loss": 0.97995651, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.11523438, + "step": 296, + "time_per_iteration": 3.0135345458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132496, + "balance_loss_mlp": 1.02187026, + "epoch": 0.05713736052327818, + "flos": 713382067968.0, + "grad_norm": 0.0305883196599643, + "language_loss": 0.9903996, + "learning_rate": 0.0009980716185849015, + "loss": 1.0017246, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.109375, + "step": 297, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.01865172, + "epoch": 0.05732974220854175, + "flos": 469936750848.0, + "grad_norm": 0.029025981508343963, + "language_loss": 0.95620793, + "learning_rate": 0.0009980441865703904, + "loss": 0.96750069, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.109375, + "step": 298, + "time_per_iteration": 2.67486572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.0163666, + "epoch": 0.05752212389380531, + "flos": 602541739008.0, + "grad_norm": 0.028406065642448373, + "language_loss": 1.04190016, + "learning_rate": 0.000998016561197978, + "loss": 1.05316436, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.10351562, + "step": 299, + "time_per_iteration": 2.7435965538024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127499, + "balance_loss_mlp": 1.01773107, + "epoch": 0.057714505579068875, + "flos": 679950622464.0, + "grad_norm": 0.02999406165417261, + "language_loss": 0.957955, + "learning_rate": 0.0009979887424783895, + "loss": 0.96922994, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.10058594, + "step": 300, + "time_per_iteration": 2.868412494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127678, + "balance_loss_mlp": 1.01800561, + "epoch": 0.057906887264332435, + "flos": 597012602112.0, + "grad_norm": 0.033381964405594114, + "language_loss": 0.95279002, + "learning_rate": 0.0009979607304224248, + "loss": 0.96406674, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.09960938, + "step": 301, + "time_per_iteration": 2.7196099758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127179, + "balance_loss_mlp": 1.01760185, + "epoch": 0.058099268949596, + "flos": 553165421568.0, + "grad_norm": 0.029428698202492602, + "language_loss": 1.02305853, + "learning_rate": 0.000997932525040959, + "loss": 1.03433037, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.09863281, + "step": 302, + "time_per_iteration": 2.645131826400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126073, + "balance_loss_mlp": 1.0166868, + "epoch": 0.05829165063485956, + "flos": 509231725056.0, + "grad_norm": 0.033454482596205204, + "language_loss": 1.04832363, + "learning_rate": 0.000997904126344943, + "loss": 1.05958426, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.09667969, + "step": 303, + "time_per_iteration": 2.60955810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_mlp": 1.0157212, + "epoch": 0.05848403232012313, + "flos": 616363608576.0, + "grad_norm": 0.0319979050325151, + "language_loss": 1.00779867, + "learning_rate": 0.0009978755343454018, + "loss": 1.01905453, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.1015625, + "step": 304, + "time_per_iteration": 2.733825206756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124254, + "balance_loss_mlp": 1.01467645, + "epoch": 0.05867641400538669, + "flos": 501079943424.0, + "grad_norm": 0.03385536533959698, + "language_loss": 1.01509869, + "learning_rate": 0.0009978467490534355, + "loss": 1.0263412, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.09863281, + "step": 305, + "time_per_iteration": 2.6263206005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121932, + "balance_loss_mlp": 1.01292717, + "epoch": 0.05886879569065025, + "flos": 532379638272.0, + "grad_norm": 0.03088897761094542, + "language_loss": 0.98605353, + "learning_rate": 0.00099781777048022, + "loss": 0.99727285, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.09277344, + "step": 306, + "time_per_iteration": 2.7351841926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122107, + "balance_loss_mlp": 1.01329267, + "epoch": 0.05906117737591381, + "flos": 490041111552.0, + "grad_norm": 0.034758856969872284, + "language_loss": 0.99957371, + "learning_rate": 0.0009977885986370057, + "loss": 1.01079476, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.09082031, + "step": 307, + "time_per_iteration": 2.566316843032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120423, + "balance_loss_mlp": 1.01199007, + "epoch": 0.05925355906117737, + "flos": 592710216960.0, + "grad_norm": 0.0408216139096099, + "language_loss": 0.95604599, + "learning_rate": 0.000997759233535118, + "loss": 0.96725023, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.08691406, + "step": 308, + "time_per_iteration": 2.781667470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119623, + "balance_loss_mlp": 1.01147592, + "epoch": 0.05944594074644094, + "flos": 564788466432.0, + "grad_norm": 0.03543125546238922, + "language_loss": 1.01945186, + "learning_rate": 0.0009977296751859576, + "loss": 1.03064811, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.08398438, + "step": 309, + "time_per_iteration": 2.778700828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121487, + "balance_loss_mlp": 1.0137223, + "epoch": 0.0596383224317045, + "flos": 539808201216.0, + "grad_norm": 0.03208598270087784, + "language_loss": 1.03591859, + "learning_rate": 0.0009976999236009998, + "loss": 1.04713345, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.08007812, + "step": 310, + "time_per_iteration": 2.790116786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121449, + "balance_loss_mlp": 1.01387453, + "epoch": 0.059830704116968066, + "flos": 562053060864.0, + "grad_norm": 0.03260901983169028, + "language_loss": 1.05564129, + "learning_rate": 0.0009976699787917955, + "loss": 1.06685579, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.078125, + "step": 311, + "time_per_iteration": 2.6586148738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108932, + "balance_loss_mlp": 1.00326538, + "epoch": 0.060023085802231625, + "flos": 1574050294272.0, + "grad_norm": 0.018314702584398344, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74551928, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.05859375, + "step": 312, + "time_per_iteration": 4.943182945251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128012, + "balance_loss_mlp": 1.02101004, + "epoch": 0.06021546748749519, + "flos": 483628363008.0, + "grad_norm": 0.04396023920554742, + "language_loss": 0.97026515, + "learning_rate": 0.0009976095095472243, + "loss": 0.98154521, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.07226562, + "step": 313, + "time_per_iteration": 2.619016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_mlp": 1.02425838, + "epoch": 0.06040784917275875, + "flos": 621424205568.0, + "grad_norm": 0.03687701456451143, + "language_loss": 0.97965562, + "learning_rate": 0.0009975789851353334, + "loss": 0.99096727, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.07128906, + "step": 314, + "time_per_iteration": 2.8331894874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125758, + "balance_loss_mlp": 1.01980519, + "epoch": 0.06060023085802232, + "flos": 484603348224.0, + "grad_norm": 0.029408756794299912, + "language_loss": 1.00726843, + "learning_rate": 0.0009975482675461487, + "loss": 1.01852608, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.06152344, + "step": 315, + "time_per_iteration": 2.659079074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125971, + "balance_loss_mlp": 1.02001762, + "epoch": 0.06079261254328588, + "flos": 582986598144.0, + "grad_norm": 0.027344501346145803, + "language_loss": 0.98408186, + "learning_rate": 0.0009975173567915952, + "loss": 0.99534154, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.06152344, + "step": 316, + "time_per_iteration": 2.6947872638702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123736, + "balance_loss_mlp": 1.01873684, + "epoch": 0.060984994228549444, + "flos": 689009348352.0, + "grad_norm": 0.03553374767777348, + "language_loss": 0.92618632, + "learning_rate": 0.000997486252883674, + "loss": 0.93742371, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.05175781, + "step": 317, + "time_per_iteration": 2.8523428440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123139, + "balance_loss_mlp": 1.01861632, + "epoch": 0.061177375913813004, + "flos": 1316749104384.0, + "grad_norm": 0.03506621320439297, + "language_loss": 0.97693729, + "learning_rate": 0.0009974549558344602, + "loss": 0.98816866, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.046875, + "step": 318, + "time_per_iteration": 3.705524206161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121913, + "balance_loss_mlp": 1.01805806, + "epoch": 0.06136975759907657, + "flos": 575401532928.0, + "grad_norm": 0.03493031867187039, + "language_loss": 1.07333064, + "learning_rate": 0.000997423465656105, + "loss": 1.08454978, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.04003906, + "step": 319, + "time_per_iteration": 2.75838565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119126, + "balance_loss_mlp": 1.01546133, + "epoch": 0.06156213928434013, + "flos": 528565234944.0, + "grad_norm": 0.037170039701900144, + "language_loss": 1.04350638, + "learning_rate": 0.0009973917823608335, + "loss": 1.05469775, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.03808594, + "step": 320, + "time_per_iteration": 2.6494460105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117676, + "balance_loss_mlp": 1.01458335, + "epoch": 0.061754520969603696, + "flos": 496590920448.0, + "grad_norm": 0.030464742512101767, + "language_loss": 0.98981547, + "learning_rate": 0.0009973599059609462, + "loss": 1.00099218, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.03222656, + "step": 321, + "time_per_iteration": 2.7119081020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116635, + "balance_loss_mlp": 1.01344728, + "epoch": 0.061946902654867256, + "flos": 441044872704.0, + "grad_norm": 0.031106795532346753, + "language_loss": 0.97035432, + "learning_rate": 0.000997327836468819, + "loss": 0.98152065, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.03320312, + "step": 322, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121262, + "balance_loss_mlp": 1.01836073, + "epoch": 0.06213928434013082, + "flos": 600043515648.0, + "grad_norm": 0.031546338171402045, + "language_loss": 1.00120687, + "learning_rate": 0.000997295573896902, + "loss": 1.01241946, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.03027344, + "step": 323, + "time_per_iteration": 2.825425624847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113502, + "balance_loss_mlp": 1.01126862, + "epoch": 0.06233166602539438, + "flos": 1453116961536.0, + "grad_norm": 0.009515746361157745, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82309544, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.0234375, + "step": 324, + "time_per_iteration": 4.7325074672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.0074234, + "epoch": 0.06252404771065795, + "flos": 1466631651072.0, + "grad_norm": 0.010337204897298672, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79680836, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.015625, + "step": 325, + "time_per_iteration": 4.845058917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131262, + "balance_loss_mlp": 1.02950513, + "epoch": 0.06271642939592151, + "flos": 465236790528.0, + "grad_norm": 0.04479189972062717, + "language_loss": 0.94122899, + "learning_rate": 0.000997197627828043, + "loss": 0.95254159, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.01855469, + "step": 326, + "time_per_iteration": 2.531477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136139, + "balance_loss_mlp": 1.03466833, + "epoch": 0.06290881108118507, + "flos": 533432391168.0, + "grad_norm": 0.03210871152906133, + "language_loss": 0.89633012, + "learning_rate": 0.0009971645930629716, + "loss": 0.9076916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.015625, + "step": 327, + "time_per_iteration": 2.766155481338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131438, + "balance_loss_mlp": 1.0305388, + "epoch": 0.06310119276644863, + "flos": 674768516352.0, + "grad_norm": 0.03217671154768682, + "language_loss": 1.03418863, + "learning_rate": 0.0009971313652814872, + "loss": 1.0455029, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.00976562, + "step": 328, + "time_per_iteration": 2.818718433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125209, + "balance_loss_mlp": 1.02440596, + "epoch": 0.0632935744517122, + "flos": 772051381248.0, + "grad_norm": 0.03902843256426295, + "language_loss": 1.00692391, + "learning_rate": 0.0009970979444964903, + "loss": 1.01817608, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.00878906, + "step": 329, + "time_per_iteration": 2.9847218990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119216, + "balance_loss_mlp": 1.01869905, + "epoch": 0.06348595613697576, + "flos": 562975556352.0, + "grad_norm": 0.040034835413812295, + "language_loss": 1.01797342, + "learning_rate": 0.0009970643307209556, + "loss": 1.02916563, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.00585938, + "step": 330, + "time_per_iteration": 2.817711353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112644, + "balance_loss_mlp": 1.01250839, + "epoch": 0.06367833782223932, + "flos": 677384358144.0, + "grad_norm": 0.031424074947949916, + "language_loss": 0.98358697, + "learning_rate": 0.0009970305239679334, + "loss": 0.99471337, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.00195312, + "step": 331, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.01247358, + "epoch": 0.06387071950750288, + "flos": 496349847552.0, + "grad_norm": 0.04016029313197435, + "language_loss": 1.03082633, + "learning_rate": 0.0009969965242505483, + "loss": 1.04195428, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.00390625, + "step": 332, + "time_per_iteration": 2.631326675415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113411, + "balance_loss_mlp": 1.01317954, + "epoch": 0.06406310119276645, + "flos": 534557075712.0, + "grad_norm": 0.03761595064373852, + "language_loss": 0.99054992, + "learning_rate": 0.0009969623315820007, + "loss": 1.00168395, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.00292969, + "step": 333, + "time_per_iteration": 2.6700048446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.01369655, + "epoch": 0.06425548287803001, + "flos": 457165688832.0, + "grad_norm": 0.0356255093132357, + "language_loss": 0.99075055, + "learning_rate": 0.000996927945975565, + "loss": 1.00188696, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.99951172, + "step": 334, + "time_per_iteration": 2.567225933074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112774, + "balance_loss_mlp": 1.01282871, + "epoch": 0.06444786456329357, + "flos": 561123762432.0, + "grad_norm": 0.034265188200332725, + "language_loss": 0.96451521, + "learning_rate": 0.0009968933674445906, + "loss": 0.97564298, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.99951172, + "step": 335, + "time_per_iteration": 2.6834452152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110954, + "balance_loss_mlp": 1.01100898, + "epoch": 0.06464024624855713, + "flos": 667357449984.0, + "grad_norm": 0.026754476738251005, + "language_loss": 0.980811, + "learning_rate": 0.0009968585960025028, + "loss": 0.99192053, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.99853516, + "step": 336, + "time_per_iteration": 2.9675402641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112488, + "balance_loss_mlp": 1.01368713, + "epoch": 0.0648326279338207, + "flos": 1524558303744.0, + "grad_norm": 0.027483244216433014, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78765678, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.98632812, + "step": 337, + "time_per_iteration": 4.80242133140564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.01540756, + "epoch": 0.06502500961908426, + "flos": 1145216581632.0, + "grad_norm": 0.03509421691107687, + "language_loss": 0.96500707, + "learning_rate": 0.0009967884744390583, + "loss": 0.97615772, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.99414062, + "step": 338, + "time_per_iteration": 3.517488479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118827, + "balance_loss_mlp": 1.01945412, + "epoch": 0.06521739130434782, + "flos": 583694265600.0, + "grad_norm": 0.03507378265000135, + "language_loss": 0.97375119, + "learning_rate": 0.0009967531243449256, + "loss": 0.98493946, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.9921875, + "step": 339, + "time_per_iteration": 2.713430404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119012, + "balance_loss_mlp": 1.02002037, + "epoch": 0.06540977298961138, + "flos": 498659487744.0, + "grad_norm": 0.03215705196534619, + "language_loss": 1.04762673, + "learning_rate": 0.000996717581394126, + "loss": 1.05881691, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.98876953, + "step": 340, + "time_per_iteration": 2.5391135215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116775, + "balance_loss_mlp": 1.01787901, + "epoch": 0.06560215467487496, + "flos": 543904506624.0, + "grad_norm": 0.030763143460584817, + "language_loss": 1.05044627, + "learning_rate": 0.000996681845600459, + "loss": 1.06161404, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.98632812, + "step": 341, + "time_per_iteration": 2.670804262161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118249, + "balance_loss_mlp": 1.01963949, + "epoch": 0.06579453636013852, + "flos": 414351819264.0, + "grad_norm": 0.040583240554979534, + "language_loss": 0.9744029, + "learning_rate": 0.0009966459169777982, + "loss": 0.98558539, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.98388672, + "step": 342, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115105, + "balance_loss_mlp": 1.01706719, + "epoch": 0.06598691804540208, + "flos": 561681730560.0, + "grad_norm": 0.04164342519277061, + "language_loss": 1.05655766, + "learning_rate": 0.0009966097955400924, + "loss": 1.06770873, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.97949219, + "step": 343, + "time_per_iteration": 2.666548728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_mlp": 1.01532912, + "epoch": 0.06617929973066564, + "flos": 573302830080.0, + "grad_norm": 0.03386977599556249, + "language_loss": 0.99970496, + "learning_rate": 0.0009965734813013652, + "loss": 1.01082909, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.97070312, + "step": 344, + "time_per_iteration": 2.8448328971862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_mlp": 1.01261127, + "epoch": 0.06637168141592921, + "flos": 491465194752.0, + "grad_norm": 0.03376822413453626, + "language_loss": 1.02026749, + "learning_rate": 0.0009965369742757151, + "loss": 1.03136492, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.97119141, + "step": 345, + "time_per_iteration": 2.568521738052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.01176453, + "epoch": 0.06656406310119277, + "flos": 1081039518720.0, + "grad_norm": 0.03449730062562062, + "language_loss": 0.98245382, + "learning_rate": 0.0009965002744773152, + "loss": 0.99353665, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.96484375, + "step": 346, + "time_per_iteration": 3.501471519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109602, + "balance_loss_mlp": 1.01347148, + "epoch": 0.06675644478645633, + "flos": 514723923456.0, + "grad_norm": 0.029121068034632647, + "language_loss": 0.95998263, + "learning_rate": 0.0009964633819204139, + "loss": 0.97107863, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.9609375, + "step": 347, + "time_per_iteration": 2.6675100326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093636, + "balance_loss_mlp": 0.9986496, + "epoch": 0.06694882647171989, + "flos": 1450537079808.0, + "grad_norm": 0.008592618933675954, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.82894754, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.94921875, + "step": 348, + "time_per_iteration": 4.92915415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 0.99832916, + "epoch": 0.06714120815698346, + "flos": 1555400152320.0, + "grad_norm": 0.006174818833869298, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76247013, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.94726562, + "step": 349, + "time_per_iteration": 4.8783159255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112042, + "balance_loss_mlp": 1.01719952, + "epoch": 0.06733358984224702, + "flos": 881617326336.0, + "grad_norm": 0.039044792628629706, + "language_loss": 0.95966816, + "learning_rate": 0.000996351547842304, + "loss": 0.97078854, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.94775391, + "step": 350, + "time_per_iteration": 3.151158094406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106972, + "balance_loss_mlp": 1.01222503, + "epoch": 0.06752597152751058, + "flos": 519918668544.0, + "grad_norm": 0.04011951728876299, + "language_loss": 0.94198334, + "learning_rate": 0.0009963138843953744, + "loss": 0.953053, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.94677734, + "step": 351, + "time_per_iteration": 2.6077194213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111271, + "balance_loss_mlp": 1.01661849, + "epoch": 0.06771835321277414, + "flos": 540883308288.0, + "grad_norm": 0.02897454745239974, + "language_loss": 0.98297268, + "learning_rate": 0.000996276028262306, + "loss": 0.99408543, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.94580078, + "step": 352, + "time_per_iteration": 2.8440346717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115128, + "balance_loss_mlp": 1.02052331, + "epoch": 0.0679107348980377, + "flos": 461615827968.0, + "grad_norm": 0.03358261828070724, + "language_loss": 1.05270672, + "learning_rate": 0.0009962379794577964, + "loss": 1.06385791, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.9453125, + "step": 353, + "time_per_iteration": 2.6153147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115897, + "balance_loss_mlp": 1.02129257, + "epoch": 0.06810311658330127, + "flos": 637208684544.0, + "grad_norm": 0.03193767698980152, + "language_loss": 0.94629884, + "learning_rate": 0.000996199737996617, + "loss": 0.95745778, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.9453125, + "step": 354, + "time_per_iteration": 2.9557363986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114833, + "balance_loss_mlp": 1.0208956, + "epoch": 0.06829549826856483, + "flos": 465627562752.0, + "grad_norm": 0.034421374529713736, + "language_loss": 1.03816652, + "learning_rate": 0.0009961613038936149, + "loss": 1.04931474, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.93847656, + "step": 355, + "time_per_iteration": 2.583648204803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112591, + "balance_loss_mlp": 1.01879704, + "epoch": 0.06848787995382839, + "flos": 635897362176.0, + "grad_norm": 0.027271592740405557, + "language_loss": 0.95725697, + "learning_rate": 0.000996122677163711, + "loss": 0.96838284, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.93701172, + "step": 356, + "time_per_iteration": 2.7997536659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.02022934, + "epoch": 0.06868026163909195, + "flos": 807781773312.0, + "grad_norm": 0.036098266403844226, + "language_loss": 1.02058005, + "learning_rate": 0.000996083857821902, + "loss": 1.03171647, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.93310547, + "step": 357, + "time_per_iteration": 3.0117554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113245, + "balance_loss_mlp": 1.01978505, + "epoch": 0.06887264332435553, + "flos": 440152512768.0, + "grad_norm": 0.03587140172627376, + "language_loss": 1.00045025, + "learning_rate": 0.0009960448458832588, + "loss": 1.01158273, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.93359375, + "step": 358, + "time_per_iteration": 2.6948373317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110172, + "balance_loss_mlp": 1.01714087, + "epoch": 0.06906502500961909, + "flos": 485786358528.0, + "grad_norm": 0.028895953236024122, + "language_loss": 0.99980301, + "learning_rate": 0.000996005641362927, + "loss": 1.01090467, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.92919922, + "step": 359, + "time_per_iteration": 2.600889205932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110333, + "balance_loss_mlp": 1.01715922, + "epoch": 0.06925740669488265, + "flos": 734886212352.0, + "grad_norm": 0.03093408458560108, + "language_loss": 1.02453041, + "learning_rate": 0.0009959662442761274, + "loss": 1.0356338, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.93066406, + "step": 360, + "time_per_iteration": 2.9324746131896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107676, + "balance_loss_mlp": 1.01445436, + "epoch": 0.0694497883801462, + "flos": 553571745024.0, + "grad_norm": 0.03028505188811882, + "language_loss": 0.95860314, + "learning_rate": 0.000995926654638155, + "loss": 0.96967983, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.93115234, + "step": 361, + "time_per_iteration": 2.8280868530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104746, + "balance_loss_mlp": 1.01157248, + "epoch": 0.06964217006540978, + "flos": 679244900352.0, + "grad_norm": 0.03450824772288923, + "language_loss": 0.98644811, + "learning_rate": 0.00099588687246438, + "loss": 0.99749553, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.93066406, + "step": 362, + "time_per_iteration": 2.8108932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108438, + "balance_loss_mlp": 1.01535928, + "epoch": 0.06983455175067334, + "flos": 525261167616.0, + "grad_norm": 0.03621302361184023, + "language_loss": 1.06105995, + "learning_rate": 0.0009958468977702471, + "loss": 1.07214439, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.9296875, + "step": 363, + "time_per_iteration": 2.6087372303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135422, + "balance_loss_mlp": 1.04272461, + "epoch": 0.0700269334359369, + "flos": 1580176283136.0, + "grad_norm": 0.03651647631774479, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.80870128, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.92578125, + "step": 364, + "time_per_iteration": 4.806072235107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104861, + "balance_loss_mlp": 1.01254511, + "epoch": 0.07021931512120046, + "flos": 1014858050304.0, + "grad_norm": 0.04058448706036458, + "language_loss": 0.94071019, + "learning_rate": 0.0009957663708830612, + "loss": 0.9517588, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.921875, + "step": 365, + "time_per_iteration": 3.30859637260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110656, + "balance_loss_mlp": 1.01862633, + "epoch": 0.07041169680646403, + "flos": 824432367360.0, + "grad_norm": 0.04186203278400794, + "language_loss": 0.98041129, + "learning_rate": 0.0009957258187212714, + "loss": 0.9915179, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.91894531, + "step": 366, + "time_per_iteration": 3.00058913230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097015, + "balance_loss_mlp": 1.00565338, + "epoch": 0.07060407849172759, + "flos": 1417293250560.0, + "grad_norm": 0.011820269564466843, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80291873, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.91210938, + "step": 367, + "time_per_iteration": 4.794500827789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113703, + "balance_loss_mlp": 1.02186394, + "epoch": 0.07079646017699115, + "flos": 513942379008.0, + "grad_norm": 0.041641563183133855, + "language_loss": 0.94691038, + "learning_rate": 0.0009956441370400167, + "loss": 0.95804739, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.91699219, + "step": 368, + "time_per_iteration": 2.63948917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111577, + "balance_loss_mlp": 1.02436066, + "epoch": 0.07098884186225471, + "flos": 541549179648.0, + "grad_norm": 0.03426405251061256, + "language_loss": 1.00885093, + "learning_rate": 0.0009956030075522636, + "loss": 1.02000868, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.91259766, + "step": 369, + "time_per_iteration": 2.74157452583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107449, + "balance_loss_mlp": 1.01613438, + "epoch": 0.07118122354751828, + "flos": 549739845120.0, + "grad_norm": 0.030296400642036637, + "language_loss": 1.0031743, + "learning_rate": 0.0009955616856543587, + "loss": 1.01424885, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.91162109, + "step": 370, + "time_per_iteration": 2.6210479736328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105786, + "balance_loss_mlp": 1.01475775, + "epoch": 0.07137360523278184, + "flos": 622077437952.0, + "grad_norm": 0.029509682347833893, + "language_loss": 0.92550498, + "learning_rate": 0.0009955201713623448, + "loss": 0.93656284, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.90869141, + "step": 371, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.00284576, + "epoch": 0.0715659869180454, + "flos": 1505976202752.0, + "grad_norm": 0.005566886599578838, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77765214, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.89648438, + "step": 372, + "time_per_iteration": 4.947838306427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126764, + "balance_loss_mlp": 1.0361172, + "epoch": 0.07175836860330896, + "flos": 496482050304.0, + "grad_norm": 0.040308561934975694, + "language_loss": 1.05629396, + "learning_rate": 0.0009954365656605333, + "loss": 1.06756163, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.90478516, + "step": 373, + "time_per_iteration": 2.5537302494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124141, + "balance_loss_mlp": 1.03416181, + "epoch": 0.07195075028857253, + "flos": 787082505984.0, + "grad_norm": 0.034789914575730614, + "language_loss": 0.98912442, + "learning_rate": 0.0009953944742831947, + "loss": 1.00036585, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.89892578, + "step": 374, + "time_per_iteration": 2.976074695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106044, + "balance_loss_mlp": 1.01678061, + "epoch": 0.0721431319738361, + "flos": 594347182848.0, + "grad_norm": 0.029628456658550576, + "language_loss": 1.02558136, + "learning_rate": 0.0009953521905766642, + "loss": 1.03664172, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.89404297, + "step": 375, + "time_per_iteration": 2.9556005001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101275, + "balance_loss_mlp": 1.01234496, + "epoch": 0.07233551365909965, + "flos": 549329630976.0, + "grad_norm": 0.034208323574026145, + "language_loss": 1.01073325, + "learning_rate": 0.0009953097145573577, + "loss": 1.02174592, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.89111328, + "step": 376, + "time_per_iteration": 2.6449482440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_mlp": 1.01759815, + "epoch": 0.07252789534436321, + "flos": 959169106176.0, + "grad_norm": 0.031040198427254525, + "language_loss": 0.98588479, + "learning_rate": 0.000995267046241766, + "loss": 0.99694908, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.89013672, + "step": 377, + "time_per_iteration": 3.2564361095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106989, + "balance_loss_mlp": 1.01877415, + "epoch": 0.07272027702962677, + "flos": 508656260352.0, + "grad_norm": 0.029229214223645432, + "language_loss": 0.98238575, + "learning_rate": 0.0009952241856464547, + "loss": 0.99345565, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.88378906, + "step": 378, + "time_per_iteration": 2.5843191146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108111, + "balance_loss_mlp": 1.02013505, + "epoch": 0.07291265871489035, + "flos": 613552380672.0, + "grad_norm": 0.03194005050639913, + "language_loss": 1.05557346, + "learning_rate": 0.0009951811327880632, + "loss": 1.06665444, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.88134766, + "step": 379, + "time_per_iteration": 2.727449655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107323, + "balance_loss_mlp": 1.01934636, + "epoch": 0.0731050404001539, + "flos": 496742565120.0, + "grad_norm": 0.03092115392183015, + "language_loss": 0.98400533, + "learning_rate": 0.0009951378876833063, + "loss": 0.99507862, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.88134766, + "step": 380, + "time_per_iteration": 2.5320205688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101258, + "balance_loss_mlp": 1.01332915, + "epoch": 0.07329742208541747, + "flos": 641130991104.0, + "grad_norm": 0.032065094183830696, + "language_loss": 1.04703462, + "learning_rate": 0.0009950944503489736, + "loss": 1.05804706, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.88085938, + "step": 381, + "time_per_iteration": 2.7422876358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_mlp": 1.01453876, + "epoch": 0.07348980377068103, + "flos": 817741607424.0, + "grad_norm": 0.030510114485064205, + "language_loss": 0.99112171, + "learning_rate": 0.0009950508208019285, + "loss": 1.00214303, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.87744141, + "step": 382, + "time_per_iteration": 3.046475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_mlp": 1.01323569, + "epoch": 0.0736821854559446, + "flos": 509670129408.0, + "grad_norm": 0.035756321159612754, + "language_loss": 1.03789318, + "learning_rate": 0.0009950069990591096, + "loss": 1.04890537, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.88134766, + "step": 383, + "time_per_iteration": 2.620088577270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.02674103, + "epoch": 0.07387456714120816, + "flos": 1558050987264.0, + "grad_norm": 0.043940663043905655, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77514511, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.86523438, + "step": 384, + "time_per_iteration": 4.87653374671936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121594, + "balance_loss_mlp": 1.03299809, + "epoch": 0.07406694882647172, + "flos": 526644421632.0, + "grad_norm": 0.039102279996233, + "language_loss": 0.96614265, + "learning_rate": 0.0009949187790542777, + "loss": 0.97735858, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.88769531, + "step": 385, + "time_per_iteration": 2.734100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112471, + "balance_loss_mlp": 1.03625691, + "epoch": 0.07425933051173528, + "flos": 498824738304.0, + "grad_norm": 0.03701278047407747, + "language_loss": 0.92462552, + "learning_rate": 0.0009948743808265148, + "loss": 0.93587261, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.88623047, + "step": 386, + "time_per_iteration": 2.7154581546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_mlp": 1.03704965, + "epoch": 0.07445171219699885, + "flos": 506057915136.0, + "grad_norm": 0.06663512882119103, + "language_loss": 1.02268195, + "learning_rate": 0.0009948297904714782, + "loss": 1.0339365, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.88574219, + "step": 387, + "time_per_iteration": 2.68532133102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112575, + "balance_loss_mlp": 1.03777313, + "epoch": 0.07464409388226241, + "flos": 555117337344.0, + "grad_norm": 0.036483324457394946, + "language_loss": 0.94151849, + "learning_rate": 0.0009947850080064796, + "loss": 0.95277596, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.88134766, + "step": 388, + "time_per_iteration": 2.789128303527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121204, + "balance_loss_mlp": 1.03370392, + "epoch": 0.07483647556752597, + "flos": 778275546624.0, + "grad_norm": 0.0421926900222792, + "language_loss": 0.99476451, + "learning_rate": 0.0009947400334489047, + "loss": 1.00597644, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.87646484, + "step": 389, + "time_per_iteration": 2.9937496185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011085, + "balance_loss_mlp": 1.02133441, + "epoch": 0.07502885725278953, + "flos": 613682638080.0, + "grad_norm": 0.0417493031738284, + "language_loss": 0.90741575, + "learning_rate": 0.0009946948668162145, + "loss": 0.91850078, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.87304688, + "step": 390, + "time_per_iteration": 2.7264010906219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101473, + "balance_loss_mlp": 1.01502275, + "epoch": 0.0752212389380531, + "flos": 689856021504.0, + "grad_norm": 0.03330838563423677, + "language_loss": 0.95001, + "learning_rate": 0.0009946495081259441, + "loss": 0.9610247, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.86572266, + "step": 391, + "time_per_iteration": 2.832472085952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097898, + "balance_loss_mlp": 1.01182938, + "epoch": 0.07541362062331666, + "flos": 767052022272.0, + "grad_norm": 0.03859494705227578, + "language_loss": 0.99014449, + "learning_rate": 0.0009946039573957035, + "loss": 1.00112355, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.86181641, + "step": 392, + "time_per_iteration": 2.925933361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101692, + "balance_loss_mlp": 1.01576602, + "epoch": 0.07560600230858022, + "flos": 589909682688.0, + "grad_norm": 0.039112379024015986, + "language_loss": 0.95485294, + "learning_rate": 0.000994558214643177, + "loss": 0.9658699, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.86035156, + "step": 393, + "time_per_iteration": 2.763448476791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095538, + "balance_loss_mlp": 1.00961244, + "epoch": 0.07579838399384378, + "flos": 751146034176.0, + "grad_norm": 0.03818992224284351, + "language_loss": 0.96862066, + "learning_rate": 0.000994512279886123, + "loss": 0.97957599, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.86035156, + "step": 394, + "time_per_iteration": 3.143615245819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101397, + "balance_loss_mlp": 1.01561391, + "epoch": 0.07599076567910736, + "flos": 524551554816.0, + "grad_norm": 0.030240351127206026, + "language_loss": 0.96659988, + "learning_rate": 0.0009944661531423758, + "loss": 0.97761387, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.85888672, + "step": 395, + "time_per_iteration": 2.6748764514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107513, + "balance_loss_mlp": 1.02206361, + "epoch": 0.07618314736437092, + "flos": 552186545664.0, + "grad_norm": 0.03358451790414236, + "language_loss": 0.95614338, + "learning_rate": 0.000994419834429843, + "loss": 0.96721858, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.85546875, + "step": 396, + "time_per_iteration": 2.6525089740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105372, + "balance_loss_mlp": 1.01987493, + "epoch": 0.07637552904963447, + "flos": 699433831680.0, + "grad_norm": 0.04315212632526892, + "language_loss": 1.00552011, + "learning_rate": 0.0009943733237665069, + "loss": 1.01657379, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.85595703, + "step": 397, + "time_per_iteration": 2.8678157329559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097353, + "balance_loss_mlp": 1.01218963, + "epoch": 0.07656791073489803, + "flos": 580636128768.0, + "grad_norm": 0.029538416941692198, + "language_loss": 0.99224108, + "learning_rate": 0.0009943266211704248, + "loss": 1.0032146, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.85253906, + "step": 398, + "time_per_iteration": 3.0023248195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099387, + "balance_loss_mlp": 1.01460528, + "epoch": 0.0767602924201616, + "flos": 418037910528.0, + "grad_norm": 0.03167845871290285, + "language_loss": 1.01143491, + "learning_rate": 0.000994279726659728, + "loss": 1.02242875, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.84863281, + "step": 399, + "time_per_iteration": 2.527693271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107007, + "balance_loss_mlp": 1.02246368, + "epoch": 0.07695267410542517, + "flos": 483888877824.0, + "grad_norm": 0.03414294034973106, + "language_loss": 0.9968133, + "learning_rate": 0.0009942326402526231, + "loss": 1.00788331, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.84619141, + "step": 400, + "time_per_iteration": 2.5610573291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_mlp": 1.02848434, + "epoch": 0.07714505579068873, + "flos": 532027749888.0, + "grad_norm": 0.030264499227930883, + "language_loss": 0.97403878, + "learning_rate": 0.0009941853619673902, + "loss": 0.98516715, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.84423828, + "step": 401, + "time_per_iteration": 2.680175542831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107236, + "balance_loss_mlp": 1.02302694, + "epoch": 0.07733743747595229, + "flos": 806440315392.0, + "grad_norm": 0.03979329481069023, + "language_loss": 1.01160502, + "learning_rate": 0.0009941378918223844, + "loss": 1.02267742, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.84277344, + "step": 402, + "time_per_iteration": 3.0908427238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098686, + "balance_loss_mlp": 1.01447606, + "epoch": 0.07752981916121585, + "flos": 623614281984.0, + "grad_norm": 0.03310929598543939, + "language_loss": 0.93567806, + "learning_rate": 0.0009940902298360354, + "loss": 0.94666493, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.84277344, + "step": 403, + "time_per_iteration": 2.7569308280944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.01048076, + "epoch": 0.07772220084647942, + "flos": 729543713280.0, + "grad_norm": 0.03955766616265138, + "language_loss": 1.03173304, + "learning_rate": 0.0009940423760268473, + "loss": 1.04268289, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.84570312, + "step": 404, + "time_per_iteration": 2.8456103801727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098252, + "balance_loss_mlp": 1.01375628, + "epoch": 0.07791458253174298, + "flos": 556469488896.0, + "grad_norm": 0.042207617679060144, + "language_loss": 0.96929657, + "learning_rate": 0.0009939943304133982, + "loss": 0.98027909, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.84570312, + "step": 405, + "time_per_iteration": 2.615145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104796, + "balance_loss_mlp": 1.02044404, + "epoch": 0.07810696421700654, + "flos": 554235671040.0, + "grad_norm": 0.04104566792755741, + "language_loss": 1.03659868, + "learning_rate": 0.0009939460930143416, + "loss": 1.04764676, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.84423828, + "step": 406, + "time_per_iteration": 2.6304614543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_mlp": 1.01745594, + "epoch": 0.0782993459022701, + "flos": 651879172608.0, + "grad_norm": 0.0317151282671847, + "language_loss": 0.97752666, + "learning_rate": 0.0009938976638484043, + "loss": 0.98854232, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.84179688, + "step": 407, + "time_per_iteration": 2.9032115936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109564, + "balance_loss_mlp": 1.01205039, + "epoch": 0.07849172758753367, + "flos": 497161527552.0, + "grad_norm": 0.04013855375776475, + "language_loss": 0.97246277, + "learning_rate": 0.0009938490429343887, + "loss": 0.98341918, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.83642578, + "step": 408, + "time_per_iteration": 2.5688796043395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.01236188, + "epoch": 0.07868410927279723, + "flos": 579076930560.0, + "grad_norm": 0.0397915036848884, + "language_loss": 0.97571141, + "learning_rate": 0.0009938002302911709, + "loss": 0.98666751, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.83300781, + "step": 409, + "time_per_iteration": 2.75036883354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096533, + "balance_loss_mlp": 1.01365864, + "epoch": 0.07887649095806079, + "flos": 524067463680.0, + "grad_norm": 0.03678821175613874, + "language_loss": 1.00230122, + "learning_rate": 0.0009937512259377015, + "loss": 1.01326644, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.82910156, + "step": 410, + "time_per_iteration": 2.6584975719451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110197, + "balance_loss_mlp": 1.01938236, + "epoch": 0.07906887264332435, + "flos": 558438901248.0, + "grad_norm": 0.04956969404692801, + "language_loss": 0.989124, + "learning_rate": 0.000993702029893006, + "loss": 1.00014377, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.82617188, + "step": 411, + "time_per_iteration": 2.7666263580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_mlp": 1.0196116, + "epoch": 0.07926125432858792, + "flos": 823364063232.0, + "grad_norm": 0.03322797228086769, + "language_loss": 0.99091381, + "learning_rate": 0.0009936526421761838, + "loss": 1.00193632, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.82666016, + "step": 412, + "time_per_iteration": 3.0222113132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102099, + "balance_loss_mlp": 1.01955855, + "epoch": 0.07945363601385148, + "flos": 563394518784.0, + "grad_norm": 0.04210923401756456, + "language_loss": 1.01423764, + "learning_rate": 0.000993603062806409, + "loss": 1.02525866, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.82568359, + "step": 413, + "time_per_iteration": 2.713226079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100317, + "balance_loss_mlp": 1.0176332, + "epoch": 0.07964601769911504, + "flos": 518885357568.0, + "grad_norm": 0.041362228888401006, + "language_loss": 1.04903626, + "learning_rate": 0.0009935532918029298, + "loss": 1.06003952, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.82714844, + "step": 414, + "time_per_iteration": 2.59602689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095709, + "balance_loss_mlp": 1.01326394, + "epoch": 0.0798383993843786, + "flos": 540301040640.0, + "grad_norm": 0.030384950019726516, + "language_loss": 0.97377884, + "learning_rate": 0.0009935033291850694, + "loss": 0.98473597, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.82470703, + "step": 415, + "time_per_iteration": 2.6417808532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094851, + "balance_loss_mlp": 1.013026, + "epoch": 0.08003078106964218, + "flos": 486122695680.0, + "grad_norm": 0.03579523867672845, + "language_loss": 1.00004411, + "learning_rate": 0.0009934531749722247, + "loss": 1.01099253, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.81835938, + "step": 416, + "time_per_iteration": 2.593029737472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095566, + "balance_loss_mlp": 1.01383638, + "epoch": 0.08022316275490574, + "flos": 519276129792.0, + "grad_norm": 0.0354518245662521, + "language_loss": 0.98370755, + "learning_rate": 0.0009934028291838672, + "loss": 0.99466318, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.81738281, + "step": 417, + "time_per_iteration": 2.7351250648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096643, + "balance_loss_mlp": 1.01496112, + "epoch": 0.0804155444401693, + "flos": 495047273472.0, + "grad_norm": 0.032920982329526526, + "language_loss": 0.93668723, + "learning_rate": 0.0009933522918395433, + "loss": 0.94765365, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.81689453, + "step": 418, + "time_per_iteration": 2.6427221298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.03316498, + "epoch": 0.08060792612543285, + "flos": 1584856801536.0, + "grad_norm": 0.029973653623271358, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79365897, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.81640625, + "step": 419, + "time_per_iteration": 4.8632917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.01569724, + "epoch": 0.08080030781069643, + "flos": 526359607296.0, + "grad_norm": 0.04163447523548115, + "language_loss": 1.12134457, + "learning_rate": 0.000993250642561551, + "loss": 1.13230991, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.80810547, + "step": 420, + "time_per_iteration": 2.608396053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109578, + "balance_loss_mlp": 1.01505113, + "epoch": 0.08099268949595999, + "flos": 547757793792.0, + "grad_norm": 0.04746808509414602, + "language_loss": 0.97398257, + "learning_rate": 0.0009931995306673466, + "loss": 0.98494035, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.80712891, + "step": 421, + "time_per_iteration": 2.7215850353240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097341, + "balance_loss_mlp": 1.01670778, + "epoch": 0.08118507118122355, + "flos": 511374169344.0, + "grad_norm": 0.04020038552675014, + "language_loss": 1.02514148, + "learning_rate": 0.000993148227296103, + "loss": 1.03611493, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.80615234, + "step": 422, + "time_per_iteration": 2.625366449356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010968, + "balance_loss_mlp": 1.01607168, + "epoch": 0.08137745286648711, + "flos": 722002389504.0, + "grad_norm": 0.03556088777041087, + "language_loss": 0.90137196, + "learning_rate": 0.000993096732467738, + "loss": 0.91233999, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.80712891, + "step": 423, + "time_per_iteration": 2.9795689582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.0118531, + "epoch": 0.08156983455175067, + "flos": 680818682880.0, + "grad_norm": 0.04422604915428747, + "language_loss": 0.99073571, + "learning_rate": 0.0009930450462022435, + "loss": 1.00165915, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.8046875, + "step": 424, + "time_per_iteration": 2.879889726638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.00783539, + "epoch": 0.08176221623701424, + "flos": 1456591137024.0, + "grad_norm": 0.006453860192715822, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.8027699, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.79296875, + "step": 425, + "time_per_iteration": 4.908784627914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.01541877, + "epoch": 0.0819545979222778, + "flos": 1558885044480.0, + "grad_norm": 0.04271462185638088, + "language_loss": 0.96659774, + "learning_rate": 0.0009929410994402065, + "loss": 0.9775573, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.80517578, + "step": 426, + "time_per_iteration": 3.7266876697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100573, + "balance_loss_mlp": 1.02013052, + "epoch": 0.08214697960754136, + "flos": 513801427968.0, + "grad_norm": 0.040597463537132866, + "language_loss": 1.00489211, + "learning_rate": 0.0009928888389840196, + "loss": 1.01589799, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.80419922, + "step": 427, + "time_per_iteration": 2.695010185241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098577, + "balance_loss_mlp": 1.01822996, + "epoch": 0.08233936129280492, + "flos": 596222309376.0, + "grad_norm": 0.03622779747664415, + "language_loss": 1.02622843, + "learning_rate": 0.0009928363871714147, + "loss": 1.03721428, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.80322266, + "step": 428, + "time_per_iteration": 2.66733455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097324, + "balance_loss_mlp": 1.01721525, + "epoch": 0.08253174297806849, + "flos": 573165769728.0, + "grad_norm": 0.028981657602537042, + "language_loss": 0.97141832, + "learning_rate": 0.0009927837440227556, + "loss": 0.98239154, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.80078125, + "step": 429, + "time_per_iteration": 2.8499114513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093938, + "balance_loss_mlp": 1.01392436, + "epoch": 0.08272412466333205, + "flos": 624643702272.0, + "grad_norm": 0.031878488957356683, + "language_loss": 0.91184896, + "learning_rate": 0.0009927309095584798, + "loss": 0.92278832, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.79980469, + "step": 430, + "time_per_iteration": 3.020768165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097624, + "balance_loss_mlp": 1.01756275, + "epoch": 0.08291650634859561, + "flos": 514995131904.0, + "grad_norm": 0.040558959270141796, + "language_loss": 1.03523278, + "learning_rate": 0.0009926778837991, + "loss": 1.0462091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.80029297, + "step": 431, + "time_per_iteration": 2.609189033508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101176, + "balance_loss_mlp": 1.02125835, + "epoch": 0.08310888803385917, + "flos": 668542405632.0, + "grad_norm": 0.035092839201242565, + "language_loss": 1.01323938, + "learning_rate": 0.000992624666765202, + "loss": 1.0242511, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.79882812, + "step": 432, + "time_per_iteration": 2.817399501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_mlp": 1.02154219, + "epoch": 0.08330126971912274, + "flos": 584491361280.0, + "grad_norm": 0.0354530922421884, + "language_loss": 0.98992586, + "learning_rate": 0.000992571258477447, + "loss": 1.00094295, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.80126953, + "step": 433, + "time_per_iteration": 2.777506113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010961, + "balance_loss_mlp": 1.0161345, + "epoch": 0.0834936514043863, + "flos": 562498268160.0, + "grad_norm": 0.03167346665720251, + "language_loss": 0.92772877, + "learning_rate": 0.0009925176589565695, + "loss": 0.93868983, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.79931641, + "step": 434, + "time_per_iteration": 2.801501512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093857, + "balance_loss_mlp": 1.01398647, + "epoch": 0.08368603308964986, + "flos": 495513868032.0, + "grad_norm": 0.03411426988917409, + "language_loss": 1.03318536, + "learning_rate": 0.0009924638682233791, + "loss": 1.04412401, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.79833984, + "step": 435, + "time_per_iteration": 2.573282241821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.01512909, + "epoch": 0.08387841477491342, + "flos": 1391811397632.0, + "grad_norm": 0.030642245427906535, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.8065716, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.7734375, + "step": 436, + "time_per_iteration": 4.596274375915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099407, + "balance_loss_mlp": 1.02006125, + "epoch": 0.084070796460177, + "flos": 800355155712.0, + "grad_norm": 0.040681894877429646, + "language_loss": 0.92768085, + "learning_rate": 0.0009923557132036668, + "loss": 0.93867493, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.79296875, + "step": 437, + "time_per_iteration": 3.0366878509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_mlp": 1.02364242, + "epoch": 0.08426317814544056, + "flos": 560097254400.0, + "grad_norm": 0.034275916488964116, + "language_loss": 0.96774155, + "learning_rate": 0.0009923013489591345, + "loss": 0.97876477, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.78613281, + "step": 438, + "time_per_iteration": 2.8060851097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_mlp": 1.0219903, + "epoch": 0.08445555983070412, + "flos": 811884881664.0, + "grad_norm": 0.035250716051411925, + "language_loss": 0.95655745, + "learning_rate": 0.0009922467935862681, + "loss": 0.96756417, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.78613281, + "step": 439, + "time_per_iteration": 3.116757869720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098598, + "balance_loss_mlp": 1.0204916, + "epoch": 0.08464794151596768, + "flos": 511170034944.0, + "grad_norm": 0.03561138790794706, + "language_loss": 0.98418635, + "learning_rate": 0.0009921920471062478, + "loss": 0.99517238, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.78027344, + "step": 440, + "time_per_iteration": 2.6008944511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093389, + "balance_loss_mlp": 1.01561701, + "epoch": 0.08484032320123125, + "flos": 557474609664.0, + "grad_norm": 0.02914226137027636, + "language_loss": 0.96590662, + "learning_rate": 0.0009921371095403281, + "loss": 0.97684056, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.77685547, + "step": 441, + "time_per_iteration": 2.638679265975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_mlp": 1.01697087, + "epoch": 0.08503270488649481, + "flos": 528361100544.0, + "grad_norm": 0.02987504029564206, + "language_loss": 0.99685514, + "learning_rate": 0.0009920819809098379, + "loss": 1.00780344, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.77783203, + "step": 442, + "time_per_iteration": 2.5915398597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089542, + "balance_loss_mlp": 1.01172209, + "epoch": 0.08522508657175837, + "flos": 615386678016.0, + "grad_norm": 0.03983619354546574, + "language_loss": 0.95535469, + "learning_rate": 0.0009920266612361798, + "loss": 0.96625006, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.77734375, + "step": 443, + "time_per_iteration": 2.724025249481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091681, + "balance_loss_mlp": 1.01371801, + "epoch": 0.08541746825702193, + "flos": 620987746560.0, + "grad_norm": 0.032808156584867194, + "language_loss": 0.9504559, + "learning_rate": 0.0009919711505408308, + "loss": 0.96137273, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.77880859, + "step": 444, + "time_per_iteration": 2.780973434448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087177, + "balance_loss_mlp": 1.00926137, + "epoch": 0.08560984994228549, + "flos": 483888877824.0, + "grad_norm": 0.03232110076143325, + "language_loss": 0.92813373, + "learning_rate": 0.000991915448845342, + "loss": 0.93900549, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.77832031, + "step": 445, + "time_per_iteration": 2.6011459827423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090069, + "balance_loss_mlp": 1.01243973, + "epoch": 0.08580223162754906, + "flos": 518177690112.0, + "grad_norm": 0.03377956208163177, + "language_loss": 1.02285504, + "learning_rate": 0.000991859556171339, + "loss": 1.03375578, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.77539062, + "step": 446, + "time_per_iteration": 2.606220006942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.01086187, + "epoch": 0.08599461331281262, + "flos": 532520589312.0, + "grad_norm": 0.037753212584348855, + "language_loss": 1.04541254, + "learning_rate": 0.000991803472540521, + "loss": 1.0562979, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.77587891, + "step": 447, + "time_per_iteration": 2.625401735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088712, + "balance_loss_mlp": 1.01113105, + "epoch": 0.08618699499807618, + "flos": 791634712320.0, + "grad_norm": 0.030920782852134367, + "language_loss": 0.98781657, + "learning_rate": 0.0009917471979746615, + "loss": 0.99870372, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.77490234, + "step": 448, + "time_per_iteration": 3.0066978931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.01195049, + "epoch": 0.08637937668333974, + "flos": 567115603200.0, + "grad_norm": 0.03238149886931097, + "language_loss": 0.98317528, + "learning_rate": 0.0009916907324956086, + "loss": 0.99407488, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.77929688, + "step": 449, + "time_per_iteration": 2.7561135292053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091057, + "balance_loss_mlp": 1.01333201, + "epoch": 0.08657175836860331, + "flos": 446118108672.0, + "grad_norm": 0.029046506526173844, + "language_loss": 0.94927382, + "learning_rate": 0.0009916340761252837, + "loss": 0.96018445, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.77636719, + "step": 450, + "time_per_iteration": 2.6452889442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089525, + "balance_loss_mlp": 1.01222932, + "epoch": 0.08676414005386687, + "flos": 845589480960.0, + "grad_norm": 0.032144406787761336, + "language_loss": 0.91630232, + "learning_rate": 0.0009915772288856832, + "loss": 0.92719758, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.77197266, + "step": 451, + "time_per_iteration": 3.0991322994232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108797, + "balance_loss_mlp": 1.01086605, + "epoch": 0.08695652173913043, + "flos": 604484906496.0, + "grad_norm": 0.025568476728402203, + "language_loss": 0.93134868, + "learning_rate": 0.000991520190798877, + "loss": 0.94222844, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.77001953, + "step": 452, + "time_per_iteration": 2.833534002304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093662, + "balance_loss_mlp": 1.01660514, + "epoch": 0.08714890342439399, + "flos": 732001107456.0, + "grad_norm": 0.03795734255344977, + "language_loss": 1.02428043, + "learning_rate": 0.0009914629618870089, + "loss": 1.03521705, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.76953125, + "step": 453, + "time_per_iteration": 2.9043643474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.02319336, + "epoch": 0.08734128510965757, + "flos": 1485456770304.0, + "grad_norm": 0.019964198948139205, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79774594, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.75390625, + "step": 454, + "time_per_iteration": 2.093019723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_mlp": 1.01278687, + "epoch": 0.08753366679492113, + "flos": 1526269146624.0, + "grad_norm": 0.012226751630218, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82515901, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.75, + "step": 455, + "time_per_iteration": 4.905871391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091043, + "balance_loss_mlp": 1.01379561, + "epoch": 0.08772604848018468, + "flos": 722525364480.0, + "grad_norm": 0.044152825797527884, + "language_loss": 0.95217329, + "learning_rate": 0.0009912901304235883, + "loss": 0.96308374, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.77148438, + "step": 456, + "time_per_iteration": 2.850330352783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090876, + "balance_loss_mlp": 1.01396191, + "epoch": 0.08791843016544824, + "flos": 709467542784.0, + "grad_norm": 0.038854584599924205, + "language_loss": 0.92178857, + "learning_rate": 0.000991232138434397, + "loss": 0.9326973, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.76806641, + "step": 457, + "time_per_iteration": 2.868957757949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.01477098, + "epoch": 0.08811081185071182, + "flos": 474022362624.0, + "grad_norm": 0.04035146689108268, + "language_loss": 0.99321103, + "learning_rate": 0.000991173955731976, + "loss": 1.00412512, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.76513672, + "step": 458, + "time_per_iteration": 2.6747970581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089272, + "balance_loss_mlp": 1.01288271, + "epoch": 0.08830319353597538, + "flos": 686315738880.0, + "grad_norm": 0.033089720334054364, + "language_loss": 1.03213239, + "learning_rate": 0.0009911155823389137, + "loss": 1.04302514, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.76269531, + "step": 459, + "time_per_iteration": 2.9462268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_mlp": 1.00881398, + "epoch": 0.08849557522123894, + "flos": 574609294848.0, + "grad_norm": 0.035557366742091014, + "language_loss": 0.99025905, + "learning_rate": 0.000991057018277873, + "loss": 1.00111353, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.76513672, + "step": 460, + "time_per_iteration": 2.6903369426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.00968456, + "epoch": 0.0886879569065025, + "flos": 565628336640.0, + "grad_norm": 0.039664118418905284, + "language_loss": 1.00002789, + "learning_rate": 0.0009909982635715898, + "loss": 1.01089334, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.76757812, + "step": 461, + "time_per_iteration": 2.620046615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010895, + "balance_loss_mlp": 1.0128243, + "epoch": 0.08888033859176607, + "flos": 564957607680.0, + "grad_norm": 0.03231802322071402, + "language_loss": 0.98670942, + "learning_rate": 0.0009909393182428751, + "loss": 0.99760437, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.765625, + "step": 462, + "time_per_iteration": 2.6466307640075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090991, + "balance_loss_mlp": 1.01412475, + "epoch": 0.08907272027702963, + "flos": 466743499008.0, + "grad_norm": 0.03344290639259395, + "language_loss": 0.93214953, + "learning_rate": 0.000990880182314614, + "loss": 0.94305944, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.76757812, + "step": 463, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_mlp": 1.0100224, + "epoch": 0.08926510196229319, + "flos": 682844475648.0, + "grad_norm": 0.03261982194681884, + "language_loss": 0.93093467, + "learning_rate": 0.0009908208558097643, + "loss": 0.94180012, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.76416016, + "step": 464, + "time_per_iteration": 2.9068925380706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089482, + "balance_loss_mlp": 1.01323605, + "epoch": 0.08945748364755675, + "flos": 597822336768.0, + "grad_norm": 0.03309433671244878, + "language_loss": 0.95414662, + "learning_rate": 0.000990761338751359, + "loss": 0.9650414, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.76123047, + "step": 465, + "time_per_iteration": 2.774606227874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079613, + "balance_loss_mlp": 1.00732422, + "epoch": 0.08964986533282032, + "flos": 1589343879168.0, + "grad_norm": 0.03434681355524106, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74739242, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.72460938, + "step": 466, + "time_per_iteration": 4.996358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092523, + "balance_loss_mlp": 1.01646745, + "epoch": 0.08984224701808388, + "flos": 534550272768.0, + "grad_norm": 0.03379784984504044, + "language_loss": 0.98391378, + "learning_rate": 0.0009906417330663815, + "loss": 0.99483901, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.75927734, + "step": 467, + "time_per_iteration": 2.6774964332580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092653, + "balance_loss_mlp": 1.01678836, + "epoch": 0.09003462870334744, + "flos": 479850898176.0, + "grad_norm": 0.04271038491910547, + "language_loss": 0.94838965, + "learning_rate": 0.0009905816444862442, + "loss": 0.95931625, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.75732422, + "step": 468, + "time_per_iteration": 2.6558451652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092919, + "balance_loss_mlp": 1.01691103, + "epoch": 0.090227010388611, + "flos": 654903283200.0, + "grad_norm": 0.031716132767048565, + "language_loss": 0.92225289, + "learning_rate": 0.0009905213654454216, + "loss": 0.933182, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.75878906, + "step": 469, + "time_per_iteration": 2.9322757720947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.01796389, + "epoch": 0.09041939207387456, + "flos": 619359528960.0, + "grad_norm": 0.03474651138537023, + "language_loss": 1.00819349, + "learning_rate": 0.0009904608959673158, + "loss": 1.01913023, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.75585938, + "step": 470, + "time_per_iteration": 2.7938003540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091787, + "balance_loss_mlp": 1.01620793, + "epoch": 0.09061177375913813, + "flos": 455296398336.0, + "grad_norm": 0.04023106246537731, + "language_loss": 1.00852847, + "learning_rate": 0.000990400236075403, + "loss": 1.01944637, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.75439453, + "step": 471, + "time_per_iteration": 2.5231049060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.01024961, + "epoch": 0.0908041554444017, + "flos": 545309147904.0, + "grad_norm": 0.036372029021066864, + "language_loss": 0.97571105, + "learning_rate": 0.0009903393857932338, + "loss": 0.98656648, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.75146484, + "step": 472, + "time_per_iteration": 2.700449228286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082684, + "balance_loss_mlp": 1.00786841, + "epoch": 0.09099653712966525, + "flos": 565467943680.0, + "grad_norm": 0.03263919317425628, + "language_loss": 0.95124531, + "learning_rate": 0.0009902783451444317, + "loss": 0.96207213, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.74658203, + "step": 473, + "time_per_iteration": 2.7006537914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.00667381, + "epoch": 0.09118891881492881, + "flos": 475502826240.0, + "grad_norm": 0.036465550100162274, + "language_loss": 0.98778975, + "learning_rate": 0.0009902171141526956, + "loss": 0.99860233, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.74414062, + "step": 474, + "time_per_iteration": 2.565852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081522, + "balance_loss_mlp": 1.00732613, + "epoch": 0.09138130050019239, + "flos": 546991800576.0, + "grad_norm": 0.03189281102051162, + "language_loss": 0.86324012, + "learning_rate": 0.000990155692841797, + "loss": 0.87405533, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.74023438, + "step": 475, + "time_per_iteration": 2.9694621562957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.0079515, + "epoch": 0.09157368218545595, + "flos": 733974410496.0, + "grad_norm": 0.03574286330183218, + "language_loss": 0.98287529, + "learning_rate": 0.0009900940812355818, + "loss": 0.99369442, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.73779297, + "step": 476, + "time_per_iteration": 2.8549702167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.00835192, + "epoch": 0.0917660638707195, + "flos": 612073862400.0, + "grad_norm": 0.03800316101532587, + "language_loss": 0.95275486, + "learning_rate": 0.00099003227935797, + "loss": 0.96357656, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.73632812, + "step": 477, + "time_per_iteration": 2.709808349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084092, + "balance_loss_mlp": 1.01051593, + "epoch": 0.09195844555598306, + "flos": 657019482624.0, + "grad_norm": 0.03875864993538346, + "language_loss": 0.99037415, + "learning_rate": 0.000989970287232955, + "loss": 1.0012151, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.73486328, + "step": 478, + "time_per_iteration": 2.7670538425445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.01252699, + "epoch": 0.09215082724124664, + "flos": 477541257984.0, + "grad_norm": 0.03367109557456403, + "language_loss": 0.95731258, + "learning_rate": 0.0009899081048846043, + "loss": 0.96817166, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.73339844, + "step": 479, + "time_per_iteration": 2.588352918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_mlp": 1.01208997, + "epoch": 0.0923432089265102, + "flos": 525326296320.0, + "grad_norm": 0.0462740033589213, + "language_loss": 1.00606585, + "learning_rate": 0.0009898457323370593, + "loss": 1.01691723, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.73046875, + "step": 480, + "time_per_iteration": 2.5808160305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082132, + "balance_loss_mlp": 1.00936687, + "epoch": 0.09253559061177376, + "flos": 546639912192.0, + "grad_norm": 0.03676160983227949, + "language_loss": 0.9798522, + "learning_rate": 0.000989783169614535, + "loss": 0.99067354, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.72900391, + "step": 481, + "time_per_iteration": 2.624483108520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097145, + "balance_loss_mlp": 1.02485657, + "epoch": 0.09272797229703732, + "flos": 1541337209856.0, + "grad_norm": 0.023489610904585654, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79849905, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.72460938, + "step": 482, + "time_per_iteration": 4.897305965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085543, + "balance_loss_mlp": 1.01330173, + "epoch": 0.09292035398230089, + "flos": 691065276672.0, + "grad_norm": 0.04252493421314706, + "language_loss": 0.95552129, + "learning_rate": 0.000989657473741779, + "loss": 0.96637678, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.72412109, + "step": 483, + "time_per_iteration": 2.8165738582611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.01184416, + "epoch": 0.09311273566756445, + "flos": 510823004160.0, + "grad_norm": 0.03895509426778844, + "language_loss": 0.97422099, + "learning_rate": 0.0009895943406403465, + "loss": 0.98506236, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.72460938, + "step": 484, + "time_per_iteration": 2.7523326873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086134, + "balance_loss_mlp": 1.01384509, + "epoch": 0.09330511735282801, + "flos": 660584064768.0, + "grad_norm": 0.04754513437429821, + "language_loss": 0.90526009, + "learning_rate": 0.0009895310174615338, + "loss": 0.91612148, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.72460938, + "step": 485, + "time_per_iteration": 2.843790292739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070969, + "balance_loss_mlp": 0.99982452, + "epoch": 0.09349749903809157, + "flos": 1456024420608.0, + "grad_norm": 0.007982392205281765, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76789486, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.71289062, + "step": 486, + "time_per_iteration": 4.649716138839722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.00877845, + "epoch": 0.09368988072335514, + "flos": 521900719872.0, + "grad_norm": 0.0379904908867083, + "language_loss": 0.94096279, + "learning_rate": 0.0009894038009701782, + "loss": 0.95177054, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.72167969, + "step": 487, + "time_per_iteration": 2.615767002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_mlp": 1.012941, + "epoch": 0.0938822624086187, + "flos": 498752806656.0, + "grad_norm": 0.041516659048387576, + "language_loss": 0.97017074, + "learning_rate": 0.0009893399077070253, + "loss": 0.98102111, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.72265625, + "step": 488, + "time_per_iteration": 2.592867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.01828361, + "epoch": 0.09407464409388226, + "flos": 534224629248.0, + "grad_norm": 0.031087819309936707, + "language_loss": 0.91152203, + "learning_rate": 0.0009892758244652718, + "loss": 0.92242396, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.72070312, + "step": 489, + "time_per_iteration": 2.702681541442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080571, + "balance_loss_mlp": 1.00852132, + "epoch": 0.09426702577914582, + "flos": 587091651840.0, + "grad_norm": 0.037758062155454256, + "language_loss": 0.98290044, + "learning_rate": 0.0009892115512697968, + "loss": 0.99370617, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.72216797, + "step": 490, + "time_per_iteration": 2.7222015857696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.01649261, + "epoch": 0.0944594074644094, + "flos": 504464690688.0, + "grad_norm": 0.03400132145466818, + "language_loss": 0.98617911, + "learning_rate": 0.0009891470881455537, + "loss": 0.99706453, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.72216797, + "step": 491, + "time_per_iteration": 2.6978650093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.01626599, + "epoch": 0.09465178914967295, + "flos": 572114962176.0, + "grad_norm": 0.03537229102294209, + "language_loss": 0.97051454, + "learning_rate": 0.0009890824351175692, + "loss": 0.98139298, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.71728516, + "step": 492, + "time_per_iteration": 2.7183802127838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.01590919, + "epoch": 0.09484417083493651, + "flos": 550419322368.0, + "grad_norm": 0.028677449722299516, + "language_loss": 1.00688422, + "learning_rate": 0.0009890175922109435, + "loss": 1.01776004, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.71826172, + "step": 493, + "time_per_iteration": 2.680469512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082456, + "balance_loss_mlp": 1.01088285, + "epoch": 0.09503655252020007, + "flos": 825272237568.0, + "grad_norm": 0.03488638846892438, + "language_loss": 0.98808897, + "learning_rate": 0.0009889525594508513, + "loss": 0.99891359, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.71728516, + "step": 494, + "time_per_iteration": 2.983400344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083066, + "balance_loss_mlp": 1.01154041, + "epoch": 0.09522893420546363, + "flos": 405518615040.0, + "grad_norm": 0.028649644857800794, + "language_loss": 0.9245472, + "learning_rate": 0.0009888873368625404, + "loss": 0.93537784, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.71679688, + "step": 495, + "time_per_iteration": 2.497526168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108369, + "balance_loss_mlp": 1.01206875, + "epoch": 0.0954213158907272, + "flos": 692257035264.0, + "grad_norm": 0.03396045626839725, + "language_loss": 0.96602595, + "learning_rate": 0.0009888219244713326, + "loss": 0.97686291, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.71777344, + "step": 496, + "time_per_iteration": 2.8588504791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.01018417, + "epoch": 0.09561369757599077, + "flos": 520075170816.0, + "grad_norm": 0.039869543083186736, + "language_loss": 0.97707164, + "learning_rate": 0.0009887563223026229, + "loss": 0.98788875, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.71679688, + "step": 497, + "time_per_iteration": 2.6856894493103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075874, + "balance_loss_mlp": 1.00644684, + "epoch": 0.09580607926125433, + "flos": 1388784363264.0, + "grad_norm": 0.01625235818526382, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80144036, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.6953125, + "step": 498, + "time_per_iteration": 4.882593393325806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086748, + "balance_loss_mlp": 1.0150795, + "epoch": 0.09599846094651789, + "flos": 718826634240.0, + "grad_norm": 0.03326061844711544, + "language_loss": 0.95632416, + "learning_rate": 0.0009886245487346482, + "loss": 0.9671917, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.71826172, + "step": 499, + "time_per_iteration": 3.0426785945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.01568544, + "epoch": 0.09619084263178146, + "flos": 386894717952.0, + "grad_norm": 0.04298067648683731, + "language_loss": 0.98954022, + "learning_rate": 0.0009885583773865422, + "loss": 1.00041187, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.71630859, + "step": 500, + "time_per_iteration": 2.452941417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.01467967, + "epoch": 0.09638322431704502, + "flos": 535173369600.0, + "grad_norm": 0.04172266818012015, + "language_loss": 0.95971203, + "learning_rate": 0.0009884920163632524, + "loss": 0.97057414, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.71679688, + "step": 501, + "time_per_iteration": 2.657940626144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080406, + "balance_loss_mlp": 1.00911927, + "epoch": 0.09657560600230858, + "flos": 501657353472.0, + "grad_norm": 0.041437287127294276, + "language_loss": 0.9960922, + "learning_rate": 0.000988425465690543, + "loss": 1.00689626, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.71435547, + "step": 502, + "time_per_iteration": 2.5540428161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.00642741, + "epoch": 0.09676798768757214, + "flos": 530332458240.0, + "grad_norm": 0.03187665411612151, + "language_loss": 0.96807587, + "learning_rate": 0.0009883587253942505, + "loss": 0.97885495, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.71630859, + "step": 503, + "time_per_iteration": 2.7744338512420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086901, + "balance_loss_mlp": 1.01542282, + "epoch": 0.09696036937283571, + "flos": 464557313280.0, + "grad_norm": 0.038653015311582224, + "language_loss": 1.0234406, + "learning_rate": 0.0009882917955002862, + "loss": 1.03430974, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.71630859, + "step": 504, + "time_per_iteration": 2.500669479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_mlp": 1.01074982, + "epoch": 0.09715275105809927, + "flos": 536011294464.0, + "grad_norm": 0.035792041916504785, + "language_loss": 0.94188601, + "learning_rate": 0.0009882246760346343, + "loss": 0.95270395, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.71191406, + "step": 505, + "time_per_iteration": 2.6442148685455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077575, + "balance_loss_mlp": 1.00652647, + "epoch": 0.09734513274336283, + "flos": 455882556672.0, + "grad_norm": 0.04461237962136338, + "language_loss": 1.00418711, + "learning_rate": 0.0009881573670233533, + "loss": 1.01496279, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.71191406, + "step": 506, + "time_per_iteration": 2.5102410316467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075502, + "balance_loss_mlp": 1.00450063, + "epoch": 0.09753751442862639, + "flos": 509828577024.0, + "grad_norm": 0.03506590591484262, + "language_loss": 0.93374205, + "learning_rate": 0.0009880898684925747, + "loss": 0.94449711, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.71142578, + "step": 507, + "time_per_iteration": 2.652381658554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.00624609, + "epoch": 0.09772989611388996, + "flos": 485247832320.0, + "grad_norm": 0.03501422949918711, + "language_loss": 0.92606336, + "learning_rate": 0.0009880221804685037, + "loss": 0.9368335, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.70898438, + "step": 508, + "time_per_iteration": 2.5481274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073608, + "balance_loss_mlp": 1.00456238, + "epoch": 0.09792227779915352, + "flos": 1569319231488.0, + "grad_norm": 0.011873284077886747, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80418032, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.69140625, + "step": 509, + "time_per_iteration": 4.725191354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076044, + "balance_loss_mlp": 1.00590122, + "epoch": 0.09811465948441708, + "flos": 588915255552.0, + "grad_norm": 0.04172960474096109, + "language_loss": 0.98818666, + "learning_rate": 0.0009878862360456733, + "loss": 0.99894708, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.70263672, + "step": 510, + "time_per_iteration": 2.7094569206237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.00828481, + "epoch": 0.09830704116968064, + "flos": 614129790720.0, + "grad_norm": 0.037035801977756785, + "language_loss": 0.90851068, + "learning_rate": 0.0009878179796996922, + "loss": 0.919294, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.70166016, + "step": 511, + "time_per_iteration": 2.6973366737365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079637, + "balance_loss_mlp": 1.00973296, + "epoch": 0.09849942285494422, + "flos": 539936513280.0, + "grad_norm": 0.0318668020933778, + "language_loss": 0.94484478, + "learning_rate": 0.0009877495339659754, + "loss": 0.95564115, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.70019531, + "step": 512, + "time_per_iteration": 2.7476089000701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083598, + "balance_loss_mlp": 1.0137887, + "epoch": 0.09869180454020778, + "flos": 621604040448.0, + "grad_norm": 0.03763698097825182, + "language_loss": 0.89467418, + "learning_rate": 0.000987680898871096, + "loss": 0.90551007, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.69921875, + "step": 513, + "time_per_iteration": 2.7254321575164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.01382184, + "epoch": 0.09888418622547133, + "flos": 813061089024.0, + "grad_norm": 0.049179676158016515, + "language_loss": 0.91816097, + "learning_rate": 0.0009876120744417, + "loss": 0.9289968, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.69873047, + "step": 514, + "time_per_iteration": 2.9596974849700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083293, + "balance_loss_mlp": 1.01357901, + "epoch": 0.0990765679107349, + "flos": 536857967616.0, + "grad_norm": 0.03966041946019195, + "language_loss": 0.99294269, + "learning_rate": 0.0009875430607045078, + "loss": 1.0037756, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.69824219, + "step": 515, + "time_per_iteration": 2.7065181732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_mlp": 1.01439941, + "epoch": 0.09926894959599845, + "flos": 588971635968.0, + "grad_norm": 0.037836000479060286, + "language_loss": 0.94664383, + "learning_rate": 0.000987473857686313, + "loss": 0.95748156, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.69482422, + "step": 516, + "time_per_iteration": 2.712947130203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_mlp": 1.01582849, + "epoch": 0.09946133128126203, + "flos": 642387878400.0, + "grad_norm": 0.04191957443387863, + "language_loss": 0.98466003, + "learning_rate": 0.0009874044654139824, + "loss": 0.99551111, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.69384766, + "step": 517, + "time_per_iteration": 2.7391469478607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081227, + "balance_loss_mlp": 1.01194227, + "epoch": 0.09965371296652559, + "flos": 466726002432.0, + "grad_norm": 0.049265237591549625, + "language_loss": 0.97911566, + "learning_rate": 0.0009873348839144563, + "loss": 0.98992795, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.69384766, + "step": 518, + "time_per_iteration": 2.5496554374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081078, + "balance_loss_mlp": 1.01198411, + "epoch": 0.09984609465178915, + "flos": 484559606784.0, + "grad_norm": 0.04039588305244337, + "language_loss": 0.99084902, + "learning_rate": 0.000987265113214749, + "loss": 1.00165975, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.69189453, + "step": 519, + "time_per_iteration": 2.592350721359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_mlp": 1.01200545, + "epoch": 0.1000384763370527, + "flos": 570095972352.0, + "grad_norm": 0.04690738730083641, + "language_loss": 1.01784182, + "learning_rate": 0.0009871951533419476, + "loss": 1.02865279, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.69189453, + "step": 520, + "time_per_iteration": 2.699725866317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.00854921, + "epoch": 0.10023085802231628, + "flos": 546926671872.0, + "grad_norm": 0.03422053119670882, + "language_loss": 0.91227025, + "learning_rate": 0.0009871250043232132, + "loss": 0.92304718, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.69238281, + "step": 521, + "time_per_iteration": 2.74124813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078273, + "balance_loss_mlp": 1.00913203, + "epoch": 0.10042323970757984, + "flos": 504440391168.0, + "grad_norm": 0.0407416967929008, + "language_loss": 0.91114902, + "learning_rate": 0.0009870546661857797, + "loss": 0.92193174, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.69238281, + "step": 522, + "time_per_iteration": 2.6524126529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080712, + "balance_loss_mlp": 1.01199949, + "epoch": 0.1006156213928434, + "flos": 771725737728.0, + "grad_norm": 0.04764395650012834, + "language_loss": 1.0071038, + "learning_rate": 0.0009869841389569553, + "loss": 1.01791096, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.68798828, + "step": 523, + "time_per_iteration": 2.9797816276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.01237857, + "epoch": 0.10080800307810696, + "flos": 491009293824.0, + "grad_norm": 0.04526617857315469, + "language_loss": 0.93126583, + "learning_rate": 0.0009869134226641206, + "loss": 0.94207817, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.68945312, + "step": 524, + "time_per_iteration": 2.624396562576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079355, + "balance_loss_mlp": 1.01064241, + "epoch": 0.10100038476337053, + "flos": 455713415424.0, + "grad_norm": 0.04976961118682096, + "language_loss": 0.93662071, + "learning_rate": 0.0009868425173347303, + "loss": 0.94741422, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.68798828, + "step": 525, + "time_per_iteration": 2.659106731414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077138, + "balance_loss_mlp": 1.00809169, + "epoch": 0.10119276644863409, + "flos": 557574731520.0, + "grad_norm": 0.04197638521891018, + "language_loss": 0.9924143, + "learning_rate": 0.0009867714229963125, + "loss": 1.00318575, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.69140625, + "step": 526, + "time_per_iteration": 2.7414495944976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.01201165, + "epoch": 0.10138514813389765, + "flos": 517220201472.0, + "grad_norm": 0.044929109849797505, + "language_loss": 0.96641302, + "learning_rate": 0.000986700139676468, + "loss": 0.97722065, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.68847656, + "step": 527, + "time_per_iteration": 2.620313882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083057, + "balance_loss_mlp": 1.01405847, + "epoch": 0.10157752981916121, + "flos": 501564034560.0, + "grad_norm": 0.03558874762709202, + "language_loss": 0.9424324, + "learning_rate": 0.0009866286674028717, + "loss": 0.95326293, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.69091797, + "step": 528, + "time_per_iteration": 2.632835865020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082511, + "balance_loss_mlp": 1.01379848, + "epoch": 0.10176991150442478, + "flos": 658094589696.0, + "grad_norm": 0.042026744727430246, + "language_loss": 0.91470444, + "learning_rate": 0.0009865570062032717, + "loss": 0.9255296, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.68798828, + "step": 529, + "time_per_iteration": 2.9185874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.01519477, + "epoch": 0.10196229318968834, + "flos": 574403215104.0, + "grad_norm": 0.031693910674612406, + "language_loss": 0.95307148, + "learning_rate": 0.0009864851561054893, + "loss": 0.96391344, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.69091797, + "step": 530, + "time_per_iteration": 2.7826597690582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.01765728, + "epoch": 0.1021546748749519, + "flos": 519256687872.0, + "grad_norm": 0.0418084670656813, + "language_loss": 0.94574928, + "learning_rate": 0.0009864131171374191, + "loss": 0.95661592, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.69091797, + "step": 531, + "time_per_iteration": 2.67000150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088238, + "balance_loss_mlp": 1.01919198, + "epoch": 0.10234705656021546, + "flos": 610954035456.0, + "grad_norm": 0.03906444640078033, + "language_loss": 0.94287467, + "learning_rate": 0.0009863408893270292, + "loss": 0.95375705, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.69140625, + "step": 532, + "time_per_iteration": 2.7893166542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_mlp": 1.02029741, + "epoch": 0.10253943824547904, + "flos": 602913069312.0, + "grad_norm": 0.046708965243717, + "language_loss": 0.90346718, + "learning_rate": 0.0009862684727023605, + "loss": 0.91435778, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.68847656, + "step": 533, + "time_per_iteration": 2.7212483882904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.0105468, + "epoch": 0.1027318199307426, + "flos": 664157395200.0, + "grad_norm": 0.04923575085492922, + "language_loss": 0.9286049, + "learning_rate": 0.0009861958672915283, + "loss": 0.93939555, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.68603516, + "step": 534, + "time_per_iteration": 2.8216443061828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080998, + "balance_loss_mlp": 1.01271474, + "epoch": 0.10292420161600616, + "flos": 684531019008.0, + "grad_norm": 0.03566434899904423, + "language_loss": 0.91122925, + "learning_rate": 0.0009861230731227201, + "loss": 0.92203927, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.68359375, + "step": 535, + "time_per_iteration": 2.8432843685150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082908, + "balance_loss_mlp": 1.01514912, + "epoch": 0.10311658330126972, + "flos": 491269808640.0, + "grad_norm": 0.04656876258351904, + "language_loss": 0.9494285, + "learning_rate": 0.0009860500902241973, + "loss": 0.96025753, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.67822266, + "step": 536, + "time_per_iteration": 2.601234197616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085787, + "balance_loss_mlp": 1.01831496, + "epoch": 0.10330896498653329, + "flos": 432687011328.0, + "grad_norm": 0.046264109011482965, + "language_loss": 0.99409795, + "learning_rate": 0.0009859769186242942, + "loss": 1.00495577, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.67529297, + "step": 537, + "time_per_iteration": 2.527156114578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079891, + "balance_loss_mlp": 1.01265681, + "epoch": 0.10350134667179685, + "flos": 550642898688.0, + "grad_norm": 0.04274411195548745, + "language_loss": 0.92667055, + "learning_rate": 0.0009859035583514187, + "loss": 0.93746948, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.67285156, + "step": 538, + "time_per_iteration": 2.6489107608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082236, + "balance_loss_mlp": 1.01505005, + "epoch": 0.10369372835706041, + "flos": 641827964928.0, + "grad_norm": 0.04978782417937993, + "language_loss": 0.95941103, + "learning_rate": 0.0009858300094340517, + "loss": 0.97023344, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.67236328, + "step": 539, + "time_per_iteration": 2.8078534603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107826, + "balance_loss_mlp": 1.01102614, + "epoch": 0.10388611004232397, + "flos": 522766834944.0, + "grad_norm": 0.04233995967203171, + "language_loss": 0.8846426, + "learning_rate": 0.0009857562719007473, + "loss": 0.8954252, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.67285156, + "step": 540, + "time_per_iteration": 2.605253219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108211, + "balance_loss_mlp": 1.01487637, + "epoch": 0.10407849172758753, + "flos": 703741074432.0, + "grad_norm": 0.04489314852578161, + "language_loss": 0.9024663, + "learning_rate": 0.0009856823457801331, + "loss": 0.91328734, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.67285156, + "step": 541, + "time_per_iteration": 2.8836264610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074756, + "balance_loss_mlp": 1.00737894, + "epoch": 0.1042708734128511, + "flos": 503945606400.0, + "grad_norm": 0.04545070943505171, + "language_loss": 0.97841358, + "learning_rate": 0.00098560823110091, + "loss": 0.98916113, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.67431641, + "step": 542, + "time_per_iteration": 2.629241466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078174, + "balance_loss_mlp": 1.01084471, + "epoch": 0.10446325509811466, + "flos": 486641779968.0, + "grad_norm": 0.04151430298304091, + "language_loss": 0.974545, + "learning_rate": 0.000985533927891851, + "loss": 0.98532677, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.67382812, + "step": 543, + "time_per_iteration": 2.712714195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.01110125, + "epoch": 0.10465563678337822, + "flos": 569713948416.0, + "grad_norm": 0.043537531534841835, + "language_loss": 0.9559319, + "learning_rate": 0.0009854594361818044, + "loss": 0.96671236, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.66992188, + "step": 544, + "time_per_iteration": 2.66324520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075343, + "balance_loss_mlp": 1.00806153, + "epoch": 0.10484801846864178, + "flos": 627243992832.0, + "grad_norm": 0.042858245855360314, + "language_loss": 0.94459403, + "learning_rate": 0.0009853847559996897, + "loss": 0.95534742, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.67333984, + "step": 545, + "time_per_iteration": 2.749379873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.00697374, + "epoch": 0.10504040015390535, + "flos": 744813965568.0, + "grad_norm": 0.04113973833070077, + "language_loss": 0.93940508, + "learning_rate": 0.0009853098873745, + "loss": 0.95015049, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.67626953, + "step": 546, + "time_per_iteration": 3.0356035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082094, + "balance_loss_mlp": 1.01457405, + "epoch": 0.10523278183916891, + "flos": 587843060736.0, + "grad_norm": 0.04039468180414331, + "language_loss": 0.92498314, + "learning_rate": 0.0009852348303353027, + "loss": 0.93580401, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.67578125, + "step": 547, + "time_per_iteration": 2.787853479385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_mlp": 1.01283157, + "epoch": 0.10542516352443247, + "flos": 871147156224.0, + "grad_norm": 0.04319215205461418, + "language_loss": 0.86143011, + "learning_rate": 0.000985159584911237, + "loss": 0.872235, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.67724609, + "step": 548, + "time_per_iteration": 3.103173017501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.00949633, + "epoch": 0.10561754520969603, + "flos": 506413694208.0, + "grad_norm": 0.04405333210851084, + "language_loss": 0.94064271, + "learning_rate": 0.0009850841511315162, + "loss": 0.95141286, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.67578125, + "step": 549, + "time_per_iteration": 2.647629737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107948, + "balance_loss_mlp": 1.01176953, + "epoch": 0.1058099268949596, + "flos": 561148061952.0, + "grad_norm": 0.03728506713954383, + "language_loss": 0.9326818, + "learning_rate": 0.0009850085290254256, + "loss": 0.94347662, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.67773438, + "step": 550, + "time_per_iteration": 2.7680838108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081585, + "balance_loss_mlp": 1.01411295, + "epoch": 0.10600230858022316, + "flos": 563160248832.0, + "grad_norm": 0.031635589688873186, + "language_loss": 0.90350562, + "learning_rate": 0.0009849327186223246, + "loss": 0.91432148, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.67529297, + "step": 551, + "time_per_iteration": 2.7540531158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077249, + "balance_loss_mlp": 1.01001453, + "epoch": 0.10619469026548672, + "flos": 495318481920.0, + "grad_norm": 0.03875875468173829, + "language_loss": 0.97612774, + "learning_rate": 0.000984856719951646, + "loss": 0.98690015, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.67285156, + "step": 552, + "time_per_iteration": 2.5471906661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_mlp": 1.01300704, + "epoch": 0.10638707195075028, + "flos": 677465038080.0, + "grad_norm": 0.04041077275123314, + "language_loss": 0.94560456, + "learning_rate": 0.0009847805330428943, + "loss": 0.95640558, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.67138672, + "step": 553, + "time_per_iteration": 2.879901647567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081173, + "balance_loss_mlp": 1.01398706, + "epoch": 0.10657945363601386, + "flos": 489035990784.0, + "grad_norm": 0.051524237529684984, + "language_loss": 0.97161597, + "learning_rate": 0.0009847041579256481, + "loss": 0.98242772, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.67236328, + "step": 554, + "time_per_iteration": 2.5838425159454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076637, + "balance_loss_mlp": 1.00997543, + "epoch": 0.10677183532127742, + "flos": 483971503104.0, + "grad_norm": 0.03890900728724459, + "language_loss": 0.96058643, + "learning_rate": 0.0009846275946295592, + "loss": 0.97135282, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.66699219, + "step": 555, + "time_per_iteration": 2.619490623474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074794, + "balance_loss_mlp": 1.00813222, + "epoch": 0.10696421700654098, + "flos": 657582308352.0, + "grad_norm": 0.03350037319549477, + "language_loss": 0.89189553, + "learning_rate": 0.0009845508431843518, + "loss": 0.9026435, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.66699219, + "step": 556, + "time_per_iteration": 3.0074055194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_mlp": 1.00895333, + "epoch": 0.10715659869180454, + "flos": 568793398272.0, + "grad_norm": 0.03867425342149035, + "language_loss": 0.90383601, + "learning_rate": 0.0009844739036198233, + "loss": 0.91459262, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.66748047, + "step": 557, + "time_per_iteration": 2.719309091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073849, + "balance_loss_mlp": 1.00756896, + "epoch": 0.10734898037706811, + "flos": 541744565760.0, + "grad_norm": 0.03845092177051005, + "language_loss": 0.97656357, + "learning_rate": 0.0009843967759658448, + "loss": 0.98730206, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.66308594, + "step": 558, + "time_per_iteration": 2.679964065551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077583, + "balance_loss_mlp": 1.01311493, + "epoch": 0.10754136206233167, + "flos": 1479734192640.0, + "grad_norm": 0.013283033162601723, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73845339, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.64453125, + "step": 559, + "time_per_iteration": 4.837440729141235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_mlp": 1.00977802, + "epoch": 0.10773374374759523, + "flos": 513412601088.0, + "grad_norm": 0.03702065367467253, + "language_loss": 0.97501957, + "learning_rate": 0.000984241956509384, + "loss": 0.98577774, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.66064453, + "step": 560, + "time_per_iteration": 2.6579978466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079218, + "balance_loss_mlp": 1.01312864, + "epoch": 0.10792612543285879, + "flos": 497478422784.0, + "grad_norm": 0.05173888564395698, + "language_loss": 0.9404971, + "learning_rate": 0.0009841642647670078, + "loss": 0.9512893, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.66113281, + "step": 561, + "time_per_iteration": 2.557605743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080143, + "balance_loss_mlp": 1.01429176, + "epoch": 0.10811850711812235, + "flos": 736838128128.0, + "grad_norm": 0.0493873548723288, + "language_loss": 0.88547891, + "learning_rate": 0.0009840863850553944, + "loss": 0.89628035, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.65869141, + "step": 562, + "time_per_iteration": 2.949580669403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077668, + "balance_loss_mlp": 1.0115304, + "epoch": 0.10831088880338592, + "flos": 612677517312.0, + "grad_norm": 0.04173462884607535, + "language_loss": 0.94150907, + "learning_rate": 0.0009840083174047782, + "loss": 0.95228577, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.66162109, + "step": 563, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081559, + "balance_loss_mlp": 1.01561248, + "epoch": 0.10850327048864948, + "flos": 557498909184.0, + "grad_norm": 0.034100755270258146, + "language_loss": 0.88515103, + "learning_rate": 0.0009839300618454685, + "loss": 0.89596659, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.65966797, + "step": 564, + "time_per_iteration": 2.8846256732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080528, + "balance_loss_mlp": 1.0148201, + "epoch": 0.10869565217391304, + "flos": 604437274368.0, + "grad_norm": 0.036735298053950545, + "language_loss": 0.93941957, + "learning_rate": 0.0009838516184078466, + "loss": 0.95022488, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.65722656, + "step": 565, + "time_per_iteration": 2.813284158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078727, + "balance_loss_mlp": 1.01297164, + "epoch": 0.1088880338591766, + "flos": 527206280448.0, + "grad_norm": 0.040314305725270186, + "language_loss": 0.91096556, + "learning_rate": 0.0009837729871223669, + "loss": 0.92175281, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.65771484, + "step": 566, + "time_per_iteration": 2.651611089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078801, + "balance_loss_mlp": 1.01318836, + "epoch": 0.10908041554444017, + "flos": 621417402624.0, + "grad_norm": 0.042325065837349046, + "language_loss": 0.91458869, + "learning_rate": 0.0009836941680195568, + "loss": 0.92537665, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.65625, + "step": 567, + "time_per_iteration": 2.8296427726745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.01508534, + "epoch": 0.10927279722970373, + "flos": 899674507008.0, + "grad_norm": 0.04990856516123606, + "language_loss": 0.87414277, + "learning_rate": 0.0009836151611300166, + "loss": 0.88495302, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.65966797, + "step": 568, + "time_per_iteration": 3.2401816844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107825, + "balance_loss_mlp": 1.01206517, + "epoch": 0.10946517891496729, + "flos": 529700613120.0, + "grad_norm": 0.0427731854110213, + "language_loss": 0.96863574, + "learning_rate": 0.0009835359664844194, + "loss": 0.97941828, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.66210938, + "step": 569, + "time_per_iteration": 2.6190173625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064438, + "balance_loss_mlp": 1.00092316, + "epoch": 0.10965756060023085, + "flos": 1563994228992.0, + "grad_norm": 0.005811935039235345, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.8210125, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.63476562, + "step": 570, + "time_per_iteration": 4.957117795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080699, + "balance_loss_mlp": 1.0151341, + "epoch": 0.10984994228549443, + "flos": 514100826624.0, + "grad_norm": 0.04369440603786518, + "language_loss": 0.94858396, + "learning_rate": 0.0009833770140481118, + "loss": 0.95939088, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.65576172, + "step": 571, + "time_per_iteration": 2.6529860496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_mlp": 1.02059519, + "epoch": 0.11004232397075799, + "flos": 956275252992.0, + "grad_norm": 0.04378732511153692, + "language_loss": 0.85010409, + "learning_rate": 0.000983297256319112, + "loss": 0.86096668, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.65673828, + "step": 572, + "time_per_iteration": 3.2036497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.01499045, + "epoch": 0.11023470565602154, + "flos": 489229431552.0, + "grad_norm": 0.043497603291787354, + "language_loss": 0.89141667, + "learning_rate": 0.000983217310957477, + "loss": 0.90222269, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.65625, + "step": 573, + "time_per_iteration": 2.7763278484344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078757, + "balance_loss_mlp": 1.01333535, + "epoch": 0.1104270873412851, + "flos": 656991292416.0, + "grad_norm": 0.04901418812727031, + "language_loss": 0.9269613, + "learning_rate": 0.000983137177994244, + "loss": 0.93774891, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.65429688, + "step": 574, + "time_per_iteration": 2.8529646396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080019, + "balance_loss_mlp": 1.01474011, + "epoch": 0.11061946902654868, + "flos": 724748488704.0, + "grad_norm": 0.03457948694206611, + "language_loss": 0.87449324, + "learning_rate": 0.0009830568574605235, + "loss": 0.88529336, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.65283203, + "step": 575, + "time_per_iteration": 2.94710373878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010791, + "balance_loss_mlp": 1.01367807, + "epoch": 0.11081185071181224, + "flos": 836869037568.0, + "grad_norm": 0.04085001299476677, + "language_loss": 0.90086508, + "learning_rate": 0.0009829763493874992, + "loss": 0.91165602, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.65429688, + "step": 576, + "time_per_iteration": 3.0296730995178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107807, + "balance_loss_mlp": 1.01283884, + "epoch": 0.1110042323970758, + "flos": 610283306496.0, + "grad_norm": 0.03775485835018356, + "language_loss": 0.95256275, + "learning_rate": 0.0009828956538064264, + "loss": 0.9633435, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.65234375, + "step": 577, + "time_per_iteration": 2.7944416999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.00893569, + "epoch": 0.11119661408233936, + "flos": 597040792320.0, + "grad_norm": 0.04378674390965236, + "language_loss": 0.93033826, + "learning_rate": 0.0009828147707486344, + "loss": 0.94107759, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.64990234, + "step": 578, + "time_per_iteration": 2.7034592628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.01099229, + "epoch": 0.11138899576760293, + "flos": 556888451328.0, + "grad_norm": 0.05042820660432219, + "language_loss": 0.89312434, + "learning_rate": 0.0009827337002455245, + "loss": 0.90388274, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.6484375, + "step": 579, + "time_per_iteration": 2.6187195777893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074948, + "balance_loss_mlp": 1.01057482, + "epoch": 0.11158137745286649, + "flos": 691063331328.0, + "grad_norm": 0.03501309245374513, + "language_loss": 0.89977694, + "learning_rate": 0.0009826524423285712, + "loss": 0.91052639, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.64355469, + "step": 580, + "time_per_iteration": 2.9009909629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.0148946, + "epoch": 0.11177375913813005, + "flos": 764307868416.0, + "grad_norm": 0.04023884017549449, + "language_loss": 0.91280103, + "learning_rate": 0.0009825709970293218, + "loss": 0.92359698, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.64697266, + "step": 581, + "time_per_iteration": 2.9111618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.0095998, + "epoch": 0.11196614082339361, + "flos": 808031594496.0, + "grad_norm": 0.038028140255108665, + "language_loss": 0.97163212, + "learning_rate": 0.0009824893643793956, + "loss": 0.98237336, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.64501953, + "step": 582, + "time_per_iteration": 3.0907368659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072862, + "balance_loss_mlp": 1.00796497, + "epoch": 0.11215852250865718, + "flos": 559725924096.0, + "grad_norm": 0.04580369165919148, + "language_loss": 0.90464842, + "learning_rate": 0.0009824075444104857, + "loss": 0.91537702, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.64892578, + "step": 583, + "time_per_iteration": 2.7276525497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_mlp": 1.01285601, + "epoch": 0.11235090419392074, + "flos": 514576169472.0, + "grad_norm": 0.03926612419770205, + "language_loss": 0.95381963, + "learning_rate": 0.000982325537154357, + "loss": 0.96459383, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.64550781, + "step": 584, + "time_per_iteration": 2.6261777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074375, + "balance_loss_mlp": 1.0100019, + "epoch": 0.1125432858791843, + "flos": 492433377024.0, + "grad_norm": 0.043221505898455144, + "language_loss": 0.96143711, + "learning_rate": 0.0009822433426428484, + "loss": 0.97218084, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.64355469, + "step": 585, + "time_per_iteration": 2.5630125999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.01079714, + "epoch": 0.11273566756444786, + "flos": 511728003072.0, + "grad_norm": 0.04466131563000304, + "language_loss": 0.88984096, + "learning_rate": 0.0009821609609078697, + "loss": 0.90059173, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.64257812, + "step": 586, + "time_per_iteration": 2.649122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.01077783, + "epoch": 0.11292804924971142, + "flos": 623640526848.0, + "grad_norm": 0.03579172726266892, + "language_loss": 0.91595018, + "learning_rate": 0.0009820783919814045, + "loss": 0.92670119, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.64306641, + "step": 587, + "time_per_iteration": 2.7977845668792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_mlp": 1.00830126, + "epoch": 0.113120430934975, + "flos": 479039218176.0, + "grad_norm": 0.04738669495581529, + "language_loss": 0.85574889, + "learning_rate": 0.0009819956358955095, + "loss": 0.86647511, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.64306641, + "step": 588, + "time_per_iteration": 2.59133243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_mlp": 1.01245642, + "epoch": 0.11331281262023855, + "flos": 467991638016.0, + "grad_norm": 0.048752038127388646, + "language_loss": 0.86982751, + "learning_rate": 0.0009819126926823127, + "loss": 0.88059437, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.64208984, + "step": 589, + "time_per_iteration": 2.511939764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075971, + "balance_loss_mlp": 1.01174104, + "epoch": 0.11350519430550211, + "flos": 651611854848.0, + "grad_norm": 0.04204370934342767, + "language_loss": 0.89311969, + "learning_rate": 0.000981829562374016, + "loss": 0.9038794, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.64208984, + "step": 590, + "time_per_iteration": 2.798734426498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.01111591, + "epoch": 0.11369757599076567, + "flos": 558861754368.0, + "grad_norm": 0.04723710161718091, + "language_loss": 0.99783856, + "learning_rate": 0.0009817462450028933, + "loss": 1.00858927, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.63916016, + "step": 591, + "time_per_iteration": 2.717622756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076867, + "balance_loss_mlp": 1.01316178, + "epoch": 0.11388995767602925, + "flos": 572306457600.0, + "grad_norm": 0.041300229846526024, + "language_loss": 0.87103492, + "learning_rate": 0.0009816627406012916, + "loss": 0.88180363, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.63671875, + "step": 592, + "time_per_iteration": 2.783677339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077614, + "balance_loss_mlp": 1.01376593, + "epoch": 0.1140823393612928, + "flos": 741744168192.0, + "grad_norm": 0.04574882804976793, + "language_loss": 0.87044728, + "learning_rate": 0.0009815790492016295, + "loss": 0.88122344, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.63818359, + "step": 593, + "time_per_iteration": 2.920262336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079358, + "balance_loss_mlp": 1.01560438, + "epoch": 0.11427472104655637, + "flos": 700252314624.0, + "grad_norm": 0.042792726491020304, + "language_loss": 0.89086539, + "learning_rate": 0.0009814951708363993, + "loss": 0.90165901, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.63720703, + "step": 594, + "time_per_iteration": 2.8244025707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.00799561, + "epoch": 0.11446710273181993, + "flos": 1480355344128.0, + "grad_norm": 0.0135056408383676, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79060781, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.6171875, + "step": 595, + "time_per_iteration": 4.779642105102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.01189995, + "epoch": 0.1146594844170835, + "flos": 495913388544.0, + "grad_norm": 0.038757735955663945, + "language_loss": 0.90035105, + "learning_rate": 0.0009813268533395648, + "loss": 0.91110331, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.6328125, + "step": 596, + "time_per_iteration": 2.5933825969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082133, + "balance_loss_mlp": 1.01895213, + "epoch": 0.11485186610234706, + "flos": 475791531264.0, + "grad_norm": 0.0538004660752225, + "language_loss": 0.90474582, + "learning_rate": 0.0009812424142733073, + "loss": 0.9155671, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.63134766, + "step": 597, + "time_per_iteration": 2.528027296066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.01089013, + "epoch": 0.11504424778761062, + "flos": 732620313600.0, + "grad_norm": 0.03283482462688361, + "language_loss": 0.87953097, + "learning_rate": 0.000981157788372175, + "loss": 0.89027071, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.63037109, + "step": 598, + "time_per_iteration": 3.008469343185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.01160276, + "epoch": 0.11523662947287418, + "flos": 546963610368.0, + "grad_norm": 0.037424804687157906, + "language_loss": 0.91041148, + "learning_rate": 0.0009810729756690223, + "loss": 0.92115927, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.63134766, + "step": 599, + "time_per_iteration": 2.75840163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077312, + "balance_loss_mlp": 1.01408339, + "epoch": 0.11542901115813775, + "flos": 776388759552.0, + "grad_norm": 0.04126969924944996, + "language_loss": 0.9391377, + "learning_rate": 0.0009809879761967766, + "loss": 0.94991082, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.63183594, + "step": 600, + "time_per_iteration": 2.9511778354644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081843, + "balance_loss_mlp": 1.01828074, + "epoch": 0.11562139284340131, + "flos": 732213990144.0, + "grad_norm": 0.05544181306164312, + "language_loss": 0.88981479, + "learning_rate": 0.0009809027899884378, + "loss": 0.90063322, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.63525391, + "step": 601, + "time_per_iteration": 2.888591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.01256609, + "epoch": 0.11581377452866487, + "flos": 537040714752.0, + "grad_norm": 0.03483284203155477, + "language_loss": 0.90335476, + "learning_rate": 0.0009808174170770779, + "loss": 0.9141165, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.63574219, + "step": 602, + "time_per_iteration": 2.7933802604675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073479, + "balance_loss_mlp": 1.01263428, + "epoch": 0.11600615621392843, + "flos": 1559214555648.0, + "grad_norm": 0.012041981792172347, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85971725, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.60742188, + "step": 603, + "time_per_iteration": 4.875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.01658237, + "epoch": 0.116198537899192, + "flos": 538468688640.0, + "grad_norm": 0.046063141341509364, + "language_loss": 0.95944118, + "learning_rate": 0.0009806461112779462, + "loss": 0.97023928, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.63183594, + "step": 604, + "time_per_iteration": 2.708552360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077772, + "balance_loss_mlp": 1.01444781, + "epoch": 0.11639091958445556, + "flos": 455137950720.0, + "grad_norm": 0.05737724930332189, + "language_loss": 0.90764457, + "learning_rate": 0.0009805601784566814, + "loss": 0.91842222, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.6328125, + "step": 605, + "time_per_iteration": 2.545696496963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076475, + "balance_loss_mlp": 1.01329422, + "epoch": 0.11658330126971912, + "flos": 556152593664.0, + "grad_norm": 0.04016687987230144, + "language_loss": 0.97276044, + "learning_rate": 0.0009804740590654089, + "loss": 0.98352522, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.63134766, + "step": 606, + "time_per_iteration": 2.6464574337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_mlp": 1.01399851, + "epoch": 0.11677568295498268, + "flos": 717601827840.0, + "grad_norm": 0.0453344941203476, + "language_loss": 0.91881627, + "learning_rate": 0.0009803877531375635, + "loss": 0.9295876, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.63085938, + "step": 607, + "time_per_iteration": 2.8467392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074228, + "balance_loss_mlp": 1.0111903, + "epoch": 0.11696806464024626, + "flos": 610899600384.0, + "grad_norm": 0.04469679718872237, + "language_loss": 0.92976171, + "learning_rate": 0.0009803012607066523, + "loss": 0.94050401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.62988281, + "step": 608, + "time_per_iteration": 2.7587811946868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.01030838, + "epoch": 0.11716044632550981, + "flos": 521416628736.0, + "grad_norm": 0.04044307397502579, + "language_loss": 0.91207683, + "learning_rate": 0.0009802145818062543, + "loss": 0.92280889, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.62841797, + "step": 609, + "time_per_iteration": 2.7623538970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.00919068, + "epoch": 0.11735282801077337, + "flos": 508489064448.0, + "grad_norm": 0.04251091083777229, + "language_loss": 0.93763256, + "learning_rate": 0.0009801277164700212, + "loss": 0.9483524, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.62744141, + "step": 610, + "time_per_iteration": 2.6250369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079805, + "balance_loss_mlp": 1.0171963, + "epoch": 0.11754520969603693, + "flos": 687837031680.0, + "grad_norm": 0.044835447829723894, + "language_loss": 0.91796255, + "learning_rate": 0.0009800406647316776, + "loss": 0.92876053, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.62548828, + "step": 611, + "time_per_iteration": 2.81438946723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058731, + "balance_loss_mlp": 0.99807739, + "epoch": 0.1177375913813005, + "flos": 1545759158784.0, + "grad_norm": 0.00493114536612535, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77973187, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.60546875, + "step": 612, + "time_per_iteration": 4.795796871185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.01008153, + "epoch": 0.11792997306656407, + "flos": 521538137856.0, + "grad_norm": 0.049162221556570344, + "language_loss": 0.91035461, + "learning_rate": 0.000979866002183916, + "loss": 0.92108488, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.62890625, + "step": 613, + "time_per_iteration": 2.6470768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.00820458, + "epoch": 0.11812235475182763, + "flos": 667489652736.0, + "grad_norm": 0.0453482214384289, + "language_loss": 0.92239928, + "learning_rate": 0.0009797783914423082, + "loss": 0.93311322, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.63134766, + "step": 614, + "time_per_iteration": 2.8020856380462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_mlp": 1.01220894, + "epoch": 0.11831473643709119, + "flos": 622505148672.0, + "grad_norm": 0.04034391423157231, + "language_loss": 0.86097217, + "learning_rate": 0.0009796905944342094, + "loss": 0.87172604, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.63134766, + "step": 615, + "time_per_iteration": 2.839617967605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_mlp": 1.0160079, + "epoch": 0.11850711812235475, + "flos": 457695466752.0, + "grad_norm": 0.03330066749319758, + "language_loss": 0.89949274, + "learning_rate": 0.0009796026111937057, + "loss": 0.91028321, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.62988281, + "step": 616, + "time_per_iteration": 2.6211540699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077601, + "balance_loss_mlp": 1.0150882, + "epoch": 0.11869949980761832, + "flos": 514928057856.0, + "grad_norm": 0.034464018290856886, + "language_loss": 0.90251315, + "learning_rate": 0.0009795144417549552, + "loss": 0.91328913, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.62451172, + "step": 617, + "time_per_iteration": 2.6946897506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080332, + "balance_loss_mlp": 1.01815259, + "epoch": 0.11889188149288188, + "flos": 536157103104.0, + "grad_norm": 0.035314864293198016, + "language_loss": 0.91583192, + "learning_rate": 0.0009794260861521883, + "loss": 0.92663527, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.62109375, + "step": 618, + "time_per_iteration": 2.77822208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.01979554, + "epoch": 0.11908426317814544, + "flos": 499645166592.0, + "grad_norm": 0.042334404758790994, + "language_loss": 0.88659471, + "learning_rate": 0.0009793375444197075, + "loss": 0.89741158, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.61816406, + "step": 619, + "time_per_iteration": 2.6199400424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086152, + "balance_loss_mlp": 1.02416277, + "epoch": 0.119276644863409, + "flos": 661068155904.0, + "grad_norm": 0.043937618111938345, + "language_loss": 0.86906028, + "learning_rate": 0.000979248816591888, + "loss": 0.87992179, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.61914062, + "step": 620, + "time_per_iteration": 2.789858341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_mlp": 1.01947308, + "epoch": 0.11946902654867257, + "flos": 760153237248.0, + "grad_norm": 0.04701199265522289, + "language_loss": 0.87992656, + "learning_rate": 0.0009791599027031766, + "loss": 0.89074314, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.62109375, + "step": 621, + "time_per_iteration": 3.026487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074317, + "balance_loss_mlp": 1.01223314, + "epoch": 0.11966140823393613, + "flos": 682214575872.0, + "grad_norm": 0.0506686420393155, + "language_loss": 0.88143325, + "learning_rate": 0.0009790708027880932, + "loss": 0.89217639, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.62011719, + "step": 622, + "time_per_iteration": 2.8321774005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.02143097, + "epoch": 0.11985378991919969, + "flos": 1454300938752.0, + "grad_norm": 0.023212611497014573, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78508806, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.59960938, + "step": 623, + "time_per_iteration": 4.862462759017944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071261, + "balance_loss_mlp": 1.00936747, + "epoch": 0.12004617160446325, + "flos": 528899626752.0, + "grad_norm": 0.04437858339694968, + "language_loss": 0.95209736, + "learning_rate": 0.0009788920450172487, + "loss": 0.96280998, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.61816406, + "step": 624, + "time_per_iteration": 2.630764961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078232, + "balance_loss_mlp": 1.01619518, + "epoch": 0.12023855328972682, + "flos": 475177182720.0, + "grad_norm": 0.048047229360432486, + "language_loss": 0.92430472, + "learning_rate": 0.0009788023872308875, + "loss": 0.93508708, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.61962891, + "step": 625, + "time_per_iteration": 2.5534780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076523, + "balance_loss_mlp": 1.01682281, + "epoch": 0.12043093497499038, + "flos": 1535054718720.0, + "grad_norm": 0.022021305117703366, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.7650553, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.59570312, + "step": 626, + "time_per_iteration": 4.738527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108475, + "balance_loss_mlp": 1.023, + "epoch": 0.12062331666025394, + "flos": 540915389184.0, + "grad_norm": 0.04663901515177362, + "language_loss": 0.9603011, + "learning_rate": 0.0009786225140303285, + "loss": 0.97114861, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.61669922, + "step": 627, + "time_per_iteration": 2.634160280227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.02427304, + "epoch": 0.1208156983455175, + "flos": 513000441600.0, + "grad_norm": 0.042540459475059536, + "language_loss": 0.94019556, + "learning_rate": 0.0009785322986859634, + "loss": 0.95105481, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.61572266, + "step": 628, + "time_per_iteration": 2.681070327758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.01725972, + "epoch": 0.12100808003078108, + "flos": 597590012160.0, + "grad_norm": 0.03866803919075334, + "language_loss": 0.94614279, + "learning_rate": 0.0009784418975588838, + "loss": 0.95693052, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.61425781, + "step": 629, + "time_per_iteration": 2.7337839603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073027, + "balance_loss_mlp": 1.01132393, + "epoch": 0.12120046171604464, + "flos": 524067463680.0, + "grad_norm": 0.03279843121618067, + "language_loss": 0.94581258, + "learning_rate": 0.0009783513106841862, + "loss": 0.95654285, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.61621094, + "step": 630, + "time_per_iteration": 2.702615737915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080086, + "balance_loss_mlp": 1.01981354, + "epoch": 0.1213928434013082, + "flos": 1557910036224.0, + "grad_norm": 0.01502333088768157, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77812791, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.6015625, + "step": 631, + "time_per_iteration": 4.998409032821655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080339, + "balance_loss_mlp": 1.01835024, + "epoch": 0.12158522508657175, + "flos": 496388731392.0, + "grad_norm": 0.04174070683076465, + "language_loss": 0.89320499, + "learning_rate": 0.0009781695798326854, + "loss": 0.90400839, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.61914062, + "step": 632, + "time_per_iteration": 2.5908379554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079195, + "balance_loss_mlp": 1.01744485, + "epoch": 0.12177760677183531, + "flos": 476590572288.0, + "grad_norm": 0.04165368210868703, + "language_loss": 0.89744723, + "learning_rate": 0.0009780784359264365, + "loss": 0.90823919, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.61669922, + "step": 633, + "time_per_iteration": 2.689202070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073334, + "balance_loss_mlp": 1.01382446, + "epoch": 0.12196998845709889, + "flos": 1471787512320.0, + "grad_norm": 0.011333314510513573, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75262028, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.59375, + "step": 634, + "time_per_iteration": 4.762145757675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073337, + "balance_loss_mlp": 1.01187229, + "epoch": 0.12216237014236245, + "flos": 587749741824.0, + "grad_norm": 0.03178889939160208, + "language_loss": 0.88649213, + "learning_rate": 0.000977895591329867, + "loss": 0.8972255, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.61376953, + "step": 635, + "time_per_iteration": 2.7996504306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075051, + "balance_loss_mlp": 1.01372933, + "epoch": 0.12235475182762601, + "flos": 599107414272.0, + "grad_norm": 0.038321985001081305, + "language_loss": 0.88459468, + "learning_rate": 0.000977803890710533, + "loss": 0.89534515, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.61230469, + "step": 636, + "time_per_iteration": 2.7200405597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072884, + "balance_loss_mlp": 1.0117538, + "epoch": 0.12254713351288957, + "flos": 498761554944.0, + "grad_norm": 0.03313527469264444, + "language_loss": 0.94808865, + "learning_rate": 0.0009777120045912774, + "loss": 0.95881748, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.61035156, + "step": 637, + "time_per_iteration": 2.6253507137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072019, + "balance_loss_mlp": 1.01084125, + "epoch": 0.12273951519815314, + "flos": 606981184512.0, + "grad_norm": 0.04065251745031248, + "language_loss": 0.91558111, + "learning_rate": 0.0009776199330077736, + "loss": 0.92630136, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.61083984, + "step": 638, + "time_per_iteration": 2.724416732788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069779, + "balance_loss_mlp": 1.0086484, + "epoch": 0.1229318968834167, + "flos": 598985905152.0, + "grad_norm": 0.04427923240085457, + "language_loss": 0.94062102, + "learning_rate": 0.0009775276759957667, + "loss": 0.9513188, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.61035156, + "step": 639, + "time_per_iteration": 2.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_mlp": 1.00851989, + "epoch": 0.12312427856868026, + "flos": 679589985792.0, + "grad_norm": 0.04435656949952303, + "language_loss": 0.91938198, + "learning_rate": 0.0009774352335910745, + "loss": 0.93008226, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.61425781, + "step": 640, + "time_per_iteration": 2.8135974407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072128, + "balance_loss_mlp": 1.01095021, + "epoch": 0.12331666025394382, + "flos": 610044178944.0, + "grad_norm": 0.03352322480141845, + "language_loss": 0.95842457, + "learning_rate": 0.000977342605829586, + "loss": 0.96914589, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.61083984, + "step": 641, + "time_per_iteration": 2.734373092651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107341, + "balance_loss_mlp": 1.01208854, + "epoch": 0.12350904193920739, + "flos": 763841273856.0, + "grad_norm": 0.04166007448412618, + "language_loss": 0.87458932, + "learning_rate": 0.0009772497927472623, + "loss": 0.88532341, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.61230469, + "step": 642, + "time_per_iteration": 3.069495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.01199543, + "epoch": 0.12370142362447095, + "flos": 542050767360.0, + "grad_norm": 0.04189965725350253, + "language_loss": 0.86664522, + "learning_rate": 0.0009771567943801368, + "loss": 0.87737978, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.61376953, + "step": 643, + "time_per_iteration": 2.6783955097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.01000655, + "epoch": 0.12389380530973451, + "flos": 549253808640.0, + "grad_norm": 0.03907898995026106, + "language_loss": 0.90534973, + "learning_rate": 0.0009770636107643152, + "loss": 0.91606158, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.61083984, + "step": 644, + "time_per_iteration": 2.7792532444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.01343274, + "epoch": 0.12408618699499807, + "flos": 541353793536.0, + "grad_norm": 0.03775088580197231, + "language_loss": 0.89077818, + "learning_rate": 0.0009769702419359738, + "loss": 0.9015224, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.60888672, + "step": 645, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071725, + "balance_loss_mlp": 1.01083338, + "epoch": 0.12427856868026164, + "flos": 747160544256.0, + "grad_norm": 0.03491310842571494, + "language_loss": 0.90435565, + "learning_rate": 0.000976876687931362, + "loss": 0.91507292, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.60791016, + "step": 646, + "time_per_iteration": 3.028578758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074215, + "balance_loss_mlp": 1.01332271, + "epoch": 0.1244709503655252, + "flos": 534745658880.0, + "grad_norm": 0.04739554944994068, + "language_loss": 0.86433625, + "learning_rate": 0.0009767829487868005, + "loss": 0.87507832, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.60791016, + "step": 647, + "time_per_iteration": 2.6323471069335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075713, + "balance_loss_mlp": 1.01472592, + "epoch": 0.12466333205078876, + "flos": 509112161280.0, + "grad_norm": 0.0390766896094967, + "language_loss": 0.89632404, + "learning_rate": 0.000976689024538682, + "loss": 0.90708113, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.60888672, + "step": 648, + "time_per_iteration": 2.6233997344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069043, + "balance_loss_mlp": 1.00819838, + "epoch": 0.12485571373605232, + "flos": 682640341248.0, + "grad_norm": 0.04106035596266842, + "language_loss": 0.87981439, + "learning_rate": 0.0009765949152234716, + "loss": 0.89050484, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.60742188, + "step": 649, + "time_per_iteration": 2.9135711193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064964, + "balance_loss_mlp": 1.00659943, + "epoch": 0.1250480954213159, + "flos": 1333201377024.0, + "grad_norm": 0.013063081234142807, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79751045, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.58203125, + "step": 650, + "time_per_iteration": 4.696362495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.0093261, + "epoch": 0.12524047710657946, + "flos": 940198178304.0, + "grad_norm": 0.03723688894295025, + "language_loss": 0.82869852, + "learning_rate": 0.0009764061415379919, + "loss": 0.83939779, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.60498047, + "step": 651, + "time_per_iteration": 3.287029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071729, + "balance_loss_mlp": 1.01078951, + "epoch": 0.12543285879184302, + "flos": 514901812992.0, + "grad_norm": 0.03842788822410913, + "language_loss": 0.90123397, + "learning_rate": 0.0009763114772410109, + "loss": 0.91195124, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.60839844, + "step": 652, + "time_per_iteration": 2.5726470947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071215, + "balance_loss_mlp": 1.01075244, + "epoch": 0.12562524047710658, + "flos": 719684001024.0, + "grad_norm": 0.03790395950388449, + "language_loss": 0.88320071, + "learning_rate": 0.0009762166280235146, + "loss": 0.89391285, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.60351562, + "step": 653, + "time_per_iteration": 2.9728682041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073992, + "balance_loss_mlp": 1.01372027, + "epoch": 0.12581762216237014, + "flos": 564799160064.0, + "grad_norm": 0.039966468352906216, + "language_loss": 0.88308495, + "learning_rate": 0.0009761215939223267, + "loss": 0.89382488, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.6015625, + "step": 654, + "time_per_iteration": 2.7552366256713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.01100981, + "epoch": 0.1260100038476337, + "flos": 482901253632.0, + "grad_norm": 0.045851790315233704, + "language_loss": 0.87049586, + "learning_rate": 0.0009760263749743428, + "loss": 0.88121206, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.60498047, + "step": 655, + "time_per_iteration": 2.5859339237213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.01301908, + "epoch": 0.12620238553289725, + "flos": 576702161664.0, + "grad_norm": 0.03680601760412016, + "language_loss": 0.91127861, + "learning_rate": 0.0009759309712165299, + "loss": 0.9220134, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.60351562, + "step": 656, + "time_per_iteration": 2.7411043643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.00841653, + "epoch": 0.12639476721816084, + "flos": 532186197504.0, + "grad_norm": 0.050748048847022796, + "language_loss": 0.94208288, + "learning_rate": 0.0009758353826859272, + "loss": 0.95277309, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.60498047, + "step": 657, + "time_per_iteration": 2.5851681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071924, + "balance_loss_mlp": 1.01117456, + "epoch": 0.1265871489034244, + "flos": 691232472576.0, + "grad_norm": 0.04052834214006204, + "language_loss": 0.90056133, + "learning_rate": 0.0009757396094196456, + "loss": 0.91128063, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.60644531, + "step": 658, + "time_per_iteration": 2.9119739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.01083672, + "epoch": 0.12677953058868796, + "flos": 538243166976.0, + "grad_norm": 0.03305987481805703, + "language_loss": 0.85138786, + "learning_rate": 0.0009756436514548673, + "loss": 0.86210179, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.60449219, + "step": 659, + "time_per_iteration": 2.8146860599517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070631, + "balance_loss_mlp": 1.01021552, + "epoch": 0.12697191227395152, + "flos": 520120857600.0, + "grad_norm": 0.03322369158928612, + "language_loss": 0.89052176, + "learning_rate": 0.0009755475088288466, + "loss": 0.90122807, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.60302734, + "step": 660, + "time_per_iteration": 2.7092652320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070351, + "balance_loss_mlp": 1.01007843, + "epoch": 0.12716429395921508, + "flos": 567666768384.0, + "grad_norm": 0.0427017471912124, + "language_loss": 0.91535795, + "learning_rate": 0.0009754511815789095, + "loss": 0.92606151, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.6015625, + "step": 661, + "time_per_iteration": 2.790198564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.00809085, + "epoch": 0.12735667564447864, + "flos": 515142885888.0, + "grad_norm": 0.0409493229321676, + "language_loss": 0.8685838, + "learning_rate": 0.0009753546697424533, + "loss": 0.87926698, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.60107422, + "step": 662, + "time_per_iteration": 2.6784565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070378, + "balance_loss_mlp": 1.01020074, + "epoch": 0.1275490573297422, + "flos": 542321975808.0, + "grad_norm": 0.039351291895580044, + "language_loss": 0.91270494, + "learning_rate": 0.0009752579733569475, + "loss": 0.92340875, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.60058594, + "step": 663, + "time_per_iteration": 2.679379940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071762, + "balance_loss_mlp": 1.01358795, + "epoch": 0.12774143901500576, + "flos": 1562027728896.0, + "grad_norm": 0.016936801864205438, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7595315, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.58007812, + "step": 664, + "time_per_iteration": 4.936127424240112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070961, + "balance_loss_mlp": 1.01092672, + "epoch": 0.12793382070026935, + "flos": 614874396672.0, + "grad_norm": 0.047422479810277696, + "language_loss": 0.90634137, + "learning_rate": 0.0009750640270890217, + "loss": 0.91705096, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.59912109, + "step": 665, + "time_per_iteration": 2.712202548980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.01361179, + "epoch": 0.1281262023855329, + "flos": 709118566656.0, + "grad_norm": 0.04721256261198653, + "language_loss": 0.97348696, + "learning_rate": 0.0009749667772818983, + "loss": 0.98422199, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.59765625, + "step": 666, + "time_per_iteration": 2.959563732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065521, + "balance_loss_mlp": 1.00791931, + "epoch": 0.12831858407079647, + "flos": 1428185295360.0, + "grad_norm": 0.00958948420866419, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78001463, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.57421875, + "step": 667, + "time_per_iteration": 4.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.01259768, + "epoch": 0.12851096575606002, + "flos": 450019027968.0, + "grad_norm": 0.04331482152431362, + "language_loss": 0.96237415, + "learning_rate": 0.0009747717245101093, + "loss": 0.97309327, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.59179688, + "step": 668, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.01193655, + "epoch": 0.12870334744132358, + "flos": 480910454016.0, + "grad_norm": 0.040015395826151615, + "language_loss": 0.86231172, + "learning_rate": 0.00097467392162117, + "loss": 0.87302423, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.59179688, + "step": 669, + "time_per_iteration": 2.620121717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.01342034, + "epoch": 0.12889572912658714, + "flos": 640152115200.0, + "grad_norm": 0.03307407171369126, + "language_loss": 0.91950834, + "learning_rate": 0.0009745759344474708, + "loss": 0.9302386, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.59472656, + "step": 670, + "time_per_iteration": 2.834406852722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070894, + "balance_loss_mlp": 1.01114607, + "epoch": 0.1290881108118507, + "flos": 510955206912.0, + "grad_norm": 0.03904079329345599, + "language_loss": 0.90752548, + "learning_rate": 0.0009744777630270536, + "loss": 0.91823441, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.59619141, + "step": 671, + "time_per_iteration": 2.5841259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.00967062, + "epoch": 0.12928049249711426, + "flos": 672291680256.0, + "grad_norm": 0.0427916369984872, + "language_loss": 0.94394779, + "learning_rate": 0.000974379407398032, + "loss": 0.95464385, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.59814453, + "step": 672, + "time_per_iteration": 2.8698208332061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072071, + "balance_loss_mlp": 1.0120368, + "epoch": 0.12947287418237785, + "flos": 795000017664.0, + "grad_norm": 0.03399258645873994, + "language_loss": 0.83039552, + "learning_rate": 0.0009742808675985913, + "loss": 0.84111625, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.59912109, + "step": 673, + "time_per_iteration": 3.1018688678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_mlp": 1.00729334, + "epoch": 0.1296652558676414, + "flos": 486448339200.0, + "grad_norm": 0.039807509100232605, + "language_loss": 0.91899526, + "learning_rate": 0.0009741821436669876, + "loss": 0.92966807, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.59863281, + "step": 674, + "time_per_iteration": 2.6348536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068278, + "balance_loss_mlp": 1.00853038, + "epoch": 0.12985763755290497, + "flos": 454393344768.0, + "grad_norm": 0.044170807310258554, + "language_loss": 0.93403888, + "learning_rate": 0.0009740832356415492, + "loss": 0.9447217, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.59619141, + "step": 675, + "time_per_iteration": 2.483262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072639, + "balance_loss_mlp": 1.01265311, + "epoch": 0.13005001923816853, + "flos": 826435805952.0, + "grad_norm": 0.043859966784303914, + "language_loss": 0.89693773, + "learning_rate": 0.0009739841435606756, + "loss": 0.90766412, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.59863281, + "step": 676, + "time_per_iteration": 2.992385149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_mlp": 1.00726056, + "epoch": 0.1302424009234321, + "flos": 532481705472.0, + "grad_norm": 0.03559705023164985, + "language_loss": 0.91210669, + "learning_rate": 0.0009738848674628377, + "loss": 0.92277622, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.59570312, + "step": 677, + "time_per_iteration": 2.766364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.00765288, + "epoch": 0.13043478260869565, + "flos": 526917575424.0, + "grad_norm": 0.03838556287658105, + "language_loss": 0.90382779, + "learning_rate": 0.000973785407386578, + "loss": 0.91449988, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.59423828, + "step": 678, + "time_per_iteration": 2.772854804992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070658, + "balance_loss_mlp": 1.01076782, + "epoch": 0.1306271642939592, + "flos": 627417991680.0, + "grad_norm": 0.03509098765963207, + "language_loss": 0.88142246, + "learning_rate": 0.0009736857633705103, + "loss": 0.89212906, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.59765625, + "step": 679, + "time_per_iteration": 2.851567268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.01602292, + "epoch": 0.13081954597922277, + "flos": 551841460224.0, + "grad_norm": 0.03859467755451503, + "language_loss": 0.94306064, + "learning_rate": 0.0009735859354533196, + "loss": 0.95381933, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.59716797, + "step": 680, + "time_per_iteration": 2.6908183097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.01038456, + "epoch": 0.13101192766448633, + "flos": 537956407296.0, + "grad_norm": 0.04695623305024525, + "language_loss": 0.92768431, + "learning_rate": 0.0009734859236737628, + "loss": 0.93838656, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.59716797, + "step": 681, + "time_per_iteration": 2.618556261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.00631785, + "epoch": 0.13120430934974991, + "flos": 504514268160.0, + "grad_norm": 0.03771498494962771, + "language_loss": 0.94425803, + "learning_rate": 0.0009733857280706678, + "loss": 0.95491678, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.59423828, + "step": 682, + "time_per_iteration": 2.607445240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.00883758, + "epoch": 0.13139669103501347, + "flos": 615423616512.0, + "grad_norm": 0.040497909024236244, + "language_loss": 0.85748106, + "learning_rate": 0.000973285348682934, + "loss": 0.86816311, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.59228516, + "step": 683, + "time_per_iteration": 2.749258518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.00846863, + "epoch": 0.13158907272027703, + "flos": 1488218420736.0, + "grad_norm": 0.017735586482065788, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.78962922, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.5625, + "step": 684, + "time_per_iteration": 4.792337894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069614, + "balance_loss_mlp": 1.01053405, + "epoch": 0.1317814544055406, + "flos": 987119046912.0, + "grad_norm": 0.04121230716493085, + "language_loss": 0.86815995, + "learning_rate": 0.0009730840387095046, + "loss": 0.87885606, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.58935547, + "step": 685, + "time_per_iteration": 3.324737071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.00972676, + "epoch": 0.13197383609080415, + "flos": 612629885184.0, + "grad_norm": 0.03769323902360627, + "language_loss": 0.91733027, + "learning_rate": 0.0009729831082019642, + "loss": 0.92801929, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.59033203, + "step": 686, + "time_per_iteration": 2.883368968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069054, + "balance_loss_mlp": 1.0096879, + "epoch": 0.1321662177760677, + "flos": 495555664128.0, + "grad_norm": 0.03344682577786829, + "language_loss": 0.90060174, + "learning_rate": 0.0009728819940660958, + "loss": 0.91129231, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.59228516, + "step": 687, + "time_per_iteration": 2.7771294116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069131, + "balance_loss_mlp": 1.00971675, + "epoch": 0.13235859946133127, + "flos": 496844632320.0, + "grad_norm": 0.041743180753116546, + "language_loss": 0.8673048, + "learning_rate": 0.0009727806963411557, + "loss": 0.87799615, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.59277344, + "step": 688, + "time_per_iteration": 2.5879924297332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069414, + "balance_loss_mlp": 1.00971425, + "epoch": 0.13255098114659483, + "flos": 512768116992.0, + "grad_norm": 0.035278095584539565, + "language_loss": 0.88457793, + "learning_rate": 0.000972679215066471, + "loss": 0.89527214, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.59570312, + "step": 689, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_mlp": 1.00826621, + "epoch": 0.13274336283185842, + "flos": 548400332544.0, + "grad_norm": 0.043703661342582356, + "language_loss": 1.0036962, + "learning_rate": 0.0009725775502814401, + "loss": 1.01437247, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.59228516, + "step": 690, + "time_per_iteration": 2.580975294113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072547, + "balance_loss_mlp": 1.01313293, + "epoch": 0.13293574451712198, + "flos": 642003909120.0, + "grad_norm": 0.041755939912029, + "language_loss": 0.86554468, + "learning_rate": 0.0009724757020255327, + "loss": 0.87627012, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.59277344, + "step": 691, + "time_per_iteration": 2.895805835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074323, + "balance_loss_mlp": 1.01533794, + "epoch": 0.13312812620238554, + "flos": 492470315520.0, + "grad_norm": 0.04584738151589033, + "language_loss": 0.8907311, + "learning_rate": 0.0009723736703382902, + "loss": 0.90147436, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.58837891, + "step": 692, + "time_per_iteration": 2.593621253967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073259, + "balance_loss_mlp": 1.01427472, + "epoch": 0.1333205078876491, + "flos": 509950086144.0, + "grad_norm": 0.042207641511909956, + "language_loss": 0.84734881, + "learning_rate": 0.0009722714552593244, + "loss": 0.85808134, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.58837891, + "step": 693, + "time_per_iteration": 2.6628286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069094, + "balance_loss_mlp": 1.01010931, + "epoch": 0.13351288957291266, + "flos": 419592251136.0, + "grad_norm": 0.04342856140262568, + "language_loss": 0.95545483, + "learning_rate": 0.000972169056828319, + "loss": 0.96614575, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.58837891, + "step": 694, + "time_per_iteration": 2.491511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.00896847, + "epoch": 0.13370527125817622, + "flos": 617051834112.0, + "grad_norm": 0.03328111889388194, + "language_loss": 0.87929142, + "learning_rate": 0.0009720664750850283, + "loss": 0.88997287, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.59033203, + "step": 695, + "time_per_iteration": 2.802238941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066112, + "balance_loss_mlp": 1.00693631, + "epoch": 0.13389765294343978, + "flos": 627170115840.0, + "grad_norm": 0.04111883948503256, + "language_loss": 0.94899035, + "learning_rate": 0.0009719637100692784, + "loss": 0.95965147, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.59033203, + "step": 696, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066724, + "balance_loss_mlp": 1.00764382, + "epoch": 0.13409003462870334, + "flos": 610897655040.0, + "grad_norm": 0.03903466400724949, + "language_loss": 0.84625083, + "learning_rate": 0.0009718607618209661, + "loss": 0.85691804, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.58935547, + "step": 697, + "time_per_iteration": 2.8612687587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.00915492, + "epoch": 0.13428241631396692, + "flos": 685088987136.0, + "grad_norm": 0.03548160791415639, + "language_loss": 0.8885181, + "learning_rate": 0.0009717576303800595, + "loss": 0.89919716, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.5859375, + "step": 698, + "time_per_iteration": 3.046081304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.00870502, + "epoch": 0.13447479799923048, + "flos": 509819828736.0, + "grad_norm": 0.04099621387271608, + "language_loss": 0.8689754, + "learning_rate": 0.0009716543157865975, + "loss": 0.87964994, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.5859375, + "step": 699, + "time_per_iteration": 2.7116739749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.00893724, + "epoch": 0.13466717968449404, + "flos": 899060158464.0, + "grad_norm": 0.03800712734159662, + "language_loss": 0.8517018, + "learning_rate": 0.0009715508180806907, + "loss": 0.86237621, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.58349609, + "step": 700, + "time_per_iteration": 3.184324026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.00777256, + "epoch": 0.1348595613697576, + "flos": 991695552768.0, + "grad_norm": 0.036541360765650906, + "language_loss": 0.91219282, + "learning_rate": 0.0009714471373025202, + "loss": 0.92285609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.58398438, + "step": 701, + "time_per_iteration": 3.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.0059582, + "epoch": 0.13505194305502116, + "flos": 488812414464.0, + "grad_norm": 0.038284394577449095, + "language_loss": 0.90020943, + "learning_rate": 0.0009713432734923386, + "loss": 0.91085601, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.58544922, + "step": 702, + "time_per_iteration": 2.6416144371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.00842357, + "epoch": 0.13524432474028472, + "flos": 614520562944.0, + "grad_norm": 0.03635122731697363, + "language_loss": 0.87970936, + "learning_rate": 0.0009712392266904696, + "loss": 0.89038247, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.58740234, + "step": 703, + "time_per_iteration": 2.73490309715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.00782144, + "epoch": 0.13543670642554828, + "flos": 906275838720.0, + "grad_norm": 0.040994558071305906, + "language_loss": 0.86788869, + "learning_rate": 0.0009711349969373076, + "loss": 0.87855482, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.58642578, + "step": 704, + "time_per_iteration": 3.1667368412017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066356, + "balance_loss_mlp": 1.00765777, + "epoch": 0.13562908811081184, + "flos": 551748141312.0, + "grad_norm": 0.040707128775991024, + "language_loss": 0.81448901, + "learning_rate": 0.0009710305842733178, + "loss": 0.82515258, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.58544922, + "step": 705, + "time_per_iteration": 2.7456798553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064755, + "balance_loss_mlp": 1.00648558, + "epoch": 0.1358214697960754, + "flos": 509038284288.0, + "grad_norm": 0.04235852839756889, + "language_loss": 0.91048527, + "learning_rate": 0.0009709259887390373, + "loss": 0.9211328, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.58105469, + "step": 706, + "time_per_iteration": 2.614645481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067613, + "balance_loss_mlp": 1.0098201, + "epoch": 0.136013851481339, + "flos": 529924189440.0, + "grad_norm": 0.045207837368539144, + "language_loss": 0.92539275, + "learning_rate": 0.0009708212103750737, + "loss": 0.93606889, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.57617188, + "step": 707, + "time_per_iteration": 2.5839250087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073243, + "balance_loss_mlp": 1.01525927, + "epoch": 0.13620623316660255, + "flos": 660321604608.0, + "grad_norm": 0.04139663244511697, + "language_loss": 0.88690269, + "learning_rate": 0.0009707162492221051, + "loss": 0.8976351, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.578125, + "step": 708, + "time_per_iteration": 2.8753738403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106855, + "balance_loss_mlp": 1.01051939, + "epoch": 0.1363986148518661, + "flos": 673083918336.0, + "grad_norm": 0.04870142688483653, + "language_loss": 0.89226341, + "learning_rate": 0.0009706111053208815, + "loss": 0.90294898, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.57861328, + "step": 709, + "time_per_iteration": 2.792555570602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.0069865, + "epoch": 0.13659099653712967, + "flos": 474004866048.0, + "grad_norm": 0.041589756065930725, + "language_loss": 0.87875092, + "learning_rate": 0.0009705057787122232, + "loss": 0.88940346, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.58105469, + "step": 710, + "time_per_iteration": 2.5474488735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106841, + "balance_loss_mlp": 1.00980711, + "epoch": 0.13678337822239323, + "flos": 453648738816.0, + "grad_norm": 0.03947638411835938, + "language_loss": 0.92397159, + "learning_rate": 0.0009704002694370216, + "loss": 0.93465567, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.58447266, + "step": 711, + "time_per_iteration": 2.5812153816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107162, + "balance_loss_mlp": 1.01306474, + "epoch": 0.13697575990765679, + "flos": 520626336000.0, + "grad_norm": 0.04103000756090051, + "language_loss": 0.88202429, + "learning_rate": 0.0009702945775362388, + "loss": 0.89274049, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.58398438, + "step": 712, + "time_per_iteration": 2.6084940433502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067699, + "balance_loss_mlp": 1.00914371, + "epoch": 0.13716814159292035, + "flos": 481366354944.0, + "grad_norm": 0.04017855754763819, + "language_loss": 0.88458985, + "learning_rate": 0.0009701887030509086, + "loss": 0.89526689, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.58398438, + "step": 713, + "time_per_iteration": 2.6361663341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.01425505, + "epoch": 0.1373605232781839, + "flos": 546750727680.0, + "grad_norm": 0.04169009137316196, + "language_loss": 0.92536753, + "learning_rate": 0.0009700826460221346, + "loss": 0.93609238, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.58056641, + "step": 714, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068882, + "balance_loss_mlp": 1.01080275, + "epoch": 0.1375529049634475, + "flos": 710071197696.0, + "grad_norm": 0.042053375460334, + "language_loss": 0.94210052, + "learning_rate": 0.0009699764064910921, + "loss": 0.95278937, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.57910156, + "step": 715, + "time_per_iteration": 2.870835542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069166, + "balance_loss_mlp": 1.01099169, + "epoch": 0.13774528664871105, + "flos": 487677036288.0, + "grad_norm": 0.04018028408764831, + "language_loss": 0.88572168, + "learning_rate": 0.0009698699844990268, + "loss": 0.89641333, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.58007812, + "step": 716, + "time_per_iteration": 2.6557233333587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106644, + "balance_loss_mlp": 1.00817037, + "epoch": 0.1379376683339746, + "flos": 681459276288.0, + "grad_norm": 0.03631196674856893, + "language_loss": 0.89737439, + "learning_rate": 0.0009697633800872555, + "loss": 0.90803885, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.58105469, + "step": 717, + "time_per_iteration": 2.9236202239990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.00998127, + "epoch": 0.13813005001923817, + "flos": 612226473984.0, + "grad_norm": 0.040527486313319094, + "language_loss": 0.9214747, + "learning_rate": 0.0009696565932971655, + "loss": 0.93215865, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.58251953, + "step": 718, + "time_per_iteration": 2.8931636810302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_mlp": 1.01394677, + "epoch": 0.13832243170450173, + "flos": 589927179264.0, + "grad_norm": 0.042228364331249636, + "language_loss": 0.91184157, + "learning_rate": 0.0009695496241702153, + "loss": 0.92256421, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.58154297, + "step": 719, + "time_per_iteration": 2.8006720542907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010673, + "balance_loss_mlp": 1.00917327, + "epoch": 0.1385148133897653, + "flos": 701320618752.0, + "grad_norm": 0.04012183054192491, + "language_loss": 0.87174737, + "learning_rate": 0.0009694424727479339, + "loss": 0.88242036, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.57958984, + "step": 720, + "time_per_iteration": 2.9363977909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.0081414, + "epoch": 0.13870719507502885, + "flos": 599367929088.0, + "grad_norm": 0.04032336097495746, + "language_loss": 0.90803999, + "learning_rate": 0.0009693351390719213, + "loss": 0.91870457, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.58154297, + "step": 721, + "time_per_iteration": 2.7786271572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01273775, + "epoch": 0.1388995767602924, + "flos": 587749741824.0, + "grad_norm": 0.04179929290372652, + "language_loss": 0.92465305, + "learning_rate": 0.000969227623183848, + "loss": 0.93536115, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.57910156, + "step": 722, + "time_per_iteration": 2.777453660964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.00870621, + "epoch": 0.139091958445556, + "flos": 652363263744.0, + "grad_norm": 0.041578114374578125, + "language_loss": 0.92603219, + "learning_rate": 0.0009691199251254554, + "loss": 0.9366982, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.57714844, + "step": 723, + "time_per_iteration": 2.813610553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.00586045, + "epoch": 0.13928434013081956, + "flos": 576906296064.0, + "grad_norm": 0.03663552971403626, + "language_loss": 0.88541949, + "learning_rate": 0.0009690120449385555, + "loss": 0.89605606, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.57617188, + "step": 724, + "time_per_iteration": 2.7604424953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_mlp": 1.00582433, + "epoch": 0.13947672181608312, + "flos": 564315068928.0, + "grad_norm": 0.034271197388489986, + "language_loss": 0.93926299, + "learning_rate": 0.0009689039826650312, + "loss": 0.94990206, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.57910156, + "step": 725, + "time_per_iteration": 2.7856695652008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095871, + "balance_loss_mlp": 1.03941345, + "epoch": 0.13966910350134668, + "flos": 1524951988224.0, + "grad_norm": 0.03128450212810151, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77618933, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.56640625, + "step": 726, + "time_per_iteration": 4.903306245803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.00902975, + "epoch": 0.13986148518661023, + "flos": 500856367104.0, + "grad_norm": 0.052764167671210026, + "language_loss": 0.89172196, + "learning_rate": 0.0009686873120259941, + "loss": 0.90239263, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.57861328, + "step": 727, + "time_per_iteration": 2.6450552940368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072692, + "balance_loss_mlp": 1.01518559, + "epoch": 0.1400538668718738, + "flos": 599850074880.0, + "grad_norm": 0.036488800736072635, + "language_loss": 0.88047451, + "learning_rate": 0.0009685787037446004, + "loss": 0.89120144, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.57324219, + "step": 728, + "time_per_iteration": 2.763434648513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072512, + "balance_loss_mlp": 1.01481462, + "epoch": 0.14024624855713735, + "flos": 595169556480.0, + "grad_norm": 0.047561697925478, + "language_loss": 0.88858587, + "learning_rate": 0.0009684699135448201, + "loss": 0.89931101, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.57519531, + "step": 729, + "time_per_iteration": 2.745037078857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067277, + "balance_loss_mlp": 1.00962722, + "epoch": 0.1404386302424009, + "flos": 507586010880.0, + "grad_norm": 0.03094406590189725, + "language_loss": 0.9291476, + "learning_rate": 0.0009683609414688895, + "loss": 0.93982029, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.57470703, + "step": 730, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.01039195, + "epoch": 0.14063101192766447, + "flos": 574515975936.0, + "grad_norm": 0.037780385553924656, + "language_loss": 0.87345785, + "learning_rate": 0.0009682517875591154, + "loss": 0.88414258, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.57910156, + "step": 731, + "time_per_iteration": 2.752572536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071345, + "balance_loss_mlp": 1.0129801, + "epoch": 0.14082339361292806, + "flos": 565765396992.0, + "grad_norm": 0.03832964150159033, + "language_loss": 0.87666118, + "learning_rate": 0.0009681424518578749, + "loss": 0.88737464, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.58203125, + "step": 732, + "time_per_iteration": 2.7323830127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.01028764, + "epoch": 0.14101577529819162, + "flos": 464583558144.0, + "grad_norm": 0.035957988569031644, + "language_loss": 0.88670099, + "learning_rate": 0.000968032934407616, + "loss": 0.8973856, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.58007812, + "step": 733, + "time_per_iteration": 2.6479005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064525, + "balance_loss_mlp": 1.00644577, + "epoch": 0.14120815698345518, + "flos": 597262423296.0, + "grad_norm": 0.039547782577588224, + "language_loss": 0.82413781, + "learning_rate": 0.0009679232352508571, + "loss": 0.83478296, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.57910156, + "step": 734, + "time_per_iteration": 2.7924795150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.00599897, + "epoch": 0.14140053866871874, + "flos": 536232925440.0, + "grad_norm": 0.03854566850595878, + "language_loss": 0.82520735, + "learning_rate": 0.0009678133544301871, + "loss": 0.83584428, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.57519531, + "step": 735, + "time_per_iteration": 2.658731698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_mlp": 1.00498438, + "epoch": 0.1415929203539823, + "flos": 521277623040.0, + "grad_norm": 0.0297517777524564, + "language_loss": 0.92917788, + "learning_rate": 0.0009677032919882658, + "loss": 0.93980187, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.57226562, + "step": 736, + "time_per_iteration": 2.661276340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_mlp": 1.0113374, + "epoch": 0.14178530203924586, + "flos": 483302719488.0, + "grad_norm": 0.041037110936195734, + "language_loss": 0.92867804, + "learning_rate": 0.000967593047967823, + "loss": 0.93936217, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.56982422, + "step": 737, + "time_per_iteration": 2.52840256690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068115, + "balance_loss_mlp": 1.01056099, + "epoch": 0.14197768372450942, + "flos": 677840259072.0, + "grad_norm": 0.04254557939420697, + "language_loss": 0.88126308, + "learning_rate": 0.0009674826224116593, + "loss": 0.89194429, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.57373047, + "step": 738, + "time_per_iteration": 2.858147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.0167979, + "epoch": 0.14217006540977298, + "flos": 446992972032.0, + "grad_norm": 0.045930563119643074, + "language_loss": 0.87994051, + "learning_rate": 0.0009673720153626455, + "loss": 0.89068353, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.57324219, + "step": 739, + "time_per_iteration": 2.664236545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.01199603, + "epoch": 0.14236244709503657, + "flos": 497478422784.0, + "grad_norm": 0.040566684483093814, + "language_loss": 0.88105047, + "learning_rate": 0.0009672612268637235, + "loss": 0.89174449, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.57226562, + "step": 740, + "time_per_iteration": 2.634126901626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069933, + "balance_loss_mlp": 1.01304626, + "epoch": 0.14255482878030012, + "flos": 649480104192.0, + "grad_norm": 0.05086050125917657, + "language_loss": 0.85906518, + "learning_rate": 0.0009671502569579048, + "loss": 0.86976457, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.56884766, + "step": 741, + "time_per_iteration": 2.7642107009887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071964, + "balance_loss_mlp": 1.01564944, + "epoch": 0.14274721046556368, + "flos": 537274984704.0, + "grad_norm": 0.037356444744632025, + "language_loss": 0.90824854, + "learning_rate": 0.0009670391056882719, + "loss": 0.91896814, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.56445312, + "step": 742, + "time_per_iteration": 2.7307372093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069292, + "balance_loss_mlp": 1.01288199, + "epoch": 0.14293959215082724, + "flos": 958584893184.0, + "grad_norm": 0.03744948002603285, + "language_loss": 0.89976203, + "learning_rate": 0.0009669277730979776, + "loss": 0.91045499, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.56494141, + "step": 743, + "time_per_iteration": 3.2251601219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068321, + "balance_loss_mlp": 1.01162553, + "epoch": 0.1431319738360908, + "flos": 694386840576.0, + "grad_norm": 0.037398516399228816, + "language_loss": 0.86562485, + "learning_rate": 0.0009668162592302449, + "loss": 0.87630802, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.56738281, + "step": 744, + "time_per_iteration": 2.924435615539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.01015854, + "epoch": 0.14332435552135436, + "flos": 566503200000.0, + "grad_norm": 0.037819132294000864, + "language_loss": 0.86981773, + "learning_rate": 0.0009667045641283676, + "loss": 0.88048917, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.56933594, + "step": 745, + "time_per_iteration": 2.6744887828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071811, + "balance_loss_mlp": 1.01540148, + "epoch": 0.14351673720661792, + "flos": 739696988160.0, + "grad_norm": 0.042480690817339954, + "language_loss": 0.96115947, + "learning_rate": 0.0009665926878357092, + "loss": 0.97187757, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.56591797, + "step": 746, + "time_per_iteration": 2.9137520790100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069908, + "balance_loss_mlp": 1.0134027, + "epoch": 0.14370911889188148, + "flos": 550352248320.0, + "grad_norm": 0.037361960218361134, + "language_loss": 0.92219329, + "learning_rate": 0.0009664806303957043, + "loss": 0.93289238, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.56542969, + "step": 747, + "time_per_iteration": 2.7734382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.01453757, + "epoch": 0.14390150057714507, + "flos": 591590390016.0, + "grad_norm": 0.040803275102161134, + "language_loss": 0.88578373, + "learning_rate": 0.0009663683918518571, + "loss": 0.89649272, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.56542969, + "step": 748, + "time_per_iteration": 2.93782114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.0103749, + "epoch": 0.14409388226240863, + "flos": 592145445888.0, + "grad_norm": 0.040391516566669984, + "language_loss": 0.87085271, + "learning_rate": 0.0009662559722477428, + "loss": 0.88152146, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.56640625, + "step": 749, + "time_per_iteration": 2.696570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.08542633, + "epoch": 0.1442862639476722, + "flos": 1514657762304.0, + "grad_norm": 0.043557664449290004, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77303517, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.55273438, + "step": 750, + "time_per_iteration": 5.024984836578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106928, + "balance_loss_mlp": 1.01263177, + "epoch": 0.14447864563293575, + "flos": 497856556032.0, + "grad_norm": 0.03544029116038115, + "language_loss": 0.90697813, + "learning_rate": 0.0009660305900333632, + "loss": 0.91767091, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.56738281, + "step": 751, + "time_per_iteration": 2.678037166595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078207, + "balance_loss_mlp": 1.02165437, + "epoch": 0.1446710273181993, + "flos": 590795239680.0, + "grad_norm": 0.04141635113788076, + "language_loss": 0.83649188, + "learning_rate": 0.0009659176275105992, + "loss": 0.84727395, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.56640625, + "step": 752, + "time_per_iteration": 2.714871883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.02074409, + "epoch": 0.14486340900346287, + "flos": 587013884160.0, + "grad_norm": 0.03637909883196532, + "language_loss": 0.87195009, + "learning_rate": 0.0009658044841025701, + "loss": 0.88271976, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.56396484, + "step": 753, + "time_per_iteration": 2.7753467559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.01901722, + "epoch": 0.14505579068872643, + "flos": 505741019904.0, + "grad_norm": 0.041255413340114844, + "language_loss": 0.82866222, + "learning_rate": 0.0009656911598532021, + "loss": 0.83941746, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.56591797, + "step": 754, + "time_per_iteration": 2.657831907272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.02119958, + "epoch": 0.14524817237399, + "flos": 487816041984.0, + "grad_norm": 0.03637506550278126, + "language_loss": 0.9138847, + "learning_rate": 0.0009655776548064917, + "loss": 0.92465889, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.56347656, + "step": 755, + "time_per_iteration": 2.6499805450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070806, + "balance_loss_mlp": 1.01477778, + "epoch": 0.14544055405925355, + "flos": 729450394368.0, + "grad_norm": 0.037726189244012505, + "language_loss": 0.89799821, + "learning_rate": 0.0009654639690065054, + "loss": 0.90870631, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.56201172, + "step": 756, + "time_per_iteration": 2.913638114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.01461017, + "epoch": 0.14563293574451713, + "flos": 594787532544.0, + "grad_norm": 0.03772784195488967, + "language_loss": 0.8914414, + "learning_rate": 0.00096535010249738, + "loss": 0.90214825, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.5625, + "step": 757, + "time_per_iteration": 2.721640110015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.01121712, + "epoch": 0.1458253174297807, + "flos": 561623404800.0, + "grad_norm": 0.04410713855467511, + "language_loss": 0.84106696, + "learning_rate": 0.0009652360553233224, + "loss": 0.8517437, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.56591797, + "step": 758, + "time_per_iteration": 2.771986484527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080421, + "balance_loss_mlp": 1.02625275, + "epoch": 0.14601769911504425, + "flos": 1561189804032.0, + "grad_norm": 0.021986445825835567, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74854165, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.54296875, + "step": 759, + "time_per_iteration": 4.951657056808472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064246, + "balance_loss_mlp": 1.00712132, + "epoch": 0.1462100808003078, + "flos": 867823646976.0, + "grad_norm": 0.03532102179266325, + "language_loss": 0.82350075, + "learning_rate": 0.0009650074191575883, + "loss": 0.83414322, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.56982422, + "step": 760, + "time_per_iteration": 3.2275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.02083874, + "epoch": 0.14640246248557137, + "flos": 524030525184.0, + "grad_norm": 0.0394901057776484, + "language_loss": 0.87295806, + "learning_rate": 0.0009648928302546766, + "loss": 0.88373965, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.57177734, + "step": 761, + "time_per_iteration": 2.6739044189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108061, + "balance_loss_mlp": 1.02319896, + "epoch": 0.14659484417083493, + "flos": 1032242556672.0, + "grad_norm": 0.0381114836464334, + "language_loss": 0.86423808, + "learning_rate": 0.0009647780608643613, + "loss": 0.87504417, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.57226562, + "step": 762, + "time_per_iteration": 3.355055332183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_mlp": 1.02742219, + "epoch": 0.1467872258560985, + "flos": 501657353472.0, + "grad_norm": 0.04884269069306727, + "language_loss": 0.89483184, + "learning_rate": 0.0009646631110312001, + "loss": 0.90568066, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.57275391, + "step": 763, + "time_per_iteration": 2.638404607772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074108, + "balance_loss_mlp": 1.01683939, + "epoch": 0.14697960754136205, + "flos": 548936913408.0, + "grad_norm": 0.030517371118051684, + "language_loss": 0.89587164, + "learning_rate": 0.0009645479807998203, + "loss": 0.90661263, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.57128906, + "step": 764, + "time_per_iteration": 2.7784340381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066881, + "balance_loss_mlp": 1.0099467, + "epoch": 0.14717198922662564, + "flos": 518902854144.0, + "grad_norm": 0.03321738346858149, + "language_loss": 0.93693149, + "learning_rate": 0.0009644326702149196, + "loss": 0.94760031, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.56884766, + "step": 765, + "time_per_iteration": 2.712148904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.009902, + "epoch": 0.1473643709118892, + "flos": 733484483328.0, + "grad_norm": 0.042813367444357694, + "language_loss": 0.86227441, + "learning_rate": 0.0009643171793212653, + "loss": 0.87293845, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.56591797, + "step": 766, + "time_per_iteration": 3.0350003242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.01357007, + "epoch": 0.14755675259715276, + "flos": 621669169152.0, + "grad_norm": 0.04397904632105779, + "language_loss": 0.90884185, + "learning_rate": 0.0009642015081636952, + "loss": 0.91953874, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.56298828, + "step": 767, + "time_per_iteration": 2.6967811584472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.01185656, + "epoch": 0.14774913428241632, + "flos": 453173395968.0, + "grad_norm": 0.040409537343205924, + "language_loss": 0.89756525, + "learning_rate": 0.0009640856567871166, + "loss": 0.90824074, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.55859375, + "step": 768, + "time_per_iteration": 2.5016207695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.00803363, + "epoch": 0.14794151596767988, + "flos": 838655702784.0, + "grad_norm": 0.03518214363191685, + "language_loss": 0.90024096, + "learning_rate": 0.0009639696252365072, + "loss": 0.91087824, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.55859375, + "step": 769, + "time_per_iteration": 3.0535316467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064247, + "balance_loss_mlp": 1.00874364, + "epoch": 0.14813389765294344, + "flos": 687405430272.0, + "grad_norm": 0.03578436651039587, + "language_loss": 0.83073497, + "learning_rate": 0.0009638534135569144, + "loss": 0.8413775, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.55664062, + "step": 770, + "time_per_iteration": 2.8983683586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065854, + "balance_loss_mlp": 1.01039767, + "epoch": 0.148326279338207, + "flos": 510944513280.0, + "grad_norm": 0.03931230706380594, + "language_loss": 0.91550887, + "learning_rate": 0.0009637370217934554, + "loss": 0.92616743, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.55615234, + "step": 771, + "time_per_iteration": 2.6311967372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061556, + "balance_loss_mlp": 1.00590932, + "epoch": 0.14851866102347056, + "flos": 589332272640.0, + "grad_norm": 0.03214719611667013, + "language_loss": 0.8436957, + "learning_rate": 0.0009636204499913175, + "loss": 0.85431123, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.55810547, + "step": 772, + "time_per_iteration": 2.8748695850372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066056, + "balance_loss_mlp": 1.01069546, + "epoch": 0.14871104270873411, + "flos": 692248286976.0, + "grad_norm": 0.034034874980260935, + "language_loss": 0.89455193, + "learning_rate": 0.0009635036981957581, + "loss": 0.9052124, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.55517578, + "step": 773, + "time_per_iteration": 2.8526012897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063541, + "balance_loss_mlp": 1.00789392, + "epoch": 0.1489034243939977, + "flos": 656283624960.0, + "grad_norm": 0.03841304714783139, + "language_loss": 0.91971016, + "learning_rate": 0.0009633867664521043, + "loss": 0.93034559, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.55810547, + "step": 774, + "time_per_iteration": 2.823320150375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_mlp": 1.00736797, + "epoch": 0.14909580607926126, + "flos": 476796652032.0, + "grad_norm": 0.0404919947218097, + "language_loss": 0.88328946, + "learning_rate": 0.0009632696548057527, + "loss": 0.89392436, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.56298828, + "step": 775, + "time_per_iteration": 2.5567190647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072265, + "balance_loss_mlp": 1.01609385, + "epoch": 0.14928818776452482, + "flos": 612284799744.0, + "grad_norm": 0.03821441574416946, + "language_loss": 0.86270714, + "learning_rate": 0.0009631523633021704, + "loss": 0.87342978, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.56347656, + "step": 776, + "time_per_iteration": 2.783348321914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.01187015, + "epoch": 0.14948056944978838, + "flos": 562917230592.0, + "grad_norm": 0.039790220133906304, + "language_loss": 0.90072912, + "learning_rate": 0.0009630348919868936, + "loss": 0.9114095, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.56347656, + "step": 777, + "time_per_iteration": 2.7115018367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.01736236, + "epoch": 0.14967295113505194, + "flos": 450112346880.0, + "grad_norm": 0.044777999480791836, + "language_loss": 0.82363755, + "learning_rate": 0.0009629172409055293, + "loss": 0.83437192, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.5625, + "step": 778, + "time_per_iteration": 2.578178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079079, + "balance_loss_mlp": 1.02319324, + "epoch": 0.1498653328203155, + "flos": 572429912064.0, + "grad_norm": 0.03699200582710457, + "language_loss": 0.8876617, + "learning_rate": 0.0009627994101037531, + "loss": 0.89845246, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.56054688, + "step": 779, + "time_per_iteration": 2.7733986377716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.01556909, + "epoch": 0.15005771450557906, + "flos": 632408602368.0, + "grad_norm": 0.04036301028093645, + "language_loss": 0.90477651, + "learning_rate": 0.0009626813996273114, + "loss": 0.91549194, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.56152344, + "step": 780, + "time_per_iteration": 2.8476834297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064638, + "balance_loss_mlp": 1.00884771, + "epoch": 0.15025009619084262, + "flos": 579166358784.0, + "grad_norm": 0.036574622666600026, + "language_loss": 0.89819682, + "learning_rate": 0.0009625632095220198, + "loss": 0.90884316, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.55957031, + "step": 781, + "time_per_iteration": 2.8279531002044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065406, + "balance_loss_mlp": 1.00961614, + "epoch": 0.1504424778761062, + "flos": 484857060096.0, + "grad_norm": 0.04416373966784989, + "language_loss": 0.8858574, + "learning_rate": 0.0009624448398337637, + "loss": 0.89651144, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.55957031, + "step": 782, + "time_per_iteration": 2.512742280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0075767, + "epoch": 0.15063485956136977, + "flos": 763895708928.0, + "grad_norm": 0.03630111779859241, + "language_loss": 0.90811443, + "learning_rate": 0.0009623262906084984, + "loss": 0.9187429, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.55419922, + "step": 783, + "time_per_iteration": 3.0409936904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.01156867, + "epoch": 0.15082724124663333, + "flos": 498676984320.0, + "grad_norm": 0.03758683048429116, + "language_loss": 0.91324949, + "learning_rate": 0.0009622075618922486, + "loss": 0.92391407, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.55029297, + "step": 784, + "time_per_iteration": 2.716580629348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.01188219, + "epoch": 0.15101962293189689, + "flos": 510722882304.0, + "grad_norm": 0.0361748672236624, + "language_loss": 0.88713133, + "learning_rate": 0.0009620886537311091, + "loss": 0.89779752, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.54882812, + "step": 785, + "time_per_iteration": 2.7197515964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.01069367, + "epoch": 0.15121200461716044, + "flos": 458702532864.0, + "grad_norm": 0.0476660620131034, + "language_loss": 0.86751854, + "learning_rate": 0.000961969566171244, + "loss": 0.87817287, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.54882812, + "step": 786, + "time_per_iteration": 2.519826650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063397, + "balance_loss_mlp": 1.00865602, + "epoch": 0.151404386302424, + "flos": 539017908480.0, + "grad_norm": 0.0401982478312821, + "language_loss": 0.91594857, + "learning_rate": 0.0009618502992588873, + "loss": 0.92658257, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.54882812, + "step": 787, + "time_per_iteration": 2.6427645683288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.02133262, + "epoch": 0.15159676798768756, + "flos": 689617860864.0, + "grad_norm": 0.04258050045209434, + "language_loss": 0.8916502, + "learning_rate": 0.0009617308530403424, + "loss": 0.9024148, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.55273438, + "step": 788, + "time_per_iteration": 3.0662577152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106461, + "balance_loss_mlp": 1.00958323, + "epoch": 0.15178914967295112, + "flos": 546433832448.0, + "grad_norm": 0.03354297731817266, + "language_loss": 0.88695067, + "learning_rate": 0.0009616112275619825, + "loss": 0.89759684, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.55175781, + "step": 789, + "time_per_iteration": 2.7230606079101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065845, + "balance_loss_mlp": 1.01081765, + "epoch": 0.1519815313582147, + "flos": 512815749120.0, + "grad_norm": 0.03087624340708216, + "language_loss": 0.85391772, + "learning_rate": 0.0009614914228702503, + "loss": 0.86457616, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.55175781, + "step": 790, + "time_per_iteration": 2.6690316200256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075082, + "balance_loss_mlp": 1.02024603, + "epoch": 0.15217391304347827, + "flos": 685458372096.0, + "grad_norm": 0.03877155611381102, + "language_loss": 0.90952718, + "learning_rate": 0.0009613714390116581, + "loss": 0.92027801, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.54980469, + "step": 791, + "time_per_iteration": 3.006898880004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069036, + "balance_loss_mlp": 1.01396108, + "epoch": 0.15236629472874183, + "flos": 645446982144.0, + "grad_norm": 0.03750254169389994, + "language_loss": 0.87660968, + "learning_rate": 0.0009612512760327879, + "loss": 0.88730001, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.55224609, + "step": 792, + "time_per_iteration": 2.858262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068429, + "balance_loss_mlp": 1.01297235, + "epoch": 0.1525586764140054, + "flos": 413765660928.0, + "grad_norm": 0.044925092089749936, + "language_loss": 0.86468709, + "learning_rate": 0.0009611309339802909, + "loss": 0.87537134, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.55615234, + "step": 793, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070738, + "balance_loss_mlp": 1.01485312, + "epoch": 0.15275105809926895, + "flos": 804234687744.0, + "grad_norm": 0.03634630877191588, + "language_loss": 0.85518378, + "learning_rate": 0.0009610104129008881, + "loss": 0.8658911, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.56054688, + "step": 794, + "time_per_iteration": 3.119896173477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064568, + "balance_loss_mlp": 1.0088737, + "epoch": 0.1529434397845325, + "flos": 613543632384.0, + "grad_norm": 0.039196324818253456, + "language_loss": 0.89691782, + "learning_rate": 0.0009608897128413701, + "loss": 0.90756351, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.55859375, + "step": 795, + "time_per_iteration": 2.7244484424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.00949657, + "epoch": 0.15313582146979607, + "flos": 616472478720.0, + "grad_norm": 0.031652256183926086, + "language_loss": 0.86697376, + "learning_rate": 0.0009607688338485965, + "loss": 0.87762469, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.55761719, + "step": 796, + "time_per_iteration": 2.859959363937378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106534, + "balance_loss_mlp": 1.00997913, + "epoch": 0.15332820315505963, + "flos": 794993214720.0, + "grad_norm": 0.036135713167076366, + "language_loss": 0.91464871, + "learning_rate": 0.0009606477759694969, + "loss": 0.92530215, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.55517578, + "step": 797, + "time_per_iteration": 3.0383169651031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.00806129, + "epoch": 0.1535205848403232, + "flos": 551257247232.0, + "grad_norm": 0.04267360012583918, + "language_loss": 0.89290035, + "learning_rate": 0.0009605265392510703, + "loss": 0.90353841, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.55908203, + "step": 798, + "time_per_iteration": 2.642423152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063071, + "balance_loss_mlp": 1.00732899, + "epoch": 0.15371296652558677, + "flos": 536979476736.0, + "grad_norm": 0.03662373873498648, + "language_loss": 0.93232477, + "learning_rate": 0.0009604051237403846, + "loss": 0.94295549, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.55908203, + "step": 799, + "time_per_iteration": 2.6661648750305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062966, + "balance_loss_mlp": 1.00693774, + "epoch": 0.15390534821085033, + "flos": 396090504192.0, + "grad_norm": 0.042222005302764924, + "language_loss": 0.87381375, + "learning_rate": 0.0009602835294845776, + "loss": 0.8844434, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.56201172, + "step": 800, + "time_per_iteration": 2.4529898166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_mlp": 1.00520432, + "epoch": 0.1540977298961139, + "flos": 536886157824.0, + "grad_norm": 0.03888031973735598, + "language_loss": 0.91938102, + "learning_rate": 0.0009601617565308565, + "loss": 0.92998952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.55810547, + "step": 801, + "time_per_iteration": 2.6380698680877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.0085746, + "epoch": 0.15429011158137745, + "flos": 725091628800.0, + "grad_norm": 0.03523983772327724, + "language_loss": 0.87975162, + "learning_rate": 0.0009600398049264977, + "loss": 0.89039195, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.55615234, + "step": 802, + "time_per_iteration": 2.9610986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064973, + "balance_loss_mlp": 1.00970769, + "epoch": 0.154482493266641, + "flos": 621749849088.0, + "grad_norm": 0.04424510077845192, + "language_loss": 0.93353879, + "learning_rate": 0.0009599176747188469, + "loss": 0.94418848, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.55419922, + "step": 803, + "time_per_iteration": 2.883296251296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065036, + "balance_loss_mlp": 1.00981843, + "epoch": 0.15467487495190457, + "flos": 526720243968.0, + "grad_norm": 0.03833070581853241, + "language_loss": 0.84471631, + "learning_rate": 0.0009597953659553196, + "loss": 0.85536671, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.55371094, + "step": 804, + "time_per_iteration": 2.7128705978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.00712788, + "epoch": 0.15486725663716813, + "flos": 528760621056.0, + "grad_norm": 0.03896986919959599, + "language_loss": 0.90159577, + "learning_rate": 0.0009596728786833997, + "loss": 0.9122197, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.55419922, + "step": 805, + "time_per_iteration": 2.605398178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062158, + "balance_loss_mlp": 1.00684452, + "epoch": 0.1550596383224317, + "flos": 1050280295424.0, + "grad_norm": 0.039312204875199507, + "language_loss": 0.90827858, + "learning_rate": 0.0009595502129506415, + "loss": 0.91890013, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.5546875, + "step": 806, + "time_per_iteration": 3.355556011199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_mlp": 1.00736439, + "epoch": 0.15525202000769528, + "flos": 614837458176.0, + "grad_norm": 0.03934214137038287, + "language_loss": 0.83726299, + "learning_rate": 0.0009594273688046678, + "loss": 0.8478874, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.55224609, + "step": 807, + "time_per_iteration": 2.765700101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062118, + "balance_loss_mlp": 1.00728118, + "epoch": 0.15544440169295884, + "flos": 534103120128.0, + "grad_norm": 0.042258492962953934, + "language_loss": 0.86714661, + "learning_rate": 0.000959304346293171, + "loss": 0.8777678, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.54980469, + "step": 808, + "time_per_iteration": 2.6490986347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.00928247, + "epoch": 0.1556367833782224, + "flos": 645887331840.0, + "grad_norm": 0.047675746935091516, + "language_loss": 0.89139616, + "learning_rate": 0.0009591811454639125, + "loss": 0.90203738, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.54980469, + "step": 809, + "time_per_iteration": 2.7880568504333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059631, + "balance_loss_mlp": 1.00469911, + "epoch": 0.15582916506348596, + "flos": 544953368832.0, + "grad_norm": 0.05205155355433054, + "language_loss": 0.89500809, + "learning_rate": 0.0009590577663647234, + "loss": 0.90560436, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.55078125, + "step": 810, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061907, + "balance_loss_mlp": 1.0068804, + "epoch": 0.15602154674874952, + "flos": 581215484160.0, + "grad_norm": 0.039153260843753375, + "language_loss": 0.87186325, + "learning_rate": 0.0009589342090435036, + "loss": 0.88248235, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.55175781, + "step": 811, + "time_per_iteration": 2.806425094604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_mlp": 1.00607169, + "epoch": 0.15621392843401308, + "flos": 536317496064.0, + "grad_norm": 0.04937652455074429, + "language_loss": 0.88453877, + "learning_rate": 0.0009588104735482223, + "loss": 0.89514732, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.54931641, + "step": 812, + "time_per_iteration": 2.647728204727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060077, + "balance_loss_mlp": 1.00538397, + "epoch": 0.15640631011927664, + "flos": 551982411264.0, + "grad_norm": 0.04402679292728805, + "language_loss": 0.85281312, + "learning_rate": 0.0009586865599269177, + "loss": 0.86341381, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.54833984, + "step": 813, + "time_per_iteration": 2.642218828201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061354, + "balance_loss_mlp": 1.0069474, + "epoch": 0.1565986918045402, + "flos": 638636658432.0, + "grad_norm": 0.0415768255708782, + "language_loss": 0.89702487, + "learning_rate": 0.0009585624682276977, + "loss": 0.90763843, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.54541016, + "step": 814, + "time_per_iteration": 2.7770931720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058453, + "balance_loss_mlp": 1.00366414, + "epoch": 0.15679107348980378, + "flos": 491782089984.0, + "grad_norm": 0.039213144049943555, + "language_loss": 0.88436091, + "learning_rate": 0.0009584381984987386, + "loss": 0.89494538, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.54931641, + "step": 815, + "time_per_iteration": 2.617560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061581, + "balance_loss_mlp": 1.00655353, + "epoch": 0.15698345517506734, + "flos": 531003187200.0, + "grad_norm": 0.030486806446719653, + "language_loss": 0.91117728, + "learning_rate": 0.0009583137507882864, + "loss": 0.92179304, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.55175781, + "step": 816, + "time_per_iteration": 2.6757051944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060617, + "balance_loss_mlp": 1.00568497, + "epoch": 0.1571758368603309, + "flos": 547078316544.0, + "grad_norm": 0.03910336486934304, + "language_loss": 0.82217371, + "learning_rate": 0.000958189125144656, + "loss": 0.83277988, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.55078125, + "step": 817, + "time_per_iteration": 2.7065701484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061392, + "balance_loss_mlp": 1.00655591, + "epoch": 0.15736821854559446, + "flos": 566744272896.0, + "grad_norm": 0.03730967846547413, + "language_loss": 0.89150202, + "learning_rate": 0.0009580643216162313, + "loss": 0.90211594, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.54980469, + "step": 818, + "time_per_iteration": 2.6849937438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_mlp": 1.00792253, + "epoch": 0.15756060023085802, + "flos": 501954806784.0, + "grad_norm": 0.041127076818974775, + "language_loss": 0.80838168, + "learning_rate": 0.0009579393402514652, + "loss": 0.81900686, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.54736328, + "step": 819, + "time_per_iteration": 2.615342378616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060056, + "balance_loss_mlp": 1.00560164, + "epoch": 0.15775298191612158, + "flos": 520272502272.0, + "grad_norm": 0.037825026421493144, + "language_loss": 0.91941106, + "learning_rate": 0.0009578141810988801, + "loss": 0.93001157, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.54589844, + "step": 820, + "time_per_iteration": 2.6530544757843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.00666904, + "epoch": 0.15794536360138514, + "flos": 467088584448.0, + "grad_norm": 0.039348813654249644, + "language_loss": 0.92238629, + "learning_rate": 0.0009576888442070668, + "loss": 0.93299985, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.54833984, + "step": 821, + "time_per_iteration": 2.5978658199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_mlp": 1.00809062, + "epoch": 0.1581377452866487, + "flos": 518168941824.0, + "grad_norm": 0.03790806580601569, + "language_loss": 0.93657464, + "learning_rate": 0.0009575633296246854, + "loss": 0.94720107, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.546875, + "step": 822, + "time_per_iteration": 2.582139492034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_mlp": 1.00711334, + "epoch": 0.15833012697191226, + "flos": 550838284800.0, + "grad_norm": 0.03604802690546967, + "language_loss": 0.84146446, + "learning_rate": 0.0009574376374004652, + "loss": 0.85208106, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.546875, + "step": 823, + "time_per_iteration": 2.6182329654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.00703347, + "epoch": 0.15852250865717585, + "flos": 488467329024.0, + "grad_norm": 0.0382059884648543, + "language_loss": 0.82121176, + "learning_rate": 0.000957311767583204, + "loss": 0.83182758, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.546875, + "step": 824, + "time_per_iteration": 2.584266185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_mlp": 1.00531006, + "epoch": 0.1587148903424394, + "flos": 1312699441152.0, + "grad_norm": 0.00659207066158758, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83129162, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.5234375, + "step": 825, + "time_per_iteration": 4.734830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_mlp": 1.00965643, + "epoch": 0.15890727202770297, + "flos": 467833190400.0, + "grad_norm": 0.04624650490850591, + "language_loss": 0.92764026, + "learning_rate": 0.0009570594953650961, + "loss": 0.93828189, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.54638672, + "step": 826, + "time_per_iteration": 2.5117454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.00937772, + "epoch": 0.15909965371296653, + "flos": 778607993088.0, + "grad_norm": 0.03976637787958364, + "language_loss": 0.81327987, + "learning_rate": 0.00095693309306219, + "loss": 0.8239187, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.54638672, + "step": 827, + "time_per_iteration": 3.1954681873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.00599849, + "epoch": 0.1592920353982301, + "flos": 1079964411648.0, + "grad_norm": 0.038150784713437476, + "language_loss": 0.89750922, + "learning_rate": 0.0009568065133621244, + "loss": 0.90811658, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54882812, + "step": 828, + "time_per_iteration": 3.3355016708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.00642896, + "epoch": 0.15948441708349365, + "flos": 726890932992.0, + "grad_norm": 0.03986186218144037, + "language_loss": 0.85834098, + "learning_rate": 0.0009566797563140422, + "loss": 0.86894989, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.54589844, + "step": 829, + "time_per_iteration": 2.873845100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_mlp": 1.00519884, + "epoch": 0.1596767987687572, + "flos": 580076215296.0, + "grad_norm": 0.03433333328837374, + "language_loss": 0.89395094, + "learning_rate": 0.0009565528219671547, + "loss": 0.90454364, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.54199219, + "step": 830, + "time_per_iteration": 2.9566032886505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.00991619, + "epoch": 0.15986918045402077, + "flos": 530026256640.0, + "grad_norm": 0.037800776955081314, + "language_loss": 0.86586118, + "learning_rate": 0.0009564257103707418, + "loss": 0.87649965, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.54052734, + "step": 831, + "time_per_iteration": 2.6305205821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.00870061, + "epoch": 0.16006156213928435, + "flos": 575670796032.0, + "grad_norm": 0.04196239075383403, + "language_loss": 0.92502224, + "learning_rate": 0.0009562984215741533, + "loss": 0.93564951, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54150391, + "step": 832, + "time_per_iteration": 2.6781210899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00743783, + "epoch": 0.1602539438245479, + "flos": 516675839232.0, + "grad_norm": 0.039654673227061156, + "language_loss": 0.83729708, + "learning_rate": 0.0009561709556268065, + "loss": 0.84791321, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.54296875, + "step": 833, + "time_per_iteration": 2.732191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064816, + "balance_loss_mlp": 1.01021826, + "epoch": 0.16044632550981147, + "flos": 622162008576.0, + "grad_norm": 0.03600956841171521, + "language_loss": 0.95349514, + "learning_rate": 0.0009560433125781884, + "loss": 0.96414334, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.54736328, + "step": 834, + "time_per_iteration": 4.227160215377808 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063475, + "balance_loss_mlp": 1.008973, + "epoch": 0.16063870719507503, + "flos": 562128883200.0, + "grad_norm": 0.03652136008848007, + "language_loss": 0.94107795, + "learning_rate": 0.0009559154924778544, + "loss": 0.95171273, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.54638672, + "step": 835, + "time_per_iteration": 2.7238283157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_mlp": 1.01251614, + "epoch": 0.1608310888803386, + "flos": 806561824512.0, + "grad_norm": 0.044196177378580975, + "language_loss": 0.86185992, + "learning_rate": 0.0009557874953754284, + "loss": 0.87252581, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.54199219, + "step": 836, + "time_per_iteration": 3.03965425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.00943184, + "epoch": 0.16102347056560215, + "flos": 601695065856.0, + "grad_norm": 0.04086380423696876, + "language_loss": 0.84961462, + "learning_rate": 0.0009556593213206038, + "loss": 0.86025023, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.54248047, + "step": 837, + "time_per_iteration": 2.714165687561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_mlp": 1.0095681, + "epoch": 0.1612158522508657, + "flos": 554615749632.0, + "grad_norm": 0.03942211179170501, + "language_loss": 0.88284755, + "learning_rate": 0.0009555309703631414, + "loss": 0.89348304, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.54101562, + "step": 838, + "time_per_iteration": 2.6616575717926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061318, + "balance_loss_mlp": 1.00729215, + "epoch": 0.16140823393612927, + "flos": 557018708736.0, + "grad_norm": 0.03970121061853926, + "language_loss": 0.88476837, + "learning_rate": 0.0009554024425528722, + "loss": 0.89538157, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.54150391, + "step": 839, + "time_per_iteration": 2.6778693199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061761, + "balance_loss_mlp": 1.00792611, + "epoch": 0.16160061562139286, + "flos": 544909627392.0, + "grad_norm": 0.03616953348933095, + "language_loss": 0.90216744, + "learning_rate": 0.0009552737379396948, + "loss": 0.91278505, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.53955078, + "step": 840, + "time_per_iteration": 2.6190080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060215, + "balance_loss_mlp": 1.00638056, + "epoch": 0.16179299730665642, + "flos": 605007881472.0, + "grad_norm": 0.03485432207779616, + "language_loss": 0.88917094, + "learning_rate": 0.0009551448565735767, + "loss": 0.89977312, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.53955078, + "step": 841, + "time_per_iteration": 2.771730422973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059839, + "balance_loss_mlp": 1.00624251, + "epoch": 0.16198537899191998, + "flos": 788552275968.0, + "grad_norm": 0.040424272174261144, + "language_loss": 0.855564, + "learning_rate": 0.0009550157985045543, + "loss": 0.86616236, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.53710938, + "step": 842, + "time_per_iteration": 3.014448642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063416, + "balance_loss_mlp": 1.00986671, + "epoch": 0.16217776067718354, + "flos": 520830470400.0, + "grad_norm": 0.03210449059239548, + "language_loss": 0.9010545, + "learning_rate": 0.0009548865637827321, + "loss": 0.91168869, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.53662109, + "step": 843, + "time_per_iteration": 2.663733959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.00725794, + "epoch": 0.1623701423624471, + "flos": 506255246592.0, + "grad_norm": 0.04236042945807781, + "language_loss": 0.91279781, + "learning_rate": 0.0009547571524582838, + "loss": 0.92340446, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.53515625, + "step": 844, + "time_per_iteration": 2.5841143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00848722, + "epoch": 0.16256252404771065, + "flos": 498157900032.0, + "grad_norm": 0.043042899099755685, + "language_loss": 0.93573415, + "learning_rate": 0.0009546275645814512, + "loss": 0.94635028, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.53222656, + "step": 845, + "time_per_iteration": 2.601743221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064884, + "balance_loss_mlp": 1.01152599, + "epoch": 0.16275490573297421, + "flos": 503287516416.0, + "grad_norm": 0.046422900850994125, + "language_loss": 0.90658545, + "learning_rate": 0.0009544978002025446, + "loss": 0.9172343, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.53466797, + "step": 846, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.00957346, + "epoch": 0.16294728741823777, + "flos": 508354916352.0, + "grad_norm": 0.03474620131823351, + "language_loss": 0.88017273, + "learning_rate": 0.0009543678593719434, + "loss": 0.89080155, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.53417969, + "step": 847, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_mlp": 1.01334834, + "epoch": 0.16313966910350133, + "flos": 510757875456.0, + "grad_norm": 0.031134263506057067, + "language_loss": 0.88570058, + "learning_rate": 0.0009542377421400945, + "loss": 0.89637142, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.53857422, + "step": 848, + "time_per_iteration": 2.79311203956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061983, + "balance_loss_mlp": 1.00810015, + "epoch": 0.16333205078876492, + "flos": 545057381376.0, + "grad_norm": 0.03805815068737175, + "language_loss": 0.84448338, + "learning_rate": 0.0009541074485575145, + "loss": 0.85510319, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.54003906, + "step": 849, + "time_per_iteration": 2.714644193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106829, + "balance_loss_mlp": 1.01450312, + "epoch": 0.16352443247402848, + "flos": 508712640768.0, + "grad_norm": 0.03447226436126556, + "language_loss": 0.93184924, + "learning_rate": 0.0009539769786747874, + "loss": 0.94253218, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.5390625, + "step": 850, + "time_per_iteration": 2.5857110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070929, + "balance_loss_mlp": 1.01709449, + "epoch": 0.16371681415929204, + "flos": 543223084032.0, + "grad_norm": 0.036141614394747515, + "language_loss": 0.82550752, + "learning_rate": 0.0009538463325425665, + "loss": 0.83621687, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.53955078, + "step": 851, + "time_per_iteration": 2.7186405658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.01242912, + "epoch": 0.1639091958445556, + "flos": 521761714176.0, + "grad_norm": 0.03784697093976771, + "language_loss": 0.87203169, + "learning_rate": 0.0009537155102115728, + "loss": 0.8826977, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.54296875, + "step": 852, + "time_per_iteration": 2.5761775970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061784, + "balance_loss_mlp": 1.00771022, + "epoch": 0.16410157752981916, + "flos": 548482957824.0, + "grad_norm": 0.03731294741121226, + "language_loss": 0.85278255, + "learning_rate": 0.0009535845117325961, + "loss": 0.8634004, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.54199219, + "step": 853, + "time_per_iteration": 2.6968846321105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065026, + "balance_loss_mlp": 1.01085758, + "epoch": 0.16429395921508272, + "flos": 584026712064.0, + "grad_norm": 0.031860977478103375, + "language_loss": 0.9423098, + "learning_rate": 0.0009534533371564946, + "loss": 0.95296007, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.54296875, + "step": 854, + "time_per_iteration": 2.7640349864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106098, + "balance_loss_mlp": 1.00709713, + "epoch": 0.16448634090034628, + "flos": 531962621184.0, + "grad_norm": 0.03950290113288642, + "language_loss": 0.89868152, + "learning_rate": 0.0009533219865341949, + "loss": 0.90929133, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.54003906, + "step": 855, + "time_per_iteration": 2.6025009155273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060489, + "balance_loss_mlp": 1.00693989, + "epoch": 0.16467872258560984, + "flos": 492961209600.0, + "grad_norm": 0.03645156199748424, + "language_loss": 0.87602645, + "learning_rate": 0.0009531904599166916, + "loss": 0.88663131, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.53662109, + "step": 856, + "time_per_iteration": 2.656604290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060758, + "balance_loss_mlp": 1.00730467, + "epoch": 0.16487110427087343, + "flos": 507260367360.0, + "grad_norm": 0.04426557796634758, + "language_loss": 0.86560714, + "learning_rate": 0.0009530587573550478, + "loss": 0.87621474, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.53564453, + "step": 857, + "time_per_iteration": 2.610445261001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_mlp": 1.00538635, + "epoch": 0.16506348595613698, + "flos": 1436111555328.0, + "grad_norm": 0.010874217326465607, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75375891, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.51171875, + "step": 858, + "time_per_iteration": 4.991516590118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.00718212, + "epoch": 0.16525586764140054, + "flos": 478090477824.0, + "grad_norm": 0.04454190836652637, + "language_loss": 0.91544032, + "learning_rate": 0.0009527948246039337, + "loss": 0.9260481, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.53710938, + "step": 859, + "time_per_iteration": 2.538290500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058142, + "balance_loss_mlp": 1.00425971, + "epoch": 0.1654482493266641, + "flos": 882541767168.0, + "grad_norm": 0.03991834039284953, + "language_loss": 0.88867122, + "learning_rate": 0.000952662594516931, + "loss": 0.89925265, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.54003906, + "step": 860, + "time_per_iteration": 3.083786964416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065202, + "balance_loss_mlp": 1.01122451, + "epoch": 0.16564063101192766, + "flos": 628106217216.0, + "grad_norm": 0.03630731527649873, + "language_loss": 0.87934124, + "learning_rate": 0.0009525301886907234, + "loss": 0.88999331, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.54101562, + "step": 861, + "time_per_iteration": 2.8606412410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.00884438, + "epoch": 0.16583301269719122, + "flos": 562593532416.0, + "grad_norm": 0.03632506699489255, + "language_loss": 0.8885988, + "learning_rate": 0.0009523976071767155, + "loss": 0.89922649, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.54052734, + "step": 862, + "time_per_iteration": 2.651202440261841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062989, + "balance_loss_mlp": 1.0094403, + "epoch": 0.16602539438245478, + "flos": 568984893696.0, + "grad_norm": 0.03883194498572106, + "language_loss": 0.88789731, + "learning_rate": 0.00095226485002638, + "loss": 0.8985272, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.53662109, + "step": 863, + "time_per_iteration": 2.798125982284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.01019073, + "epoch": 0.16621777606771834, + "flos": 576022684416.0, + "grad_norm": 0.03638934937563812, + "language_loss": 0.89892161, + "learning_rate": 0.0009521319172912576, + "loss": 0.90955949, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.53710938, + "step": 864, + "time_per_iteration": 4.098716974258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_mlp": 1.00632548, + "epoch": 0.16641015775298193, + "flos": 515598786816.0, + "grad_norm": 0.037169751839881825, + "language_loss": 0.96108532, + "learning_rate": 0.0009519988090229579, + "loss": 0.97168505, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.53759766, + "step": 865, + "time_per_iteration": 2.659381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068447, + "balance_loss_mlp": 1.01489806, + "epoch": 0.1666025394382455, + "flos": 622850234112.0, + "grad_norm": 0.04388029559541895, + "language_loss": 0.88811028, + "learning_rate": 0.0009518655252731576, + "loss": 0.89879477, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.53662109, + "step": 866, + "time_per_iteration": 2.738511323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061925, + "balance_loss_mlp": 1.00880551, + "epoch": 0.16679492112350905, + "flos": 549933285888.0, + "grad_norm": 0.03352631932153436, + "language_loss": 0.91113746, + "learning_rate": 0.0009517320660936022, + "loss": 0.92175674, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.53222656, + "step": 867, + "time_per_iteration": 2.7755699157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.01343453, + "epoch": 0.1669873028087726, + "flos": 666866555904.0, + "grad_norm": 0.04051359913494383, + "language_loss": 0.84396493, + "learning_rate": 0.0009515984315361051, + "loss": 0.85462809, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.52978516, + "step": 868, + "time_per_iteration": 2.8502533435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_mlp": 1.00944042, + "epoch": 0.16717968449403617, + "flos": 539604066816.0, + "grad_norm": 0.03969494402961726, + "language_loss": 0.88029611, + "learning_rate": 0.000951464621652548, + "loss": 0.89092225, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.53271484, + "step": 869, + "time_per_iteration": 2.6079800128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.01233244, + "epoch": 0.16737206617929973, + "flos": 531279253248.0, + "grad_norm": 0.03349656106003216, + "language_loss": 0.7990135, + "learning_rate": 0.0009513306364948804, + "loss": 0.80967236, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.53662109, + "step": 870, + "time_per_iteration": 2.824232578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106371, + "balance_loss_mlp": 1.00987494, + "epoch": 0.1675644478645633, + "flos": 481757127168.0, + "grad_norm": 0.04264569815750397, + "language_loss": 0.90229708, + "learning_rate": 0.0009511964761151197, + "loss": 0.91293418, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.53955078, + "step": 871, + "time_per_iteration": 2.6326816082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106642, + "balance_loss_mlp": 1.01344323, + "epoch": 0.16775682954982685, + "flos": 495542058240.0, + "grad_norm": 0.04000245460937008, + "language_loss": 0.91825569, + "learning_rate": 0.0009510621405653521, + "loss": 0.92891991, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.53076172, + "step": 872, + "time_per_iteration": 2.5802783966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074346, + "balance_loss_mlp": 1.02151191, + "epoch": 0.1679492112350904, + "flos": 753406096896.0, + "grad_norm": 0.04130745072346603, + "language_loss": 0.85908926, + "learning_rate": 0.0009509276298977309, + "loss": 0.86983275, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.52929688, + "step": 873, + "time_per_iteration": 2.9676413536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069963, + "balance_loss_mlp": 1.01689136, + "epoch": 0.168141592920354, + "flos": 1137733583616.0, + "grad_norm": 0.036676349776393134, + "language_loss": 0.82925022, + "learning_rate": 0.0009507929441644778, + "loss": 0.83994985, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.53173828, + "step": 874, + "time_per_iteration": 3.5441927909851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.00924039, + "epoch": 0.16833397460561755, + "flos": 633554674176.0, + "grad_norm": 0.03715311549034911, + "language_loss": 0.86810201, + "learning_rate": 0.0009506580834178826, + "loss": 0.87872851, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.53515625, + "step": 875, + "time_per_iteration": 2.767840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106879, + "balance_loss_mlp": 1.01524162, + "epoch": 0.1685263562908811, + "flos": 542543606784.0, + "grad_norm": 0.041322978640758234, + "language_loss": 0.92533737, + "learning_rate": 0.0009505230477103028, + "loss": 0.93602526, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.53662109, + "step": 876, + "time_per_iteration": 2.68626070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_mlp": 1.01151776, + "epoch": 0.16871873797614467, + "flos": 620486158848.0, + "grad_norm": 0.04979097271806245, + "language_loss": 0.82312369, + "learning_rate": 0.0009503878370941641, + "loss": 0.83377057, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.53271484, + "step": 877, + "time_per_iteration": 2.738828182220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.01455081, + "epoch": 0.16891111966140823, + "flos": 607456527360.0, + "grad_norm": 0.048240798926105125, + "language_loss": 0.90597415, + "learning_rate": 0.0009502524516219595, + "loss": 0.91664839, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.52978516, + "step": 878, + "time_per_iteration": 2.7533464431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.01234174, + "epoch": 0.1691035013466718, + "flos": 553406494464.0, + "grad_norm": 0.04285435284136928, + "language_loss": 0.91275579, + "learning_rate": 0.0009501168913462506, + "loss": 0.92340994, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.53173828, + "step": 879, + "time_per_iteration": 2.6498849391937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106115, + "balance_loss_mlp": 1.00946045, + "epoch": 0.16929588303193535, + "flos": 1479308427264.0, + "grad_norm": 0.010969186313753012, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80183077, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.51757812, + "step": 880, + "time_per_iteration": 4.8127734661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065784, + "balance_loss_mlp": 1.01228285, + "epoch": 0.1694882647171989, + "flos": 927848024064.0, + "grad_norm": 0.04254449001590413, + "language_loss": 0.86211771, + "learning_rate": 0.0009498452465949042, + "loss": 0.87277561, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.53613281, + "step": 881, + "time_per_iteration": 3.242352247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.00668061, + "epoch": 0.1696806464024625, + "flos": 547152193536.0, + "grad_norm": 0.03842920637304405, + "language_loss": 0.92758489, + "learning_rate": 0.0009497091622247285, + "loss": 0.93818152, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.53076172, + "step": 882, + "time_per_iteration": 2.7538321018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.01363766, + "epoch": 0.16987302808772606, + "flos": 530295519744.0, + "grad_norm": 0.04346709327253658, + "language_loss": 0.94739175, + "learning_rate": 0.0009495729032619723, + "loss": 0.95805502, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.52783203, + "step": 883, + "time_per_iteration": 2.681851863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.00830746, + "epoch": 0.17006540977298962, + "flos": 756479784960.0, + "grad_norm": 0.03707996109728333, + "language_loss": 0.85065424, + "learning_rate": 0.0009494364697595354, + "loss": 0.86126566, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.52929688, + "step": 884, + "time_per_iteration": 2.886613607406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058078, + "balance_loss_mlp": 1.00495851, + "epoch": 0.17025779145825318, + "flos": 559875623424.0, + "grad_norm": 0.04262534374301406, + "language_loss": 0.90753883, + "learning_rate": 0.0009492998617703867, + "loss": 0.91811961, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.53222656, + "step": 885, + "time_per_iteration": 2.7197954654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069962, + "balance_loss_mlp": 1.01684284, + "epoch": 0.17045017314351674, + "flos": 513217214976.0, + "grad_norm": 0.04472607646913617, + "language_loss": 0.89151132, + "learning_rate": 0.0009491630793475619, + "loss": 0.90221095, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.53222656, + "step": 886, + "time_per_iteration": 2.6023643016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059759, + "balance_loss_mlp": 1.00706899, + "epoch": 0.1706425548287803, + "flos": 510013269504.0, + "grad_norm": 0.03690999998020265, + "language_loss": 0.86250949, + "learning_rate": 0.0009490261225441643, + "loss": 0.87310708, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.52783203, + "step": 887, + "time_per_iteration": 2.8811516761779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01845872, + "epoch": 0.17083493651404386, + "flos": 718715818752.0, + "grad_norm": 0.037520519160069404, + "language_loss": 0.91723603, + "learning_rate": 0.0009488889914133656, + "loss": 0.92794418, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.52441406, + "step": 888, + "time_per_iteration": 2.983920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.01515496, + "epoch": 0.17102731819930742, + "flos": 560201266944.0, + "grad_norm": 0.034570155262309, + "language_loss": 0.90050644, + "learning_rate": 0.0009487516860084047, + "loss": 0.91118205, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.52490234, + "step": 889, + "time_per_iteration": 2.739945888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061028, + "balance_loss_mlp": 1.0078603, + "epoch": 0.17121969988457098, + "flos": 495765634560.0, + "grad_norm": 0.04354558177795279, + "language_loss": 0.9033885, + "learning_rate": 0.0009486142063825884, + "loss": 0.91399872, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.53271484, + "step": 890, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.02223206, + "epoch": 0.17141208156983456, + "flos": 1552108723968.0, + "grad_norm": 0.01766408052426257, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73499948, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.515625, + "step": 891, + "time_per_iteration": 4.968579053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_mlp": 1.00568736, + "epoch": 0.17160446325509812, + "flos": 620700986880.0, + "grad_norm": 0.037544702591063864, + "language_loss": 0.91210532, + "learning_rate": 0.0009483387246819542, + "loss": 0.92269152, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.53027344, + "step": 892, + "time_per_iteration": 2.7970938682556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071655, + "balance_loss_mlp": 1.0209198, + "epoch": 0.17179684494036168, + "flos": 1384695839232.0, + "grad_norm": 0.01601076320839161, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83357239, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.5078125, + "step": 893, + "time_per_iteration": 4.629605054855347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.01386988, + "epoch": 0.17198922662562524, + "flos": 493642632192.0, + "grad_norm": 0.03763004911158334, + "language_loss": 0.90241146, + "learning_rate": 0.0009480625467392688, + "loss": 0.91307414, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.52490234, + "step": 894, + "time_per_iteration": 2.6142358779907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_mlp": 1.01822662, + "epoch": 0.1721816083108888, + "flos": 1461488428800.0, + "grad_norm": 0.016749035753296605, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79063439, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.50585938, + "step": 895, + "time_per_iteration": 4.811494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112065, + "balance_loss_mlp": 1.06719661, + "epoch": 0.17237398999615236, + "flos": 529205828352.0, + "grad_norm": 0.05241044192650153, + "language_loss": 0.88738441, + "learning_rate": 0.0009477856729834196, + "loss": 0.89859092, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.53564453, + "step": 896, + "time_per_iteration": 2.7389612197875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066584, + "balance_loss_mlp": 1.01446557, + "epoch": 0.17256637168141592, + "flos": 605027323392.0, + "grad_norm": 0.03860455021635393, + "language_loss": 0.90989411, + "learning_rate": 0.0009476469753098809, + "loss": 0.92055988, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.52197266, + "step": 897, + "time_per_iteration": 2.7175238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.02507758, + "epoch": 0.17275875336667948, + "flos": 510694692096.0, + "grad_norm": 0.040412661310783936, + "language_loss": 0.88453948, + "learning_rate": 0.0009475081038443738, + "loss": 0.89531147, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.52197266, + "step": 898, + "time_per_iteration": 2.6398110389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.02753115, + "epoch": 0.17295113505194307, + "flos": 666502028544.0, + "grad_norm": 0.045107808798334564, + "language_loss": 0.87902451, + "learning_rate": 0.0009473690586408124, + "loss": 0.88981915, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.52001953, + "step": 899, + "time_per_iteration": 2.817730665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.01965487, + "epoch": 0.17314351673720663, + "flos": 556432550400.0, + "grad_norm": 0.03870851432877784, + "language_loss": 0.87576568, + "learning_rate": 0.0009472298397531792, + "loss": 0.88648236, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.52099609, + "step": 900, + "time_per_iteration": 2.6932764053344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061802, + "balance_loss_mlp": 1.00892079, + "epoch": 0.17333589842247019, + "flos": 504607587072.0, + "grad_norm": 0.03631909976073519, + "language_loss": 0.87174571, + "learning_rate": 0.0009470904472355235, + "loss": 0.88236374, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.52978516, + "step": 901, + "time_per_iteration": 2.669405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099242, + "balance_loss_mlp": 1.04593205, + "epoch": 0.17352828010773375, + "flos": 557351155200.0, + "grad_norm": 0.04839261993488341, + "language_loss": 0.80976391, + "learning_rate": 0.0009469508811419626, + "loss": 0.82075632, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.53417969, + "step": 902, + "time_per_iteration": 2.7412211894989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083992, + "balance_loss_mlp": 1.033638, + "epoch": 0.1737206617929973, + "flos": 1557794363136.0, + "grad_norm": 0.02136399149953286, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72697818, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.50390625, + "step": 903, + "time_per_iteration": 4.800720930099487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075494, + "balance_loss_mlp": 1.02318478, + "epoch": 0.17391304347826086, + "flos": 517756782336.0, + "grad_norm": 0.04178806719411302, + "language_loss": 0.85797513, + "learning_rate": 0.0009466712284439292, + "loss": 0.86873007, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.52392578, + "step": 904, + "time_per_iteration": 2.7409780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076244, + "balance_loss_mlp": 1.02360141, + "epoch": 0.17410542516352442, + "flos": 542161582848.0, + "grad_norm": 0.043268311729831165, + "language_loss": 0.90273786, + "learning_rate": 0.0009465311419480276, + "loss": 0.91350031, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.52734375, + "step": 905, + "time_per_iteration": 2.7310986518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068245, + "balance_loss_mlp": 1.01526833, + "epoch": 0.17429780684878798, + "flos": 625082106624.0, + "grad_norm": 0.0375699532684124, + "language_loss": 0.89484948, + "learning_rate": 0.0009463908820933622, + "loss": 0.905532, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.53076172, + "step": 906, + "time_per_iteration": 2.8575551509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086696, + "balance_loss_mlp": 1.03281319, + "epoch": 0.17449018853405157, + "flos": 576849915648.0, + "grad_norm": 0.04286783530345041, + "language_loss": 0.83513701, + "learning_rate": 0.0009462504489343868, + "loss": 0.84600401, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.54003906, + "step": 907, + "time_per_iteration": 2.83085036277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.0128628, + "epoch": 0.17468257021931513, + "flos": 534773849088.0, + "grad_norm": 0.0408315501053547, + "language_loss": 0.90177906, + "learning_rate": 0.0009461098425256222, + "loss": 0.91243982, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.53320312, + "step": 908, + "time_per_iteration": 2.6000654697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075411, + "balance_loss_mlp": 1.02257717, + "epoch": 0.1748749519045787, + "flos": 541809694464.0, + "grad_norm": 0.0381088809784924, + "language_loss": 0.87053907, + "learning_rate": 0.0009459690629216567, + "loss": 0.88129318, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.52929688, + "step": 909, + "time_per_iteration": 2.622178316116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_mlp": 1.02770495, + "epoch": 0.17506733358984225, + "flos": 499627670016.0, + "grad_norm": 0.039096197570908604, + "language_loss": 0.88898331, + "learning_rate": 0.0009458281101771457, + "loss": 0.89978582, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.52636719, + "step": 910, + "time_per_iteration": 2.5964770317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.01217556, + "epoch": 0.1752597152751058, + "flos": 624133366272.0, + "grad_norm": 0.035444142957055544, + "language_loss": 0.83730716, + "learning_rate": 0.0009456869843468122, + "loss": 0.84795535, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.52734375, + "step": 911, + "time_per_iteration": 2.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059336, + "balance_loss_mlp": 1.00650251, + "epoch": 0.17545209696036937, + "flos": 521994038784.0, + "grad_norm": 0.04587594362499167, + "language_loss": 0.79429859, + "learning_rate": 0.0009455456854854459, + "loss": 0.80489194, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.52929688, + "step": 912, + "time_per_iteration": 2.627058744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.0219084, + "epoch": 0.17564447864563293, + "flos": 462946592256.0, + "grad_norm": 0.044462507375804226, + "language_loss": 0.85522115, + "learning_rate": 0.0009454042136479039, + "loss": 0.86597091, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.53173828, + "step": 913, + "time_per_iteration": 2.562453031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.01250815, + "epoch": 0.1758368603308965, + "flos": 481618121472.0, + "grad_norm": 0.03599423435064716, + "language_loss": 0.84144086, + "learning_rate": 0.0009452625688891103, + "loss": 0.85208857, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.5234375, + "step": 914, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.0137558, + "epoch": 0.17602924201616005, + "flos": 1482087574272.0, + "grad_norm": 0.013260252544834742, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79798466, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.49902344, + "step": 915, + "time_per_iteration": 4.572151184082031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_mlp": 1.0219233, + "epoch": 0.17622162370142364, + "flos": 603471037440.0, + "grad_norm": 0.044830704586910027, + "language_loss": 0.94022703, + "learning_rate": 0.0009449787608278015, + "loss": 0.95096982, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.52441406, + "step": 916, + "time_per_iteration": 2.731264114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062837, + "balance_loss_mlp": 1.0104804, + "epoch": 0.1764140053866872, + "flos": 443606279424.0, + "grad_norm": 0.0370205772569368, + "language_loss": 0.92972034, + "learning_rate": 0.0009448365976354704, + "loss": 0.94034874, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.52441406, + "step": 917, + "time_per_iteration": 2.478041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.0204134, + "epoch": 0.17660638707195075, + "flos": 501592224768.0, + "grad_norm": 0.047363321454448416, + "language_loss": 0.907022, + "learning_rate": 0.0009446942617422558, + "loss": 0.91775542, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.53027344, + "step": 918, + "time_per_iteration": 2.5698564052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_mlp": 1.00789583, + "epoch": 0.17679876875721431, + "flos": 539984145408.0, + "grad_norm": 0.03732253291641402, + "language_loss": 0.86447889, + "learning_rate": 0.0009445517532034176, + "loss": 0.87508708, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.53027344, + "step": 919, + "time_per_iteration": 2.6916563510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.00926292, + "epoch": 0.17699115044247787, + "flos": 498715868160.0, + "grad_norm": 0.04444616550081301, + "language_loss": 0.8994987, + "learning_rate": 0.0009444090720742824, + "loss": 0.91012013, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.52978516, + "step": 920, + "time_per_iteration": 2.5798380374908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.01706016, + "epoch": 0.17718353212774143, + "flos": 663916322304.0, + "grad_norm": 0.04662040468857239, + "language_loss": 0.89399016, + "learning_rate": 0.0009442662184102439, + "loss": 0.90468818, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.52832031, + "step": 921, + "time_per_iteration": 2.755929708480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.01164341, + "epoch": 0.177375913813005, + "flos": 583848822528.0, + "grad_norm": 0.03479566109485236, + "language_loss": 0.88455689, + "learning_rate": 0.000944123192266763, + "loss": 0.89519787, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.52539062, + "step": 922, + "time_per_iteration": 2.8776824474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.00980616, + "epoch": 0.17756829549826855, + "flos": 553684505856.0, + "grad_norm": 0.036018663808135676, + "language_loss": 0.84559548, + "learning_rate": 0.0009439799936993671, + "loss": 0.85622525, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.53271484, + "step": 923, + "time_per_iteration": 2.708897113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.01041508, + "epoch": 0.17776067718353214, + "flos": 557372542464.0, + "grad_norm": 0.06706828820902193, + "language_loss": 0.89721078, + "learning_rate": 0.0009438366227636511, + "loss": 0.90784371, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.52978516, + "step": 924, + "time_per_iteration": 2.6524295806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.01035416, + "epoch": 0.1779530588687957, + "flos": 659652820992.0, + "grad_norm": 0.03503923634288643, + "language_loss": 0.87549317, + "learning_rate": 0.0009436930795152763, + "loss": 0.8861202, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.52441406, + "step": 925, + "time_per_iteration": 2.8627374172210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_mlp": 1.01823378, + "epoch": 0.17814544055405926, + "flos": 645672503808.0, + "grad_norm": 0.03989967380061369, + "language_loss": 0.87815237, + "learning_rate": 0.0009435493640099713, + "loss": 0.88885403, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.52001953, + "step": 926, + "time_per_iteration": 2.7886180877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.01283479, + "epoch": 0.17833782223932282, + "flos": 461885091072.0, + "grad_norm": 0.040977111340993126, + "language_loss": 0.85709256, + "learning_rate": 0.0009434054763035314, + "loss": 0.86774307, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.52294922, + "step": 927, + "time_per_iteration": 2.635576009750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00520515, + "epoch": 0.17853020392458638, + "flos": 760854101760.0, + "grad_norm": 0.029435711646972902, + "language_loss": 0.86359227, + "learning_rate": 0.0009432614164518185, + "loss": 0.8741703, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.52685547, + "step": 928, + "time_per_iteration": 2.945253849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074963, + "balance_loss_mlp": 1.02203369, + "epoch": 0.17872258560984994, + "flos": 784056450048.0, + "grad_norm": 0.039066121455708196, + "language_loss": 0.84876156, + "learning_rate": 0.000943117184510762, + "loss": 0.85951114, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 3.0016870498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092369, + "balance_loss_mlp": 1.04201508, + "epoch": 0.1789149672951135, + "flos": 1463034021120.0, + "grad_norm": 0.03241390760866092, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79882336, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.50390625, + "step": 930, + "time_per_iteration": 5.0408923625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.04005396, + "epoch": 0.17910734898037706, + "flos": 504931285248.0, + "grad_norm": 0.037670754636037675, + "language_loss": 0.90276599, + "learning_rate": 0.0009428282045846674, + "loss": 0.91368294, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.51708984, + "step": 931, + "time_per_iteration": 2.699357509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093914, + "balance_loss_mlp": 1.04260671, + "epoch": 0.17929973066564064, + "flos": 747670880256.0, + "grad_norm": 0.03557447538434831, + "language_loss": 0.91468316, + "learning_rate": 0.0009426834567118214, + "loss": 0.92562228, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.51367188, + "step": 932, + "time_per_iteration": 3.0888116359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095199, + "balance_loss_mlp": 1.04370034, + "epoch": 0.1794921123509042, + "flos": 714573826560.0, + "grad_norm": 0.03713873812168088, + "language_loss": 0.82311261, + "learning_rate": 0.0009425385369740155, + "loss": 0.8340646, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.515625, + "step": 933, + "time_per_iteration": 3.0156304836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.04465711, + "epoch": 0.17968449403616776, + "flos": 634362463488.0, + "grad_norm": 0.04581160448205157, + "language_loss": 0.89044029, + "learning_rate": 0.0009423934454275125, + "loss": 0.90140092, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.51464844, + "step": 934, + "time_per_iteration": 2.8524558544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095874, + "balance_loss_mlp": 1.04428041, + "epoch": 0.17987687572143132, + "flos": 537378997248.0, + "grad_norm": 0.045982575553228676, + "language_loss": 0.93734717, + "learning_rate": 0.0009422481821286418, + "loss": 0.94830596, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.51660156, + "step": 935, + "time_per_iteration": 2.7354249954223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096416, + "balance_loss_mlp": 1.0448221, + "epoch": 0.18006925740669488, + "flos": 539119975680.0, + "grad_norm": 0.04748543050697339, + "language_loss": 0.89948702, + "learning_rate": 0.0009421027471337998, + "loss": 0.91045117, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.51660156, + "step": 936, + "time_per_iteration": 2.660287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095184, + "balance_loss_mlp": 1.04363835, + "epoch": 0.18026163909195844, + "flos": 540535310592.0, + "grad_norm": 0.04911488628490749, + "language_loss": 0.84066534, + "learning_rate": 0.0009419571404994493, + "loss": 0.8516171, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.51611328, + "step": 937, + "time_per_iteration": 2.624769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.03865409, + "epoch": 0.180454020777222, + "flos": 501683598336.0, + "grad_norm": 0.0468107226861285, + "language_loss": 0.92304778, + "learning_rate": 0.00094181136228212, + "loss": 0.9339512, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.51757812, + "step": 938, + "time_per_iteration": 2.6784133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092284, + "balance_loss_mlp": 1.04069054, + "epoch": 0.18064640246248556, + "flos": 500007748608.0, + "grad_norm": 0.039466745711782485, + "language_loss": 0.87082231, + "learning_rate": 0.0009416654125384077, + "loss": 0.8817451, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.51660156, + "step": 939, + "time_per_iteration": 2.7231576442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081085, + "balance_loss_mlp": 1.03034973, + "epoch": 0.18083878414774912, + "flos": 1522293383424.0, + "grad_norm": 0.016406546431804496, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80853462, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.5078125, + "step": 940, + "time_per_iteration": 4.919930934906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329403, + "balance_loss_mlp": 1.27490067, + "epoch": 0.1810311658330127, + "flos": 728666904576.0, + "grad_norm": 0.12503564718566265, + "language_loss": 0.85519916, + "learning_rate": 0.000941372998698552, + "loss": 0.8684932, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.54638672, + "step": 941, + "time_per_iteration": 2.9731380939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093385, + "balance_loss_mlp": 1.04121876, + "epoch": 0.18122354751827627, + "flos": 566045353728.0, + "grad_norm": 0.05253753965114479, + "language_loss": 0.83319217, + "learning_rate": 0.0009412265347159336, + "loss": 0.84412599, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.52246094, + "step": 942, + "time_per_iteration": 2.7150988578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103842, + "balance_loss_mlp": 1.05162799, + "epoch": 0.18141592920353983, + "flos": 520318189056.0, + "grad_norm": 0.046885904923641086, + "language_loss": 0.86687338, + "learning_rate": 0.0009410798994339829, + "loss": 0.87791175, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.52294922, + "step": 943, + "time_per_iteration": 2.598576545715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111341, + "balance_loss_mlp": 1.05831623, + "epoch": 0.1816083108888034, + "flos": 513477729792.0, + "grad_norm": 0.04639702407841738, + "language_loss": 0.8991158, + "learning_rate": 0.000940933092909628, + "loss": 0.91022921, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.53125, + "step": 944, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104212, + "balance_loss_mlp": 1.05109203, + "epoch": 0.18180069257406695, + "flos": 493373369088.0, + "grad_norm": 0.04493061679832577, + "language_loss": 0.85416293, + "learning_rate": 0.0009407861151998649, + "loss": 0.86520505, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.53222656, + "step": 945, + "time_per_iteration": 2.5710983276367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.04692006, + "epoch": 0.1819930742593305, + "flos": 571231350528.0, + "grad_norm": 0.04259629183686275, + "language_loss": 0.87787771, + "learning_rate": 0.0009406389663617552, + "loss": 0.88888001, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.53417969, + "step": 946, + "time_per_iteration": 2.6741456985473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.04661465, + "epoch": 0.18218545594459407, + "flos": 607111441920.0, + "grad_norm": 0.04866460503106345, + "language_loss": 0.87927794, + "learning_rate": 0.000940491646452427, + "loss": 0.89027911, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.53613281, + "step": 947, + "time_per_iteration": 2.718358278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101327, + "balance_loss_mlp": 1.04753995, + "epoch": 0.18237783762985763, + "flos": 549739845120.0, + "grad_norm": 0.042994543525894185, + "language_loss": 0.92601323, + "learning_rate": 0.000940344155529075, + "loss": 0.93702656, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.5390625, + "step": 948, + "time_per_iteration": 2.624303102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097087, + "balance_loss_mlp": 1.04325247, + "epoch": 0.1825702193151212, + "flos": 451675435776.0, + "grad_norm": 0.046415524987670945, + "language_loss": 0.89178842, + "learning_rate": 0.0009401964936489605, + "loss": 0.90275931, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.53955078, + "step": 949, + "time_per_iteration": 2.5104119777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088983, + "balance_loss_mlp": 1.03524303, + "epoch": 0.18276260100038477, + "flos": 590385025536.0, + "grad_norm": 0.0430347708706334, + "language_loss": 0.86972219, + "learning_rate": 0.0009400486608694108, + "loss": 0.88061202, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.53857422, + "step": 950, + "time_per_iteration": 2.744044065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085663, + "balance_loss_mlp": 1.03154159, + "epoch": 0.18295498268564833, + "flos": 788710723584.0, + "grad_norm": 0.040810758702646055, + "language_loss": 0.88588369, + "learning_rate": 0.0009399006572478195, + "loss": 0.89674032, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.54248047, + "step": 951, + "time_per_iteration": 3.0828475952148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.02493632, + "epoch": 0.1831473643709119, + "flos": 579226629888.0, + "grad_norm": 0.03747434947067488, + "language_loss": 0.92113942, + "learning_rate": 0.0009397524828416468, + "loss": 0.93193376, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.54638672, + "step": 952, + "time_per_iteration": 2.6881086826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.03405273, + "epoch": 0.18333974605617545, + "flos": 567964221696.0, + "grad_norm": 0.0419825959367211, + "language_loss": 0.97306633, + "learning_rate": 0.0009396041377084192, + "loss": 0.9839648, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.55957031, + "step": 953, + "time_per_iteration": 2.673654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097804, + "balance_loss_mlp": 1.04191864, + "epoch": 0.183532127741439, + "flos": 528070450176.0, + "grad_norm": 0.04203850234568462, + "language_loss": 0.89016271, + "learning_rate": 0.0009394556219057295, + "loss": 0.90114069, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.56054688, + "step": 954, + "time_per_iteration": 2.7255043983459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.01610565, + "epoch": 0.18372450942670257, + "flos": 595644899328.0, + "grad_norm": 0.03789415730727427, + "language_loss": 0.84751296, + "learning_rate": 0.0009393069354912362, + "loss": 0.85822284, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.55029297, + "step": 955, + "time_per_iteration": 2.7474210262298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_mlp": 1.02963698, + "epoch": 0.18391689111196613, + "flos": 646284907008.0, + "grad_norm": 0.04389714766773939, + "language_loss": 0.83882308, + "learning_rate": 0.0009391580785226649, + "loss": 0.84966445, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.54638672, + "step": 956, + "time_per_iteration": 2.844409465789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081024, + "balance_loss_mlp": 1.02990723, + "epoch": 0.18410927279722972, + "flos": 1460394846720.0, + "grad_norm": 0.013082177800516761, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80421472, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.51171875, + "step": 957, + "time_per_iteration": 4.792405843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_mlp": 1.030267, + "epoch": 0.18430165448249328, + "flos": 660004709376.0, + "grad_norm": 0.04089111102732722, + "language_loss": 0.88231802, + "learning_rate": 0.0009388598531545196, + "loss": 0.89316285, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.54345703, + "step": 958, + "time_per_iteration": 2.900062084197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084489, + "balance_loss_mlp": 1.03017747, + "epoch": 0.18449403616775684, + "flos": 518950486272.0, + "grad_norm": 0.045948437313162956, + "language_loss": 0.87467843, + "learning_rate": 0.000938710484870727, + "loss": 0.88552332, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.54443359, + "step": 959, + "time_per_iteration": 2.5785140991210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.031569, + "epoch": 0.1846864178530204, + "flos": 553825456896.0, + "grad_norm": 0.04362127254920589, + "language_loss": 0.87369549, + "learning_rate": 0.0009385609462644189, + "loss": 0.88455284, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.54296875, + "step": 960, + "time_per_iteration": 2.686221122741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.02774417, + "epoch": 0.18487879953828396, + "flos": 467116774656.0, + "grad_norm": 0.04468558895083242, + "language_loss": 0.86931455, + "learning_rate": 0.0009384112373936514, + "loss": 0.88013744, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.546875, + "step": 961, + "time_per_iteration": 2.633582830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.00935197, + "epoch": 0.18507118122354752, + "flos": 649684238592.0, + "grad_norm": 0.03687654302408078, + "language_loss": 0.9259429, + "learning_rate": 0.0009382613583165467, + "loss": 0.93658715, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.55224609, + "step": 962, + "time_per_iteration": 2.7910635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458915, + "balance_loss_mlp": 1.40078855, + "epoch": 0.18526356290881107, + "flos": 627923470080.0, + "grad_norm": 0.09306974449566385, + "language_loss": 0.90611041, + "learning_rate": 0.0009381113090912928, + "loss": 0.92069954, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.57958984, + "step": 963, + "time_per_iteration": 2.7445125579833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078714, + "balance_loss_mlp": 1.02464056, + "epoch": 0.18545594459407463, + "flos": 433646445312.0, + "grad_norm": 0.04076594680163087, + "language_loss": 0.91471934, + "learning_rate": 0.000937961089776144, + "loss": 0.92550647, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.54199219, + "step": 964, + "time_per_iteration": 2.5835955142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089429, + "balance_loss_mlp": 1.03607059, + "epoch": 0.1856483262793382, + "flos": 750427673088.0, + "grad_norm": 0.041116434601540804, + "language_loss": 0.8449949, + "learning_rate": 0.0009378107004294208, + "loss": 0.8558892, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.53466797, + "step": 965, + "time_per_iteration": 2.9773664474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090833, + "balance_loss_mlp": 1.03790379, + "epoch": 0.18584070796460178, + "flos": 531402707712.0, + "grad_norm": 0.04029010126422192, + "language_loss": 0.93043375, + "learning_rate": 0.0009376601411095096, + "loss": 0.94134206, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.53027344, + "step": 966, + "time_per_iteration": 2.6703643798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088702, + "balance_loss_mlp": 1.03639269, + "epoch": 0.18603308964986534, + "flos": 484084263936.0, + "grad_norm": 0.03934020689435504, + "language_loss": 0.87718618, + "learning_rate": 0.0009375094118748622, + "loss": 0.88807321, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.52392578, + "step": 967, + "time_per_iteration": 2.5719969272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091813, + "balance_loss_mlp": 1.03974187, + "epoch": 0.1862254713351289, + "flos": 802682292480.0, + "grad_norm": 0.042176858736630414, + "language_loss": 0.92643285, + "learning_rate": 0.0009373585127839976, + "loss": 0.93735105, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.52148438, + "step": 968, + "time_per_iteration": 2.956153392791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096332, + "balance_loss_mlp": 1.04483318, + "epoch": 0.18641785302039246, + "flos": 479290984704.0, + "grad_norm": 0.04307464179422831, + "language_loss": 0.92206955, + "learning_rate": 0.0009372074438954994, + "loss": 0.93303293, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.515625, + "step": 969, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092255, + "balance_loss_mlp": 1.04085171, + "epoch": 0.18661023470565602, + "flos": 389779822848.0, + "grad_norm": 0.044792080488554424, + "language_loss": 0.93312657, + "learning_rate": 0.0009370562052680181, + "loss": 0.94404912, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.51464844, + "step": 970, + "time_per_iteration": 2.4642274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109029, + "balance_loss_mlp": 1.03926873, + "epoch": 0.18680261639091958, + "flos": 565776090624.0, + "grad_norm": 0.03666794569701081, + "language_loss": 0.90593827, + "learning_rate": 0.0009369047969602695, + "loss": 0.91684115, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.51074219, + "step": 971, + "time_per_iteration": 2.6925313472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090519, + "balance_loss_mlp": 1.03968859, + "epoch": 0.18699499807618314, + "flos": 480230976768.0, + "grad_norm": 0.04959033368050126, + "language_loss": 0.88274431, + "learning_rate": 0.0009367532190310357, + "loss": 0.89364946, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.50878906, + "step": 972, + "time_per_iteration": 2.5632824897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095286, + "balance_loss_mlp": 1.04464579, + "epoch": 0.1871873797614467, + "flos": 554328989952.0, + "grad_norm": 0.047101191533600484, + "language_loss": 0.90956879, + "learning_rate": 0.0009366014715391644, + "loss": 0.92052168, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.50683594, + "step": 973, + "time_per_iteration": 2.6131792068481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087331, + "balance_loss_mlp": 1.03669059, + "epoch": 0.18737976144671029, + "flos": 553953768960.0, + "grad_norm": 0.03277863870695053, + "language_loss": 0.85193431, + "learning_rate": 0.0009364495545435693, + "loss": 0.86280763, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.50683594, + "step": 974, + "time_per_iteration": 2.768160820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077828, + "balance_loss_mlp": 1.02647221, + "epoch": 0.18757214313197385, + "flos": 503248632576.0, + "grad_norm": 0.03709252074476072, + "language_loss": 0.90046728, + "learning_rate": 0.0009362974681032297, + "loss": 0.91124547, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.51416016, + "step": 975, + "time_per_iteration": 2.596752405166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358762, + "balance_loss_mlp": 1.30464137, + "epoch": 0.1877645248172374, + "flos": 676292721408.0, + "grad_norm": 0.11355211768831018, + "language_loss": 0.89691889, + "learning_rate": 0.0009361452122771907, + "loss": 0.91050649, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.54248047, + "step": 976, + "time_per_iteration": 2.841670036315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.03649426, + "epoch": 0.18795690650250096, + "flos": 405863700480.0, + "grad_norm": 0.05182073733860081, + "language_loss": 0.85757113, + "learning_rate": 0.0009359927871245635, + "loss": 0.86844826, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.51269531, + "step": 977, + "time_per_iteration": 2.4593758583068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110124, + "balance_loss_mlp": 1.04988456, + "epoch": 0.18814928818776452, + "flos": 639064369152.0, + "grad_norm": 0.04599902588150218, + "language_loss": 0.8843354, + "learning_rate": 0.0009358401927045246, + "loss": 0.89534783, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.51416016, + "step": 978, + "time_per_iteration": 2.8043553829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_mlp": 1.05197036, + "epoch": 0.18834166987302808, + "flos": 1140117100800.0, + "grad_norm": 0.05109113713971293, + "language_loss": 0.89583617, + "learning_rate": 0.0009356874290763166, + "loss": 0.90687132, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.51611328, + "step": 979, + "time_per_iteration": 3.4783685207366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105346, + "balance_loss_mlp": 1.0536567, + "epoch": 0.18853405155829164, + "flos": 505816842240.0, + "grad_norm": 0.03906189308485337, + "language_loss": 0.90395761, + "learning_rate": 0.0009355344962992474, + "loss": 0.91501105, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.51757812, + "step": 980, + "time_per_iteration": 2.6457359790802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_mlp": 1.05116904, + "epoch": 0.1887264332435552, + "flos": 609371504640.0, + "grad_norm": 0.038270487176229884, + "language_loss": 0.89782834, + "learning_rate": 0.0009353813944326908, + "loss": 0.9088589, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.51953125, + "step": 981, + "time_per_iteration": 2.923243761062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102538, + "balance_loss_mlp": 1.05070543, + "epoch": 0.1889188149288188, + "flos": 553593132288.0, + "grad_norm": 0.04212053297292714, + "language_loss": 0.84181225, + "learning_rate": 0.0009352281235360863, + "loss": 0.85283768, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.51904297, + "step": 982, + "time_per_iteration": 2.674790620803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_mlp": 1.05135345, + "epoch": 0.18911119661408235, + "flos": 419470742016.0, + "grad_norm": 0.03892833341753514, + "language_loss": 0.86323905, + "learning_rate": 0.0009350746836689389, + "loss": 0.87426949, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.51757812, + "step": 983, + "time_per_iteration": 2.5294649600982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103523, + "balance_loss_mlp": 1.05335999, + "epoch": 0.1893035782993459, + "flos": 1485320676864.0, + "grad_norm": 0.016207020064155576, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82542741, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.50195312, + "step": 984, + "time_per_iteration": 5.031845569610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094201, + "balance_loss_mlp": 1.04227316, + "epoch": 0.18949595998460947, + "flos": 509457246720.0, + "grad_norm": 0.045438139941342374, + "language_loss": 0.84563899, + "learning_rate": 0.0009347672972613634, + "loss": 0.85658097, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.52001953, + "step": 985, + "time_per_iteration": 2.6333274841308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.0384593, + "epoch": 0.18968834166987303, + "flos": 532193000448.0, + "grad_norm": 0.03993027053802703, + "language_loss": 0.8704083, + "learning_rate": 0.0009346133508402735, + "loss": 0.8813107, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.51855469, + "step": 986, + "time_per_iteration": 2.751340389251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089761, + "balance_loss_mlp": 1.03797686, + "epoch": 0.1898807233551366, + "flos": 500754299904.0, + "grad_norm": 0.04595906606263721, + "language_loss": 0.85852754, + "learning_rate": 0.0009344592356873166, + "loss": 0.86942512, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.51855469, + "step": 987, + "time_per_iteration": 2.6785645484924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.03223073, + "epoch": 0.19007310504040015, + "flos": 603360221952.0, + "grad_norm": 0.042275439246703725, + "language_loss": 0.79788595, + "learning_rate": 0.0009343049518623255, + "loss": 0.80872947, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.52197266, + "step": 988, + "time_per_iteration": 2.709439516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365061, + "balance_loss_mlp": 1.30979574, + "epoch": 0.1902654867256637, + "flos": 602765315328.0, + "grad_norm": 0.1049262798815586, + "language_loss": 0.8386007, + "learning_rate": 0.0009341504994251985, + "loss": 0.85225129, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.55419922, + "step": 989, + "time_per_iteration": 2.925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.03952026, + "epoch": 0.19045786841092727, + "flos": 1579234345728.0, + "grad_norm": 0.01847097645999908, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74610186, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.50195312, + "step": 990, + "time_per_iteration": 5.025054216384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_mlp": 1.04845631, + "epoch": 0.19065025009619085, + "flos": 683055412992.0, + "grad_norm": 0.039739471389523856, + "language_loss": 0.8281374, + "learning_rate": 0.0009338410889544574, + "loss": 0.83915699, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.53613281, + "step": 991, + "time_per_iteration": 3.0653748512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112616, + "balance_loss_mlp": 1.05868626, + "epoch": 0.1908426317814544, + "flos": 603442847232.0, + "grad_norm": 0.04383499470371995, + "language_loss": 0.89543211, + "learning_rate": 0.000933686131040967, + "loss": 0.90655828, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.54052734, + "step": 992, + "time_per_iteration": 2.7901530265808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106006, + "balance_loss_mlp": 1.0517416, + "epoch": 0.19103501346671797, + "flos": 587434791936.0, + "grad_norm": 0.04122735235002176, + "language_loss": 0.92173266, + "learning_rate": 0.0009335310047555883, + "loss": 0.93279278, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.54394531, + "step": 993, + "time_per_iteration": 2.7153608798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097163, + "balance_loss_mlp": 1.04285157, + "epoch": 0.19122739515198153, + "flos": 546835298304.0, + "grad_norm": 0.04052898350535971, + "language_loss": 0.89637405, + "learning_rate": 0.0009333757101585467, + "loss": 0.90734565, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.54443359, + "step": 994, + "time_per_iteration": 2.6286795139312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091826, + "balance_loss_mlp": 1.03732359, + "epoch": 0.1914197768372451, + "flos": 522550061568.0, + "grad_norm": 0.03850908176124289, + "language_loss": 0.94694555, + "learning_rate": 0.0009332202473101329, + "loss": 0.95786381, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.54638672, + "step": 995, + "time_per_iteration": 2.649850368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072176, + "balance_loss_mlp": 1.01714945, + "epoch": 0.19161215852250865, + "flos": 612388812288.0, + "grad_norm": 0.03654296504823072, + "language_loss": 0.83743644, + "learning_rate": 0.0009330646162707028, + "loss": 0.84815824, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.55175781, + "step": 996, + "time_per_iteration": 2.7329981327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059087, + "balance_loss_mlp": 1.0033443, + "epoch": 0.1918045402077722, + "flos": 848183935488.0, + "grad_norm": 0.03315860340701524, + "language_loss": 0.85236025, + "learning_rate": 0.0009329088171006779, + "loss": 0.8629511, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.55908203, + "step": 997, + "time_per_iteration": 3.135049343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290698, + "balance_loss_mlp": 1.2330482, + "epoch": 0.19199692189303577, + "flos": 466893198336.0, + "grad_norm": 0.06463762674453556, + "language_loss": 0.86239529, + "learning_rate": 0.0009327528498605446, + "loss": 0.87530231, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.57470703, + "step": 998, + "time_per_iteration": 2.5807580947875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.01727533, + "epoch": 0.19218930357829936, + "flos": 532613908224.0, + "grad_norm": 0.04280698068802137, + "language_loss": 0.90856296, + "learning_rate": 0.0009325967146108548, + "loss": 0.91928697, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.55273438, + "step": 999, + "time_per_iteration": 2.637840986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086346, + "balance_loss_mlp": 1.03217781, + "epoch": 0.19238168526356292, + "flos": 602728376832.0, + "grad_norm": 0.04847652630230049, + "language_loss": 0.88902158, + "learning_rate": 0.0009324404114122258, + "loss": 0.89988506, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.54296875, + "step": 1000, + "time_per_iteration": 4.1391942501068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090902, + "balance_loss_mlp": 1.03701913, + "epoch": 0.19257406694882648, + "flos": 573155076096.0, + "grad_norm": 0.04193719314851312, + "language_loss": 0.88362414, + "learning_rate": 0.0009322839403253397, + "loss": 0.89453316, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.54003906, + "step": 1001, + "time_per_iteration": 2.8266265392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.03395164, + "epoch": 0.19276644863409004, + "flos": 803157635328.0, + "grad_norm": 0.04353601683576214, + "language_loss": 0.85235333, + "learning_rate": 0.0009321273014109439, + "loss": 0.86323166, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.54003906, + "step": 1002, + "time_per_iteration": 2.9539175033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_mlp": 1.04068995, + "epoch": 0.1929588303193536, + "flos": 564480319488.0, + "grad_norm": 0.03718563884895513, + "language_loss": 0.86078906, + "learning_rate": 0.0009319704947298513, + "loss": 0.87173432, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.53955078, + "step": 1003, + "time_per_iteration": 2.8760387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091693, + "balance_loss_mlp": 1.0380007, + "epoch": 0.19315121200461716, + "flos": 627988598784.0, + "grad_norm": 0.03744955738150477, + "language_loss": 0.89579475, + "learning_rate": 0.0009318135203429393, + "loss": 0.9067117, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.53808594, + "step": 1004, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094654, + "balance_loss_mlp": 1.04058087, + "epoch": 0.19334359368988072, + "flos": 518584013568.0, + "grad_norm": 0.03742742378220975, + "language_loss": 0.89228511, + "learning_rate": 0.0009316563783111511, + "loss": 0.90323162, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.54199219, + "step": 1005, + "time_per_iteration": 2.7024500370025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090205, + "balance_loss_mlp": 1.03598833, + "epoch": 0.19353597537514428, + "flos": 695400709632.0, + "grad_norm": 0.036019255491177425, + "language_loss": 0.83731771, + "learning_rate": 0.0009314990686954943, + "loss": 0.84821975, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.54345703, + "step": 1006, + "time_per_iteration": 2.901319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092974, + "balance_loss_mlp": 1.03866184, + "epoch": 0.19372835706040784, + "flos": 1212200981760.0, + "grad_norm": 0.03507497873235563, + "language_loss": 0.82359284, + "learning_rate": 0.000931341591557042, + "loss": 0.8345226, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.54443359, + "step": 1007, + "time_per_iteration": 3.70509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.03467596, + "epoch": 0.19392073874567142, + "flos": 521685891840.0, + "grad_norm": 0.04354230775215961, + "language_loss": 0.88703787, + "learning_rate": 0.0009311839469569325, + "loss": 0.89792681, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.54345703, + "step": 1008, + "time_per_iteration": 2.632070302963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088222, + "balance_loss_mlp": 1.03386211, + "epoch": 0.19411312043093498, + "flos": 589911628032.0, + "grad_norm": 0.044503426382111445, + "language_loss": 0.88821465, + "learning_rate": 0.0009310261349563687, + "loss": 0.89909685, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.54492188, + "step": 1009, + "time_per_iteration": 2.7138211727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0067606, + "epoch": 0.19430550211619854, + "flos": 580572945408.0, + "grad_norm": 0.029375689409949213, + "language_loss": 0.86173785, + "learning_rate": 0.0009308681556166186, + "loss": 0.87235624, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.55224609, + "step": 1010, + "time_per_iteration": 2.834946870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05162705, + "balance_loss_mlp": 5.08607721, + "epoch": 0.1944978838014621, + "flos": 622246579200.0, + "grad_norm": 0.2884784307389343, + "language_loss": 0.88793403, + "learning_rate": 0.0009307100089990152, + "loss": 0.93956107, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.76513672, + "step": 1011, + "time_per_iteration": 2.705335855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094303, + "balance_loss_mlp": 1.04189909, + "epoch": 0.19469026548672566, + "flos": 599815081728.0, + "grad_norm": 0.04633555371791679, + "language_loss": 0.85740912, + "learning_rate": 0.0009305516951649568, + "loss": 0.86835217, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.52490234, + "step": 1012, + "time_per_iteration": 2.7048773765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164213, + "balance_loss_mlp": 1.11281013, + "epoch": 0.19488264717198922, + "flos": 553248046848.0, + "grad_norm": 0.04991787894778298, + "language_loss": 0.87912452, + "learning_rate": 0.0009303932141759057, + "loss": 0.89076668, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.51464844, + "step": 1013, + "time_per_iteration": 2.8072102069854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211245, + "balance_loss_mlp": 1.15984225, + "epoch": 0.19507502885725278, + "flos": 667313708544.0, + "grad_norm": 0.06529111316537192, + "language_loss": 0.85445917, + "learning_rate": 0.0009302345660933902, + "loss": 0.86657166, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.51464844, + "step": 1014, + "time_per_iteration": 2.7895615100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244014, + "balance_loss_mlp": 1.19265878, + "epoch": 0.19526741054251634, + "flos": 672328618752.0, + "grad_norm": 0.06071591874537116, + "language_loss": 0.86587232, + "learning_rate": 0.0009300757509790026, + "loss": 0.87831247, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.51416016, + "step": 1015, + "time_per_iteration": 2.8867006301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012313, + "balance_loss_mlp": 1.18008745, + "epoch": 0.19545979222777993, + "flos": 448147792128.0, + "grad_norm": 0.057262662434688416, + "language_loss": 0.91914976, + "learning_rate": 0.0009299167688944005, + "loss": 0.93146276, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.51269531, + "step": 1016, + "time_per_iteration": 2.526421546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226901, + "balance_loss_mlp": 1.17568827, + "epoch": 0.1956521739130435, + "flos": 570169849344.0, + "grad_norm": 0.05343522997619492, + "language_loss": 0.87454194, + "learning_rate": 0.0009297576199013063, + "loss": 0.8868109, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.51269531, + "step": 1017, + "time_per_iteration": 2.7184784412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012071, + "balance_loss_mlp": 1.15884399, + "epoch": 0.19584455559830705, + "flos": 1458883280640.0, + "grad_norm": 0.03399393552013433, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74209231, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.48242188, + "step": 1018, + "time_per_iteration": 4.916393756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159874, + "balance_loss_mlp": 1.11199951, + "epoch": 0.1960369372835706, + "flos": 1594484189184.0, + "grad_norm": 0.02523442502037962, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80586171, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.47851562, + "step": 1019, + "time_per_iteration": 5.5991902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163202, + "balance_loss_mlp": 1.11241901, + "epoch": 0.19622931896883417, + "flos": 617254023168.0, + "grad_norm": 0.06792637193668423, + "language_loss": 0.88615566, + "learning_rate": 0.0009292791720892659, + "loss": 0.89778763, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.50830078, + "step": 1020, + "time_per_iteration": 2.8419806957244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.08191884, + "epoch": 0.19642170065409773, + "flos": 467208148224.0, + "grad_norm": 0.044541966790476714, + "language_loss": 0.90245676, + "learning_rate": 0.0009291193560807218, + "loss": 0.91378373, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.50830078, + "step": 1021, + "time_per_iteration": 2.60357403755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111942, + "balance_loss_mlp": 1.06858945, + "epoch": 0.19661408233936128, + "flos": 516288957696.0, + "grad_norm": 0.03957164107654416, + "language_loss": 0.88134921, + "learning_rate": 0.0009289593734732688, + "loss": 0.89254344, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.50878906, + "step": 1022, + "time_per_iteration": 2.6077988147735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115639, + "balance_loss_mlp": 1.06461763, + "epoch": 0.19680646402462484, + "flos": 393494104320.0, + "grad_norm": 0.03618938319364158, + "language_loss": 0.94921708, + "learning_rate": 0.0009287992243290175, + "loss": 0.96037352, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.51074219, + "step": 1023, + "time_per_iteration": 2.486910820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_mlp": 1.05263603, + "epoch": 0.19699884570988843, + "flos": 627624071424.0, + "grad_norm": 0.04088238638674664, + "language_loss": 0.91379654, + "learning_rate": 0.0009286389087101435, + "loss": 0.92483938, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.51708984, + "step": 1024, + "time_per_iteration": 2.7762300968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083626, + "balance_loss_mlp": 1.03126919, + "epoch": 0.197191227395152, + "flos": 559074637056.0, + "grad_norm": 0.038177798611856564, + "language_loss": 0.89866579, + "learning_rate": 0.0009284784266788864, + "loss": 0.90950203, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.52441406, + "step": 1025, + "time_per_iteration": 2.7595441341400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.05275905, + "epoch": 0.19738360908041555, + "flos": 666250262016.0, + "grad_norm": 0.08120700653890094, + "language_loss": 0.93505025, + "learning_rate": 0.0009283177782975512, + "loss": 0.94610423, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.52734375, + "step": 1026, + "time_per_iteration": 2.9439735412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158523, + "balance_loss_mlp": 1.10511732, + "epoch": 0.1975759907656791, + "flos": 523511440896.0, + "grad_norm": 0.05175943009769999, + "language_loss": 0.89213437, + "learning_rate": 0.000928156963628507, + "loss": 0.9037196, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.53515625, + "step": 1027, + "time_per_iteration": 2.5648727416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124606, + "balance_loss_mlp": 1.0717721, + "epoch": 0.19776837245094267, + "flos": 463485118464.0, + "grad_norm": 0.0380471847687272, + "language_loss": 0.89530945, + "learning_rate": 0.0009279959827341877, + "loss": 0.90655547, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.52929688, + "step": 1028, + "time_per_iteration": 2.7482099533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114747, + "balance_loss_mlp": 1.0622474, + "epoch": 0.19796075413620623, + "flos": 504058367232.0, + "grad_norm": 0.038077776452832945, + "language_loss": 0.88821751, + "learning_rate": 0.0009278348356770915, + "loss": 0.89936495, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.52587891, + "step": 1029, + "time_per_iteration": 2.5559866428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125843, + "balance_loss_mlp": 1.07362974, + "epoch": 0.1981531358214698, + "flos": 508571689728.0, + "grad_norm": 0.03906482091144459, + "language_loss": 0.87010926, + "learning_rate": 0.0009276735225197814, + "loss": 0.88136768, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.52294922, + "step": 1030, + "time_per_iteration": 2.598353862762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116206, + "balance_loss_mlp": 1.06418335, + "epoch": 0.19834551750673335, + "flos": 532640153088.0, + "grad_norm": 0.039761606091750314, + "language_loss": 0.8715511, + "learning_rate": 0.0009275120433248847, + "loss": 0.88271314, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.52099609, + "step": 1031, + "time_per_iteration": 2.691051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105688, + "balance_loss_mlp": 1.05414224, + "epoch": 0.1985378991919969, + "flos": 776971027200.0, + "grad_norm": 0.03650424605094363, + "language_loss": 0.87217546, + "learning_rate": 0.0009273503981550931, + "loss": 0.88323236, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.51611328, + "step": 1032, + "time_per_iteration": 3.05829119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094626, + "balance_loss_mlp": 1.04336572, + "epoch": 0.1987302808772605, + "flos": 435192037632.0, + "grad_norm": 0.04492232470085823, + "language_loss": 0.88675368, + "learning_rate": 0.0009271885870731626, + "loss": 0.89769995, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.51318359, + "step": 1033, + "time_per_iteration": 2.5097644329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.04036272, + "epoch": 0.19892266256252406, + "flos": 554654633472.0, + "grad_norm": 0.041410721104386976, + "language_loss": 0.89478087, + "learning_rate": 0.0009270266101419143, + "loss": 0.90569472, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.51074219, + "step": 1034, + "time_per_iteration": 2.6359710693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.04026711, + "epoch": 0.19911504424778761, + "flos": 550949100288.0, + "grad_norm": 0.034987230226667505, + "language_loss": 0.86329561, + "learning_rate": 0.0009268644674242328, + "loss": 0.87420899, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.51123047, + "step": 1035, + "time_per_iteration": 2.679041624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091574, + "balance_loss_mlp": 1.04045713, + "epoch": 0.19930742593305117, + "flos": 519313068288.0, + "grad_norm": 0.035495194235479824, + "language_loss": 0.81977046, + "learning_rate": 0.0009267021589830678, + "loss": 0.83068615, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.51171875, + "step": 1036, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330025, + "balance_loss_mlp": 1.27871704, + "epoch": 0.19949980761831473, + "flos": 1512640717824.0, + "grad_norm": 0.0530000786951376, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78957105, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.51367188, + "step": 1037, + "time_per_iteration": 5.041083097457886 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.04635978, + "epoch": 0.1996921893035783, + "flos": 699440634624.0, + "grad_norm": 0.03827221066614039, + "language_loss": 0.93735194, + "learning_rate": 0.000926377045182406, + "loss": 0.94832766, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.51269531, + "step": 1038, + "time_per_iteration": 2.921194314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_mlp": 1.05443072, + "epoch": 0.19988457098884185, + "flos": 728395696128.0, + "grad_norm": 0.0388450926907903, + "language_loss": 0.89164472, + "learning_rate": 0.0009262142399491296, + "loss": 0.90270543, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.51708984, + "step": 1039, + "time_per_iteration": 3.0543293952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.05093122, + "epoch": 0.2000769526741054, + "flos": 561625350144.0, + "grad_norm": 0.04341407711707897, + "language_loss": 0.8911137, + "learning_rate": 0.0009260512692448105, + "loss": 0.90213847, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.51611328, + "step": 1040, + "time_per_iteration": 2.6906111240386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097091, + "balance_loss_mlp": 1.04549766, + "epoch": 0.200269334359369, + "flos": 573165769728.0, + "grad_norm": 0.03433464693573298, + "language_loss": 0.85109496, + "learning_rate": 0.000925888133132719, + "loss": 0.86206591, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.51660156, + "step": 1041, + "time_per_iteration": 2.77327561378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112812, + "balance_loss_mlp": 1.06465149, + "epoch": 0.20046171604463256, + "flos": 1489155500544.0, + "grad_norm": 0.023433110981570023, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072325, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.48144531, + "step": 1042, + "time_per_iteration": 4.926042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116989, + "balance_loss_mlp": 1.06525254, + "epoch": 0.20065409772989612, + "flos": 497578544640.0, + "grad_norm": 0.04254485219096875, + "language_loss": 0.82304472, + "learning_rate": 0.0009255613649386244, + "loss": 0.83421457, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.51806641, + "step": 1043, + "time_per_iteration": 2.6593456268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111366, + "balance_loss_mlp": 1.06144655, + "epoch": 0.20084647941515968, + "flos": 580464075264.0, + "grad_norm": 0.040062947145422745, + "language_loss": 0.79980814, + "learning_rate": 0.0009253977329834838, + "loss": 0.81094474, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.52294922, + "step": 1044, + "time_per_iteration": 2.765777111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110762, + "balance_loss_mlp": 1.0584054, + "epoch": 0.20103886110042324, + "flos": 643288986624.0, + "grad_norm": 0.040441822708095716, + "language_loss": 0.87291706, + "learning_rate": 0.0009252339358742965, + "loss": 0.88402474, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.52441406, + "step": 1045, + "time_per_iteration": 2.825388193130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.05353701, + "epoch": 0.2012312427856868, + "flos": 442970543616.0, + "grad_norm": 0.03567593499019723, + "language_loss": 0.84250462, + "learning_rate": 0.000925069973674654, + "loss": 0.85356355, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.52441406, + "step": 1046, + "time_per_iteration": 2.609393358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103182, + "balance_loss_mlp": 1.05082524, + "epoch": 0.20142362447095036, + "flos": 555473116416.0, + "grad_norm": 0.03147198417726023, + "language_loss": 0.89562172, + "learning_rate": 0.000924905846448212, + "loss": 0.90665352, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.52441406, + "step": 1047, + "time_per_iteration": 2.7771337032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.0364331, + "epoch": 0.20161600615621392, + "flos": 671555822592.0, + "grad_norm": 0.0352448826174341, + "language_loss": 0.86282432, + "learning_rate": 0.0009247415542586906, + "loss": 0.87371844, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.53076172, + "step": 1048, + "time_per_iteration": 2.8992083072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.03624833, + "epoch": 0.2018083878414775, + "flos": 574307950848.0, + "grad_norm": 0.02930747529675645, + "language_loss": 0.83574796, + "learning_rate": 0.0009245770971698735, + "loss": 0.84664071, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.53125, + "step": 1049, + "time_per_iteration": 2.890824317932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092759, + "balance_loss_mlp": 1.03992498, + "epoch": 0.20200076952674106, + "flos": 426795292416.0, + "grad_norm": 0.03785140598382088, + "language_loss": 0.89288604, + "learning_rate": 0.0009244124752456087, + "loss": 0.9038136, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.52929688, + "step": 1050, + "time_per_iteration": 2.5022785663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078262, + "balance_loss_mlp": 1.02566695, + "epoch": 0.20219315121200462, + "flos": 537685198848.0, + "grad_norm": 0.03140637951028952, + "language_loss": 0.86254251, + "learning_rate": 0.0009242476885498081, + "loss": 0.87332511, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.52685547, + "step": 1051, + "time_per_iteration": 2.732915163040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080259, + "balance_loss_mlp": 1.02771127, + "epoch": 0.20238553289726818, + "flos": 478835083776.0, + "grad_norm": 0.042472274730814934, + "language_loss": 0.82148528, + "learning_rate": 0.0009240827371464474, + "loss": 0.83228779, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.52636719, + "step": 1052, + "time_per_iteration": 2.577660322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076456, + "balance_loss_mlp": 1.02448094, + "epoch": 0.20257791458253174, + "flos": 1153847596800.0, + "grad_norm": 0.038862673250338535, + "language_loss": 0.85609984, + "learning_rate": 0.0009239176210995666, + "loss": 0.86686444, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.52050781, + "step": 1053, + "time_per_iteration": 3.517408609390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076589, + "balance_loss_mlp": 1.02485228, + "epoch": 0.2027702962677953, + "flos": 668149688064.0, + "grad_norm": 0.03591644261584591, + "language_loss": 0.94691521, + "learning_rate": 0.0009237523404732695, + "loss": 0.95768112, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51806641, + "step": 1054, + "time_per_iteration": 2.9073944091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010707, + "balance_loss_mlp": 1.01934481, + "epoch": 0.20296267795305886, + "flos": 642453007104.0, + "grad_norm": 0.03829830750428097, + "language_loss": 0.85043323, + "learning_rate": 0.0009235868953317235, + "loss": 0.86114025, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.51416016, + "step": 1055, + "time_per_iteration": 2.8769731521606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063995, + "balance_loss_mlp": 1.01249659, + "epoch": 0.20315505963832242, + "flos": 932130967296.0, + "grad_norm": 0.03371739794492534, + "language_loss": 0.86243355, + "learning_rate": 0.0009234212857391602, + "loss": 0.87307346, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.515625, + "step": 1056, + "time_per_iteration": 3.1701345443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062013, + "balance_loss_mlp": 1.01075327, + "epoch": 0.20334744132358598, + "flos": 563288560896.0, + "grad_norm": 0.028023058598955305, + "language_loss": 0.9034453, + "learning_rate": 0.000923255511759875, + "loss": 0.91406548, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.51318359, + "step": 1057, + "time_per_iteration": 2.8186585903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105938, + "balance_loss_mlp": 1.00840592, + "epoch": 0.20353982300884957, + "flos": 645429485568.0, + "grad_norm": 0.03599363132321351, + "language_loss": 0.85699975, + "learning_rate": 0.000923089573458227, + "loss": 0.86759359, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.51025391, + "step": 1058, + "time_per_iteration": 2.829428195953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_mlp": 1.01248097, + "epoch": 0.20373220469411313, + "flos": 652706403840.0, + "grad_norm": 0.03721325608628497, + "language_loss": 0.84890962, + "learning_rate": 0.0009229234708986392, + "loss": 0.85954273, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.50878906, + "step": 1059, + "time_per_iteration": 2.9125583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119614, + "balance_loss_mlp": 1.06964111, + "epoch": 0.2039245863793767, + "flos": 1440399367680.0, + "grad_norm": 0.026200157549973457, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82786512, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.49902344, + "step": 1060, + "time_per_iteration": 4.70502233505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105542, + "balance_loss_mlp": 1.00468493, + "epoch": 0.20411696806464025, + "flos": 598128538368.0, + "grad_norm": 0.03644056871626998, + "language_loss": 0.85909504, + "learning_rate": 0.0009225907732636548, + "loss": 0.86964923, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.5078125, + "step": 1061, + "time_per_iteration": 2.7681198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057413, + "balance_loss_mlp": 1.00672543, + "epoch": 0.2043093497499038, + "flos": 574897999872.0, + "grad_norm": 0.03243635340085092, + "language_loss": 0.87862682, + "learning_rate": 0.0009224241783174227, + "loss": 0.88920105, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.50732422, + "step": 1062, + "time_per_iteration": 2.682659864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058744, + "balance_loss_mlp": 1.00819898, + "epoch": 0.20450173143516737, + "flos": 631524990720.0, + "grad_norm": 0.033151959510572516, + "language_loss": 0.86810422, + "learning_rate": 0.0009222574193715802, + "loss": 0.87869167, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.50585938, + "step": 1063, + "time_per_iteration": 2.7470076084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057209, + "balance_loss_mlp": 1.00656855, + "epoch": 0.20469411312043093, + "flos": 575147821056.0, + "grad_norm": 0.03442752078644266, + "language_loss": 0.86910367, + "learning_rate": 0.000922090496490869, + "loss": 0.87967575, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.50683594, + "step": 1064, + "time_per_iteration": 2.789161443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_mlp": 1.00465047, + "epoch": 0.20488649480569449, + "flos": 638280879360.0, + "grad_norm": 0.029149473365885022, + "language_loss": 0.90671569, + "learning_rate": 0.0009219234097400937, + "loss": 0.91726714, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.50537109, + "step": 1065, + "time_per_iteration": 2.8469130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00483656, + "epoch": 0.20507887649095807, + "flos": 977439169536.0, + "grad_norm": 0.03225683406068631, + "language_loss": 0.83590472, + "learning_rate": 0.0009217561591841237, + "loss": 0.84645659, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.50390625, + "step": 1066, + "time_per_iteration": 3.331498622894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105396, + "balance_loss_mlp": 1.00332034, + "epoch": 0.20527125817622163, + "flos": 487156006656.0, + "grad_norm": 0.037421781664849635, + "language_loss": 0.81758374, + "learning_rate": 0.0009215887448878913, + "loss": 0.82812333, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.50683594, + "step": 1067, + "time_per_iteration": 2.5782346725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054496, + "balance_loss_mlp": 1.00414193, + "epoch": 0.2054636398614852, + "flos": 528211401216.0, + "grad_norm": 0.031680985043262715, + "language_loss": 0.86063826, + "learning_rate": 0.0009214211669163922, + "loss": 0.87118322, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.50390625, + "step": 1068, + "time_per_iteration": 2.689772129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054583, + "balance_loss_mlp": 1.00403798, + "epoch": 0.20565602154674875, + "flos": 559324458240.0, + "grad_norm": 0.03119808154519671, + "language_loss": 0.94868428, + "learning_rate": 0.0009212534253346862, + "loss": 0.95923012, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.50585938, + "step": 1069, + "time_per_iteration": 2.760840654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.01027393, + "epoch": 0.2058484032320123, + "flos": 505221935616.0, + "grad_norm": 0.042999288209875815, + "language_loss": 0.85068119, + "learning_rate": 0.0009210855202078964, + "loss": 0.86128938, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.50585938, + "step": 1070, + "time_per_iteration": 2.6273016929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057609, + "balance_loss_mlp": 1.00687337, + "epoch": 0.20604078491727587, + "flos": 434047911168.0, + "grad_norm": 0.03672139626538296, + "language_loss": 0.88035965, + "learning_rate": 0.0009209174516012091, + "loss": 0.89093566, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.5078125, + "step": 1071, + "time_per_iteration": 2.5263099670410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.0049957, + "epoch": 0.20623316660253943, + "flos": 609875037696.0, + "grad_norm": 0.03118890610347894, + "language_loss": 0.89938867, + "learning_rate": 0.0009207492195798747, + "loss": 0.90994692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.50878906, + "step": 1072, + "time_per_iteration": 2.773094654083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059091, + "balance_loss_mlp": 1.00816524, + "epoch": 0.206425548287803, + "flos": 481394545152.0, + "grad_norm": 0.034846135669383375, + "language_loss": 0.85408926, + "learning_rate": 0.0009205808242092061, + "loss": 0.86468017, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.50976562, + "step": 1073, + "time_per_iteration": 2.6704161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.01025188, + "epoch": 0.20661792997306658, + "flos": 951124249344.0, + "grad_norm": 0.036438983488896924, + "language_loss": 0.83303434, + "learning_rate": 0.0009204122655545808, + "loss": 0.84364516, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.50878906, + "step": 1074, + "time_per_iteration": 3.3605480194091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059315, + "balance_loss_mlp": 1.00857949, + "epoch": 0.20681031165833014, + "flos": 604617109248.0, + "grad_norm": 0.03238632395719984, + "language_loss": 0.81744164, + "learning_rate": 0.0009202435436814388, + "loss": 0.82803476, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.5078125, + "step": 1075, + "time_per_iteration": 2.6966288089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106261, + "balance_loss_mlp": 1.01163661, + "epoch": 0.2070026933435937, + "flos": 710266583808.0, + "grad_norm": 0.03297439165012413, + "language_loss": 0.90137285, + "learning_rate": 0.0009200746586552836, + "loss": 0.91199899, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.51025391, + "step": 1076, + "time_per_iteration": 2.919851779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057537, + "balance_loss_mlp": 1.00675428, + "epoch": 0.20719507502885726, + "flos": 831255330048.0, + "grad_norm": 0.031928056401627374, + "language_loss": 0.84964621, + "learning_rate": 0.0009199056105416825, + "loss": 0.86022151, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.50830078, + "step": 1077, + "time_per_iteration": 3.0944886207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059646, + "balance_loss_mlp": 1.00881469, + "epoch": 0.20738745671412082, + "flos": 639500828160.0, + "grad_norm": 0.033227407694906064, + "language_loss": 0.87196565, + "learning_rate": 0.0009197363994062654, + "loss": 0.88256204, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.50878906, + "step": 1078, + "time_per_iteration": 2.8505265712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059879, + "balance_loss_mlp": 1.00933433, + "epoch": 0.20757983839938438, + "flos": 686984522496.0, + "grad_norm": 0.03258152966614613, + "language_loss": 0.84972161, + "learning_rate": 0.0009195670253147262, + "loss": 0.86032039, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.50585938, + "step": 1079, + "time_per_iteration": 3.0077526569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_mlp": 1.01375961, + "epoch": 0.20777222008464794, + "flos": 520318189056.0, + "grad_norm": 0.03575722766779635, + "language_loss": 0.83075011, + "learning_rate": 0.0009193974883328216, + "loss": 0.84139216, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.50488281, + "step": 1080, + "time_per_iteration": 2.6277496814727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062434, + "balance_loss_mlp": 1.01212776, + "epoch": 0.2079646017699115, + "flos": 512470663680.0, + "grad_norm": 0.03316952161345372, + "language_loss": 0.87936002, + "learning_rate": 0.0009192277885263718, + "loss": 0.88998437, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.50341797, + "step": 1081, + "time_per_iteration": 2.6486003398895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056126, + "balance_loss_mlp": 1.00596321, + "epoch": 0.20815698345517505, + "flos": 933468534528.0, + "grad_norm": 0.031694408237267754, + "language_loss": 0.87043977, + "learning_rate": 0.0009190579259612602, + "loss": 0.881001, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.50195312, + "step": 1082, + "time_per_iteration": 3.280133008956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062428, + "balance_loss_mlp": 1.01202655, + "epoch": 0.20834936514043864, + "flos": 633554674176.0, + "grad_norm": 0.03367407497844021, + "language_loss": 0.87446159, + "learning_rate": 0.000918887900703433, + "loss": 0.88508588, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.50439453, + "step": 1083, + "time_per_iteration": 2.7914657592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_mlp": 1.01024699, + "epoch": 0.2085417468257022, + "flos": 395243831040.0, + "grad_norm": 0.03354838448754016, + "language_loss": 0.91036344, + "learning_rate": 0.0009187177128188999, + "loss": 0.92096996, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.50439453, + "step": 1084, + "time_per_iteration": 2.4803311824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107357, + "balance_loss_mlp": 1.02455139, + "epoch": 0.20873412851096576, + "flos": 1405197775104.0, + "grad_norm": 0.012085868941934568, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78230107, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.48925781, + "step": 1085, + "time_per_iteration": 4.883121728897095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_mlp": 1.00562024, + "epoch": 0.20892651019622932, + "flos": 448762140672.0, + "grad_norm": 0.03493036575467998, + "language_loss": 0.8691588, + "learning_rate": 0.000918376849434071, + "loss": 0.87971807, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.50317383, + "step": 1086, + "time_per_iteration": 2.537820816040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065129, + "balance_loss_mlp": 1.01444149, + "epoch": 0.20911889188149288, + "flos": 494081036544.0, + "grad_norm": 0.040745363066357655, + "language_loss": 0.91673005, + "learning_rate": 0.0009182061740661098, + "loss": 0.9273814, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.50732422, + "step": 1087, + "time_per_iteration": 2.5920886993408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056749, + "balance_loss_mlp": 1.00615633, + "epoch": 0.20931127356675644, + "flos": 842750062848.0, + "grad_norm": 0.02822254108426211, + "language_loss": 0.85810733, + "learning_rate": 0.0009180353363361127, + "loss": 0.86867487, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.50634766, + "step": 1088, + "time_per_iteration": 3.1376798152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060338, + "balance_loss_mlp": 1.00979316, + "epoch": 0.20950365525202, + "flos": 758525019648.0, + "grad_norm": 0.03922038165748564, + "language_loss": 0.83160806, + "learning_rate": 0.0009178643363104044, + "loss": 0.84221143, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.50585938, + "step": 1089, + "time_per_iteration": 3.124352216720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059844, + "balance_loss_mlp": 1.00939417, + "epoch": 0.20969603693728356, + "flos": 473492584704.0, + "grad_norm": 0.04272734591158297, + "language_loss": 0.920385, + "learning_rate": 0.0009176931740553735, + "loss": 0.93098342, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.50488281, + "step": 1090, + "time_per_iteration": 2.556528091430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067623, + "balance_loss_mlp": 1.01731646, + "epoch": 0.20988841862254715, + "flos": 978628982784.0, + "grad_norm": 0.03590255199570226, + "language_loss": 0.83530974, + "learning_rate": 0.0009175218496374708, + "loss": 0.84598601, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.50341797, + "step": 1091, + "time_per_iteration": 3.328984260559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059931, + "balance_loss_mlp": 1.00976801, + "epoch": 0.2100808003078107, + "flos": 1094819592192.0, + "grad_norm": 0.03766723451938342, + "language_loss": 0.86626744, + "learning_rate": 0.0009173503631232103, + "loss": 0.87686676, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.50170898, + "step": 1092, + "time_per_iteration": 3.4216480255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.00832939, + "epoch": 0.21027318199307427, + "flos": 1014560596992.0, + "grad_norm": 0.047058286401960234, + "language_loss": 0.82703817, + "learning_rate": 0.0009171787145791691, + "loss": 0.83762449, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.50341797, + "step": 1093, + "time_per_iteration": 3.2454655170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.00911129, + "epoch": 0.21046556367833782, + "flos": 522413001216.0, + "grad_norm": 0.043211200123957835, + "language_loss": 0.80955076, + "learning_rate": 0.000917006904071987, + "loss": 0.8201468, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.50537109, + "step": 1094, + "time_per_iteration": 2.6560592651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_mlp": 1.01053584, + "epoch": 0.21065794536360138, + "flos": 604840685568.0, + "grad_norm": 0.03488627405352903, + "language_loss": 0.87964189, + "learning_rate": 0.0009168349316683669, + "loss": 0.89025223, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.50537109, + "step": 1095, + "time_per_iteration": 2.794358253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.01243329, + "epoch": 0.21085032704886494, + "flos": 604558783488.0, + "grad_norm": 0.031199931973452354, + "language_loss": 0.82918072, + "learning_rate": 0.0009166627974350741, + "loss": 0.83981001, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.50537109, + "step": 1096, + "time_per_iteration": 2.89837384223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062823, + "balance_loss_mlp": 1.01242077, + "epoch": 0.2110427087341285, + "flos": 638832044544.0, + "grad_norm": 0.03623978918327459, + "language_loss": 0.90394479, + "learning_rate": 0.0009164905014389373, + "loss": 0.91457301, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.50439453, + "step": 1097, + "time_per_iteration": 2.79203462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055619, + "balance_loss_mlp": 1.00559878, + "epoch": 0.21123509041939206, + "flos": 523930403328.0, + "grad_norm": 0.03351990521185014, + "language_loss": 0.87381279, + "learning_rate": 0.0009163180437468476, + "loss": 0.88436902, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.50024414, + "step": 1098, + "time_per_iteration": 2.6110002994537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056208, + "balance_loss_mlp": 1.00647402, + "epoch": 0.21142747210465565, + "flos": 452194520064.0, + "grad_norm": 0.03619268995909484, + "language_loss": 0.86631316, + "learning_rate": 0.000916145424425759, + "loss": 0.87687522, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.49658203, + "step": 1099, + "time_per_iteration": 2.67106294631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060107, + "balance_loss_mlp": 1.01027727, + "epoch": 0.2116198537899192, + "flos": 877626978816.0, + "grad_norm": 0.042483916895571405, + "language_loss": 0.91832745, + "learning_rate": 0.0009159726435426885, + "loss": 0.92892849, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.49780273, + "step": 1100, + "time_per_iteration": 3.095250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052771, + "balance_loss_mlp": 1.00275087, + "epoch": 0.21181223547518277, + "flos": 524675009280.0, + "grad_norm": 0.035590136232614346, + "language_loss": 0.91126454, + "learning_rate": 0.0009157997011647154, + "loss": 0.92179227, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.49926758, + "step": 1101, + "time_per_iteration": 2.61954665184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.00615227, + "epoch": 0.21200461716044633, + "flos": 573426284544.0, + "grad_norm": 0.03167271765745466, + "language_loss": 0.86759949, + "learning_rate": 0.0009156265973589817, + "loss": 0.87816215, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.50146484, + "step": 1102, + "time_per_iteration": 2.7851946353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053549, + "balance_loss_mlp": 1.00348067, + "epoch": 0.2121969988457099, + "flos": 546175262976.0, + "grad_norm": 0.033324702660241096, + "language_loss": 0.90598941, + "learning_rate": 0.0009154533321926926, + "loss": 0.91652489, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.50073242, + "step": 1103, + "time_per_iteration": 2.658358573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056655, + "balance_loss_mlp": 1.00663483, + "epoch": 0.21238938053097345, + "flos": 845355211008.0, + "grad_norm": 0.03290940631262569, + "language_loss": 0.88234645, + "learning_rate": 0.0009152799057331156, + "loss": 0.89291298, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.50024414, + "step": 1104, + "time_per_iteration": 3.1174561977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056205, + "balance_loss_mlp": 1.00623202, + "epoch": 0.212581762216237, + "flos": 447142671360.0, + "grad_norm": 0.035279899791186564, + "language_loss": 0.91767001, + "learning_rate": 0.0009151063180475805, + "loss": 0.92823207, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.5, + "step": 1105, + "time_per_iteration": 2.538922071456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054823, + "balance_loss_mlp": 1.00489795, + "epoch": 0.21277414390150057, + "flos": 515385904128.0, + "grad_norm": 0.03737857831356842, + "language_loss": 0.85410213, + "learning_rate": 0.0009149325692034803, + "loss": 0.86465037, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.49853516, + "step": 1106, + "time_per_iteration": 2.588087558746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055756, + "balance_loss_mlp": 1.00788116, + "epoch": 0.21296652558676413, + "flos": 1488514907136.0, + "grad_norm": 0.005769411809131762, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80259192, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.47851562, + "step": 1107, + "time_per_iteration": 4.901995658874512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055609, + "balance_loss_mlp": 1.00596976, + "epoch": 0.21315890727202771, + "flos": 847451968512.0, + "grad_norm": 0.03679321288402367, + "language_loss": 0.87994891, + "learning_rate": 0.0009145845883094678, + "loss": 0.89050496, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.49584961, + "step": 1108, + "time_per_iteration": 3.034179925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057835, + "balance_loss_mlp": 1.00833917, + "epoch": 0.21335128895729127, + "flos": 630556808448.0, + "grad_norm": 0.040833312538100186, + "language_loss": 0.86006308, + "learning_rate": 0.000914410356394654, + "loss": 0.87064135, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.49438477, + "step": 1109, + "time_per_iteration": 2.793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058379, + "balance_loss_mlp": 1.00878823, + "epoch": 0.21354367064255483, + "flos": 712285573632.0, + "grad_norm": 0.029526159769499145, + "language_loss": 0.85111213, + "learning_rate": 0.0009142359635914709, + "loss": 0.86169595, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.49560547, + "step": 1110, + "time_per_iteration": 3.0403430461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063298, + "balance_loss_mlp": 1.01375508, + "epoch": 0.2137360523278184, + "flos": 457211375616.0, + "grad_norm": 0.03547311640481051, + "language_loss": 0.85051197, + "learning_rate": 0.0009140614099676245, + "loss": 0.8611449, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.49414062, + "step": 1111, + "time_per_iteration": 2.6027371883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054693, + "balance_loss_mlp": 1.00495887, + "epoch": 0.21392843401308195, + "flos": 667266076416.0, + "grad_norm": 0.03139007596896344, + "language_loss": 0.8342849, + "learning_rate": 0.0009138866955908821, + "loss": 0.84483182, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.49658203, + "step": 1112, + "time_per_iteration": 2.924180269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00517082, + "epoch": 0.2141208156983455, + "flos": 750362544384.0, + "grad_norm": 0.03405304612319473, + "language_loss": 0.81477892, + "learning_rate": 0.0009137118205290738, + "loss": 0.82533085, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.49951172, + "step": 1113, + "time_per_iteration": 2.956289768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057131, + "balance_loss_mlp": 1.00711048, + "epoch": 0.21431319738360907, + "flos": 420011213568.0, + "grad_norm": 0.037812047895131755, + "language_loss": 0.90930229, + "learning_rate": 0.0009135367848500924, + "loss": 0.9198736, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.49975586, + "step": 1114, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_mlp": 1.01079023, + "epoch": 0.21450557906887263, + "flos": 610239565056.0, + "grad_norm": 0.04455846969282107, + "language_loss": 0.87261575, + "learning_rate": 0.0009133615886218927, + "loss": 0.88322389, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.5, + "step": 1115, + "time_per_iteration": 2.7146785259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105379, + "balance_loss_mlp": 1.00367427, + "epoch": 0.21469796075413622, + "flos": 562975556352.0, + "grad_norm": 0.04025415931658291, + "language_loss": 0.88754129, + "learning_rate": 0.0009131862319124917, + "loss": 0.89807916, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.50097656, + "step": 1116, + "time_per_iteration": 2.702315092086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058084, + "balance_loss_mlp": 1.0081588, + "epoch": 0.21489034243939978, + "flos": 595738218240.0, + "grad_norm": 0.036347556106983744, + "language_loss": 0.84819156, + "learning_rate": 0.0009130107147899691, + "loss": 0.8587724, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.49902344, + "step": 1117, + "time_per_iteration": 2.705153226852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055436, + "balance_loss_mlp": 1.00555849, + "epoch": 0.21508272412466334, + "flos": 442850979840.0, + "grad_norm": 0.032390780355026266, + "language_loss": 0.85796201, + "learning_rate": 0.0009128350373224665, + "loss": 0.86851633, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.49804688, + "step": 1118, + "time_per_iteration": 2.5689737796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055817, + "balance_loss_mlp": 1.00775146, + "epoch": 0.2152751058099269, + "flos": 1499234898432.0, + "grad_norm": 0.005802610423144338, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82512248, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.48046875, + "step": 1119, + "time_per_iteration": 4.659603834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054629, + "balance_loss_mlp": 1.00475144, + "epoch": 0.21546748749519046, + "flos": 494992838400.0, + "grad_norm": 0.03550503890551413, + "language_loss": 0.86117166, + "learning_rate": 0.0009124832016254005, + "loss": 0.87171793, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.4987793, + "step": 1120, + "time_per_iteration": 2.6080243587493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054572, + "balance_loss_mlp": 1.00450444, + "epoch": 0.21565986918045402, + "flos": 635695173120.0, + "grad_norm": 0.03761657282592244, + "language_loss": 0.88987935, + "learning_rate": 0.0009123070435324316, + "loss": 0.90042508, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.50097656, + "step": 1121, + "time_per_iteration": 2.8451340198516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062664, + "balance_loss_mlp": 1.01450348, + "epoch": 0.21585225086571758, + "flos": 1586801914368.0, + "grad_norm": 0.011675507285583616, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78938448, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.48144531, + "step": 1122, + "time_per_iteration": 5.018117666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_mlp": 1.00541639, + "epoch": 0.21604463255098114, + "flos": 685323257088.0, + "grad_norm": 0.03443856201457266, + "language_loss": 0.87021005, + "learning_rate": 0.0009119542471995752, + "loss": 0.8807621, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.49682617, + "step": 1123, + "time_per_iteration": 2.8631908893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.00755107, + "epoch": 0.2162370142362447, + "flos": 782308668672.0, + "grad_norm": 0.034966150945184314, + "language_loss": 0.82536203, + "learning_rate": 0.0009117776090966554, + "loss": 0.83593345, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.49511719, + "step": 1124, + "time_per_iteration": 2.9458060264587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_mlp": 1.00877571, + "epoch": 0.21642939592150828, + "flos": 1003762838016.0, + "grad_norm": 0.03795033166932298, + "language_loss": 0.87775326, + "learning_rate": 0.0009116008111274899, + "loss": 0.88833648, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.49511719, + "step": 1125, + "time_per_iteration": 3.2748866081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_mlp": 1.00556183, + "epoch": 0.21662177760677184, + "flos": 1485764917248.0, + "grad_norm": 0.008195913283110022, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8015998, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.47460938, + "step": 1126, + "time_per_iteration": 4.803825616836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105355, + "balance_loss_mlp": 1.00391161, + "epoch": 0.2168141592920354, + "flos": 888861196800.0, + "grad_norm": 0.03626284425770287, + "language_loss": 0.85553163, + "learning_rate": 0.0009112467358650396, + "loss": 0.86606717, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.49609375, + "step": 1127, + "time_per_iteration": 3.155856132507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057313, + "balance_loss_mlp": 1.00753081, + "epoch": 0.21700654097729896, + "flos": 547085119488.0, + "grad_norm": 0.03272511127748384, + "language_loss": 0.87140059, + "learning_rate": 0.0009110694587092192, + "loss": 0.88197374, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.49682617, + "step": 1128, + "time_per_iteration": 2.7438507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.00781655, + "epoch": 0.21719892266256252, + "flos": 510536244480.0, + "grad_norm": 0.0385378102776186, + "language_loss": 0.81826651, + "learning_rate": 0.0009108920219620815, + "loss": 0.82884294, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.49829102, + "step": 1129, + "time_per_iteration": 2.6256754398345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.00682795, + "epoch": 0.21739130434782608, + "flos": 544462474752.0, + "grad_norm": 0.03288593298355655, + "language_loss": 0.9021399, + "learning_rate": 0.0009107144256925133, + "loss": 0.91270602, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.49707031, + "step": 1130, + "time_per_iteration": 2.665764808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055495, + "balance_loss_mlp": 1.00566518, + "epoch": 0.21758368603308964, + "flos": 617983077888.0, + "grad_norm": 0.04004849400109536, + "language_loss": 0.83221352, + "learning_rate": 0.0009105366699694638, + "loss": 0.84276843, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.49755859, + "step": 1131, + "time_per_iteration": 2.7092785835266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055334, + "balance_loss_mlp": 1.0055995, + "epoch": 0.2177760677183532, + "flos": 636335766528.0, + "grad_norm": 0.03327692114185805, + "language_loss": 0.82139939, + "learning_rate": 0.0009103587548619439, + "loss": 0.83195269, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.49658203, + "step": 1132, + "time_per_iteration": 2.833617925643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055546, + "balance_loss_mlp": 1.00585985, + "epoch": 0.2179684494036168, + "flos": 533597641728.0, + "grad_norm": 0.036557340203022134, + "language_loss": 0.8721149, + "learning_rate": 0.0009101806804390261, + "loss": 0.8826704, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.49609375, + "step": 1133, + "time_per_iteration": 2.7880306243896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054716, + "balance_loss_mlp": 1.0050298, + "epoch": 0.21816083108888035, + "flos": 476182303488.0, + "grad_norm": 0.03701280834454915, + "language_loss": 0.917292, + "learning_rate": 0.0009100024467698453, + "loss": 0.92783916, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.49560547, + "step": 1134, + "time_per_iteration": 2.592986822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054821, + "balance_loss_mlp": 1.00513422, + "epoch": 0.2183532127741439, + "flos": 578547152640.0, + "grad_norm": 0.04183992577645213, + "language_loss": 0.83309305, + "learning_rate": 0.0009098240539235981, + "loss": 0.84364122, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.49658203, + "step": 1135, + "time_per_iteration": 2.693387269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055318, + "balance_loss_mlp": 1.00558341, + "epoch": 0.21854559445940747, + "flos": 595280371968.0, + "grad_norm": 0.03379290176549673, + "language_loss": 0.88387418, + "learning_rate": 0.0009096455019695423, + "loss": 0.89442736, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.49609375, + "step": 1136, + "time_per_iteration": 2.781304359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059818, + "balance_loss_mlp": 1.0098455, + "epoch": 0.21873797614467103, + "flos": 409549791744.0, + "grad_norm": 0.03874067782032871, + "language_loss": 0.90736896, + "learning_rate": 0.000909466790976998, + "loss": 0.91796714, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.49951172, + "step": 1137, + "time_per_iteration": 2.4837231636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.00620675, + "epoch": 0.21893035782993459, + "flos": 895655969280.0, + "grad_norm": 0.03281311030157744, + "language_loss": 0.83296013, + "learning_rate": 0.0009092879210153473, + "loss": 0.84352005, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.49682617, + "step": 1138, + "time_per_iteration": 3.156329870223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058359, + "balance_loss_mlp": 1.00862455, + "epoch": 0.21912273951519814, + "flos": 468569048064.0, + "grad_norm": 0.03332829582894704, + "language_loss": 0.89480728, + "learning_rate": 0.0009091088921540333, + "loss": 0.90539086, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.49731445, + "step": 1139, + "time_per_iteration": 2.5444674491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060322, + "balance_loss_mlp": 1.01197052, + "epoch": 0.2193151212004617, + "flos": 1535180118528.0, + "grad_norm": 0.009447727830516332, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76569003, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.48339844, + "step": 1140, + "time_per_iteration": 4.993603944778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_mlp": 1.00158358, + "epoch": 0.2195075028857253, + "flos": 592275703296.0, + "grad_norm": 0.039648398816974934, + "language_loss": 0.85201681, + "learning_rate": 0.0009087503580104985, + "loss": 0.86252946, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.49560547, + "step": 1141, + "time_per_iteration": 2.6736245155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053862, + "balance_loss_mlp": 1.00436676, + "epoch": 0.21969988457098885, + "flos": 637518776832.0, + "grad_norm": 0.03678403810630545, + "language_loss": 0.8005864, + "learning_rate": 0.0009085708528674728, + "loss": 0.81112504, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.49414062, + "step": 1142, + "time_per_iteration": 2.799607038497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.00362051, + "epoch": 0.2198922662562524, + "flos": 913860903936.0, + "grad_norm": 0.040969430424554455, + "language_loss": 0.86853033, + "learning_rate": 0.0009083911891031745, + "loss": 0.87906301, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.49487305, + "step": 1143, + "time_per_iteration": 3.1043601036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010518, + "balance_loss_mlp": 1.00235164, + "epoch": 0.22008464794151597, + "flos": 824495550720.0, + "grad_norm": 0.03475506353694162, + "language_loss": 0.91937912, + "learning_rate": 0.0009082113667873553, + "loss": 0.92989707, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.4934082, + "step": 1144, + "time_per_iteration": 3.114678144454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055626, + "balance_loss_mlp": 1.00636888, + "epoch": 0.22027702962677953, + "flos": 460619455488.0, + "grad_norm": 0.047183367988671336, + "language_loss": 0.91319406, + "learning_rate": 0.0009080313859898283, + "loss": 0.92375034, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.49145508, + "step": 1145, + "time_per_iteration": 2.529627799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_mlp": 1.00877535, + "epoch": 0.2204694113120431, + "flos": 532288264704.0, + "grad_norm": 0.034289556826903954, + "language_loss": 0.91988164, + "learning_rate": 0.0009078512467804684, + "loss": 0.93046296, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.49243164, + "step": 1146, + "time_per_iteration": 2.692556381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056493, + "balance_loss_mlp": 1.00737858, + "epoch": 0.22066179299730665, + "flos": 523687385088.0, + "grad_norm": 0.03628724645244133, + "language_loss": 0.91349947, + "learning_rate": 0.0009076709492292119, + "loss": 0.9240644, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.49023438, + "step": 1147, + "time_per_iteration": 2.6262857913970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056943, + "balance_loss_mlp": 1.00799513, + "epoch": 0.2208541746825702, + "flos": 547506027264.0, + "grad_norm": 0.0383258843164557, + "language_loss": 0.89899343, + "learning_rate": 0.0009074904934060562, + "loss": 0.90956283, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.48901367, + "step": 1148, + "time_per_iteration": 2.710716962814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054195, + "balance_loss_mlp": 1.00498509, + "epoch": 0.22104655636783377, + "flos": 710060504064.0, + "grad_norm": 0.034028934421108444, + "language_loss": 0.85814822, + "learning_rate": 0.0009073098793810607, + "loss": 0.86869013, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.4909668, + "step": 1149, + "time_per_iteration": 2.986891269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056627, + "balance_loss_mlp": 1.00758433, + "epoch": 0.22123893805309736, + "flos": 585965021952.0, + "grad_norm": 0.03641392016248804, + "language_loss": 0.88886124, + "learning_rate": 0.000907129107224346, + "loss": 0.89942753, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.48999023, + "step": 1150, + "time_per_iteration": 2.7348337173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055224, + "balance_loss_mlp": 1.00601482, + "epoch": 0.22143131973836092, + "flos": 493251859968.0, + "grad_norm": 0.02984339906163832, + "language_loss": 0.89448893, + "learning_rate": 0.0009069481770060939, + "loss": 0.90504116, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.49121094, + "step": 1151, + "time_per_iteration": 2.688180685043335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055578, + "balance_loss_mlp": 1.00593948, + "epoch": 0.22162370142362448, + "flos": 1081469174784.0, + "grad_norm": 0.034516826316188534, + "language_loss": 0.8487525, + "learning_rate": 0.000906767088796548, + "loss": 0.85930824, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.49584961, + "step": 1152, + "time_per_iteration": 3.4747724533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057959, + "balance_loss_mlp": 1.00841522, + "epoch": 0.22181608310888803, + "flos": 493512374784.0, + "grad_norm": 0.03114695536209251, + "language_loss": 0.87880313, + "learning_rate": 0.0009065858426660127, + "loss": 0.88938272, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.49462891, + "step": 1153, + "time_per_iteration": 2.6112635135650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060109, + "balance_loss_mlp": 1.0103749, + "epoch": 0.2220084647941516, + "flos": 725325898752.0, + "grad_norm": 0.04119971901255946, + "language_loss": 0.85662532, + "learning_rate": 0.0009064044386848543, + "loss": 0.86722642, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.49658203, + "step": 1154, + "time_per_iteration": 2.893120288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105547, + "balance_loss_mlp": 1.00564086, + "epoch": 0.22220084647941515, + "flos": 490245245952.0, + "grad_norm": 0.04012578927121656, + "language_loss": 0.89651787, + "learning_rate": 0.0009062228769234997, + "loss": 0.9070726, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.49731445, + "step": 1155, + "time_per_iteration": 2.544904947280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_mlp": 1.00344408, + "epoch": 0.2223932281646787, + "flos": 537296371968.0, + "grad_norm": 0.03814815821860503, + "language_loss": 0.82016486, + "learning_rate": 0.0009060411574524376, + "loss": 0.83069855, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.49804688, + "step": 1156, + "time_per_iteration": 2.6412572860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056148, + "balance_loss_mlp": 1.00660419, + "epoch": 0.22258560984994227, + "flos": 932968892160.0, + "grad_norm": 0.0415511709861084, + "language_loss": 0.88770878, + "learning_rate": 0.0009058592803422178, + "loss": 0.89827025, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.49462891, + "step": 1157, + "time_per_iteration": 4.623233079910278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055229, + "balance_loss_mlp": 1.00792694, + "epoch": 0.22277799153520586, + "flos": 1202397638400.0, + "grad_norm": 0.007067436666665483, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79765517, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.47265625, + "step": 1158, + "time_per_iteration": 4.805820465087891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053661, + "balance_loss_mlp": 1.00397491, + "epoch": 0.22297037322046942, + "flos": 502317388800.0, + "grad_norm": 0.032485949168455416, + "language_loss": 0.91067338, + "learning_rate": 0.00090549505348681, + "loss": 0.92121005, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.49633789, + "step": 1159, + "time_per_iteration": 2.5877561569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00427544, + "epoch": 0.22316275490573298, + "flos": 754113764352.0, + "grad_norm": 0.0354615562345569, + "language_loss": 0.84617937, + "learning_rate": 0.0009053127038830275, + "loss": 0.85672045, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.49731445, + "step": 1160, + "time_per_iteration": 3.0164098739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_mlp": 1.00777233, + "epoch": 0.22335513659099654, + "flos": 515804866560.0, + "grad_norm": 0.03692799991821936, + "language_loss": 0.87995219, + "learning_rate": 0.000905130196922898, + "loss": 0.89052767, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.49682617, + "step": 1161, + "time_per_iteration": 2.603769063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058076, + "balance_loss_mlp": 1.00848484, + "epoch": 0.2235475182762601, + "flos": 485508347136.0, + "grad_norm": 0.031071089964746976, + "language_loss": 0.8758713, + "learning_rate": 0.0009049475326772769, + "loss": 0.88645208, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.49511719, + "step": 1162, + "time_per_iteration": 2.6613070964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_mlp": 1.00334835, + "epoch": 0.22373989996152366, + "flos": 471068238336.0, + "grad_norm": 0.03308636607962537, + "language_loss": 0.83887613, + "learning_rate": 0.0009047647112170811, + "loss": 0.84940416, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.49389648, + "step": 1163, + "time_per_iteration": 2.8056106567382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105253, + "balance_loss_mlp": 1.00322485, + "epoch": 0.22393228164678722, + "flos": 1273019542272.0, + "grad_norm": 0.035987441954907426, + "language_loss": 0.88180983, + "learning_rate": 0.0009045817326132876, + "loss": 0.89233518, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.49243164, + "step": 1164, + "time_per_iteration": 3.7020320892333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_mlp": 1.00575495, + "epoch": 0.22412466333205078, + "flos": 597468503040.0, + "grad_norm": 0.03371692057767332, + "language_loss": 0.84342653, + "learning_rate": 0.0009043985969369357, + "loss": 0.85397661, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.49145508, + "step": 1165, + "time_per_iteration": 2.8581626415252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_mlp": 1.00299454, + "epoch": 0.22431704501731436, + "flos": 609632019456.0, + "grad_norm": 0.03010954873673584, + "language_loss": 0.84869868, + "learning_rate": 0.0009042153042591245, + "loss": 0.85922217, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.49243164, + "step": 1166, + "time_per_iteration": 2.810300827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_mlp": 1.0050199, + "epoch": 0.22450942670257792, + "flos": 908108190720.0, + "grad_norm": 0.030118647676053625, + "language_loss": 0.86120874, + "learning_rate": 0.0009040318546510146, + "loss": 0.87175173, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.49169922, + "step": 1167, + "time_per_iteration": 3.129802942276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057032, + "balance_loss_mlp": 1.00791764, + "epoch": 0.22470180838784148, + "flos": 566381690880.0, + "grad_norm": 0.035718478093575166, + "language_loss": 0.85780692, + "learning_rate": 0.0009038482481838275, + "loss": 0.86837721, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.49047852, + "step": 1168, + "time_per_iteration": 2.674471855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00880456, + "epoch": 0.22489419007310504, + "flos": 835918351872.0, + "grad_norm": 0.03078757560697398, + "language_loss": 0.88093269, + "learning_rate": 0.0009036644849288455, + "loss": 0.89151073, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.48925781, + "step": 1169, + "time_per_iteration": 3.126168727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00331759, + "epoch": 0.2250865717583686, + "flos": 582139924992.0, + "grad_norm": 0.03503818002335677, + "language_loss": 0.86431491, + "learning_rate": 0.0009034805649574118, + "loss": 0.87483639, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.48779297, + "step": 1170, + "time_per_iteration": 2.6982839107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056949, + "balance_loss_mlp": 1.0084312, + "epoch": 0.22527895344363216, + "flos": 601671733248.0, + "grad_norm": 0.031992933731526396, + "language_loss": 0.85811341, + "learning_rate": 0.0009032964883409308, + "loss": 0.86868292, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.48510742, + "step": 1171, + "time_per_iteration": 2.9468932151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055088, + "balance_loss_mlp": 1.00826263, + "epoch": 0.22547133512889572, + "flos": 1443734537472.0, + "grad_norm": 0.010800983830845337, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.7410562, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.46777344, + "step": 1172, + "time_per_iteration": 5.044191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_mlp": 1.0051204, + "epoch": 0.22566371681415928, + "flos": 491586703872.0, + "grad_norm": 0.034976527569036825, + "language_loss": 0.88142014, + "learning_rate": 0.0009029278654587462, + "loss": 0.89195722, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.48583984, + "step": 1173, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105611, + "balance_loss_mlp": 1.00749624, + "epoch": 0.22585609849942284, + "flos": 605752487424.0, + "grad_norm": 0.03629905495680353, + "language_loss": 0.82793885, + "learning_rate": 0.0009027433193361548, + "loss": 0.83850002, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.48583984, + "step": 1174, + "time_per_iteration": 2.707061290740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105926, + "balance_loss_mlp": 1.01064646, + "epoch": 0.22604848018468643, + "flos": 636728484096.0, + "grad_norm": 0.035409171913978986, + "language_loss": 0.87780964, + "learning_rate": 0.00090255861685474, + "loss": 0.88840234, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.48608398, + "step": 1175, + "time_per_iteration": 2.7910189628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.00752461, + "epoch": 0.22624086186995, + "flos": 480845325312.0, + "grad_norm": 0.040136392489239156, + "language_loss": 0.91905487, + "learning_rate": 0.0009023737580862095, + "loss": 0.92961645, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.48632812, + "step": 1176, + "time_per_iteration": 2.5489909648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054427, + "balance_loss_mlp": 1.00600469, + "epoch": 0.22643324355521355, + "flos": 496807693824.0, + "grad_norm": 0.032828642541270554, + "language_loss": 0.83966863, + "learning_rate": 0.0009021887431023321, + "loss": 0.85021293, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.48413086, + "step": 1177, + "time_per_iteration": 2.679046392440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060571, + "balance_loss_mlp": 1.01224387, + "epoch": 0.2266256252404771, + "flos": 562684905984.0, + "grad_norm": 0.03431341234676521, + "language_loss": 0.8836711, + "learning_rate": 0.0009020035719749369, + "loss": 0.89427686, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.4831543, + "step": 1178, + "time_per_iteration": 2.777273416519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_mlp": 1.00516534, + "epoch": 0.22681800692574067, + "flos": 581033703936.0, + "grad_norm": 0.0422995660898389, + "language_loss": 0.78512251, + "learning_rate": 0.0009018182447759136, + "loss": 0.79566014, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.48583984, + "step": 1179, + "time_per_iteration": 2.9779903888702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105363, + "balance_loss_mlp": 1.00508785, + "epoch": 0.22701038861100423, + "flos": 741466156800.0, + "grad_norm": 0.03672617722264385, + "language_loss": 0.80683887, + "learning_rate": 0.0009016327615772126, + "loss": 0.81737518, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.48535156, + "step": 1180, + "time_per_iteration": 2.953355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054935, + "balance_loss_mlp": 1.00636911, + "epoch": 0.2272027702962678, + "flos": 578306079744.0, + "grad_norm": 0.03924605706365315, + "language_loss": 0.88551408, + "learning_rate": 0.0009014471224508451, + "loss": 0.89606345, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.4855957, + "step": 1181, + "time_per_iteration": 2.7092630863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00744355, + "epoch": 0.22739515198153135, + "flos": 545291651328.0, + "grad_norm": 0.04038062834310644, + "language_loss": 0.83949769, + "learning_rate": 0.0009012613274688823, + "loss": 0.85005856, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.48632812, + "step": 1182, + "time_per_iteration": 2.642143964767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055555, + "balance_loss_mlp": 1.00689363, + "epoch": 0.22758753366679493, + "flos": 441092504832.0, + "grad_norm": 0.03566258536478163, + "language_loss": 0.88506091, + "learning_rate": 0.0009010753767034565, + "loss": 0.89561647, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.48632812, + "step": 1183, + "time_per_iteration": 2.599167585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_mlp": 1.00526154, + "epoch": 0.2277799153520585, + "flos": 730824900096.0, + "grad_norm": 0.03354089847275564, + "language_loss": 0.79992342, + "learning_rate": 0.0009008892702267599, + "loss": 0.81046152, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.48535156, + "step": 1184, + "time_per_iteration": 2.9798924922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057068, + "balance_loss_mlp": 1.00855029, + "epoch": 0.22797229703732205, + "flos": 527913947904.0, + "grad_norm": 0.04184098346005727, + "language_loss": 0.89975739, + "learning_rate": 0.0009007030081110457, + "loss": 0.91032803, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.48510742, + "step": 1185, + "time_per_iteration": 2.6349968910217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.00910807, + "epoch": 0.2281646787225856, + "flos": 536521630464.0, + "grad_norm": 0.03583751901003141, + "language_loss": 0.85487026, + "learning_rate": 0.000900516590428627, + "loss": 0.86544555, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.48413086, + "step": 1186, + "time_per_iteration": 2.669015407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054531, + "balance_loss_mlp": 1.00596476, + "epoch": 0.22835706040784917, + "flos": 542478478080.0, + "grad_norm": 0.03191556588332838, + "language_loss": 0.9033947, + "learning_rate": 0.0009003300172518778, + "loss": 0.91394001, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.4855957, + "step": 1187, + "time_per_iteration": 2.7164688110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.00804579, + "epoch": 0.22854944209311273, + "flos": 792006042624.0, + "grad_norm": 0.0322044633529041, + "language_loss": 0.85374159, + "learning_rate": 0.0009001432886532321, + "loss": 0.86430913, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.48681641, + "step": 1188, + "time_per_iteration": 2.9621965885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054799, + "balance_loss_mlp": 1.00568485, + "epoch": 0.2287418237783763, + "flos": 470216707584.0, + "grad_norm": 0.03536870053258389, + "language_loss": 0.87358034, + "learning_rate": 0.0008999564047051843, + "loss": 0.88412833, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.49047852, + "step": 1189, + "time_per_iteration": 2.5233154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058932, + "balance_loss_mlp": 1.01003218, + "epoch": 0.22893420546363985, + "flos": 469005507072.0, + "grad_norm": 0.030491923293758834, + "language_loss": 0.8554523, + "learning_rate": 0.0008997693654802894, + "loss": 0.86604154, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.48852539, + "step": 1190, + "time_per_iteration": 2.6391589641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_mlp": 1.00965738, + "epoch": 0.22912658714890344, + "flos": 627402440448.0, + "grad_norm": 0.0331512035559832, + "language_loss": 0.87166977, + "learning_rate": 0.0008995821710511625, + "loss": 0.88225698, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49023438, + "step": 1191, + "time_per_iteration": 2.7549567222595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.00599909, + "epoch": 0.229318968834167, + "flos": 504021428736.0, + "grad_norm": 0.030936804790582927, + "language_loss": 0.85688579, + "learning_rate": 0.0008993948214904786, + "loss": 0.86743385, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.48779297, + "step": 1192, + "time_per_iteration": 2.596224784851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_mlp": 1.01483917, + "epoch": 0.22951135051943056, + "flos": 1377716374272.0, + "grad_norm": 0.008909469382289665, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79484069, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.46972656, + "step": 1193, + "time_per_iteration": 4.853066921234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062271, + "balance_loss_mlp": 1.01356232, + "epoch": 0.22970373220469412, + "flos": 645550994688.0, + "grad_norm": 0.0389743097765726, + "language_loss": 0.78935194, + "learning_rate": 0.0008990196572654427, + "loss": 0.79997468, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.48681641, + "step": 1194, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00771046, + "epoch": 0.22989611388995768, + "flos": 501273384192.0, + "grad_norm": 0.02988304738122761, + "language_loss": 0.88486552, + "learning_rate": 0.0008988318427467426, + "loss": 0.8954283, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.4855957, + "step": 1195, + "time_per_iteration": 2.6931521892547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053709, + "balance_loss_mlp": 1.00514269, + "epoch": 0.23008849557522124, + "flos": 1098334596864.0, + "grad_norm": 0.03694163801075408, + "language_loss": 0.87307864, + "learning_rate": 0.0008986438733877887, + "loss": 0.88361579, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.4855957, + "step": 1196, + "time_per_iteration": 3.4505865573883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00447488, + "epoch": 0.2302808772604848, + "flos": 684993722880.0, + "grad_norm": 0.030674764969734848, + "language_loss": 0.85086071, + "learning_rate": 0.0008984557492615576, + "loss": 0.86139137, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.48583984, + "step": 1197, + "time_per_iteration": 2.936891794204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056985, + "balance_loss_mlp": 1.00837183, + "epoch": 0.23047325894574835, + "flos": 529961127936.0, + "grad_norm": 0.03469763625730159, + "language_loss": 0.90249604, + "learning_rate": 0.0008982674704410854, + "loss": 0.91306591, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.48608398, + "step": 1198, + "time_per_iteration": 2.6928677558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_mlp": 1.00653744, + "epoch": 0.23066564063101191, + "flos": 684127607808.0, + "grad_norm": 0.03582939263118032, + "language_loss": 0.78263444, + "learning_rate": 0.0008980790369994682, + "loss": 0.79318547, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.4855957, + "step": 1199, + "time_per_iteration": 2.941063642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692904, + "epoch": 0.2308580223162755, + "flos": 559632605184.0, + "grad_norm": 0.03400437188822284, + "language_loss": 0.87868834, + "learning_rate": 0.000897890449009863, + "loss": 0.88924116, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.48339844, + "step": 1200, + "time_per_iteration": 2.6677346229553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058779, + "balance_loss_mlp": 1.01061893, + "epoch": 0.23105040400153906, + "flos": 556730003712.0, + "grad_norm": 0.030515141355108834, + "language_loss": 0.90571141, + "learning_rate": 0.0008977017065454853, + "loss": 0.91629916, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.48144531, + "step": 1201, + "time_per_iteration": 2.7204995155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053158, + "balance_loss_mlp": 1.00506902, + "epoch": 0.23124278568680262, + "flos": 706050714624.0, + "grad_norm": 0.034769733982414605, + "language_loss": 0.81452352, + "learning_rate": 0.0008975128096796121, + "loss": 0.82505512, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48071289, + "step": 1202, + "time_per_iteration": 2.861058473587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.00517035, + "epoch": 0.23143516737206618, + "flos": 613969397760.0, + "grad_norm": 0.03845725381901349, + "language_loss": 0.86815399, + "learning_rate": 0.0008973237584855794, + "loss": 0.87868845, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.48266602, + "step": 1203, + "time_per_iteration": 2.907670021057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055623, + "balance_loss_mlp": 1.00715244, + "epoch": 0.23162754905732974, + "flos": 390096718080.0, + "grad_norm": 0.03680581416715809, + "language_loss": 0.82972479, + "learning_rate": 0.0008971345530367832, + "loss": 0.84028101, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.48461914, + "step": 1204, + "time_per_iteration": 2.4500131607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_mlp": 1.00190353, + "epoch": 0.2318199307425933, + "flos": 668970116352.0, + "grad_norm": 0.03636020946200237, + "language_loss": 0.86001658, + "learning_rate": 0.0008969451934066799, + "loss": 0.87052464, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.48828125, + "step": 1205, + "time_per_iteration": 2.786860704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_mlp": 1.00558126, + "epoch": 0.23201231242785686, + "flos": 667628658432.0, + "grad_norm": 0.042825772722853955, + "language_loss": 0.80798173, + "learning_rate": 0.0008967556796687854, + "loss": 0.81852657, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.48852539, + "step": 1206, + "time_per_iteration": 2.9043900966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106058, + "balance_loss_mlp": 1.01153755, + "epoch": 0.23220469411312042, + "flos": 750095226624.0, + "grad_norm": 0.036226897286377145, + "language_loss": 0.84918714, + "learning_rate": 0.0008965660118966752, + "loss": 0.85979295, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.48974609, + "step": 1207, + "time_per_iteration": 2.8989100456237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054609, + "balance_loss_mlp": 1.00597119, + "epoch": 0.232397075798384, + "flos": 668262448896.0, + "grad_norm": 0.03230217319227319, + "language_loss": 0.90859735, + "learning_rate": 0.0008963761901639851, + "loss": 0.91914344, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.48632812, + "step": 1208, + "time_per_iteration": 2.801715612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050654, + "balance_loss_mlp": 1.00204051, + "epoch": 0.23258945748364757, + "flos": 611346753024.0, + "grad_norm": 0.038379048380249, + "language_loss": 0.83753544, + "learning_rate": 0.0008961862145444103, + "loss": 0.84804195, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.48608398, + "step": 1209, + "time_per_iteration": 2.6739237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105504, + "balance_loss_mlp": 1.00656986, + "epoch": 0.23278183916891113, + "flos": 490672956672.0, + "grad_norm": 0.04093378826068356, + "language_loss": 0.86382735, + "learning_rate": 0.0008959960851117059, + "loss": 0.87437773, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.48461914, + "step": 1210, + "time_per_iteration": 2.635650634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056695, + "balance_loss_mlp": 1.00808144, + "epoch": 0.23297422085417469, + "flos": 512674798080.0, + "grad_norm": 0.0354403494585401, + "language_loss": 0.84509313, + "learning_rate": 0.0008958058019396868, + "loss": 0.85566002, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.48608398, + "step": 1211, + "time_per_iteration": 2.788318157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105326, + "balance_loss_mlp": 1.00462246, + "epoch": 0.23316660253943824, + "flos": 547532272128.0, + "grad_norm": 0.03263062148431384, + "language_loss": 0.87462825, + "learning_rate": 0.0008956153651022274, + "loss": 0.8851608, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.48608398, + "step": 1212, + "time_per_iteration": 2.725313901901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_mlp": 1.00709951, + "epoch": 0.2333589842247018, + "flos": 511289598720.0, + "grad_norm": 0.03371055024816449, + "language_loss": 0.84886169, + "learning_rate": 0.0008954247746732618, + "loss": 0.85942048, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.48754883, + "step": 1213, + "time_per_iteration": 2.592165470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057864, + "balance_loss_mlp": 1.00894058, + "epoch": 0.23355136590996536, + "flos": 664407216384.0, + "grad_norm": 0.030798488974581865, + "language_loss": 0.9124192, + "learning_rate": 0.0008952340307267837, + "loss": 0.92299783, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48876953, + "step": 1214, + "time_per_iteration": 2.887542724609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_mlp": 1.00332439, + "epoch": 0.23374374759522892, + "flos": 509465995008.0, + "grad_norm": 0.038631928770240895, + "language_loss": 0.8442086, + "learning_rate": 0.0008950431333368468, + "loss": 0.85472775, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.48583984, + "step": 1215, + "time_per_iteration": 2.5713701248168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_mlp": 1.00283849, + "epoch": 0.2339361292804925, + "flos": 1296429915648.0, + "grad_norm": 0.03446682830311694, + "language_loss": 0.8584398, + "learning_rate": 0.0008948520825775634, + "loss": 0.86895549, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48706055, + "step": 1216, + "time_per_iteration": 3.631596565246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054055, + "balance_loss_mlp": 1.00541723, + "epoch": 0.23412851096575607, + "flos": 707177344512.0, + "grad_norm": 0.031791306217448204, + "language_loss": 0.84468639, + "learning_rate": 0.0008946608785231067, + "loss": 0.85522687, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48632812, + "step": 1217, + "time_per_iteration": 2.878099203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053689, + "balance_loss_mlp": 1.00517046, + "epoch": 0.23432089265101963, + "flos": 439175582208.0, + "grad_norm": 0.03486793229645632, + "language_loss": 0.85493773, + "learning_rate": 0.0008944695212477084, + "loss": 0.86547458, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.48510742, + "step": 1218, + "time_per_iteration": 2.5141704082489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053338, + "balance_loss_mlp": 1.00498641, + "epoch": 0.2345132743362832, + "flos": 481915574784.0, + "grad_norm": 0.03047714423600347, + "language_loss": 0.87145793, + "learning_rate": 0.0008942780108256599, + "loss": 0.88199133, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.48339844, + "step": 1219, + "time_per_iteration": 2.6020901203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050373, + "balance_loss_mlp": 1.00180733, + "epoch": 0.23470565602154675, + "flos": 412341577728.0, + "grad_norm": 0.03328064907126118, + "language_loss": 0.87382472, + "learning_rate": 0.0008940863473313121, + "loss": 0.88432848, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.4855957, + "step": 1220, + "time_per_iteration": 2.4561610221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_mlp": 1.00483322, + "epoch": 0.2348980377068103, + "flos": 546500906496.0, + "grad_norm": 0.04239569524538178, + "language_loss": 0.88751769, + "learning_rate": 0.0008938945308390756, + "loss": 0.89805412, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48779297, + "step": 1221, + "time_per_iteration": 2.657763719558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057653, + "balance_loss_mlp": 1.00906336, + "epoch": 0.23509041939207387, + "flos": 576843112704.0, + "grad_norm": 0.04482007629740174, + "language_loss": 0.88039029, + "learning_rate": 0.00089370256142342, + "loss": 0.89096677, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.48583984, + "step": 1222, + "time_per_iteration": 2.7348928451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_mlp": 1.00616074, + "epoch": 0.23528280107733743, + "flos": 589948566528.0, + "grad_norm": 0.030112791330182954, + "language_loss": 0.85687798, + "learning_rate": 0.0008935104391588746, + "loss": 0.86742526, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.4855957, + "step": 1223, + "time_per_iteration": 2.7620511054992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_mlp": 1.00350857, + "epoch": 0.235475182762601, + "flos": 824858132736.0, + "grad_norm": 0.028710207733723417, + "language_loss": 0.83630896, + "learning_rate": 0.0008933181641200276, + "loss": 0.84683019, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.48608398, + "step": 1224, + "time_per_iteration": 3.1587913036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053568, + "balance_loss_mlp": 1.00531197, + "epoch": 0.23566756444786457, + "flos": 681367902720.0, + "grad_norm": 0.03430983930689064, + "language_loss": 0.86561936, + "learning_rate": 0.0008931257363815271, + "loss": 0.87615514, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.48242188, + "step": 1225, + "time_per_iteration": 2.9277396202087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056611, + "balance_loss_mlp": 1.00849795, + "epoch": 0.23585994613312813, + "flos": 703135474176.0, + "grad_norm": 0.029906055234585397, + "language_loss": 0.90256047, + "learning_rate": 0.0008929331560180798, + "loss": 0.91312659, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.48095703, + "step": 1226, + "time_per_iteration": 2.911451578140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_mlp": 1.00676012, + "epoch": 0.2360523278183917, + "flos": 525196038912.0, + "grad_norm": 0.030679819106685022, + "language_loss": 0.9186613, + "learning_rate": 0.0008927404231044525, + "loss": 0.92921197, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48291016, + "step": 1227, + "time_per_iteration": 2.6848785877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.00756276, + "epoch": 0.23624470950365525, + "flos": 525443914752.0, + "grad_norm": 0.030207709240370546, + "language_loss": 0.82286787, + "learning_rate": 0.0008925475377154703, + "loss": 0.83342624, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48266602, + "step": 1228, + "time_per_iteration": 2.7278709411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058098, + "balance_loss_mlp": 1.00974643, + "epoch": 0.2364370911889188, + "flos": 597961342464.0, + "grad_norm": 0.04301213480645635, + "language_loss": 0.82405227, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463323, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.48339844, + "step": 1229, + "time_per_iteration": 2.7282724380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055873, + "balance_loss_mlp": 1.00766432, + "epoch": 0.23662947287418237, + "flos": 758173131264.0, + "grad_norm": 0.03660169780759576, + "language_loss": 0.92488217, + "learning_rate": 0.00089216130981104, + "loss": 0.93544096, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.48193359, + "step": 1230, + "time_per_iteration": 3.0333714485168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051461, + "balance_loss_mlp": 1.00337219, + "epoch": 0.23682185455944593, + "flos": 547208573952.0, + "grad_norm": 0.03138155314794734, + "language_loss": 0.83336782, + "learning_rate": 0.000891967967445539, + "loss": 0.8438825, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.48071289, + "step": 1231, + "time_per_iteration": 2.7093472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_mlp": 1.00587165, + "epoch": 0.2370142362447095, + "flos": 663523604736.0, + "grad_norm": 0.02795314572038805, + "language_loss": 0.89439881, + "learning_rate": 0.0008917744729045772, + "loss": 0.90493822, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.48046875, + "step": 1232, + "time_per_iteration": 2.8760838508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057809, + "balance_loss_mlp": 1.00974393, + "epoch": 0.23720661792997308, + "flos": 684913042944.0, + "grad_norm": 0.03460859048974857, + "language_loss": 0.8446126, + "learning_rate": 0.0008915808262632757, + "loss": 0.85519075, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.48046875, + "step": 1233, + "time_per_iteration": 2.889141321182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_mlp": 1.01058459, + "epoch": 0.23739899961523664, + "flos": 560023377408.0, + "grad_norm": 0.03296017154749467, + "language_loss": 0.94079709, + "learning_rate": 0.0008913870275968148, + "loss": 0.95138192, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.47875977, + "step": 1234, + "time_per_iteration": 2.7432892322540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_mlp": 1.00655627, + "epoch": 0.2375913813005002, + "flos": 891165000960.0, + "grad_norm": 0.03128077017401229, + "language_loss": 0.88428569, + "learning_rate": 0.0008911930769804342, + "loss": 0.89483166, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.48022461, + "step": 1235, + "time_per_iteration": 3.261483669281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692844, + "epoch": 0.23778376298576376, + "flos": 642366491136.0, + "grad_norm": 0.029107844015886564, + "language_loss": 0.91850013, + "learning_rate": 0.0008909989744894318, + "loss": 0.92905295, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.48339844, + "step": 1236, + "time_per_iteration": 2.8673832416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061412, + "balance_loss_mlp": 1.01287031, + "epoch": 0.23797614467102732, + "flos": 617946139392.0, + "grad_norm": 0.034095811880077646, + "language_loss": 0.82566786, + "learning_rate": 0.0008908047201991649, + "loss": 0.83628196, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.48535156, + "step": 1237, + "time_per_iteration": 2.7810442447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00511789, + "epoch": 0.23816852635629088, + "flos": 625464130560.0, + "grad_norm": 0.032663011960307756, + "language_loss": 0.87081301, + "learning_rate": 0.0008906103141850502, + "loss": 0.88134885, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.48461914, + "step": 1238, + "time_per_iteration": 2.880305528640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_mlp": 1.00416911, + "epoch": 0.23836090804155444, + "flos": 522441191424.0, + "grad_norm": 0.03474425243888252, + "language_loss": 0.88862967, + "learning_rate": 0.0008904157565225621, + "loss": 0.89915323, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48168945, + "step": 1239, + "time_per_iteration": 2.648766040802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052394, + "balance_loss_mlp": 1.00423324, + "epoch": 0.238553289726818, + "flos": 1155855892992.0, + "grad_norm": 0.034399895266541865, + "language_loss": 0.82445645, + "learning_rate": 0.000890221047287235, + "loss": 0.83498037, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48144531, + "step": 1240, + "time_per_iteration": 3.5001280307769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055, + "balance_loss_mlp": 1.00703037, + "epoch": 0.23874567141208156, + "flos": 500910802176.0, + "grad_norm": 0.03306053891413694, + "language_loss": 0.91726851, + "learning_rate": 0.0008900261865546615, + "loss": 0.92781848, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47949219, + "step": 1241, + "time_per_iteration": 2.6465680599212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052437, + "balance_loss_mlp": 1.00418115, + "epoch": 0.23893805309734514, + "flos": 558050074368.0, + "grad_norm": 0.0354259641755878, + "language_loss": 0.85598528, + "learning_rate": 0.0008898311744004936, + "loss": 0.86650962, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.48242188, + "step": 1242, + "time_per_iteration": 2.7268829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053623, + "balance_loss_mlp": 1.0055337, + "epoch": 0.2391304347826087, + "flos": 550317255168.0, + "grad_norm": 0.0320494810853186, + "language_loss": 0.87574649, + "learning_rate": 0.0008896360109004414, + "loss": 0.88628268, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.48071289, + "step": 1243, + "time_per_iteration": 2.6199252605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050337, + "balance_loss_mlp": 1.00222456, + "epoch": 0.23932281646787226, + "flos": 517079250432.0, + "grad_norm": 0.0302458656306059, + "language_loss": 0.85177696, + "learning_rate": 0.0008894406961302742, + "loss": 0.86228031, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.48095703, + "step": 1244, + "time_per_iteration": 2.604508876800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052152, + "balance_loss_mlp": 1.00411069, + "epoch": 0.23951519815313582, + "flos": 745002548736.0, + "grad_norm": 0.03429303167053761, + "language_loss": 0.84712255, + "learning_rate": 0.0008892452301658201, + "loss": 0.85764414, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.48022461, + "step": 1245, + "time_per_iteration": 2.924288272857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054436, + "balance_loss_mlp": 1.00651395, + "epoch": 0.23970757983839938, + "flos": 555175663104.0, + "grad_norm": 0.03219666617279603, + "language_loss": 0.84054452, + "learning_rate": 0.0008890496130829653, + "loss": 0.85108888, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.47900391, + "step": 1246, + "time_per_iteration": 2.6700189113616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052243, + "balance_loss_mlp": 1.00441635, + "epoch": 0.23989996152366294, + "flos": 481618121472.0, + "grad_norm": 0.033578246726411604, + "language_loss": 0.86002076, + "learning_rate": 0.0008888538449576555, + "loss": 0.87054318, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.47802734, + "step": 1247, + "time_per_iteration": 2.6269826889038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_mlp": 1.00886118, + "epoch": 0.2400923432089265, + "flos": 486281143296.0, + "grad_norm": 0.03580496599340432, + "language_loss": 0.83572984, + "learning_rate": 0.0008886579258658944, + "loss": 0.84630001, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48144531, + "step": 1248, + "time_per_iteration": 2.577885389328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054529, + "balance_loss_mlp": 1.0065589, + "epoch": 0.24028472489419006, + "flos": 624793401600.0, + "grad_norm": 0.03296142515540601, + "language_loss": 0.85843956, + "learning_rate": 0.0008884618558837446, + "loss": 0.86898482, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.47949219, + "step": 1249, + "time_per_iteration": 2.874666929244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.00870681, + "epoch": 0.24047710657945365, + "flos": 602809056768.0, + "grad_norm": 0.033943651692576245, + "language_loss": 0.87474859, + "learning_rate": 0.0008882656350873273, + "loss": 0.88531733, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.48144531, + "step": 1250, + "time_per_iteration": 2.8647053241729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055385, + "balance_loss_mlp": 1.00748658, + "epoch": 0.2406694882647172, + "flos": 843001829376.0, + "grad_norm": 0.04142560607115463, + "language_loss": 0.87984931, + "learning_rate": 0.0008880692635528219, + "loss": 0.89040315, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.47875977, + "step": 1251, + "time_per_iteration": 3.0643107891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105149, + "balance_loss_mlp": 1.00352037, + "epoch": 0.24086186994998077, + "flos": 528135578880.0, + "grad_norm": 0.03337559285192523, + "language_loss": 0.90356189, + "learning_rate": 0.0008878727413564669, + "loss": 0.91407681, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47949219, + "step": 1252, + "time_per_iteration": 2.7680115699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053848, + "balance_loss_mlp": 1.00826263, + "epoch": 0.24105425163524433, + "flos": 1341462028800.0, + "grad_norm": 0.009196650126926217, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81189448, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.45507812, + "step": 1253, + "time_per_iteration": 4.858070135116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_mlp": 1.00698781, + "epoch": 0.24124663332050789, + "flos": 615228230400.0, + "grad_norm": 0.036740782431925904, + "language_loss": 0.79496801, + "learning_rate": 0.0008874792452834528, + "loss": 0.80551577, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.47753906, + "step": 1254, + "time_per_iteration": 2.756243944168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_mlp": 1.00954247, + "epoch": 0.24143901500577145, + "flos": 576593291520.0, + "grad_norm": 0.037714132300224086, + "language_loss": 0.87880921, + "learning_rate": 0.0008872822715595626, + "loss": 0.88938332, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.47851562, + "step": 1255, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.00812411, + "epoch": 0.241631396691035, + "flos": 496147658496.0, + "grad_norm": 0.038695693582970765, + "language_loss": 0.87873089, + "learning_rate": 0.0008870851474793598, + "loss": 0.88929206, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.47973633, + "step": 1256, + "time_per_iteration": 2.6313350200653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058027, + "balance_loss_mlp": 1.009866, + "epoch": 0.24182377837629856, + "flos": 637397267712.0, + "grad_norm": 0.03630749648984725, + "language_loss": 0.904266, + "learning_rate": 0.0008868878731193752, + "loss": 0.9148463, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48144531, + "step": 1257, + "time_per_iteration": 2.820671558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052967, + "balance_loss_mlp": 1.00509274, + "epoch": 0.24201616006156215, + "flos": 516350195712.0, + "grad_norm": 0.04098435374075245, + "language_loss": 0.90631104, + "learning_rate": 0.0008866904485561973, + "loss": 0.91684067, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.47851562, + "step": 1258, + "time_per_iteration": 2.712970495223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053405, + "balance_loss_mlp": 1.0053165, + "epoch": 0.2422085417468257, + "flos": 616379159808.0, + "grad_norm": 0.03199149634406808, + "language_loss": 0.83463258, + "learning_rate": 0.000886492873866473, + "loss": 0.84516662, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.48071289, + "step": 1259, + "time_per_iteration": 2.8250985145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00330269, + "epoch": 0.24240092343208927, + "flos": 586913762304.0, + "grad_norm": 0.03973618931504764, + "language_loss": 0.85183978, + "learning_rate": 0.000886295149126908, + "loss": 0.86235273, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.47973633, + "step": 1260, + "time_per_iteration": 2.7110049724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_mlp": 1.00338328, + "epoch": 0.24259330511735283, + "flos": 763572010752.0, + "grad_norm": 0.03275678482299809, + "language_loss": 0.86485362, + "learning_rate": 0.0008860972744142655, + "loss": 0.87536597, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47827148, + "step": 1261, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051398, + "balance_loss_mlp": 1.00361907, + "epoch": 0.2427856868026164, + "flos": 628134407424.0, + "grad_norm": 0.03196094686024711, + "language_loss": 0.82455611, + "learning_rate": 0.0008858992498053671, + "loss": 0.83507007, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47753906, + "step": 1262, + "time_per_iteration": 2.8111376762390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054321, + "balance_loss_mlp": 1.00797272, + "epoch": 0.24297806848787995, + "flos": 1514922167808.0, + "grad_norm": 0.010120346862694057, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77643073, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.46289062, + "step": 1263, + "time_per_iteration": 4.84857177734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052202, + "balance_loss_mlp": 1.00420785, + "epoch": 0.2431704501731435, + "flos": 543073384704.0, + "grad_norm": 0.030775668427347653, + "language_loss": 0.83837479, + "learning_rate": 0.0008855027512063817, + "loss": 0.84889686, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47973633, + "step": 1264, + "time_per_iteration": 2.69954252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055608, + "balance_loss_mlp": 1.0077095, + "epoch": 0.24336283185840707, + "flos": 524879143680.0, + "grad_norm": 0.03906981412635217, + "language_loss": 0.86655742, + "learning_rate": 0.0008853042773702292, + "loss": 0.87711346, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.47875977, + "step": 1265, + "time_per_iteration": 2.703227996826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_mlp": 1.00530863, + "epoch": 0.24355521354367063, + "flos": 538206228480.0, + "grad_norm": 0.030917867079500824, + "language_loss": 0.88497615, + "learning_rate": 0.0008851056539456896, + "loss": 0.89550632, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.47680664, + "step": 1266, + "time_per_iteration": 2.6844840049743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_mlp": 1.00655031, + "epoch": 0.24374759522893422, + "flos": 932109580032.0, + "grad_norm": 0.032880300158599975, + "language_loss": 0.82697207, + "learning_rate": 0.0008849068810098755, + "loss": 0.83751392, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.47607422, + "step": 1267, + "time_per_iteration": 3.274641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_mlp": 1.00789249, + "epoch": 0.24393997691419778, + "flos": 428685970176.0, + "grad_norm": 0.04273651221625489, + "language_loss": 0.84108871, + "learning_rate": 0.0008847079586399575, + "loss": 0.85164183, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47387695, + "step": 1268, + "time_per_iteration": 2.475217819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057264, + "balance_loss_mlp": 1.00993788, + "epoch": 0.24413235859946134, + "flos": 579943045632.0, + "grad_norm": 0.03463136192779687, + "language_loss": 0.86878628, + "learning_rate": 0.0008845088869131641, + "loss": 0.87935889, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.47290039, + "step": 1269, + "time_per_iteration": 2.676954746246338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054221, + "balance_loss_mlp": 1.00689447, + "epoch": 0.2443247402847249, + "flos": 530901120000.0, + "grad_norm": 0.04739098518835349, + "language_loss": 0.8972156, + "learning_rate": 0.0008843096659067818, + "loss": 0.90775776, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.47290039, + "step": 1270, + "time_per_iteration": 2.6031625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_mlp": 1.00896251, + "epoch": 0.24451712196998845, + "flos": 697625779200.0, + "grad_norm": 0.03005687387855686, + "language_loss": 0.8676796, + "learning_rate": 0.000884110295698155, + "loss": 0.87824345, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.47387695, + "step": 1271, + "time_per_iteration": 2.946385145187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00460577, + "epoch": 0.24470950365525201, + "flos": 530864181504.0, + "grad_norm": 0.03542850047119753, + "language_loss": 0.86657912, + "learning_rate": 0.0008839107763646861, + "loss": 0.87710059, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.47509766, + "step": 1272, + "time_per_iteration": 2.6175343990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057047, + "balance_loss_mlp": 1.00955379, + "epoch": 0.24490188534051557, + "flos": 492348806400.0, + "grad_norm": 0.04294337139782129, + "language_loss": 0.9099223, + "learning_rate": 0.0008837111079838353, + "loss": 0.92049271, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.47460938, + "step": 1273, + "time_per_iteration": 2.699777126312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051504, + "balance_loss_mlp": 1.00393975, + "epoch": 0.24509426702577913, + "flos": 475112054016.0, + "grad_norm": 0.03233839715385124, + "language_loss": 0.90686411, + "learning_rate": 0.000883511290633121, + "loss": 0.91737914, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.4753418, + "step": 1274, + "time_per_iteration": 2.5347506999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053736, + "balance_loss_mlp": 1.0061239, + "epoch": 0.24528664871104272, + "flos": 551648019456.0, + "grad_norm": 0.029596958484994024, + "language_loss": 0.9283247, + "learning_rate": 0.000883311324390119, + "loss": 0.93886209, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.47583008, + "step": 1275, + "time_per_iteration": 2.7105162143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_mlp": 1.00703931, + "epoch": 0.24547903039630628, + "flos": 827336914176.0, + "grad_norm": 0.04026092464880397, + "language_loss": 0.8227402, + "learning_rate": 0.0008831112093324629, + "loss": 0.83328599, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.47509766, + "step": 1276, + "time_per_iteration": 3.0518436431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052804, + "balance_loss_mlp": 1.00523984, + "epoch": 0.24567141208156984, + "flos": 592694665728.0, + "grad_norm": 0.0350541873914122, + "language_loss": 0.89993191, + "learning_rate": 0.0008829109455378444, + "loss": 0.91045994, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.4753418, + "step": 1277, + "time_per_iteration": 2.705888032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053461, + "balance_loss_mlp": 1.00606322, + "epoch": 0.2458637937668334, + "flos": 548930110464.0, + "grad_norm": 0.03225743101348484, + "language_loss": 0.87107539, + "learning_rate": 0.000882710533084013, + "loss": 0.88161004, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.47363281, + "step": 1278, + "time_per_iteration": 2.6600000858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_mlp": 1.00418186, + "epoch": 0.24605617545209696, + "flos": 516912054528.0, + "grad_norm": 0.031446449457072034, + "language_loss": 0.89965951, + "learning_rate": 0.0008825099720487755, + "loss": 0.91017628, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.47460938, + "step": 1279, + "time_per_iteration": 2.6381545066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059078, + "balance_loss_mlp": 1.01320648, + "epoch": 0.24624855713736052, + "flos": 1515061173504.0, + "grad_norm": 0.006597619453236458, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76320213, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.45800781, + "step": 1280, + "time_per_iteration": 4.836413621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.0109787, + "epoch": 0.24644093882262408, + "flos": 1530749421312.0, + "grad_norm": 0.006438131933853504, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000866, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.45703125, + "step": 1281, + "time_per_iteration": 4.763012409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055673, + "balance_loss_mlp": 1.00817966, + "epoch": 0.24663332050788764, + "flos": 660349794816.0, + "grad_norm": 0.03366863359794558, + "language_loss": 0.89743239, + "learning_rate": 0.0008819073982335619, + "loss": 0.90798908, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.47460938, + "step": 1282, + "time_per_iteration": 2.830066204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051734, + "balance_loss_mlp": 1.00426519, + "epoch": 0.24682570219315123, + "flos": 542806066944.0, + "grad_norm": 0.034270358372240205, + "language_loss": 0.85323066, + "learning_rate": 0.0008817062436519235, + "loss": 0.86374807, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.47436523, + "step": 1283, + "time_per_iteration": 2.6451101303100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00680709, + "epoch": 0.24701808387841478, + "flos": 441659221248.0, + "grad_norm": 0.03422998600893363, + "language_loss": 0.90367711, + "learning_rate": 0.0008815049408787788, + "loss": 0.91422176, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.47631836, + "step": 1284, + "time_per_iteration": 2.5568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_mlp": 1.00672722, + "epoch": 0.24721046556367834, + "flos": 469033697280.0, + "grad_norm": 0.036620952447016124, + "language_loss": 0.86045629, + "learning_rate": 0.0008813034899922805, + "loss": 0.87100112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47729492, + "step": 1285, + "time_per_iteration": 2.5571885108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052621, + "balance_loss_mlp": 1.00498545, + "epoch": 0.2474028472489419, + "flos": 505408573440.0, + "grad_norm": 0.03938899634346209, + "language_loss": 0.90811062, + "learning_rate": 0.0008811018910706387, + "loss": 0.91863692, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.47607422, + "step": 1286, + "time_per_iteration": 2.5542702674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105262, + "balance_loss_mlp": 1.00496054, + "epoch": 0.24759522893420546, + "flos": 480956140800.0, + "grad_norm": 0.04329385189604929, + "language_loss": 0.82886434, + "learning_rate": 0.0008809001441921211, + "loss": 0.83939052, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.47631836, + "step": 1287, + "time_per_iteration": 2.7426302433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056359, + "balance_loss_mlp": 1.00879443, + "epoch": 0.24778761061946902, + "flos": 534754407168.0, + "grad_norm": 0.03495005483538565, + "language_loss": 0.86372733, + "learning_rate": 0.0008806982494350528, + "loss": 0.87429094, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.4753418, + "step": 1288, + "time_per_iteration": 2.6200613975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_mlp": 1.0063771, + "epoch": 0.24797999230473258, + "flos": 560943927552.0, + "grad_norm": 0.028534619779485338, + "language_loss": 0.90820038, + "learning_rate": 0.0008804962068778161, + "loss": 0.91874075, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.47631836, + "step": 1289, + "time_per_iteration": 2.8445866107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050603, + "balance_loss_mlp": 1.00287127, + "epoch": 0.24817237398999614, + "flos": 625481627136.0, + "grad_norm": 0.033144052318390974, + "language_loss": 0.81476247, + "learning_rate": 0.0008802940165988511, + "loss": 0.82526851, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.47705078, + "step": 1290, + "time_per_iteration": 2.874469518661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052663, + "balance_loss_mlp": 1.00500298, + "epoch": 0.2483647556752597, + "flos": 613485306624.0, + "grad_norm": 0.033485904546120666, + "language_loss": 0.88976955, + "learning_rate": 0.000880091678676655, + "loss": 0.90029621, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47631836, + "step": 1291, + "time_per_iteration": 2.8294923305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_mlp": 1.00159943, + "epoch": 0.2485571373605233, + "flos": 584688692736.0, + "grad_norm": 0.030875088012072577, + "language_loss": 0.89826584, + "learning_rate": 0.0008798891931897821, + "loss": 0.90875816, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47607422, + "step": 1292, + "time_per_iteration": 2.7068471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050902, + "balance_loss_mlp": 1.00359952, + "epoch": 0.24874951904578685, + "flos": 495737444352.0, + "grad_norm": 0.03670876005724945, + "language_loss": 0.84959131, + "learning_rate": 0.0008796865602168447, + "loss": 0.86010033, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.47265625, + "step": 1293, + "time_per_iteration": 2.550218343734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_mlp": 1.00526226, + "epoch": 0.2489419007310504, + "flos": 457174437120.0, + "grad_norm": 0.03243940706171699, + "language_loss": 0.89144397, + "learning_rate": 0.0008794837798365115, + "loss": 0.90196991, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.47290039, + "step": 1294, + "time_per_iteration": 2.6271979808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.00420678, + "epoch": 0.24913428241631397, + "flos": 486565957632.0, + "grad_norm": 0.03268946967982851, + "language_loss": 0.89255542, + "learning_rate": 0.0008792808521275089, + "loss": 0.90307105, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47314453, + "step": 1295, + "time_per_iteration": 2.733107566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_mlp": 1.00544262, + "epoch": 0.24932666410157753, + "flos": 519918668544.0, + "grad_norm": 0.031266052737173484, + "language_loss": 0.88015056, + "learning_rate": 0.0008790777771686206, + "loss": 0.89068043, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.47509766, + "step": 1296, + "time_per_iteration": 2.5860161781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_mlp": 1.0059917, + "epoch": 0.2495190457868411, + "flos": 473557713408.0, + "grad_norm": 0.03428757295266267, + "language_loss": 0.86048388, + "learning_rate": 0.0008788745550386872, + "loss": 0.8710202, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.47607422, + "step": 1297, + "time_per_iteration": 2.599851608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_mlp": 1.00776434, + "epoch": 0.24971142747210465, + "flos": 747199428096.0, + "grad_norm": 0.03345883603952397, + "language_loss": 0.80858141, + "learning_rate": 0.0008786711858166063, + "loss": 0.81913638, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47705078, + "step": 1298, + "time_per_iteration": 2.9357736110687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_mlp": 1.00770009, + "epoch": 0.2499038091573682, + "flos": 750903015936.0, + "grad_norm": 0.03503874681650984, + "language_loss": 0.84951854, + "learning_rate": 0.0008784676695813332, + "loss": 0.86007309, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.47729492, + "step": 1299, + "time_per_iteration": 2.955172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055374, + "balance_loss_mlp": 1.00776184, + "epoch": 0.2500961908426318, + "flos": 746344006656.0, + "grad_norm": 0.032686560936085865, + "language_loss": 0.85840905, + "learning_rate": 0.0008782640064118796, + "loss": 0.86896276, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47583008, + "step": 1300, + "time_per_iteration": 2.897998571395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055206, + "balance_loss_mlp": 1.00904846, + "epoch": 0.2502885725278953, + "flos": 1420526353152.0, + "grad_norm": 0.0075534145797937526, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77239954, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.4609375, + "step": 1301, + "time_per_iteration": 5.023081541061401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.00904393, + "epoch": 0.2504809542131589, + "flos": 516232577280.0, + "grad_norm": 0.03748206036604932, + "language_loss": 0.87484509, + "learning_rate": 0.0008778562395867648, + "loss": 0.88541192, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.47607422, + "step": 1302, + "time_per_iteration": 2.593972682952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105251, + "balance_loss_mlp": 1.00477886, + "epoch": 0.25067333589842244, + "flos": 526852446720.0, + "grad_norm": 0.031223058919554587, + "language_loss": 0.84117836, + "learning_rate": 0.0008776521360894127, + "loss": 0.85170352, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.47705078, + "step": 1303, + "time_per_iteration": 2.6153149604797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.02342987, + "epoch": 0.25086571758368603, + "flos": 1477160146944.0, + "grad_norm": 0.014969332736355754, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80031657, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.4609375, + "step": 1304, + "time_per_iteration": 4.792739629745483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053769, + "balance_loss_mlp": 1.00649047, + "epoch": 0.2510580992689496, + "flos": 529403159808.0, + "grad_norm": 0.03453306909815573, + "language_loss": 0.91369265, + "learning_rate": 0.0008772434893213186, + "loss": 0.92423034, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.47241211, + "step": 1305, + "time_per_iteration": 2.581268072128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056214, + "balance_loss_mlp": 1.00919807, + "epoch": 0.25125048095421315, + "flos": 518466395136.0, + "grad_norm": 0.035319884850533015, + "language_loss": 0.84733635, + "learning_rate": 0.0008770389462092276, + "loss": 0.85789847, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46972656, + "step": 1306, + "time_per_iteration": 2.627317428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056702, + "balance_loss_mlp": 1.00951862, + "epoch": 0.25144286263947674, + "flos": 621675972096.0, + "grad_norm": 0.03558379494917989, + "language_loss": 0.87486076, + "learning_rate": 0.0008768342567176357, + "loss": 0.88542777, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.47143555, + "step": 1307, + "time_per_iteration": 2.787318706512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052534, + "balance_loss_mlp": 1.00537527, + "epoch": 0.25163524432474027, + "flos": 504866156544.0, + "grad_norm": 0.03616031366836922, + "language_loss": 0.9109531, + "learning_rate": 0.0008766294209260107, + "loss": 0.92147839, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.47119141, + "step": 1308, + "time_per_iteration": 2.6384546756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_mlp": 1.00510657, + "epoch": 0.25182762601000386, + "flos": 510080343552.0, + "grad_norm": 0.03702737725286332, + "language_loss": 0.92033225, + "learning_rate": 0.0008764244389138767, + "loss": 0.93085706, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47338867, + "step": 1309, + "time_per_iteration": 2.5620551109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053435, + "balance_loss_mlp": 1.006037, + "epoch": 0.2520200076952674, + "flos": 635098321152.0, + "grad_norm": 0.03928250470986306, + "language_loss": 0.83104628, + "learning_rate": 0.000876219310760815, + "loss": 0.84158063, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.47363281, + "step": 1310, + "time_per_iteration": 2.886335849761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053423, + "balance_loss_mlp": 1.00614405, + "epoch": 0.252212389380531, + "flos": 495652873728.0, + "grad_norm": 0.03544669215118347, + "language_loss": 0.82256365, + "learning_rate": 0.0008760140365464631, + "loss": 0.83309782, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.47241211, + "step": 1311, + "time_per_iteration": 2.607191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053107, + "balance_loss_mlp": 1.00592351, + "epoch": 0.2524047710657945, + "flos": 491530323456.0, + "grad_norm": 0.037974131054051216, + "language_loss": 0.87817502, + "learning_rate": 0.0008758086163505156, + "loss": 0.88870609, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.47143555, + "step": 1312, + "time_per_iteration": 2.6121339797973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052408, + "balance_loss_mlp": 1.00505757, + "epoch": 0.2525971527510581, + "flos": 648613989120.0, + "grad_norm": 0.03226827566126977, + "language_loss": 0.90228277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91280687, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.47314453, + "step": 1313, + "time_per_iteration": 2.8256115913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049721, + "balance_loss_mlp": 1.00234711, + "epoch": 0.2527895344363217, + "flos": 570373983744.0, + "grad_norm": 0.0325160066751772, + "language_loss": 0.907884, + "learning_rate": 0.0008753973383328954, + "loss": 0.91838121, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.47338867, + "step": 1314, + "time_per_iteration": 2.722231388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051583, + "balance_loss_mlp": 1.00423265, + "epoch": 0.2529819161215852, + "flos": 515069008896.0, + "grad_norm": 0.040482030139478604, + "language_loss": 0.8500945, + "learning_rate": 0.0008751914806708952, + "loss": 0.86061025, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.47314453, + "step": 1315, + "time_per_iteration": 2.593076229095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_mlp": 1.00376213, + "epoch": 0.2531742978068488, + "flos": 532351448064.0, + "grad_norm": 0.03414491036051862, + "language_loss": 0.82694548, + "learning_rate": 0.0008749854773466439, + "loss": 0.8374573, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.47387695, + "step": 1316, + "time_per_iteration": 2.660116672515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054789, + "balance_loss_mlp": 1.00722456, + "epoch": 0.25336667949211233, + "flos": 597748459776.0, + "grad_norm": 0.03206754273868493, + "language_loss": 0.84984171, + "learning_rate": 0.0008747793284401192, + "loss": 0.86038959, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.4753418, + "step": 1317, + "time_per_iteration": 2.692183017730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105185, + "balance_loss_mlp": 1.00407052, + "epoch": 0.2535590611773759, + "flos": 603256209408.0, + "grad_norm": 0.034288977750124294, + "language_loss": 0.85941386, + "learning_rate": 0.0008745730340313551, + "loss": 0.86993235, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.47753906, + "step": 1318, + "time_per_iteration": 2.7932682037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105299, + "balance_loss_mlp": 1.00525868, + "epoch": 0.25375144286263945, + "flos": 496323602688.0, + "grad_norm": 0.035249055653748196, + "language_loss": 0.8522734, + "learning_rate": 0.0008743665942004422, + "loss": 0.86280334, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.47705078, + "step": 1319, + "time_per_iteration": 2.6616318225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052413, + "balance_loss_mlp": 1.00465751, + "epoch": 0.25394382454790304, + "flos": 513477729792.0, + "grad_norm": 0.032623992793633046, + "language_loss": 0.93257391, + "learning_rate": 0.0008741600090275277, + "loss": 0.94309807, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.47729492, + "step": 1320, + "time_per_iteration": 2.567985773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051086, + "balance_loss_mlp": 1.00333035, + "epoch": 0.25413620623316663, + "flos": 960856616448.0, + "grad_norm": 0.03465281335593922, + "language_loss": 0.8488484, + "learning_rate": 0.0008739532785928151, + "loss": 0.85935926, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.47729492, + "step": 1321, + "time_per_iteration": 3.4506430625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054222, + "balance_loss_mlp": 1.00882721, + "epoch": 0.25432858791843016, + "flos": 1580651625984.0, + "grad_norm": 0.01348888133328934, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75947809, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.453125, + "step": 1322, + "time_per_iteration": 4.819811820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055371, + "balance_loss_mlp": 1.00752044, + "epoch": 0.25452096960369375, + "flos": 584894772480.0, + "grad_norm": 0.03690210205672512, + "language_loss": 0.83839363, + "learning_rate": 0.0008735393822590908, + "loss": 0.84894735, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.47827148, + "step": 1323, + "time_per_iteration": 2.680769681930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069306, + "balance_loss_mlp": 1.02138364, + "epoch": 0.2547133512889573, + "flos": 509641939200.0, + "grad_norm": 0.03795743442729459, + "language_loss": 0.87760162, + "learning_rate": 0.0008733322165207681, + "loss": 0.8882947, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.47900391, + "step": 1324, + "time_per_iteration": 2.6391303539276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056249, + "balance_loss_mlp": 1.00856507, + "epoch": 0.25490573297422087, + "flos": 784037008128.0, + "grad_norm": 0.03625483542623235, + "language_loss": 0.83670151, + "learning_rate": 0.0008731249058420247, + "loss": 0.84726399, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.4765625, + "step": 1325, + "time_per_iteration": 3.0179827213287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.01479542, + "epoch": 0.2550981146594844, + "flos": 510953261568.0, + "grad_norm": 0.03728184694741104, + "language_loss": 0.91373062, + "learning_rate": 0.0008729174503033459, + "loss": 0.92435133, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.47241211, + "step": 1326, + "time_per_iteration": 2.644351005554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059853, + "balance_loss_mlp": 1.01262248, + "epoch": 0.255290496344748, + "flos": 677931632640.0, + "grad_norm": 0.04262364220636159, + "language_loss": 0.83700824, + "learning_rate": 0.0008727098499852728, + "loss": 0.84760678, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.47192383, + "step": 1327, + "time_per_iteration": 2.8393821716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059289, + "balance_loss_mlp": 1.01212943, + "epoch": 0.2554828780300115, + "flos": 538985827584.0, + "grad_norm": 0.0346626903619469, + "language_loss": 0.90499496, + "learning_rate": 0.0008725021049684034, + "loss": 0.91558784, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.47119141, + "step": 1328, + "time_per_iteration": 2.74480938911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_mlp": 1.00554764, + "epoch": 0.2556752597152751, + "flos": 825624125952.0, + "grad_norm": 0.0321884383853499, + "language_loss": 0.83690739, + "learning_rate": 0.000872294215333391, + "loss": 0.84743297, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.46972656, + "step": 1329, + "time_per_iteration": 3.177448034286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066156, + "balance_loss_mlp": 1.01880646, + "epoch": 0.2558676414005387, + "flos": 571891385856.0, + "grad_norm": 0.037080167806849716, + "language_loss": 0.84060931, + "learning_rate": 0.0008720861811609457, + "loss": 0.85127091, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.47314453, + "step": 1330, + "time_per_iteration": 2.7320711612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_mlp": 1.00745046, + "epoch": 0.2560600230858022, + "flos": 487748967936.0, + "grad_norm": 0.03498979971426328, + "language_loss": 0.84052318, + "learning_rate": 0.0008718780025318338, + "loss": 0.85106957, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.47143555, + "step": 1331, + "time_per_iteration": 2.7297112941741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.00705111, + "epoch": 0.2562524047710658, + "flos": 514120268544.0, + "grad_norm": 0.03699782349212247, + "language_loss": 0.84697664, + "learning_rate": 0.0008716696795268771, + "loss": 0.85751587, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.46826172, + "step": 1332, + "time_per_iteration": 2.6615397930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054318, + "balance_loss_mlp": 1.00756466, + "epoch": 0.25644478645632934, + "flos": 636110244864.0, + "grad_norm": 0.03600089626817585, + "language_loss": 0.85914254, + "learning_rate": 0.0008714612122269538, + "loss": 0.86968577, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.46704102, + "step": 1333, + "time_per_iteration": 2.849813938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056443, + "balance_loss_mlp": 1.00968957, + "epoch": 0.25663716814159293, + "flos": 437545419264.0, + "grad_norm": 0.03932780780666976, + "language_loss": 0.90516675, + "learning_rate": 0.0008712526007129982, + "loss": 0.91573119, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46704102, + "step": 1334, + "time_per_iteration": 2.520730972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_mlp": 1.00675464, + "epoch": 0.25682954982685646, + "flos": 499243700736.0, + "grad_norm": 0.03395243638019146, + "language_loss": 0.9133085, + "learning_rate": 0.0008710438450660003, + "loss": 0.9238441, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.4675293, + "step": 1335, + "time_per_iteration": 2.6936721801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00590599, + "epoch": 0.25702193151212005, + "flos": 458628655872.0, + "grad_norm": 0.038911849114865095, + "language_loss": 0.8791827, + "learning_rate": 0.0008708349453670064, + "loss": 0.88971329, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47119141, + "step": 1336, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074594, + "balance_loss_mlp": 1.02733934, + "epoch": 0.2572143131973836, + "flos": 599404867584.0, + "grad_norm": 0.03723585257139378, + "language_loss": 0.92015922, + "learning_rate": 0.0008706259016971185, + "loss": 0.93090516, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.47216797, + "step": 1337, + "time_per_iteration": 2.792436361312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055792, + "balance_loss_mlp": 1.00872791, + "epoch": 0.25740669488264717, + "flos": 699527150592.0, + "grad_norm": 0.04259016947882448, + "language_loss": 0.8355068, + "learning_rate": 0.0008704167141374944, + "loss": 0.84606469, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.47021484, + "step": 1338, + "time_per_iteration": 2.806931972503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056758, + "balance_loss_mlp": 1.01014686, + "epoch": 0.25759907656791076, + "flos": 503378889984.0, + "grad_norm": 0.03686560218677495, + "language_loss": 0.88890558, + "learning_rate": 0.0008702073827693482, + "loss": 0.89947319, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.46557617, + "step": 1339, + "time_per_iteration": 2.7613115310668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_mlp": 1.01112759, + "epoch": 0.2577914582531743, + "flos": 775242687744.0, + "grad_norm": 0.03484469931885578, + "language_loss": 0.89865053, + "learning_rate": 0.0008699979076739494, + "loss": 0.90922654, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.46411133, + "step": 1340, + "time_per_iteration": 2.9694418907165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_mlp": 1.00552797, + "epoch": 0.2579838399384379, + "flos": 460610707200.0, + "grad_norm": 0.04216529081594553, + "language_loss": 0.89380765, + "learning_rate": 0.0008697882889326234, + "loss": 0.9043293, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.46582031, + "step": 1341, + "time_per_iteration": 2.5050456523895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051599, + "balance_loss_mlp": 1.00482166, + "epoch": 0.2581762216237014, + "flos": 570263168256.0, + "grad_norm": 0.03742337984590145, + "language_loss": 0.87203884, + "learning_rate": 0.0008695785266267515, + "loss": 0.88255489, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.46728516, + "step": 1342, + "time_per_iteration": 2.677072763442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057516, + "balance_loss_mlp": 1.01069069, + "epoch": 0.258368603308965, + "flos": 605387960064.0, + "grad_norm": 0.035138016776099276, + "language_loss": 0.83827055, + "learning_rate": 0.0008693686208377704, + "loss": 0.84884572, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.46777344, + "step": 1343, + "time_per_iteration": 2.826026439666748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054134, + "balance_loss_mlp": 1.0075947, + "epoch": 0.2585609849942285, + "flos": 492487812096.0, + "grad_norm": 0.03194520317053949, + "language_loss": 0.89379156, + "learning_rate": 0.0008691585716471733, + "loss": 0.90433288, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.46484375, + "step": 1344, + "time_per_iteration": 2.6379647254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_mlp": 1.00646937, + "epoch": 0.2587533666794921, + "flos": 641958222336.0, + "grad_norm": 0.03185107281306307, + "language_loss": 0.86602217, + "learning_rate": 0.0008689483791365079, + "loss": 0.87655246, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.46508789, + "step": 1345, + "time_per_iteration": 2.8372344970703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105515, + "balance_loss_mlp": 1.00868249, + "epoch": 0.2589457483647557, + "flos": 577995987456.0, + "grad_norm": 0.038033594557881883, + "language_loss": 0.90178049, + "learning_rate": 0.0008687380433873786, + "loss": 0.91233194, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46411133, + "step": 1346, + "time_per_iteration": 2.7660248279571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.00636888, + "epoch": 0.25913813005001923, + "flos": 536467195392.0, + "grad_norm": 0.03823400300780179, + "language_loss": 0.83192778, + "learning_rate": 0.0008685275644814448, + "loss": 0.8424564, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.46435547, + "step": 1347, + "time_per_iteration": 2.6657776832580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058039, + "balance_loss_mlp": 1.01118934, + "epoch": 0.2593305117352828, + "flos": 722347474944.0, + "grad_norm": 0.04308500968206218, + "language_loss": 0.85215819, + "learning_rate": 0.0008683169425004216, + "loss": 0.86273861, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46801758, + "step": 1348, + "time_per_iteration": 2.8938682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067463, + "balance_loss_mlp": 1.02058995, + "epoch": 0.25952289342054635, + "flos": 711356275200.0, + "grad_norm": 0.04420512127692048, + "language_loss": 0.84604859, + "learning_rate": 0.0008681061775260799, + "loss": 0.85672331, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.46826172, + "step": 1349, + "time_per_iteration": 2.8803627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_mlp": 1.00634348, + "epoch": 0.25971527510580994, + "flos": 456850738944.0, + "grad_norm": 0.03368144531989068, + "language_loss": 0.92376006, + "learning_rate": 0.0008678952696402458, + "loss": 0.93428755, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46337891, + "step": 1350, + "time_per_iteration": 2.5544798374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054358, + "balance_loss_mlp": 1.00824761, + "epoch": 0.25990765679107347, + "flos": 613754569728.0, + "grad_norm": 0.03011764192417466, + "language_loss": 0.87159944, + "learning_rate": 0.000867684218924801, + "loss": 0.88214302, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.46044922, + "step": 1351, + "time_per_iteration": 2.856372833251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_mlp": 1.02496338, + "epoch": 0.26010003847633706, + "flos": 1541407196160.0, + "grad_norm": 0.012951365709411706, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80016494, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.4453125, + "step": 1352, + "time_per_iteration": 4.943616628646851 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058433, + "balance_loss_mlp": 1.01194191, + "epoch": 0.2602924201616006, + "flos": 717545447424.0, + "grad_norm": 0.029832851456929797, + "language_loss": 0.85926312, + "learning_rate": 0.0008672616893328834, + "loss": 0.86984742, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.46435547, + "step": 1353, + "time_per_iteration": 2.913235664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.01012051, + "epoch": 0.2604848018468642, + "flos": 644686824960.0, + "grad_norm": 0.03749633937906014, + "language_loss": 0.91143578, + "learning_rate": 0.0008670502106204512, + "loss": 0.92200339, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.46582031, + "step": 1354, + "time_per_iteration": 2.821753978729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091569, + "balance_loss_mlp": 1.0442189, + "epoch": 0.26067718353212777, + "flos": 518038684416.0, + "grad_norm": 0.04686611644365056, + "language_loss": 0.82400739, + "learning_rate": 0.0008668385894064892, + "loss": 0.83492303, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.47314453, + "step": 1355, + "time_per_iteration": 2.642392158508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056006, + "balance_loss_mlp": 1.00925195, + "epoch": 0.2608695652173913, + "flos": 824226287616.0, + "grad_norm": 0.03313451231790272, + "language_loss": 0.89331532, + "learning_rate": 0.0008666268257731562, + "loss": 0.90387547, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46704102, + "step": 1356, + "time_per_iteration": 3.1127805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060563, + "balance_loss_mlp": 1.01414335, + "epoch": 0.2610619469026549, + "flos": 1009450422528.0, + "grad_norm": 0.04035878870854939, + "language_loss": 0.86687934, + "learning_rate": 0.0008664149198026662, + "loss": 0.87748504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.46362305, + "step": 1357, + "time_per_iteration": 3.2328455448150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106616, + "balance_loss_mlp": 1.01971614, + "epoch": 0.2612543285879184, + "flos": 537826149888.0, + "grad_norm": 0.03943672852684058, + "language_loss": 0.8952527, + "learning_rate": 0.0008662028715772883, + "loss": 0.90591431, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.46386719, + "step": 1358, + "time_per_iteration": 2.621894359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058771, + "balance_loss_mlp": 1.01213586, + "epoch": 0.261446710273182, + "flos": 520439698176.0, + "grad_norm": 0.03590038892764462, + "language_loss": 0.86476588, + "learning_rate": 0.0008659906811793467, + "loss": 0.87535357, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.46582031, + "step": 1359, + "time_per_iteration": 2.6540629863739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054238, + "balance_loss_mlp": 1.00741243, + "epoch": 0.26163909195844554, + "flos": 584399987712.0, + "grad_norm": 0.03384500135634075, + "language_loss": 0.90458202, + "learning_rate": 0.0008657783486912215, + "loss": 0.91512442, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.46777344, + "step": 1360, + "time_per_iteration": 2.71598744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.01626348, + "epoch": 0.2618314736437091, + "flos": 960369613056.0, + "grad_norm": 0.03695926115068694, + "language_loss": 0.90376949, + "learning_rate": 0.0008655658741953472, + "loss": 0.91440493, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.47241211, + "step": 1361, + "time_per_iteration": 3.233081102371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061537, + "balance_loss_mlp": 1.01413929, + "epoch": 0.26202385532897265, + "flos": 575903120640.0, + "grad_norm": 0.032102410789184695, + "language_loss": 0.892542, + "learning_rate": 0.0008653532577742136, + "loss": 0.90315735, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.47363281, + "step": 1362, + "time_per_iteration": 2.671513319015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_mlp": 1.00673676, + "epoch": 0.26221623701423624, + "flos": 446398065408.0, + "grad_norm": 0.034188430773875136, + "language_loss": 0.88125902, + "learning_rate": 0.0008651404995103659, + "loss": 0.8917954, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.46850586, + "step": 1363, + "time_per_iteration": 2.5599000453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064691, + "balance_loss_mlp": 1.01803255, + "epoch": 0.26240861869949983, + "flos": 536755900416.0, + "grad_norm": 0.03309695956224158, + "language_loss": 0.87925225, + "learning_rate": 0.0008649275994864041, + "loss": 0.88989913, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.46606445, + "step": 1364, + "time_per_iteration": 2.68673038482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061602, + "balance_loss_mlp": 1.01472914, + "epoch": 0.26260100038476336, + "flos": 566488615680.0, + "grad_norm": 0.0327166713474878, + "language_loss": 0.84653741, + "learning_rate": 0.0008647145577849834, + "loss": 0.85715348, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46826172, + "step": 1365, + "time_per_iteration": 2.8294174671173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_mlp": 1.01471996, + "epoch": 0.26279338207002695, + "flos": 614321286144.0, + "grad_norm": 0.027467777319160957, + "language_loss": 0.83391041, + "learning_rate": 0.0008645013744888139, + "loss": 0.84452683, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.46875, + "step": 1366, + "time_per_iteration": 2.845019578933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.01238823, + "epoch": 0.2629857637552905, + "flos": 523945954560.0, + "grad_norm": 0.034051307399065846, + "language_loss": 0.88423878, + "learning_rate": 0.0008642880496806607, + "loss": 0.89483547, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.47241211, + "step": 1367, + "time_per_iteration": 2.7665200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_mlp": 1.01832283, + "epoch": 0.26317814544055407, + "flos": 535655515392.0, + "grad_norm": 0.03476637042829631, + "language_loss": 0.85672963, + "learning_rate": 0.0008640745834433437, + "loss": 0.86738896, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.47583008, + "step": 1368, + "time_per_iteration": 2.7824857234954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_mlp": 1.00967455, + "epoch": 0.2633705271258176, + "flos": 556780548096.0, + "grad_norm": 0.035052832704740904, + "language_loss": 0.8778615, + "learning_rate": 0.000863860975859738, + "loss": 0.88843262, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.47412109, + "step": 1369, + "time_per_iteration": 2.938157796859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059209, + "balance_loss_mlp": 1.01214516, + "epoch": 0.2635629088110812, + "flos": 553462874880.0, + "grad_norm": 0.04030614296387141, + "language_loss": 0.89190161, + "learning_rate": 0.0008636472270127733, + "loss": 0.90249372, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.47021484, + "step": 1370, + "time_per_iteration": 2.6449878215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_mlp": 1.0106585, + "epoch": 0.2637552904963448, + "flos": 456915867648.0, + "grad_norm": 0.03827203709322554, + "language_loss": 0.91134202, + "learning_rate": 0.0008634333369854345, + "loss": 0.9219166, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.4675293, + "step": 1371, + "time_per_iteration": 2.6090121269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053294, + "balance_loss_mlp": 1.00642049, + "epoch": 0.2639476721816083, + "flos": 614260048128.0, + "grad_norm": 0.03299961926418253, + "language_loss": 0.88250023, + "learning_rate": 0.0008632193058607608, + "loss": 0.89303321, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.46826172, + "step": 1372, + "time_per_iteration": 2.6980674266815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_mlp": 1.00562024, + "epoch": 0.2641400538668719, + "flos": 573026764032.0, + "grad_norm": 0.03659842444989107, + "language_loss": 0.81553382, + "learning_rate": 0.0008630051337218466, + "loss": 0.82606065, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47021484, + "step": 1373, + "time_per_iteration": 2.6634395122528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056457, + "balance_loss_mlp": 1.00960791, + "epoch": 0.2643324355521354, + "flos": 583340431872.0, + "grad_norm": 0.03511173854729822, + "language_loss": 0.82885635, + "learning_rate": 0.0008627908206518409, + "loss": 0.83942091, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.46801758, + "step": 1374, + "time_per_iteration": 2.6550941467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_mlp": 1.01022339, + "epoch": 0.264524817237399, + "flos": 1548027969792.0, + "grad_norm": 0.005864236448565476, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76206684, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.45117188, + "step": 1375, + "time_per_iteration": 4.995543718338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_mlp": 1.01197898, + "epoch": 0.26471719892266254, + "flos": 519043805184.0, + "grad_norm": 0.03321674595186757, + "language_loss": 0.92123759, + "learning_rate": 0.0008623617720514241, + "loss": 0.93182206, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.46411133, + "step": 1376, + "time_per_iteration": 2.592569351196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061616, + "balance_loss_mlp": 1.0151242, + "epoch": 0.26490958060792613, + "flos": 518205880320.0, + "grad_norm": 0.036665073764434085, + "language_loss": 0.85824203, + "learning_rate": 0.0008621470366875848, + "loss": 0.8688581, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.46435547, + "step": 1377, + "time_per_iteration": 2.5636963844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00766897, + "epoch": 0.26510196229318966, + "flos": 597683331072.0, + "grad_norm": 0.03396624681403314, + "language_loss": 0.88501984, + "learning_rate": 0.0008619321607257966, + "loss": 0.8955617, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.46459961, + "step": 1378, + "time_per_iteration": 2.687581777572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056779, + "balance_loss_mlp": 1.010144, + "epoch": 0.26529434397845325, + "flos": 687053541888.0, + "grad_norm": 0.031207845572821406, + "language_loss": 0.82550275, + "learning_rate": 0.000861717144249482, + "loss": 0.83607054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.46582031, + "step": 1379, + "time_per_iteration": 2.8333678245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.00819123, + "epoch": 0.26548672566371684, + "flos": 425260393728.0, + "grad_norm": 0.03047521662480035, + "language_loss": 0.90854567, + "learning_rate": 0.0008615019873421175, + "loss": 0.91909492, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.46679688, + "step": 1380, + "time_per_iteration": 2.47892689704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00437295, + "epoch": 0.26567910734898037, + "flos": 490850846208.0, + "grad_norm": 0.03515354974137605, + "language_loss": 0.8636173, + "learning_rate": 0.0008612866900872349, + "loss": 0.87412781, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.46630859, + "step": 1381, + "time_per_iteration": 2.558497428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.00833893, + "epoch": 0.26587148903424396, + "flos": 535229750016.0, + "grad_norm": 0.033124361732310995, + "language_loss": 0.88441265, + "learning_rate": 0.0008610712525684197, + "loss": 0.89496362, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.46704102, + "step": 1382, + "time_per_iteration": 2.6567015647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056513, + "balance_loss_mlp": 1.00997365, + "epoch": 0.2660638707195075, + "flos": 1019056422912.0, + "grad_norm": 0.038309225150243896, + "language_loss": 0.84641987, + "learning_rate": 0.0008608556748693121, + "loss": 0.85698497, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.46484375, + "step": 1383, + "time_per_iteration": 3.266127347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054133, + "balance_loss_mlp": 1.00754607, + "epoch": 0.2662562524047711, + "flos": 525063836160.0, + "grad_norm": 0.03266135396779854, + "language_loss": 0.86478686, + "learning_rate": 0.000860639957073607, + "loss": 0.87532818, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.46533203, + "step": 1384, + "time_per_iteration": 2.701979398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052082, + "balance_loss_mlp": 1.00542331, + "epoch": 0.2664486340900346, + "flos": 553480371456.0, + "grad_norm": 0.03507018041250785, + "language_loss": 0.88455647, + "learning_rate": 0.0008604240992650534, + "loss": 0.89507735, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.46606445, + "step": 1385, + "time_per_iteration": 2.6528589725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051347, + "balance_loss_mlp": 1.00476038, + "epoch": 0.2666410157752982, + "flos": 471209189376.0, + "grad_norm": 0.03349459525563368, + "language_loss": 0.89804894, + "learning_rate": 0.0008602081015274545, + "loss": 0.90856242, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.46533203, + "step": 1386, + "time_per_iteration": 2.7359464168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00626826, + "epoch": 0.2668333974605617, + "flos": 571016522496.0, + "grad_norm": 0.027882929979452454, + "language_loss": 0.8367793, + "learning_rate": 0.0008599919639446684, + "loss": 0.84730947, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.46704102, + "step": 1387, + "time_per_iteration": 2.72188401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_mlp": 1.00572038, + "epoch": 0.2670257791458253, + "flos": 399896159232.0, + "grad_norm": 0.038277743086958374, + "language_loss": 0.80995691, + "learning_rate": 0.000859775686600607, + "loss": 0.82048184, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46728516, + "step": 1388, + "time_per_iteration": 2.5220229625701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051137, + "balance_loss_mlp": 1.00443089, + "epoch": 0.2672181608310889, + "flos": 516892612608.0, + "grad_norm": 0.03738976993969629, + "language_loss": 0.85769641, + "learning_rate": 0.0008595592695792367, + "loss": 0.86820781, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.46655273, + "step": 1389, + "time_per_iteration": 2.7041423320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050805, + "balance_loss_mlp": 1.0042417, + "epoch": 0.26741054251635243, + "flos": 508526002944.0, + "grad_norm": 0.03398026188762752, + "language_loss": 0.91414082, + "learning_rate": 0.0008593427129645778, + "loss": 0.92464888, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.46508789, + "step": 1390, + "time_per_iteration": 2.563215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.0067687, + "epoch": 0.267602924201616, + "flos": 577809349632.0, + "grad_norm": 0.03481446530036303, + "language_loss": 0.86254311, + "learning_rate": 0.0008591260168407052, + "loss": 0.87307882, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.4675293, + "step": 1391, + "time_per_iteration": 2.788869619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_mlp": 1.00475395, + "epoch": 0.26779530588687955, + "flos": 525000652800.0, + "grad_norm": 0.029176301882166727, + "language_loss": 0.83413607, + "learning_rate": 0.0008589091812917479, + "loss": 0.84465045, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.46630859, + "step": 1392, + "time_per_iteration": 2.6471304893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057177, + "balance_loss_mlp": 1.0103997, + "epoch": 0.26798768757214314, + "flos": 557828443392.0, + "grad_norm": 0.034011915135398356, + "language_loss": 0.85611916, + "learning_rate": 0.0008586922064018887, + "loss": 0.86669087, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.46728516, + "step": 1393, + "time_per_iteration": 2.665710926055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00488269, + "epoch": 0.2681800692574067, + "flos": 932095974144.0, + "grad_norm": 0.035119979561623306, + "language_loss": 0.89861763, + "learning_rate": 0.0008584750922553651, + "loss": 0.90913308, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.46606445, + "step": 1394, + "time_per_iteration": 3.1556007862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00813687, + "epoch": 0.26837245094267026, + "flos": 702318936576.0, + "grad_norm": 0.034220503648090136, + "language_loss": 0.84388494, + "learning_rate": 0.0008582578389364677, + "loss": 0.85443103, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.46411133, + "step": 1395, + "time_per_iteration": 2.8831770420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054667, + "balance_loss_mlp": 1.00824666, + "epoch": 0.26856483262793385, + "flos": 594394814976.0, + "grad_norm": 0.030437239966241224, + "language_loss": 0.92446673, + "learning_rate": 0.0008580404465295422, + "loss": 0.93501341, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.46362305, + "step": 1396, + "time_per_iteration": 2.823685884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_mlp": 1.00578523, + "epoch": 0.2687572143131974, + "flos": 715589640960.0, + "grad_norm": 0.035135728363153845, + "language_loss": 0.88714433, + "learning_rate": 0.0008578229151189876, + "loss": 0.89766812, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.46533203, + "step": 1397, + "time_per_iteration": 2.9427757263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_mlp": 1.00858808, + "epoch": 0.26894959599846097, + "flos": 468671115264.0, + "grad_norm": 0.03944499035247069, + "language_loss": 0.82205743, + "learning_rate": 0.0008576052447892573, + "loss": 0.83260822, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.46435547, + "step": 1398, + "time_per_iteration": 2.570364475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_mlp": 1.00712895, + "epoch": 0.2691419776837245, + "flos": 469630549248.0, + "grad_norm": 0.035560759826370754, + "language_loss": 0.87260717, + "learning_rate": 0.000857387435624858, + "loss": 0.88314486, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.46582031, + "step": 1399, + "time_per_iteration": 2.5241427421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_mlp": 1.00698149, + "epoch": 0.2693343593689881, + "flos": 939286376448.0, + "grad_norm": 0.026228750880396605, + "language_loss": 0.88826966, + "learning_rate": 0.0008571694877103513, + "loss": 0.89880389, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.46386719, + "step": 1400, + "time_per_iteration": 3.2871432304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049973, + "balance_loss_mlp": 1.00355244, + "epoch": 0.2695267410542516, + "flos": 578795028480.0, + "grad_norm": 0.031687518811048296, + "language_loss": 0.88370931, + "learning_rate": 0.0008569514011303515, + "loss": 0.89420903, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.46362305, + "step": 1401, + "time_per_iteration": 2.8385562896728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00763249, + "epoch": 0.2697191227395152, + "flos": 557965503744.0, + "grad_norm": 0.03646210542720766, + "language_loss": 0.89149171, + "learning_rate": 0.0008567331759695277, + "loss": 0.90203321, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.46459961, + "step": 1402, + "time_per_iteration": 2.73796010017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_mlp": 1.00663948, + "epoch": 0.26991150442477874, + "flos": 530314961664.0, + "grad_norm": 0.03368837159460442, + "language_loss": 0.86897242, + "learning_rate": 0.0008565148123126023, + "loss": 0.87950301, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.46362305, + "step": 1403, + "time_per_iteration": 2.654782772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055763, + "balance_loss_mlp": 1.00970042, + "epoch": 0.2701038861100423, + "flos": 533087305728.0, + "grad_norm": 0.02742415368344255, + "language_loss": 0.86797845, + "learning_rate": 0.0008562963102443516, + "loss": 0.87853605, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.45996094, + "step": 1404, + "time_per_iteration": 2.6844303607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057243, + "balance_loss_mlp": 1.01122797, + "epoch": 0.2702962677953059, + "flos": 736505681664.0, + "grad_norm": 0.03794782730472634, + "language_loss": 0.85607296, + "learning_rate": 0.0008560776698496056, + "loss": 0.86664534, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.45947266, + "step": 1405, + "time_per_iteration": 2.9016945362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054152, + "balance_loss_mlp": 1.00806534, + "epoch": 0.27048864948056944, + "flos": 576001297152.0, + "grad_norm": 0.03333453941991407, + "language_loss": 0.8661586, + "learning_rate": 0.0008558588912132481, + "loss": 0.8767001, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.46020508, + "step": 1406, + "time_per_iteration": 2.8187410831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.03042603, + "epoch": 0.27068103116583303, + "flos": 1426912856832.0, + "grad_norm": 0.025019447230712623, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77533662, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.44335938, + "step": 1407, + "time_per_iteration": 4.91855001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059735, + "balance_loss_mlp": 1.01386356, + "epoch": 0.27087341285109656, + "flos": 533032870656.0, + "grad_norm": 0.03180107690871134, + "language_loss": 0.83613265, + "learning_rate": 0.0008554209195555016, + "loss": 0.84672999, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.45800781, + "step": 1408, + "time_per_iteration": 2.7004964351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.02188134, + "epoch": 0.27106579453636015, + "flos": 582465568512.0, + "grad_norm": 0.03644580883658202, + "language_loss": 0.89378774, + "learning_rate": 0.0008552017267041483, + "loss": 0.90446383, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.45654297, + "step": 1409, + "time_per_iteration": 2.7288694381713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.0219177, + "epoch": 0.2712581762216237, + "flos": 507881518848.0, + "grad_norm": 0.03188220116364099, + "language_loss": 0.84328783, + "learning_rate": 0.0008549823959512549, + "loss": 0.85396332, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.45556641, + "step": 1410, + "time_per_iteration": 2.67370343208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060246, + "balance_loss_mlp": 1.01435077, + "epoch": 0.27145055790688727, + "flos": 999143557632.0, + "grad_norm": 0.03419744556224296, + "language_loss": 0.87478781, + "learning_rate": 0.0008547629273819728, + "loss": 0.88539028, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.45825195, + "step": 1411, + "time_per_iteration": 3.3728370666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_mlp": 1.0104996, + "epoch": 0.2716429395921508, + "flos": 547729603584.0, + "grad_norm": 0.037303619224495106, + "language_loss": 0.84070724, + "learning_rate": 0.0008545433210815074, + "loss": 0.85127789, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.46508789, + "step": 1412, + "time_per_iteration": 2.6812539100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_mlp": 1.01536179, + "epoch": 0.2718353212774144, + "flos": 574311841536.0, + "grad_norm": 0.033089137280770606, + "language_loss": 0.8805269, + "learning_rate": 0.0008543235771351176, + "loss": 0.89114881, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.46777344, + "step": 1413, + "time_per_iteration": 2.713487148284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00961292, + "epoch": 0.272027702962678, + "flos": 645585987840.0, + "grad_norm": 0.026077025600286987, + "language_loss": 0.85152733, + "learning_rate": 0.0008541036956281154, + "loss": 0.86208814, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.46411133, + "step": 1414, + "time_per_iteration": 2.9018056392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062827, + "balance_loss_mlp": 1.01631117, + "epoch": 0.2722200846479415, + "flos": 654996602112.0, + "grad_norm": 0.04047455719590206, + "language_loss": 0.83293629, + "learning_rate": 0.0008538836766458665, + "loss": 0.84356457, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.46459961, + "step": 1415, + "time_per_iteration": 2.84184193611145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106005, + "balance_loss_mlp": 1.01365411, + "epoch": 0.2724124663332051, + "flos": 580779025152.0, + "grad_norm": 0.0390255284508479, + "language_loss": 0.85920322, + "learning_rate": 0.0008536635202737897, + "loss": 0.86980367, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.46337891, + "step": 1416, + "time_per_iteration": 2.814687728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01272988, + "epoch": 0.2726048480184686, + "flos": 538468688640.0, + "grad_norm": 0.03678906161491062, + "language_loss": 0.82951486, + "learning_rate": 0.0008534432265973573, + "loss": 0.8401081, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.46533203, + "step": 1417, + "time_per_iteration": 2.641660451889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00930703, + "epoch": 0.2727972297037322, + "flos": 997550333184.0, + "grad_norm": 0.4222293446211692, + "language_loss": 0.88806397, + "learning_rate": 0.000853222795702095, + "loss": 0.89862669, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.46923828, + "step": 1418, + "time_per_iteration": 3.3743135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181433, + "balance_loss_mlp": 1.1334635, + "epoch": 0.27298961138899575, + "flos": 607335018240.0, + "grad_norm": 0.06715989722341878, + "language_loss": 0.84640503, + "learning_rate": 0.0008530022276735813, + "loss": 0.85821939, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.47949219, + "step": 1419, + "time_per_iteration": 2.752645254135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069458, + "balance_loss_mlp": 1.02225161, + "epoch": 0.27318199307425933, + "flos": 530397586944.0, + "grad_norm": 0.040820608700474346, + "language_loss": 0.87344372, + "learning_rate": 0.0008527815225974489, + "loss": 0.88413835, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.47167969, + "step": 1420, + "time_per_iteration": 2.65108585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085616, + "balance_loss_mlp": 1.03852844, + "epoch": 0.2733743747595229, + "flos": 409912373760.0, + "grad_norm": 0.06690132065136703, + "language_loss": 0.92052042, + "learning_rate": 0.0008525606805593829, + "loss": 0.93137658, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.47045898, + "step": 1421, + "time_per_iteration": 2.4201173782348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.03422987, + "epoch": 0.27356675644478645, + "flos": 517228949760.0, + "grad_norm": 0.05290317096475839, + "language_loss": 0.85793996, + "learning_rate": 0.0008523397016451213, + "loss": 0.86875236, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46972656, + "step": 1422, + "time_per_iteration": 2.632446765899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080251, + "balance_loss_mlp": 1.03328276, + "epoch": 0.27375913813005004, + "flos": 1054059705600.0, + "grad_norm": 0.039766191828199446, + "language_loss": 0.90321743, + "learning_rate": 0.0008521185859404564, + "loss": 0.91401994, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.46923828, + "step": 1423, + "time_per_iteration": 3.381535291671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107676, + "balance_loss_mlp": 1.02998257, + "epoch": 0.27395151981531357, + "flos": 626004602112.0, + "grad_norm": 0.042654551092476074, + "language_loss": 0.92207062, + "learning_rate": 0.0008518973335312326, + "loss": 0.9328382, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.46728516, + "step": 1424, + "time_per_iteration": 2.787799596786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070757, + "balance_loss_mlp": 1.0240984, + "epoch": 0.27414390150057716, + "flos": 551415694848.0, + "grad_norm": 0.04883209929837253, + "language_loss": 0.85839558, + "learning_rate": 0.0008516759445033477, + "loss": 0.86910313, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.46606445, + "step": 1425, + "time_per_iteration": 2.6206350326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_mlp": 1.01881957, + "epoch": 0.2743362831858407, + "flos": 540952327680.0, + "grad_norm": 0.043467714857121094, + "language_loss": 0.87962419, + "learning_rate": 0.0008514544189427526, + "loss": 0.89028037, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.4675293, + "step": 1426, + "time_per_iteration": 2.679623603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_mlp": 1.0118494, + "epoch": 0.2745286648711043, + "flos": 469545978624.0, + "grad_norm": 0.04158543868721512, + "language_loss": 0.89037859, + "learning_rate": 0.0008512327569354511, + "loss": 0.90096468, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.46704102, + "step": 1427, + "time_per_iteration": 2.5345683097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.01036775, + "epoch": 0.2747210465563678, + "flos": 473872663296.0, + "grad_norm": 0.05094281183667316, + "language_loss": 0.85685182, + "learning_rate": 0.0008510109585675001, + "loss": 0.8674283, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.47241211, + "step": 1428, + "time_per_iteration": 2.5991017818450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076946, + "balance_loss_mlp": 1.03031158, + "epoch": 0.2749134282416314, + "flos": 1318059436800.0, + "grad_norm": 0.019364160619571847, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82230288, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.46582031, + "step": 1429, + "time_per_iteration": 4.724486351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.03459787, + "epoch": 0.275105809926895, + "flos": 972533129472.0, + "grad_norm": 0.05143903496013185, + "language_loss": 0.82696635, + "learning_rate": 0.0008505669530941415, + "loss": 0.83778298, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.47021484, + "step": 1430, + "time_per_iteration": 3.3173024654388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058611, + "balance_loss_mlp": 1.01231062, + "epoch": 0.2752981916121585, + "flos": 528369848832.0, + "grad_norm": 0.04649662222604448, + "language_loss": 0.87158883, + "learning_rate": 0.000850344746161112, + "loss": 0.88217485, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.46240234, + "step": 1431, + "time_per_iteration": 2.635831356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065544, + "balance_loss_mlp": 1.01943398, + "epoch": 0.2754905732974221, + "flos": 454599424512.0, + "grad_norm": 0.04970989937431765, + "language_loss": 0.90776384, + "learning_rate": 0.0008501224032121894, + "loss": 0.91841936, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.46044922, + "step": 1432, + "time_per_iteration": 2.531921148300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.02339363, + "epoch": 0.27568295498268564, + "flos": 498509788416.0, + "grad_norm": 0.04336527805629792, + "language_loss": 0.84821916, + "learning_rate": 0.0008498999243336946, + "loss": 0.85891324, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.45947266, + "step": 1433, + "time_per_iteration": 2.6142802238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068976, + "balance_loss_mlp": 1.02298498, + "epoch": 0.2758753366679492, + "flos": 609417191424.0, + "grad_norm": 0.03822636329404569, + "language_loss": 0.8997575, + "learning_rate": 0.0008496773096120021, + "loss": 0.91044724, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.45922852, + "step": 1434, + "time_per_iteration": 2.788863182067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066122, + "balance_loss_mlp": 1.01977372, + "epoch": 0.27606771835321275, + "flos": 741437966592.0, + "grad_norm": 0.04844453313229188, + "language_loss": 0.86675751, + "learning_rate": 0.0008494545591335381, + "loss": 0.87741876, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46289062, + "step": 1435, + "time_per_iteration": 2.8883180618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061614, + "balance_loss_mlp": 1.01516986, + "epoch": 0.27626010003847634, + "flos": 555749182464.0, + "grad_norm": 0.03304758436240527, + "language_loss": 0.88791698, + "learning_rate": 0.0008492316729847823, + "loss": 0.89853311, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.46386719, + "step": 1436, + "time_per_iteration": 2.794938087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.0072248, + "epoch": 0.2764524817237399, + "flos": 543696481536.0, + "grad_norm": 0.13725655625344893, + "language_loss": 0.82129836, + "learning_rate": 0.0008490086512522664, + "loss": 0.83184153, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47045898, + "step": 1437, + "time_per_iteration": 2.6979260444641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.01495445, + "epoch": 0.27664486340900346, + "flos": 407129336064.0, + "grad_norm": 0.04115092615815086, + "language_loss": 0.92702913, + "learning_rate": 0.0008487854940225755, + "loss": 0.93765163, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47265625, + "step": 1438, + "time_per_iteration": 2.4361565113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_mlp": 1.0080049, + "epoch": 0.27683724509426705, + "flos": 523157607168.0, + "grad_norm": 0.06281356926864295, + "language_loss": 0.92480713, + "learning_rate": 0.0008485622013823466, + "loss": 0.93535829, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.47070312, + "step": 1439, + "time_per_iteration": 2.588972568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060631, + "balance_loss_mlp": 1.01332879, + "epoch": 0.2770296267795306, + "flos": 536410814976.0, + "grad_norm": 0.048827385499573994, + "language_loss": 0.8582921, + "learning_rate": 0.00084833877341827, + "loss": 0.86889839, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47265625, + "step": 1440, + "time_per_iteration": 2.6215152740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063403, + "balance_loss_mlp": 1.01648188, + "epoch": 0.27722200846479417, + "flos": 488970862080.0, + "grad_norm": 0.04074125375838667, + "language_loss": 0.82920921, + "learning_rate": 0.000848115210217088, + "loss": 0.83984327, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.46875, + "step": 1441, + "time_per_iteration": 2.578479290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059768, + "balance_loss_mlp": 1.01244187, + "epoch": 0.2774143901500577, + "flos": 619444099584.0, + "grad_norm": 0.03981713509883016, + "language_loss": 0.84628934, + "learning_rate": 0.0008478915118655952, + "loss": 0.85688698, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.47290039, + "step": 1442, + "time_per_iteration": 2.697610855102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055568, + "balance_loss_mlp": 1.0080508, + "epoch": 0.2776067718353213, + "flos": 514845432576.0, + "grad_norm": 0.032345577367045, + "language_loss": 0.88479745, + "learning_rate": 0.0008476676784506393, + "loss": 0.89535314, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.47485352, + "step": 1443, + "time_per_iteration": 2.6315112113952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056662, + "balance_loss_mlp": 1.00897789, + "epoch": 0.2777991535205848, + "flos": 1006042342656.0, + "grad_norm": 0.04008629757661371, + "language_loss": 0.8412413, + "learning_rate": 0.0008474437100591201, + "loss": 0.85180795, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.4765625, + "step": 1444, + "time_per_iteration": 3.3463656902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051562, + "balance_loss_mlp": 1.00371146, + "epoch": 0.2779915352058484, + "flos": 551376811008.0, + "grad_norm": 0.033834103416723965, + "language_loss": 0.87362587, + "learning_rate": 0.0008472196067779898, + "loss": 0.88414145, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47827148, + "step": 1445, + "time_per_iteration": 2.6647677421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054302, + "balance_loss_mlp": 1.00649953, + "epoch": 0.278183916891112, + "flos": 875217216768.0, + "grad_norm": 0.0457526450580795, + "language_loss": 0.87776953, + "learning_rate": 0.0008469953686942531, + "loss": 0.88831258, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.4777832, + "step": 1446, + "time_per_iteration": 3.076035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056504, + "balance_loss_mlp": 1.00882006, + "epoch": 0.2783762985763755, + "flos": 625196812800.0, + "grad_norm": 0.042452946668595545, + "language_loss": 0.85090148, + "learning_rate": 0.0008467709958949668, + "loss": 0.86146653, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.4765625, + "step": 1447, + "time_per_iteration": 2.744459629058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_mlp": 1.00850928, + "epoch": 0.2785686802616391, + "flos": 582912721152.0, + "grad_norm": 0.04136143865758397, + "language_loss": 0.87796736, + "learning_rate": 0.0008465464884672403, + "loss": 0.88852853, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.47583008, + "step": 1448, + "time_per_iteration": 2.6887707710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_mlp": 1.00235641, + "epoch": 0.27876106194690264, + "flos": 588540034560.0, + "grad_norm": 0.031263057988026755, + "language_loss": 0.87220562, + "learning_rate": 0.0008463218464982348, + "loss": 0.88270551, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.47607422, + "step": 1449, + "time_per_iteration": 2.8354454040527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_mlp": 1.00326335, + "epoch": 0.27895344363216623, + "flos": 877431592704.0, + "grad_norm": 0.03730856956989286, + "language_loss": 0.89626968, + "learning_rate": 0.0008460970700751645, + "loss": 0.90677798, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.4753418, + "step": 1450, + "time_per_iteration": 3.12705135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062156, + "balance_loss_mlp": 1.01442492, + "epoch": 0.27914582531742976, + "flos": 605036071680.0, + "grad_norm": 0.0379360607610882, + "language_loss": 0.8910991, + "learning_rate": 0.000845872159285295, + "loss": 0.90172064, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.47705078, + "step": 1451, + "time_per_iteration": 2.792448043823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065174, + "balance_loss_mlp": 1.02025604, + "epoch": 0.27933820700269335, + "flos": 1501133346048.0, + "grad_norm": 0.01376981107013524, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.7883203, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.44921875, + "step": 1452, + "time_per_iteration": 4.966037034988403 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_mlp": 1.00921774, + "epoch": 0.2795305886879569, + "flos": 1033518885888.0, + "grad_norm": 0.037040263742322534, + "language_loss": 0.87809932, + "learning_rate": 0.0008454219349544836, + "loss": 0.88866544, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47363281, + "step": 1453, + "time_per_iteration": 3.428589344024658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055627, + "balance_loss_mlp": 1.00851548, + "epoch": 0.27972297037322047, + "flos": 608227378176.0, + "grad_norm": 0.03307542484781365, + "language_loss": 0.83086669, + "learning_rate": 0.000845196621588334, + "loss": 0.84142298, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.47070312, + "step": 1454, + "time_per_iteration": 2.7620909214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053868, + "balance_loss_mlp": 1.00661373, + "epoch": 0.27991535205848406, + "flos": 631561929216.0, + "grad_norm": 0.034345141589198824, + "language_loss": 0.77104861, + "learning_rate": 0.0008449711742049706, + "loss": 0.78158724, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.47216797, + "step": 1455, + "time_per_iteration": 2.7629852294921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057076, + "balance_loss_mlp": 1.009655, + "epoch": 0.2801077337437476, + "flos": 550354193664.0, + "grad_norm": 0.03843537360044117, + "language_loss": 0.85426688, + "learning_rate": 0.0008447455928919196, + "loss": 0.86483765, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.47387695, + "step": 1456, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054361, + "balance_loss_mlp": 1.00670111, + "epoch": 0.2803001154290112, + "flos": 487742164992.0, + "grad_norm": 0.03308646323695097, + "language_loss": 0.8834334, + "learning_rate": 0.0008445198777367595, + "loss": 0.89397705, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.47631836, + "step": 1457, + "time_per_iteration": 2.5908620357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054094, + "balance_loss_mlp": 1.00633848, + "epoch": 0.2804924971142747, + "flos": 523092478464.0, + "grad_norm": 0.036759152060528134, + "language_loss": 0.82140505, + "learning_rate": 0.0008442940288271208, + "loss": 0.8319459, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.47729492, + "step": 1458, + "time_per_iteration": 2.6980724334716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057218, + "balance_loss_mlp": 1.00953484, + "epoch": 0.2806848787995383, + "flos": 528850049280.0, + "grad_norm": 0.03179596299998768, + "language_loss": 0.88266242, + "learning_rate": 0.0008440680462506856, + "loss": 0.89323461, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.4765625, + "step": 1459, + "time_per_iteration": 2.818169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058111, + "balance_loss_mlp": 1.01047492, + "epoch": 0.2808772604848018, + "flos": 486485277696.0, + "grad_norm": 0.030255628698855237, + "language_loss": 0.87626624, + "learning_rate": 0.0008438419300951883, + "loss": 0.88684738, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.47607422, + "step": 1460, + "time_per_iteration": 2.644911527633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_mlp": 1.00825953, + "epoch": 0.2810696421700654, + "flos": 619340087040.0, + "grad_norm": 0.03597967684758823, + "language_loss": 0.87670606, + "learning_rate": 0.0008436156804484148, + "loss": 0.88726676, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.4777832, + "step": 1461, + "time_per_iteration": 2.7725627422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_mlp": 1.00657165, + "epoch": 0.28126202385532895, + "flos": 455687170560.0, + "grad_norm": 0.0394598317615188, + "language_loss": 0.89263237, + "learning_rate": 0.0008433892973982031, + "loss": 0.90317494, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.4765625, + "step": 1462, + "time_per_iteration": 2.5091495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063928, + "balance_loss_mlp": 1.0156002, + "epoch": 0.28145440554059253, + "flos": 531739044864.0, + "grad_norm": 0.041651284680957995, + "language_loss": 0.866346, + "learning_rate": 0.0008431627810324431, + "loss": 0.87698531, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.4831543, + "step": 1463, + "time_per_iteration": 2.6705899238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056697, + "balance_loss_mlp": 1.00872695, + "epoch": 0.2816467872258561, + "flos": 453164647680.0, + "grad_norm": 0.03544245246238935, + "language_loss": 0.81977493, + "learning_rate": 0.000842936131439076, + "loss": 0.83034194, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.47949219, + "step": 1464, + "time_per_iteration": 2.610419511795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055364, + "balance_loss_mlp": 1.00763226, + "epoch": 0.28183916891111965, + "flos": 473705467392.0, + "grad_norm": 0.034609246408770326, + "language_loss": 0.89094436, + "learning_rate": 0.0008427093487060951, + "loss": 0.90149802, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.47705078, + "step": 1465, + "time_per_iteration": 2.72540283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054608, + "balance_loss_mlp": 1.00656629, + "epoch": 0.28203155059638324, + "flos": 558189080064.0, + "grad_norm": 0.02738603689522664, + "language_loss": 0.8552286, + "learning_rate": 0.000842482432921545, + "loss": 0.86577463, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.48022461, + "step": 1466, + "time_per_iteration": 2.8388257026672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105456, + "balance_loss_mlp": 1.00654304, + "epoch": 0.28222393228164677, + "flos": 417879462912.0, + "grad_norm": 0.03402242241185157, + "language_loss": 0.88381398, + "learning_rate": 0.0008422553841735225, + "loss": 0.89435959, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.47998047, + "step": 1467, + "time_per_iteration": 2.495126485824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057213, + "balance_loss_mlp": 1.00917137, + "epoch": 0.28241631396691036, + "flos": 606041192448.0, + "grad_norm": 0.032675143321136885, + "language_loss": 0.86003613, + "learning_rate": 0.0008420282025501757, + "loss": 0.87060827, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.48022461, + "step": 1468, + "time_per_iteration": 2.7908880710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052419, + "balance_loss_mlp": 1.00473487, + "epoch": 0.2826086956521739, + "flos": 574051326720.0, + "grad_norm": 0.03300906221563125, + "language_loss": 0.86686498, + "learning_rate": 0.0008418008881397043, + "loss": 0.87738919, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.4765625, + "step": 1469, + "time_per_iteration": 2.7646520137786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054478, + "balance_loss_mlp": 1.00693762, + "epoch": 0.2828010773374375, + "flos": 844319954688.0, + "grad_norm": 0.03195966631281891, + "language_loss": 0.84124947, + "learning_rate": 0.0008415734410303595, + "loss": 0.85179424, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.47509766, + "step": 1470, + "time_per_iteration": 3.1784656047821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059132, + "balance_loss_mlp": 1.01151943, + "epoch": 0.28299345902270107, + "flos": 543772303872.0, + "grad_norm": 0.0307788797974712, + "language_loss": 0.91781342, + "learning_rate": 0.0008413458613104444, + "loss": 0.92840481, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.47583008, + "step": 1471, + "time_per_iteration": 2.7000675201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057543, + "balance_loss_mlp": 1.00995505, + "epoch": 0.2831858407079646, + "flos": 572755555584.0, + "grad_norm": 0.03187726406761503, + "language_loss": 0.84024346, + "learning_rate": 0.0008411181490683129, + "loss": 0.85081899, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.47558594, + "step": 1472, + "time_per_iteration": 2.7358603477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105744, + "balance_loss_mlp": 1.00958943, + "epoch": 0.2833782223932282, + "flos": 765172038144.0, + "grad_norm": 0.03258814259190176, + "language_loss": 0.83765668, + "learning_rate": 0.0008408903043923707, + "loss": 0.84823108, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.47827148, + "step": 1473, + "time_per_iteration": 3.016690492630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060961, + "balance_loss_mlp": 1.01291955, + "epoch": 0.2835706040784917, + "flos": 540088157952.0, + "grad_norm": 0.03783140599229066, + "language_loss": 0.82463539, + "learning_rate": 0.0008406623273710754, + "loss": 0.83524501, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.48022461, + "step": 1474, + "time_per_iteration": 2.651932954788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_mlp": 1.00736535, + "epoch": 0.2837629857637553, + "flos": 531654474240.0, + "grad_norm": 0.03425671969493541, + "language_loss": 0.84354198, + "learning_rate": 0.0008404342180929351, + "loss": 0.85409558, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.47973633, + "step": 1475, + "time_per_iteration": 2.6064491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105922, + "balance_loss_mlp": 1.01120257, + "epoch": 0.28395536744901884, + "flos": 541110775296.0, + "grad_norm": 0.03564784056716401, + "language_loss": 0.8245163, + "learning_rate": 0.00084020597664651, + "loss": 0.83510846, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.47998047, + "step": 1476, + "time_per_iteration": 2.7597527503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.00890458, + "epoch": 0.2841477491342824, + "flos": 574802735616.0, + "grad_norm": 0.037292940254278956, + "language_loss": 0.8496412, + "learning_rate": 0.0008399776031204111, + "loss": 0.86021066, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.48022461, + "step": 1477, + "time_per_iteration": 2.759089231491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.00412941, + "epoch": 0.28434013081954596, + "flos": 573139524864.0, + "grad_norm": 0.03522410712402375, + "language_loss": 0.80955458, + "learning_rate": 0.0008397490976033009, + "loss": 0.8200742, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.47802734, + "step": 1478, + "time_per_iteration": 2.6423845291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056133, + "balance_loss_mlp": 1.0100708, + "epoch": 0.28453251250480954, + "flos": 1556676481536.0, + "grad_norm": 0.010218347035897045, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78935778, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.45996094, + "step": 1479, + "time_per_iteration": 4.732174396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_mlp": 1.0056026, + "epoch": 0.28472489419007313, + "flos": 750427673088.0, + "grad_norm": 0.028762601306014927, + "language_loss": 0.86263019, + "learning_rate": 0.0008392916909509525, + "loss": 0.87316358, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.47705078, + "step": 1480, + "time_per_iteration": 3.0842366218566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105593, + "balance_loss_mlp": 1.00817478, + "epoch": 0.28491727587533666, + "flos": 491139551232.0, + "grad_norm": 0.03654292068957682, + "language_loss": 0.86134857, + "learning_rate": 0.0008390627899932954, + "loss": 0.87190789, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.47729492, + "step": 1481, + "time_per_iteration": 2.615267753601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053309, + "balance_loss_mlp": 1.0055064, + "epoch": 0.28510965756060025, + "flos": 730360250880.0, + "grad_norm": 0.03257927187729683, + "language_loss": 0.89633858, + "learning_rate": 0.000838833757399789, + "loss": 0.90687168, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.4777832, + "step": 1482, + "time_per_iteration": 2.9428212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053301, + "balance_loss_mlp": 1.00528359, + "epoch": 0.2853020392458638, + "flos": 552670636800.0, + "grad_norm": 0.036455185890550544, + "language_loss": 0.82055122, + "learning_rate": 0.0008386045932593515, + "loss": 0.83108419, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.47998047, + "step": 1483, + "time_per_iteration": 2.724045991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_mlp": 1.00416255, + "epoch": 0.28549442093112737, + "flos": 756097761024.0, + "grad_norm": 0.02777472605390161, + "language_loss": 0.8718375, + "learning_rate": 0.0008383752976609525, + "loss": 0.8823595, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.48022461, + "step": 1484, + "time_per_iteration": 2.929905891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054156, + "balance_loss_mlp": 1.00618601, + "epoch": 0.2856868026163909, + "flos": 539704188672.0, + "grad_norm": 0.028392575187028035, + "language_loss": 0.8111921, + "learning_rate": 0.0008381458706936123, + "loss": 0.82173365, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.47949219, + "step": 1485, + "time_per_iteration": 2.717545986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053651, + "balance_loss_mlp": 1.00563323, + "epoch": 0.2858791843016545, + "flos": 584921017344.0, + "grad_norm": 0.03333139148622456, + "language_loss": 0.88664746, + "learning_rate": 0.0008379163124464025, + "loss": 0.8971839, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.47998047, + "step": 1486, + "time_per_iteration": 2.7234747409820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00685012, + "epoch": 0.286071565986918, + "flos": 646052582400.0, + "grad_norm": 0.03454926432429506, + "language_loss": 0.77946562, + "learning_rate": 0.0008376866230084452, + "loss": 0.79001164, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.47729492, + "step": 1487, + "time_per_iteration": 2.856128692626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00408018, + "epoch": 0.2862639476721816, + "flos": 492331309824.0, + "grad_norm": 0.034661288064865674, + "language_loss": 0.87705112, + "learning_rate": 0.000837456802468914, + "loss": 0.88757157, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.47949219, + "step": 1488, + "time_per_iteration": 2.57454514503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_mlp": 1.00700366, + "epoch": 0.2864563293574452, + "flos": 522745447680.0, + "grad_norm": 0.035472984165373166, + "language_loss": 0.86247557, + "learning_rate": 0.0008372268509170331, + "loss": 0.87302554, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.47973633, + "step": 1489, + "time_per_iteration": 2.661430597305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_mlp": 1.00452483, + "epoch": 0.2866487110427087, + "flos": 548257436160.0, + "grad_norm": 0.03357077125927176, + "language_loss": 0.85950172, + "learning_rate": 0.0008369967684420779, + "loss": 0.8700276, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.48046875, + "step": 1490, + "time_per_iteration": 2.703200101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052654, + "balance_loss_mlp": 1.0047555, + "epoch": 0.2868410927279723, + "flos": 483218148864.0, + "grad_norm": 0.03511930922286833, + "language_loss": 0.8567192, + "learning_rate": 0.0008367665551333736, + "loss": 0.86724567, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.47875977, + "step": 1491, + "time_per_iteration": 2.6027045249938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051173, + "balance_loss_mlp": 1.00334597, + "epoch": 0.28703347441323585, + "flos": 726137578752.0, + "grad_norm": 0.03668604763704844, + "language_loss": 0.86648476, + "learning_rate": 0.0008365362110802977, + "loss": 0.87699652, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47802734, + "step": 1492, + "time_per_iteration": 2.872743606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00630987, + "epoch": 0.28722585609849943, + "flos": 636214257408.0, + "grad_norm": 0.0346446819062503, + "language_loss": 0.83264536, + "learning_rate": 0.0008363057363722773, + "loss": 0.84318721, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.47851562, + "step": 1493, + "time_per_iteration": 2.830925941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055811, + "balance_loss_mlp": 1.00827014, + "epoch": 0.28741823778376296, + "flos": 511252660224.0, + "grad_norm": 0.03541460771255837, + "language_loss": 0.8481909, + "learning_rate": 0.0008360751310987906, + "loss": 0.85874903, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.47509766, + "step": 1494, + "time_per_iteration": 2.6102633476257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055778, + "balance_loss_mlp": 1.00840437, + "epoch": 0.28761061946902655, + "flos": 604932059136.0, + "grad_norm": 0.030521465086419404, + "language_loss": 0.86298919, + "learning_rate": 0.0008358443953493666, + "loss": 0.87354696, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.47338867, + "step": 1495, + "time_per_iteration": 2.8808648586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053186, + "balance_loss_mlp": 1.00590765, + "epoch": 0.28780300115429014, + "flos": 408060579840.0, + "grad_norm": 0.03760103829607362, + "language_loss": 0.89352167, + "learning_rate": 0.0008356135292135851, + "loss": 0.90405357, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.47241211, + "step": 1496, + "time_per_iteration": 2.5025811195373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055101, + "balance_loss_mlp": 1.00794196, + "epoch": 0.28799538283955367, + "flos": 375745070592.0, + "grad_norm": 0.04396673202836768, + "language_loss": 0.93575335, + "learning_rate": 0.0008353825327810758, + "loss": 0.94630432, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.47119141, + "step": 1497, + "time_per_iteration": 2.4455389976501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053362, + "balance_loss_mlp": 1.00601161, + "epoch": 0.28818776452481726, + "flos": 593020309248.0, + "grad_norm": 0.03575929377279749, + "language_loss": 0.82620615, + "learning_rate": 0.00083515140614152, + "loss": 0.83673978, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.47314453, + "step": 1498, + "time_per_iteration": 2.7318496704101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.01204443, + "epoch": 0.2883801462100808, + "flos": 536104613376.0, + "grad_norm": 0.03408677708994041, + "language_loss": 0.8771323, + "learning_rate": 0.0008349201493846485, + "loss": 0.88772887, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.47583008, + "step": 1499, + "time_per_iteration": 2.671473503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105332, + "balance_loss_mlp": 1.00606573, + "epoch": 0.2885725278953444, + "flos": 481077649920.0, + "grad_norm": 0.037679681148910335, + "language_loss": 0.90198493, + "learning_rate": 0.0008346887626002432, + "loss": 0.91251814, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.47216797, + "step": 1500, + "time_per_iteration": 2.565556287765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_mlp": 1.00290418, + "epoch": 0.2887649095806079, + "flos": 465030710784.0, + "grad_norm": 0.03453406345592784, + "language_loss": 0.87256986, + "learning_rate": 0.000834457245878137, + "loss": 0.88307267, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.47338867, + "step": 1501, + "time_per_iteration": 2.6684980392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_mlp": 1.00411427, + "epoch": 0.2889572912658715, + "flos": 932641303296.0, + "grad_norm": 0.034149555340210275, + "language_loss": 0.82079703, + "learning_rate": 0.000834225599308212, + "loss": 0.83131123, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.47265625, + "step": 1502, + "time_per_iteration": 3.2747607231140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052615, + "balance_loss_mlp": 1.00526536, + "epoch": 0.28914967295113503, + "flos": 571257595392.0, + "grad_norm": 0.03426641952710734, + "language_loss": 0.85934782, + "learning_rate": 0.0008339938229804016, + "loss": 0.869874, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.47314453, + "step": 1503, + "time_per_iteration": 2.7027056217193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062946, + "balance_loss_mlp": 1.01783752, + "epoch": 0.2893420546363986, + "flos": 1489874828544.0, + "grad_norm": 0.016861580481692767, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76497769, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.45019531, + "step": 1504, + "time_per_iteration": 4.9503560066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010625, + "balance_loss_mlp": 1.01536465, + "epoch": 0.2895344363216622, + "flos": 471182944512.0, + "grad_norm": 0.04276572481675365, + "language_loss": 0.8589167, + "learning_rate": 0.0008335298814111094, + "loss": 0.86954165, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47094727, + "step": 1505, + "time_per_iteration": 2.548398017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.01654112, + "epoch": 0.28972681800692573, + "flos": 649341098496.0, + "grad_norm": 0.03572405467889404, + "language_loss": 0.89211309, + "learning_rate": 0.0008332977163497455, + "loss": 0.90274966, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.47070312, + "step": 1506, + "time_per_iteration": 2.786355972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059474, + "balance_loss_mlp": 1.01241064, + "epoch": 0.2899191996921893, + "flos": 573306720768.0, + "grad_norm": 0.03560254091063293, + "language_loss": 0.84471554, + "learning_rate": 0.0008330654218907325, + "loss": 0.85531026, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.47021484, + "step": 1507, + "time_per_iteration": 2.706066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054224, + "balance_loss_mlp": 1.00701702, + "epoch": 0.29011158137745285, + "flos": 662638047744.0, + "grad_norm": 0.03364876986368613, + "language_loss": 0.82771999, + "learning_rate": 0.0008328329981242548, + "loss": 0.8382622, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47167969, + "step": 1508, + "time_per_iteration": 2.9025378227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053376, + "balance_loss_mlp": 1.00607395, + "epoch": 0.29030396306271644, + "flos": 537403296768.0, + "grad_norm": 0.0314370875382877, + "language_loss": 0.88638061, + "learning_rate": 0.0008326004451405475, + "loss": 0.89691436, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47265625, + "step": 1509, + "time_per_iteration": 2.740288496017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091124, + "balance_loss_mlp": 1.04370284, + "epoch": 0.29049634474798, + "flos": 512956700160.0, + "grad_norm": 0.04021928954994292, + "language_loss": 0.83711147, + "learning_rate": 0.0008323677630298957, + "loss": 0.84802264, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.47387695, + "step": 1510, + "time_per_iteration": 2.5700840950012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_mlp": 1.00935256, + "epoch": 0.29068872643324356, + "flos": 614983266816.0, + "grad_norm": 0.03498537298994642, + "language_loss": 0.86212677, + "learning_rate": 0.0008321349518826345, + "loss": 0.87268996, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.46923828, + "step": 1511, + "time_per_iteration": 2.7968146800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060609, + "balance_loss_mlp": 1.01364064, + "epoch": 0.2908811081185071, + "flos": 547469088768.0, + "grad_norm": 0.03734404843374857, + "language_loss": 0.95525789, + "learning_rate": 0.0008319020117891491, + "loss": 0.96586394, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.46923828, + "step": 1512, + "time_per_iteration": 2.646127939224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.01107061, + "epoch": 0.2910734898037707, + "flos": 605902186752.0, + "grad_norm": 0.03463533015087841, + "language_loss": 0.88378417, + "learning_rate": 0.0008316689428398751, + "loss": 0.89436436, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.46899414, + "step": 1513, + "time_per_iteration": 2.7310631275177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056228, + "balance_loss_mlp": 1.00935447, + "epoch": 0.29126587148903427, + "flos": 575836046592.0, + "grad_norm": 0.028150288904366032, + "language_loss": 0.89498413, + "learning_rate": 0.0008314357451252979, + "loss": 0.90554643, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.46826172, + "step": 1514, + "time_per_iteration": 2.8262994289398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054124, + "balance_loss_mlp": 1.00727487, + "epoch": 0.2914582531742978, + "flos": 572134404096.0, + "grad_norm": 0.05354948204009119, + "language_loss": 0.89001274, + "learning_rate": 0.0008312024187359527, + "loss": 0.90055394, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.46801758, + "step": 1515, + "time_per_iteration": 2.717780590057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_mlp": 1.01109469, + "epoch": 0.2916506348595614, + "flos": 732303418368.0, + "grad_norm": 0.032865630858266236, + "language_loss": 0.8831327, + "learning_rate": 0.000830968963762425, + "loss": 0.89371502, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.47094727, + "step": 1516, + "time_per_iteration": 3.080526828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051181, + "balance_loss_mlp": 1.00383127, + "epoch": 0.2918430165448249, + "flos": 511467488256.0, + "grad_norm": 0.032871242995291323, + "language_loss": 0.84882748, + "learning_rate": 0.0008307353802953497, + "loss": 0.85933936, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.47314453, + "step": 1517, + "time_per_iteration": 2.744476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.03726828, + "epoch": 0.2920353982300885, + "flos": 631607616000.0, + "grad_norm": 0.03594729450056152, + "language_loss": 0.86997348, + "learning_rate": 0.0008305016684254125, + "loss": 0.88082325, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.47680664, + "step": 1518, + "time_per_iteration": 2.8340506553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_mlp": 1.00001049, + "epoch": 0.29222777991535204, + "flos": 502671222528.0, + "grad_norm": 0.03192476620539529, + "language_loss": 0.87901479, + "learning_rate": 0.0008302678282433479, + "loss": 0.88948864, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.47338867, + "step": 1519, + "time_per_iteration": 2.5783281326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048912, + "balance_loss_mlp": 1.00177681, + "epoch": 0.2924201616006156, + "flos": 487842286848.0, + "grad_norm": 0.03491462978028735, + "language_loss": 0.85667795, + "learning_rate": 0.0008300338598399411, + "loss": 0.86716712, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.47094727, + "step": 1520, + "time_per_iteration": 2.6763737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105218, + "balance_loss_mlp": 1.0049969, + "epoch": 0.2926125432858792, + "flos": 477411000576.0, + "grad_norm": 0.036990289889529016, + "language_loss": 0.957196, + "learning_rate": 0.0008297997633060263, + "loss": 0.96771777, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.47143555, + "step": 1521, + "time_per_iteration": 2.5368785858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055222, + "balance_loss_mlp": 1.00799167, + "epoch": 0.29280492497114274, + "flos": 677868449280.0, + "grad_norm": 0.0362418142607002, + "language_loss": 0.86058486, + "learning_rate": 0.0008295655387324883, + "loss": 0.87113714, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.47192383, + "step": 1522, + "time_per_iteration": 2.8447062969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055265, + "balance_loss_mlp": 1.0079869, + "epoch": 0.29299730665640633, + "flos": 459345071616.0, + "grad_norm": 0.03782463739456531, + "language_loss": 0.86245579, + "learning_rate": 0.0008293311862102609, + "loss": 0.87300849, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.47241211, + "step": 1523, + "time_per_iteration": 2.5397908687591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050568, + "balance_loss_mlp": 1.00328994, + "epoch": 0.29318968834166986, + "flos": 447496505088.0, + "grad_norm": 0.03500221637525105, + "language_loss": 0.90103561, + "learning_rate": 0.0008290967058303275, + "loss": 0.91154128, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.47241211, + "step": 1524, + "time_per_iteration": 2.4784419536590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.03349924, + "epoch": 0.29338207002693345, + "flos": 451256473344.0, + "grad_norm": 0.038529021386844775, + "language_loss": 0.87365985, + "learning_rate": 0.0008288620976837219, + "loss": 0.88447046, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.4753418, + "step": 1525, + "time_per_iteration": 2.540762424468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_mlp": 1.00684249, + "epoch": 0.293574451712197, + "flos": 503285571072.0, + "grad_norm": 0.03477645959362119, + "language_loss": 0.8372373, + "learning_rate": 0.000828627361861527, + "loss": 0.84778112, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.47509766, + "step": 1526, + "time_per_iteration": 2.583862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058639, + "balance_loss_mlp": 1.01124167, + "epoch": 0.29376683339746057, + "flos": 697684104960.0, + "grad_norm": 0.03858140978476568, + "language_loss": 0.85503912, + "learning_rate": 0.0008283924984548752, + "loss": 0.8656255, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.47363281, + "step": 1527, + "time_per_iteration": 2.848947525024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054231, + "balance_loss_mlp": 1.00680923, + "epoch": 0.2939592150827241, + "flos": 479542751232.0, + "grad_norm": 0.03208252397749005, + "language_loss": 0.8577444, + "learning_rate": 0.0008281575075549485, + "loss": 0.86828673, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.47387695, + "step": 1528, + "time_per_iteration": 2.6076998710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063099, + "balance_loss_mlp": 1.01703644, + "epoch": 0.2941515967679877, + "flos": 1488389507328.0, + "grad_norm": 0.010941905571601225, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78415793, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.45996094, + "step": 1529, + "time_per_iteration": 4.672811508178711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175133, + "balance_loss_mlp": 1.12690103, + "epoch": 0.2943439784532513, + "flos": 675400361472.0, + "grad_norm": 0.05299717257038309, + "language_loss": 0.90924174, + "learning_rate": 0.0008276871436402469, + "loss": 0.92099309, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.48217773, + "step": 1530, + "time_per_iteration": 2.8220977783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010581, + "balance_loss_mlp": 1.01096439, + "epoch": 0.2945363601385148, + "flos": 577383584256.0, + "grad_norm": 0.03620573442946411, + "language_loss": 0.88955015, + "learning_rate": 0.000827451770808083, + "loss": 0.90013111, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.47094727, + "step": 1531, + "time_per_iteration": 2.6981046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057368, + "balance_loss_mlp": 1.01013768, + "epoch": 0.2947287418237784, + "flos": 481618121472.0, + "grad_norm": 0.03382548660060083, + "language_loss": 0.84345412, + "learning_rate": 0.0008272162708478674, + "loss": 0.85402787, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.47192383, + "step": 1532, + "time_per_iteration": 2.5975306034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_mlp": 1.01151645, + "epoch": 0.2949211235090419, + "flos": 559261274880.0, + "grad_norm": 0.03154442800865326, + "language_loss": 0.87544608, + "learning_rate": 0.000826980643851029, + "loss": 0.88603282, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.47119141, + "step": 1533, + "time_per_iteration": 2.6889007091522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063865, + "balance_loss_mlp": 1.01675379, + "epoch": 0.2951135051943055, + "flos": 484857060096.0, + "grad_norm": 0.03876668067992812, + "language_loss": 0.85914761, + "learning_rate": 0.0008267448899090464, + "loss": 0.86978626, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.47070312, + "step": 1534, + "time_per_iteration": 2.5630924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062291, + "balance_loss_mlp": 1.01498842, + "epoch": 0.29530588687956905, + "flos": 551422497792.0, + "grad_norm": 0.034923849251574525, + "language_loss": 0.81812191, + "learning_rate": 0.0008265090091134473, + "loss": 0.82874477, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.47265625, + "step": 1535, + "time_per_iteration": 2.8399465084075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105968, + "balance_loss_mlp": 1.01235437, + "epoch": 0.29549826856483263, + "flos": 674310670080.0, + "grad_norm": 0.028029616611284485, + "language_loss": 0.80873084, + "learning_rate": 0.0008262730015558088, + "loss": 0.81932771, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.47290039, + "step": 1536, + "time_per_iteration": 2.874537944793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059174, + "balance_loss_mlp": 1.01151371, + "epoch": 0.29569065025009617, + "flos": 766136329728.0, + "grad_norm": 0.03177117147053012, + "language_loss": 0.82803708, + "learning_rate": 0.0008260368673277574, + "loss": 0.83862883, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.47631836, + "step": 1537, + "time_per_iteration": 3.0976641178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_mlp": 1.00573432, + "epoch": 0.29588303193535975, + "flos": 544831859712.0, + "grad_norm": 0.031452220479770684, + "language_loss": 0.84814745, + "learning_rate": 0.0008258006065209682, + "loss": 0.85868478, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.47973633, + "step": 1538, + "time_per_iteration": 2.7704694271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115804, + "balance_loss_mlp": 1.06735778, + "epoch": 0.29607541362062334, + "flos": 598146034944.0, + "grad_norm": 0.04896094729194987, + "language_loss": 0.81966412, + "learning_rate": 0.0008255642192271657, + "loss": 0.83082211, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.484375, + "step": 1539, + "time_per_iteration": 2.774122714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_mlp": 1.01219356, + "epoch": 0.29626779530588687, + "flos": 611038606080.0, + "grad_norm": 0.02837345788652225, + "language_loss": 0.84628069, + "learning_rate": 0.0008253277055381241, + "loss": 0.85687971, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.47680664, + "step": 1540, + "time_per_iteration": 2.837587833404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061212, + "balance_loss_mlp": 1.01340961, + "epoch": 0.29646017699115046, + "flos": 868959025152.0, + "grad_norm": 0.03662488769273821, + "language_loss": 0.86757702, + "learning_rate": 0.0008250910655456658, + "loss": 0.87818909, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.4777832, + "step": 1541, + "time_per_iteration": 3.123687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010574, + "balance_loss_mlp": 1.00954938, + "epoch": 0.296652558676414, + "flos": 496881570816.0, + "grad_norm": 0.03318095479066229, + "language_loss": 0.84889704, + "learning_rate": 0.0008248542993416625, + "loss": 0.85947102, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.47827148, + "step": 1542, + "time_per_iteration": 2.637747049331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068583, + "balance_loss_mlp": 1.02082753, + "epoch": 0.2968449403616776, + "flos": 572627243520.0, + "grad_norm": 0.03443634648546435, + "language_loss": 0.84426934, + "learning_rate": 0.0008246174070180352, + "loss": 0.8549552, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.47729492, + "step": 1543, + "time_per_iteration": 2.6872684955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062899, + "balance_loss_mlp": 1.01511967, + "epoch": 0.2970373220469411, + "flos": 795651304704.0, + "grad_norm": 0.035080805136432934, + "language_loss": 0.85198414, + "learning_rate": 0.0008243803886667537, + "loss": 0.86261314, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.47753906, + "step": 1544, + "time_per_iteration": 3.13710618019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069403, + "balance_loss_mlp": 1.02145684, + "epoch": 0.2972297037322047, + "flos": 662249220864.0, + "grad_norm": 0.04094703338464919, + "language_loss": 0.80137819, + "learning_rate": 0.0008241432443798364, + "loss": 0.81207222, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.47924805, + "step": 1545, + "time_per_iteration": 2.841092109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061565, + "balance_loss_mlp": 1.0138818, + "epoch": 0.29742208541746823, + "flos": 598232550912.0, + "grad_norm": 0.028624248431763765, + "language_loss": 0.86072361, + "learning_rate": 0.0008239059742493512, + "loss": 0.87133932, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.4765625, + "step": 1546, + "time_per_iteration": 2.7034194469451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349258, + "balance_loss_mlp": 1.29957151, + "epoch": 0.2976144671027318, + "flos": 771339823104.0, + "grad_norm": 0.07377893489124947, + "language_loss": 0.88059306, + "learning_rate": 0.0008236685783674142, + "loss": 0.89408565, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.49584961, + "step": 1547, + "time_per_iteration": 3.063077688217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071266, + "balance_loss_mlp": 1.02510834, + "epoch": 0.2978068487879954, + "flos": 1487914164480.0, + "grad_norm": 0.01225569795264997, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.7729246, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.4609375, + "step": 1548, + "time_per_iteration": 4.894561767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.02564275, + "epoch": 0.29799923047325894, + "flos": 476330057472.0, + "grad_norm": 0.041178192237982324, + "language_loss": 0.84313369, + "learning_rate": 0.0008231934097178955, + "loss": 0.85386503, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.47460938, + "step": 1549, + "time_per_iteration": 2.630146026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081209, + "balance_loss_mlp": 1.03362012, + "epoch": 0.2981916121585225, + "flos": 761169051648.0, + "grad_norm": 0.037198017460407115, + "language_loss": 0.86745787, + "learning_rate": 0.0008229556371347903, + "loss": 0.87826997, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.47558594, + "step": 1550, + "time_per_iteration": 2.9614980220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081757, + "balance_loss_mlp": 1.03416848, + "epoch": 0.29838399384378606, + "flos": 876517845504.0, + "grad_norm": 0.043512769843104544, + "language_loss": 0.80808616, + "learning_rate": 0.0008227177391691874, + "loss": 0.81890368, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.47558594, + "step": 1551, + "time_per_iteration": 3.11059832572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081803, + "balance_loss_mlp": 1.03445339, + "epoch": 0.29857637552904964, + "flos": 580752780288.0, + "grad_norm": 0.039547132323558824, + "language_loss": 0.90871334, + "learning_rate": 0.0008224797159134463, + "loss": 0.91953135, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.47314453, + "step": 1552, + "time_per_iteration": 2.7177717685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077357, + "balance_loss_mlp": 1.03026903, + "epoch": 0.2987687572143132, + "flos": 837809029632.0, + "grad_norm": 0.03288289742732326, + "language_loss": 0.84735203, + "learning_rate": 0.0008222415674599765, + "loss": 0.85812569, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.47045898, + "step": 1553, + "time_per_iteration": 3.090768814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.02513897, + "epoch": 0.29896113889957676, + "flos": 568168356096.0, + "grad_norm": 0.03857517262144223, + "language_loss": 0.8489393, + "learning_rate": 0.0008220032939012349, + "loss": 0.85966009, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.46899414, + "step": 1554, + "time_per_iteration": 2.7050375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.02554476, + "epoch": 0.29915352058484035, + "flos": 499836662016.0, + "grad_norm": 0.03341170745827686, + "language_loss": 0.89154899, + "learning_rate": 0.0008217648953297277, + "loss": 0.90227222, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.46728516, + "step": 1555, + "time_per_iteration": 2.8296022415161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106052, + "balance_loss_mlp": 1.01376653, + "epoch": 0.2993459022701039, + "flos": 593215695360.0, + "grad_norm": 0.042418434687241845, + "language_loss": 0.79395097, + "learning_rate": 0.0008215263718380095, + "loss": 0.80455619, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.46704102, + "step": 1556, + "time_per_iteration": 2.683760643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02541041, + "balance_loss_mlp": 2.4871583, + "epoch": 0.29953828395536747, + "flos": 573473916672.0, + "grad_norm": 0.19828678552993478, + "language_loss": 0.85491472, + "learning_rate": 0.0008212877235186833, + "loss": 0.88032514, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.54003906, + "step": 1557, + "time_per_iteration": 2.6963422298431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.0413208, + "epoch": 0.299730665640631, + "flos": 1508086566144.0, + "grad_norm": 0.015049722833054002, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78823709, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.44824219, + "step": 1558, + "time_per_iteration": 4.971554279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098273, + "balance_loss_mlp": 1.05063736, + "epoch": 0.2999230473258946, + "flos": 514808494080.0, + "grad_norm": 0.04814176942398931, + "language_loss": 0.82249933, + "learning_rate": 0.0008208100527678611, + "loss": 0.83348203, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.47607422, + "step": 1559, + "time_per_iteration": 2.6210360527038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130027, + "balance_loss_mlp": 1.08127058, + "epoch": 0.3001154290111581, + "flos": 835855168512.0, + "grad_norm": 0.05333171316141313, + "language_loss": 0.80031002, + "learning_rate": 0.0008205710305218135, + "loss": 0.81161028, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.48730469, + "step": 1560, + "time_per_iteration": 3.0021140575408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168816, + "balance_loss_mlp": 1.11898673, + "epoch": 0.3003078106964217, + "flos": 557946061824.0, + "grad_norm": 0.05314988858528354, + "language_loss": 0.91578549, + "learning_rate": 0.0008203318838190541, + "loss": 0.92747366, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.49707031, + "step": 1561, + "time_per_iteration": 2.7369065284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153064, + "balance_loss_mlp": 1.10247147, + "epoch": 0.30050019238168524, + "flos": 527169341952.0, + "grad_norm": 0.047834322975263, + "language_loss": 0.86778915, + "learning_rate": 0.0008200926127524281, + "loss": 0.87931979, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.50634766, + "step": 1562, + "time_per_iteration": 2.6357791423797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.10565686, + "epoch": 0.3006925740669488, + "flos": 578937924864.0, + "grad_norm": 0.04357261617021945, + "language_loss": 0.84502149, + "learning_rate": 0.0008198532174148289, + "loss": 0.85659254, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.51513672, + "step": 1563, + "time_per_iteration": 2.7241976261138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097195, + "balance_loss_mlp": 1.04941559, + "epoch": 0.3008849557522124, + "flos": 1493613409536.0, + "grad_norm": 0.019627167679756308, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8178336, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.47753906, + "step": 1564, + "time_per_iteration": 4.851420879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122708, + "balance_loss_mlp": 1.07035148, + "epoch": 0.30107733743747594, + "flos": 510824949504.0, + "grad_norm": 0.045341503179798265, + "language_loss": 0.90611446, + "learning_rate": 0.0008193740542985244, + "loss": 0.91734147, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.52441406, + "step": 1565, + "time_per_iteration": 2.62724232673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113673, + "balance_loss_mlp": 1.06098223, + "epoch": 0.30126971912273953, + "flos": 588821936640.0, + "grad_norm": 0.04014967632238747, + "language_loss": 0.87587321, + "learning_rate": 0.0008191342867058467, + "loss": 0.88700998, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.52783203, + "step": 1566, + "time_per_iteration": 2.766045570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133038, + "balance_loss_mlp": 1.07991791, + "epoch": 0.30146210080800306, + "flos": 603221216256.0, + "grad_norm": 0.039455426947262194, + "language_loss": 0.84397018, + "learning_rate": 0.0008188943952142509, + "loss": 0.85530061, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.53222656, + "step": 1567, + "time_per_iteration": 2.798323154449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113428, + "balance_loss_mlp": 1.06030834, + "epoch": 0.30165448249326665, + "flos": 919287973632.0, + "grad_norm": 0.03836627098538091, + "language_loss": 0.83653766, + "learning_rate": 0.0008186543799168711, + "loss": 0.84767193, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.53222656, + "step": 1568, + "time_per_iteration": 3.1216585636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.0594008, + "epoch": 0.3018468641785302, + "flos": 778631325696.0, + "grad_norm": 0.037681015369085746, + "language_loss": 0.89441907, + "learning_rate": 0.0008184142409068892, + "loss": 0.90554047, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.52832031, + "step": 1569, + "time_per_iteration": 2.9987363815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.03584409, + "epoch": 0.30203924586379377, + "flos": 523389931776.0, + "grad_norm": 0.031063886155947292, + "language_loss": 0.87584674, + "learning_rate": 0.000818173978277536, + "loss": 0.88672638, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.52197266, + "step": 1570, + "time_per_iteration": 2.657801389694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.04125619, + "epoch": 0.3022316275490573, + "flos": 525649994496.0, + "grad_norm": 0.03542742618693904, + "language_loss": 0.8460654, + "learning_rate": 0.000817933592122089, + "loss": 0.85699487, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.51757812, + "step": 1571, + "time_per_iteration": 2.699676752090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094536, + "balance_loss_mlp": 1.04289424, + "epoch": 0.3024240092343209, + "flos": 480873515520.0, + "grad_norm": 0.03710559119511486, + "language_loss": 0.84148443, + "learning_rate": 0.0008176930825338749, + "loss": 0.85242975, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.51708984, + "step": 1572, + "time_per_iteration": 2.560293197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_mlp": 1.03446782, + "epoch": 0.3026163909195845, + "flos": 688431938304.0, + "grad_norm": 0.03769478699711506, + "language_loss": 0.89810324, + "learning_rate": 0.0008174524496062679, + "loss": 0.90895915, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.51171875, + "step": 1573, + "time_per_iteration": 2.9185256958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083791, + "balance_loss_mlp": 1.03334129, + "epoch": 0.302808772604848, + "flos": 544087253760.0, + "grad_norm": 0.033203995249134796, + "language_loss": 0.86450267, + "learning_rate": 0.0008172116934326894, + "loss": 0.87534058, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.50488281, + "step": 1574, + "time_per_iteration": 2.77254056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107482, + "balance_loss_mlp": 1.02456117, + "epoch": 0.3030011542901116, + "flos": 476052046080.0, + "grad_norm": 0.03232260410081742, + "language_loss": 0.88820696, + "learning_rate": 0.0008169708141066097, + "loss": 0.89895517, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.50268555, + "step": 1575, + "time_per_iteration": 2.5428524017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083713, + "balance_loss_mlp": 1.03402615, + "epoch": 0.30319353597537513, + "flos": 482473542912.0, + "grad_norm": 0.035261838486320786, + "language_loss": 0.91478366, + "learning_rate": 0.0008167298117215465, + "loss": 0.92562079, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.49536133, + "step": 1576, + "time_per_iteration": 2.5388023853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.0151732, + "epoch": 0.3033859176606387, + "flos": 706113897984.0, + "grad_norm": 0.033895137386355495, + "language_loss": 0.89157575, + "learning_rate": 0.0008164886863710649, + "loss": 0.90221858, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.49047852, + "step": 1577, + "time_per_iteration": 2.9326250553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072249, + "balance_loss_mlp": 1.02363503, + "epoch": 0.30357829934590225, + "flos": 766110084864.0, + "grad_norm": 0.03320904121402137, + "language_loss": 0.87079322, + "learning_rate": 0.0008162474381487783, + "loss": 0.88151574, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.48608398, + "step": 1578, + "time_per_iteration": 3.0217320919036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_mlp": 1.02135277, + "epoch": 0.30377068103116583, + "flos": 533449887744.0, + "grad_norm": 0.035817825196195696, + "language_loss": 0.854909, + "learning_rate": 0.0008160060671483475, + "loss": 0.86560726, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.48461914, + "step": 1579, + "time_per_iteration": 2.6730797290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074874, + "balance_loss_mlp": 1.02647483, + "epoch": 0.3039630627164294, + "flos": 511224470016.0, + "grad_norm": 0.04566645575365512, + "language_loss": 0.84833682, + "learning_rate": 0.0008157645734634809, + "loss": 0.85908556, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.48388672, + "step": 1580, + "time_per_iteration": 2.5822741985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186287, + "balance_loss_mlp": 1.14089203, + "epoch": 0.30415544440169295, + "flos": 1509190841856.0, + "grad_norm": 0.045615209750242004, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78082776, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.453125, + "step": 1581, + "time_per_iteration": 4.900806665420532 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157879, + "balance_loss_mlp": 1.11257935, + "epoch": 0.30434782608695654, + "flos": 1461789772800.0, + "grad_norm": 0.04177274485031814, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74372375, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.45214844, + "step": 1582, + "time_per_iteration": 4.890560150146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071245, + "balance_loss_mlp": 1.02329922, + "epoch": 0.3045402077722201, + "flos": 483535044096.0, + "grad_norm": 0.03665669352532136, + "language_loss": 0.84926951, + "learning_rate": 0.000815039357240067, + "loss": 0.85998201, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.47924805, + "step": 1583, + "time_per_iteration": 2.655641555786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075238, + "balance_loss_mlp": 1.02695799, + "epoch": 0.30473258945748366, + "flos": 544627725312.0, + "grad_norm": 0.03699880598765725, + "language_loss": 0.86035675, + "learning_rate": 0.0008147973737554952, + "loss": 0.87110913, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.48266602, + "step": 1584, + "time_per_iteration": 2.8118185997009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066041, + "balance_loss_mlp": 1.01754665, + "epoch": 0.3049249711427472, + "flos": 568122669312.0, + "grad_norm": 0.039919187148179, + "language_loss": 0.86646891, + "learning_rate": 0.000814555268055744, + "loss": 0.87712932, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.48486328, + "step": 1585, + "time_per_iteration": 2.618649482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067734, + "balance_loss_mlp": 1.01926374, + "epoch": 0.3051173528280108, + "flos": 529290398976.0, + "grad_norm": 0.034961032963054674, + "language_loss": 0.88066852, + "learning_rate": 0.0008143130402348073, + "loss": 0.89134592, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.48461914, + "step": 1586, + "time_per_iteration": 2.6645073890686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.01593137, + "epoch": 0.3053097345132743, + "flos": 587600042496.0, + "grad_norm": 0.03198607314396223, + "language_loss": 0.79707628, + "learning_rate": 0.0008140706903867265, + "loss": 0.80772173, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.48608398, + "step": 1587, + "time_per_iteration": 2.772688150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.01706147, + "epoch": 0.3055021161985379, + "flos": 608201133312.0, + "grad_norm": 0.03820330265300666, + "language_loss": 0.90882033, + "learning_rate": 0.0008138282186055897, + "loss": 0.91947937, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.48803711, + "step": 1588, + "time_per_iteration": 2.6824429035186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106751, + "balance_loss_mlp": 1.01851535, + "epoch": 0.3056944978838015, + "flos": 574963128576.0, + "grad_norm": 0.03364087196891663, + "language_loss": 0.83419842, + "learning_rate": 0.0008135856249855331, + "loss": 0.84487349, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.48950195, + "step": 1589, + "time_per_iteration": 2.6829729080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.0164994, + "epoch": 0.305886879569065, + "flos": 635072076288.0, + "grad_norm": 0.036524553871552005, + "language_loss": 0.90591866, + "learning_rate": 0.0008133429096207398, + "loss": 0.91657621, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.4909668, + "step": 1590, + "time_per_iteration": 2.7734742164611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135399, + "balance_loss_mlp": 1.08351898, + "epoch": 0.3060792612543286, + "flos": 1372133769216.0, + "grad_norm": 0.023040785082221134, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76447666, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.51953125, + "step": 1591, + "time_per_iteration": 4.964044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106727, + "balance_loss_mlp": 1.01806068, + "epoch": 0.30627164293959214, + "flos": 519619269888.0, + "grad_norm": 0.029618090290997726, + "language_loss": 0.87174189, + "learning_rate": 0.0008128571140339123, + "loss": 0.88241458, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.49121094, + "step": 1592, + "time_per_iteration": 2.6813180446624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.01942289, + "epoch": 0.3064640246248557, + "flos": 456533843712.0, + "grad_norm": 0.02963099688993501, + "language_loss": 0.87551641, + "learning_rate": 0.0008126140340004805, + "loss": 0.88620031, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.48876953, + "step": 1593, + "time_per_iteration": 2.5293447971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_mlp": 1.01580834, + "epoch": 0.30665640631011926, + "flos": 851609511936.0, + "grad_norm": 0.028917997945976257, + "language_loss": 0.82855684, + "learning_rate": 0.0008123708325995172, + "loss": 0.8392061, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.49023438, + "step": 1594, + "time_per_iteration": 3.1976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068322, + "balance_loss_mlp": 1.01937473, + "epoch": 0.30684878799538284, + "flos": 759616656384.0, + "grad_norm": 0.02786640270256765, + "language_loss": 0.80270225, + "learning_rate": 0.0008121275099254414, + "loss": 0.81338549, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.48901367, + "step": 1595, + "time_per_iteration": 2.9073448181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105895, + "balance_loss_mlp": 1.01069379, + "epoch": 0.3070411696806464, + "flos": 518596652544.0, + "grad_norm": 0.02828411740511225, + "language_loss": 0.89261508, + "learning_rate": 0.0008118840660727194, + "loss": 0.90320462, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.48242188, + "step": 1596, + "time_per_iteration": 2.6137096881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105679, + "balance_loss_mlp": 1.00855815, + "epoch": 0.30723355136590996, + "flos": 845791670016.0, + "grad_norm": 0.02807637717187332, + "language_loss": 0.8853125, + "learning_rate": 0.0008116405011358644, + "loss": 0.89588046, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.48217773, + "step": 1597, + "time_per_iteration": 3.1528680324554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_mlp": 1.01163971, + "epoch": 0.30742593305117355, + "flos": 467079836160.0, + "grad_norm": 0.032917462624290315, + "language_loss": 0.80716425, + "learning_rate": 0.0008113968152094369, + "loss": 0.81776392, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.4831543, + "step": 1598, + "time_per_iteration": 2.5390987396240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059252, + "balance_loss_mlp": 1.011235, + "epoch": 0.3076183147364371, + "flos": 687817589760.0, + "grad_norm": 0.03298344899906339, + "language_loss": 0.830042, + "learning_rate": 0.0008111530083880438, + "loss": 0.84063458, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.47998047, + "step": 1599, + "time_per_iteration": 2.904327154159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059695, + "balance_loss_mlp": 1.01170099, + "epoch": 0.30781069642170067, + "flos": 615180598272.0, + "grad_norm": 0.03364515132561045, + "language_loss": 0.86925042, + "learning_rate": 0.0008109090807663399, + "loss": 0.87984729, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.47973633, + "step": 1600, + "time_per_iteration": 2.794553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059845, + "balance_loss_mlp": 1.01206601, + "epoch": 0.3080030781069642, + "flos": 591509710080.0, + "grad_norm": 0.029450986393402313, + "language_loss": 0.89288217, + "learning_rate": 0.0008106650324390257, + "loss": 0.90348059, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.47753906, + "step": 1601, + "time_per_iteration": 2.825118064880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055341, + "balance_loss_mlp": 1.00744271, + "epoch": 0.3081954597922278, + "flos": 563691972096.0, + "grad_norm": 0.03217567830931305, + "language_loss": 0.82333392, + "learning_rate": 0.0008104208635008493, + "loss": 0.83388734, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.47875977, + "step": 1602, + "time_per_iteration": 2.7727856636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057631, + "balance_loss_mlp": 1.0099231, + "epoch": 0.3083878414774913, + "flos": 448762140672.0, + "grad_norm": 0.03928010080840531, + "language_loss": 0.82422024, + "learning_rate": 0.0008101765740466058, + "loss": 0.83479655, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.47680664, + "step": 1603, + "time_per_iteration": 2.5764591693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106031, + "balance_loss_mlp": 1.01272202, + "epoch": 0.3085802231627549, + "flos": 494545685760.0, + "grad_norm": 0.03880240670965016, + "language_loss": 0.84925759, + "learning_rate": 0.0008099321641711364, + "loss": 0.85986066, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.47558594, + "step": 1604, + "time_per_iteration": 2.6562154293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059833, + "balance_loss_mlp": 1.01262641, + "epoch": 0.3087726048480185, + "flos": 488690905344.0, + "grad_norm": 0.030963234073246262, + "language_loss": 0.84138477, + "learning_rate": 0.0008096876339693295, + "loss": 0.85198307, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.47167969, + "step": 1605, + "time_per_iteration": 2.6818747520446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057926, + "balance_loss_mlp": 1.01083875, + "epoch": 0.308964986533282, + "flos": 731888346624.0, + "grad_norm": 0.03606871420254603, + "language_loss": 0.82584137, + "learning_rate": 0.0008094429835361206, + "loss": 0.83642066, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.47045898, + "step": 1606, + "time_per_iteration": 2.940202236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01211011, + "epoch": 0.3091573682185456, + "flos": 606516535296.0, + "grad_norm": 0.033324674351776856, + "language_loss": 0.86802429, + "learning_rate": 0.0008091982129664908, + "loss": 0.87861747, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.47167969, + "step": 1607, + "time_per_iteration": 2.7152366638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055819, + "balance_loss_mlp": 1.00858819, + "epoch": 0.30934974990380915, + "flos": 461307681024.0, + "grad_norm": 0.0316485976101594, + "language_loss": 0.83554763, + "learning_rate": 0.0008089533223554687, + "loss": 0.84610581, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.47192383, + "step": 1608, + "time_per_iteration": 2.73236083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00692201, + "epoch": 0.30954213158907273, + "flos": 554568117504.0, + "grad_norm": 0.03240022060424308, + "language_loss": 0.85798776, + "learning_rate": 0.0008087083117981294, + "loss": 0.86852884, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.47143555, + "step": 1609, + "time_per_iteration": 2.8992979526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052885, + "balance_loss_mlp": 1.00543988, + "epoch": 0.30973451327433627, + "flos": 554114161920.0, + "grad_norm": 0.03509024741452312, + "language_loss": 0.88937026, + "learning_rate": 0.0008084631813895943, + "loss": 0.89989913, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.47412109, + "step": 1610, + "time_per_iteration": 2.8113343715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00168037, + "epoch": 0.30992689495959985, + "flos": 566763714816.0, + "grad_norm": 0.03310460584308608, + "language_loss": 0.8446725, + "learning_rate": 0.0008082179312250315, + "loss": 0.85516399, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.47436523, + "step": 1611, + "time_per_iteration": 2.6286494731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146973, + "balance_loss_mlp": 1.09509277, + "epoch": 0.3101192766448634, + "flos": 1445562998784.0, + "grad_norm": 0.022501740699277736, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8100282, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.51953125, + "step": 1612, + "time_per_iteration": 4.877255439758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132568, + "balance_loss_mlp": 1.08087921, + "epoch": 0.31031165833012697, + "flos": 1535130541056.0, + "grad_norm": 0.020576462480935535, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.777619, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.51757812, + "step": 1613, + "time_per_iteration": 5.064774751663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.00363839, + "epoch": 0.31050404001539056, + "flos": 993633862656.0, + "grad_norm": 0.03245007970491877, + "language_loss": 0.83116508, + "learning_rate": 0.0008074814631475545, + "loss": 0.84167451, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.47265625, + "step": 1614, + "time_per_iteration": 3.322155714035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_mlp": 1.00741875, + "epoch": 0.3106964217006541, + "flos": 446973530112.0, + "grad_norm": 0.03235075185089818, + "language_loss": 0.80034411, + "learning_rate": 0.0008072357349114907, + "loss": 0.81089151, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.47290039, + "step": 1615, + "time_per_iteration": 2.699772596359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056016, + "balance_loss_mlp": 1.00880885, + "epoch": 0.3108888033859177, + "flos": 511495678464.0, + "grad_norm": 0.0340106704308988, + "language_loss": 0.89603639, + "learning_rate": 0.0008069898873959363, + "loss": 0.90659654, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.47167969, + "step": 1616, + "time_per_iteration": 2.680640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051359, + "balance_loss_mlp": 1.0043664, + "epoch": 0.3110811850711812, + "flos": 521779210752.0, + "grad_norm": 0.029395602971080924, + "language_loss": 0.86344647, + "learning_rate": 0.0008067439206963375, + "loss": 0.87396008, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.46948242, + "step": 1617, + "time_per_iteration": 2.6484971046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055386, + "balance_loss_mlp": 1.00844121, + "epoch": 0.3112735667564448, + "flos": 687731073792.0, + "grad_norm": 0.03406090033110643, + "language_loss": 0.87673247, + "learning_rate": 0.0008064978349081873, + "loss": 0.88728631, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.46899414, + "step": 1618, + "time_per_iteration": 2.92702579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_mlp": 1.00965679, + "epoch": 0.31146594844170833, + "flos": 534166303488.0, + "grad_norm": 0.030256910717709223, + "language_loss": 0.87292403, + "learning_rate": 0.0008062516301270245, + "loss": 0.88348979, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.46875, + "step": 1619, + "time_per_iteration": 2.7301478385925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.00859511, + "epoch": 0.3116583301269719, + "flos": 680842982400.0, + "grad_norm": 0.027867683897015817, + "language_loss": 0.88937479, + "learning_rate": 0.0008060053064484343, + "loss": 0.89992964, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.46850586, + "step": 1620, + "time_per_iteration": 2.947906017303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048804, + "balance_loss_mlp": 1.00202632, + "epoch": 0.31185071181223545, + "flos": 587330779392.0, + "grad_norm": 0.03167203134142694, + "language_loss": 0.86095911, + "learning_rate": 0.0008057588639680482, + "loss": 0.87144709, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.46728516, + "step": 1621, + "time_per_iteration": 2.7836551666259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_mlp": 1.00282919, + "epoch": 0.31204309349749904, + "flos": 726658608384.0, + "grad_norm": 0.037979301866738396, + "language_loss": 0.83855367, + "learning_rate": 0.0008055123027815434, + "loss": 0.84904802, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.46557617, + "step": 1622, + "time_per_iteration": 2.9263358116149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_mlp": 1.00455689, + "epoch": 0.3122354751827626, + "flos": 577895865600.0, + "grad_norm": 0.032507776226150094, + "language_loss": 0.85607505, + "learning_rate": 0.0008052656229846436, + "loss": 0.86658645, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.46533203, + "step": 1623, + "time_per_iteration": 2.662386894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051831, + "balance_loss_mlp": 1.00514877, + "epoch": 0.31242785686802615, + "flos": 577029750528.0, + "grad_norm": 0.03513403942618559, + "language_loss": 0.91195071, + "learning_rate": 0.0008050188246731182, + "loss": 0.92246902, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.46630859, + "step": 1624, + "time_per_iteration": 2.710176467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052884, + "balance_loss_mlp": 1.00624907, + "epoch": 0.31262023855328974, + "flos": 738197082624.0, + "grad_norm": 0.0324646036152644, + "language_loss": 0.82931978, + "learning_rate": 0.0008047719079427834, + "loss": 0.83984858, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.46582031, + "step": 1625, + "time_per_iteration": 2.970287561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.03533173, + "epoch": 0.3128126202385533, + "flos": 1562594445312.0, + "grad_norm": 0.01743050972952843, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75434434, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.47363281, + "step": 1626, + "time_per_iteration": 4.816533088684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053999, + "balance_loss_mlp": 1.0071733, + "epoch": 0.31300500192381686, + "flos": 515943872256.0, + "grad_norm": 0.030770809254638827, + "language_loss": 0.86711371, + "learning_rate": 0.0008042777196091757, + "loss": 0.87765372, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.46777344, + "step": 1627, + "time_per_iteration": 2.7191882133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.01072919, + "epoch": 0.3131973836090804, + "flos": 527662181376.0, + "grad_norm": 0.031150181208545357, + "language_loss": 0.82488692, + "learning_rate": 0.0008040304481977643, + "loss": 0.83546221, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.4675293, + "step": 1628, + "time_per_iteration": 2.706782579421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.01065385, + "epoch": 0.313389765294344, + "flos": 824210736384.0, + "grad_norm": 0.032636383561425994, + "language_loss": 0.87568998, + "learning_rate": 0.0008037830587512649, + "loss": 0.88626337, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.46630859, + "step": 1629, + "time_per_iteration": 3.0928542613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054937, + "balance_loss_mlp": 1.00820696, + "epoch": 0.31358214697960757, + "flos": 394703359488.0, + "grad_norm": 0.03241768310332359, + "language_loss": 0.79631239, + "learning_rate": 0.0008035355513657224, + "loss": 0.80686176, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.46679688, + "step": 1630, + "time_per_iteration": 2.449666738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054798, + "balance_loss_mlp": 1.00806797, + "epoch": 0.3137745286648711, + "flos": 573098695680.0, + "grad_norm": 0.0293939817515363, + "language_loss": 0.93494189, + "learning_rate": 0.0008032879261372279, + "loss": 0.94548988, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.46679688, + "step": 1631, + "time_per_iteration": 2.766951084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068432, + "balance_loss_mlp": 1.02256012, + "epoch": 0.3139669103501347, + "flos": 1501632021504.0, + "grad_norm": 0.011791019456215185, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80704272, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.45800781, + "step": 1632, + "time_per_iteration": 5.585620403289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_mlp": 1.00425589, + "epoch": 0.3141592920353982, + "flos": 526359607296.0, + "grad_norm": 0.030163528949794682, + "language_loss": 0.87607086, + "learning_rate": 0.0008027923225359748, + "loss": 0.88657928, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.46533203, + "step": 1633, + "time_per_iteration": 2.607407808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105517, + "balance_loss_mlp": 1.0084641, + "epoch": 0.3143516737206618, + "flos": 594388012032.0, + "grad_norm": 0.030785944321789945, + "language_loss": 0.88644683, + "learning_rate": 0.0008025443443556267, + "loss": 0.89699847, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.46655273, + "step": 1634, + "time_per_iteration": 2.704568862915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053981, + "balance_loss_mlp": 1.00756085, + "epoch": 0.31454405540592534, + "flos": 649680347904.0, + "grad_norm": 0.028625636333363444, + "language_loss": 0.88813668, + "learning_rate": 0.000802296248717147, + "loss": 0.89867646, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.46362305, + "step": 1635, + "time_per_iteration": 2.914228916168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_mlp": 1.00461841, + "epoch": 0.3147364370911889, + "flos": 644070531072.0, + "grad_norm": 0.032412817231273386, + "language_loss": 0.79727387, + "learning_rate": 0.0008020480357168554, + "loss": 0.80778593, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.46533203, + "step": 1636, + "time_per_iteration": 2.8196966648101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_mlp": 1.00505865, + "epoch": 0.31492881877645246, + "flos": 472821855744.0, + "grad_norm": 0.028828485286514015, + "language_loss": 0.88662213, + "learning_rate": 0.0008017997054511165, + "loss": 0.89713949, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.46630859, + "step": 1637, + "time_per_iteration": 2.6545960903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051567, + "balance_loss_mlp": 1.00486124, + "epoch": 0.31512120046171604, + "flos": 630630685440.0, + "grad_norm": 0.03463883423234526, + "language_loss": 0.86238796, + "learning_rate": 0.0008015512580163407, + "loss": 0.87290359, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.46655273, + "step": 1638, + "time_per_iteration": 2.775726795196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00429583, + "epoch": 0.31531358214697963, + "flos": 705054342144.0, + "grad_norm": 0.0328972983749375, + "language_loss": 0.81582069, + "learning_rate": 0.0008013026935089838, + "loss": 0.82632947, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.46533203, + "step": 1639, + "time_per_iteration": 2.859405040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048793, + "balance_loss_mlp": 1.00182474, + "epoch": 0.31550596383224316, + "flos": 573632364288.0, + "grad_norm": 0.03266078051512415, + "language_loss": 0.84787768, + "learning_rate": 0.0008010540120255472, + "loss": 0.85836554, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.46923828, + "step": 1640, + "time_per_iteration": 2.654087781906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051523, + "balance_loss_mlp": 1.00457835, + "epoch": 0.31569834551750675, + "flos": 659513815296.0, + "grad_norm": 0.0373471738494659, + "language_loss": 0.87093472, + "learning_rate": 0.0008008052136625774, + "loss": 0.88144994, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.46899414, + "step": 1641, + "time_per_iteration": 2.7806570529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_mlp": 1.00730693, + "epoch": 0.3158907272027703, + "flos": 567404308224.0, + "grad_norm": 0.028103315573088077, + "language_loss": 0.87394774, + "learning_rate": 0.0008005562985166666, + "loss": 0.88449007, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.46875, + "step": 1642, + "time_per_iteration": 2.6866798400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053754, + "balance_loss_mlp": 1.00699973, + "epoch": 0.31608310888803387, + "flos": 537973903872.0, + "grad_norm": 0.024374019828786602, + "language_loss": 0.85555339, + "learning_rate": 0.0008003072666844524, + "loss": 0.86609089, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.46704102, + "step": 1643, + "time_per_iteration": 2.684518337249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_mlp": 1.00856149, + "epoch": 0.3162754905732974, + "flos": 487640097792.0, + "grad_norm": 0.037314537224785074, + "language_loss": 0.8350842, + "learning_rate": 0.0008000581182626173, + "loss": 0.84563494, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.46459961, + "step": 1644, + "time_per_iteration": 2.5574259757995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051572, + "balance_loss_mlp": 1.00481844, + "epoch": 0.316467872258561, + "flos": 531096506112.0, + "grad_norm": 0.03327277300757214, + "language_loss": 0.87005818, + "learning_rate": 0.0007998088533478894, + "loss": 0.88057387, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.46704102, + "step": 1645, + "time_per_iteration": 2.6987338066101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_mlp": 1.00894499, + "epoch": 0.3166602539438245, + "flos": 444414068736.0, + "grad_norm": 0.040202418156990175, + "language_loss": 0.85042381, + "learning_rate": 0.000799559472037042, + "loss": 0.8609792, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.46533203, + "step": 1646, + "time_per_iteration": 2.6219563484191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056055, + "balance_loss_mlp": 1.00958765, + "epoch": 0.3168526356290881, + "flos": 647103389952.0, + "grad_norm": 0.026601574185044653, + "language_loss": 0.8823331, + "learning_rate": 0.0007993099744268932, + "loss": 0.89289367, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.46411133, + "step": 1647, + "time_per_iteration": 2.8902037143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_mlp": 1.00817358, + "epoch": 0.3170450173143517, + "flos": 587258847744.0, + "grad_norm": 0.03281471441230887, + "language_loss": 0.8855083, + "learning_rate": 0.000799060360614307, + "loss": 0.89605635, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.46582031, + "step": 1648, + "time_per_iteration": 2.694293975830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.00945473, + "epoch": 0.3172373989996152, + "flos": 828574359552.0, + "grad_norm": 0.03046931045185914, + "language_loss": 0.84284711, + "learning_rate": 0.0007988106306961917, + "loss": 0.85340536, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.46313477, + "step": 1649, + "time_per_iteration": 3.121788501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_mlp": 1.01195896, + "epoch": 0.3174297806848788, + "flos": 528434977536.0, + "grad_norm": 0.03563880571664149, + "language_loss": 0.85299373, + "learning_rate": 0.0007985607847695014, + "loss": 0.8635785, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.46459961, + "step": 1650, + "time_per_iteration": 2.625356912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.00107014, + "epoch": 0.31762216237014235, + "flos": 714482452992.0, + "grad_norm": 0.030498079123472206, + "language_loss": 0.83133662, + "learning_rate": 0.0007983108229312345, + "loss": 0.84180987, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.46191406, + "step": 1651, + "time_per_iteration": 2.894109010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_mlp": 1.00362098, + "epoch": 0.31781454405540593, + "flos": 484800679680.0, + "grad_norm": 0.03387492306443982, + "language_loss": 0.86931884, + "learning_rate": 0.0007980607452784351, + "loss": 0.87981641, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.46069336, + "step": 1652, + "time_per_iteration": 2.5593390464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048407, + "balance_loss_mlp": 1.00236845, + "epoch": 0.31800692574066947, + "flos": 549804973824.0, + "grad_norm": 0.04030851184116312, + "language_loss": 0.90997875, + "learning_rate": 0.0007978105519081919, + "loss": 0.92046285, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.4597168, + "step": 1653, + "time_per_iteration": 2.683809995651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_mlp": 0.99982309, + "epoch": 0.31819930742593305, + "flos": 517917175296.0, + "grad_norm": 0.033294821801319624, + "language_loss": 0.88831019, + "learning_rate": 0.0007975602429176385, + "loss": 0.89876974, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.46069336, + "step": 1654, + "time_per_iteration": 2.5786075592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00238276, + "epoch": 0.31839168911119664, + "flos": 456970302720.0, + "grad_norm": 0.028947480678153642, + "language_loss": 0.82318926, + "learning_rate": 0.0007973098184039536, + "loss": 0.83367276, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.45898438, + "step": 1655, + "time_per_iteration": 2.651188611984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 0.99921381, + "epoch": 0.3185840707964602, + "flos": 627296482560.0, + "grad_norm": 0.03276090001573999, + "language_loss": 0.8731916, + "learning_rate": 0.0007970592784643602, + "loss": 0.88364458, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.46020508, + "step": 1656, + "time_per_iteration": 2.8683595657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 0.99976265, + "epoch": 0.31877645248172376, + "flos": 568541631744.0, + "grad_norm": 0.035945607337745746, + "language_loss": 0.85986471, + "learning_rate": 0.0007968086231961272, + "loss": 0.87032342, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.46044922, + "step": 1657, + "time_per_iteration": 2.642733335494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00119007, + "epoch": 0.3189688341669873, + "flos": 490553392896.0, + "grad_norm": 0.04377426906704287, + "language_loss": 0.84065533, + "learning_rate": 0.0007965578526965671, + "loss": 0.85112733, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.45947266, + "step": 1658, + "time_per_iteration": 2.5638930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049099, + "balance_loss_mlp": 1.00291717, + "epoch": 0.3191612158522509, + "flos": 577381638912.0, + "grad_norm": 0.02931224295785387, + "language_loss": 0.86766565, + "learning_rate": 0.0007963069670630377, + "loss": 0.87815666, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.46118164, + "step": 1659, + "time_per_iteration": 2.7154479026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051177, + "balance_loss_mlp": 1.00506639, + "epoch": 0.3193535975375144, + "flos": 539193852672.0, + "grad_norm": 0.03496177903686506, + "language_loss": 0.88776976, + "learning_rate": 0.0007960559663929416, + "loss": 0.89828151, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.46044922, + "step": 1660, + "time_per_iteration": 2.6322021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_mlp": 1.00868368, + "epoch": 0.319545979222778, + "flos": 735628872960.0, + "grad_norm": 0.030221795014758104, + "language_loss": 0.88154632, + "learning_rate": 0.0007958048507837259, + "loss": 0.89209306, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.45922852, + "step": 1661, + "time_per_iteration": 2.9221389293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105245, + "balance_loss_mlp": 1.00648332, + "epoch": 0.31973836090804153, + "flos": 765768890112.0, + "grad_norm": 0.037416739988226255, + "language_loss": 0.87668484, + "learning_rate": 0.0007955536203328822, + "loss": 0.88720942, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.45898438, + "step": 1662, + "time_per_iteration": 2.9018445014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_mlp": 1.00184774, + "epoch": 0.3199307425933051, + "flos": 561742968576.0, + "grad_norm": 0.03025687936293395, + "language_loss": 0.84124553, + "learning_rate": 0.0007953022751379469, + "loss": 0.85172796, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.46337891, + "step": 1663, + "time_per_iteration": 2.781562566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085701, + "balance_loss_mlp": 1.03906643, + "epoch": 0.3201231242785687, + "flos": 752672184576.0, + "grad_norm": 0.03881407073457837, + "language_loss": 0.82717097, + "learning_rate": 0.000795050815296501, + "loss": 0.83802795, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.46582031, + "step": 1664, + "time_per_iteration": 2.9950287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_mlp": 1.00446498, + "epoch": 0.32031550596383224, + "flos": 497385103872.0, + "grad_norm": 0.02713287522590179, + "language_loss": 0.93810016, + "learning_rate": 0.0007947992409061695, + "loss": 0.94860852, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.46313477, + "step": 1665, + "time_per_iteration": 2.583118438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056564, + "balance_loss_mlp": 1.01045382, + "epoch": 0.3205078876490958, + "flos": 732875970816.0, + "grad_norm": 0.03263285268561658, + "language_loss": 0.86165506, + "learning_rate": 0.0007945475520646226, + "loss": 0.8722207, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.46044922, + "step": 1666, + "time_per_iteration": 2.903190851211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_mlp": 1.01324141, + "epoch": 0.32070026933435936, + "flos": 550475702784.0, + "grad_norm": 0.03801033406135743, + "language_loss": 0.85650241, + "learning_rate": 0.0007942957488695743, + "loss": 0.86709714, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.46166992, + "step": 1667, + "time_per_iteration": 2.661292791366577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_mlp": 1.01277089, + "epoch": 0.32089265101962294, + "flos": 746685201408.0, + "grad_norm": 0.031638418068872444, + "language_loss": 0.81749988, + "learning_rate": 0.0007940438314187833, + "loss": 0.82809013, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.46191406, + "step": 1668, + "time_per_iteration": 3.0293474197387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057511, + "balance_loss_mlp": 1.01144862, + "epoch": 0.3210850327048865, + "flos": 495196972800.0, + "grad_norm": 0.034120041175176606, + "language_loss": 0.81371748, + "learning_rate": 0.0007937917998100529, + "loss": 0.82429266, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.45996094, + "step": 1669, + "time_per_iteration": 2.5822434425354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08258255, + "balance_loss_mlp": 8.0, + "epoch": 0.32127741439015006, + "flos": 531673916160.0, + "grad_norm": 0.043058724234977634, + "language_loss": 0.81425405, + "learning_rate": 0.0007935396541412302, + "loss": 0.89683664, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 2.58203125, + "step": 1670, + "time_per_iteration": 2.5968360900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0830899, + "balance_loss_mlp": 8.0, + "epoch": 0.3214697960754136, + "flos": 502224069888.0, + "grad_norm": 0.0363513778225316, + "language_loss": 0.87401152, + "learning_rate": 0.0007932873945102068, + "loss": 0.9571014, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 3.0859375, + "step": 1671, + "time_per_iteration": 2.582617998123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08312805, + "balance_loss_mlp": 8.0, + "epoch": 0.3216621777606772, + "flos": 1386404736768.0, + "grad_norm": 0.003686648730821959, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.84074581, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 3.125, + "step": 1672, + "time_per_iteration": 4.829998970031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08311279, + "balance_loss_mlp": 8.0, + "epoch": 0.32185455944594077, + "flos": 572635991808.0, + "grad_norm": 0.030782594356869853, + "language_loss": 0.88089788, + "learning_rate": 0.0007927825337533461, + "loss": 0.96401072, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 3.109375, + "step": 1673, + "time_per_iteration": 2.6633598804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08310516, + "balance_loss_mlp": 8.0, + "epoch": 0.3220469411312043, + "flos": 544937817600.0, + "grad_norm": 0.040711103761993876, + "language_loss": 0.86732781, + "learning_rate": 0.0007925299328235131, + "loss": 0.95043296, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 3.1015625, + "step": 1674, + "time_per_iteration": 2.634169578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08307083, + "balance_loss_mlp": 8.0, + "epoch": 0.3222393228164679, + "flos": 492162168576.0, + "grad_norm": 0.03938689136463286, + "language_loss": 0.86802006, + "learning_rate": 0.000792277218323488, + "loss": 0.95109081, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 3.06640625, + "step": 1675, + "time_per_iteration": 2.5893990993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08270843, + "balance_loss_mlp": 8.0, + "epoch": 0.3224317045017314, + "flos": 491363127552.0, + "grad_norm": 0.03386575094399551, + "language_loss": 0.86165106, + "learning_rate": 0.0007920243903513833, + "loss": 0.94435954, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 2.7109375, + "step": 1676, + "time_per_iteration": 2.5602426528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02321873, + "balance_loss_mlp": 2.26942062, + "epoch": 0.322624086186995, + "flos": 576871302912.0, + "grad_norm": 0.12910494226103245, + "language_loss": 0.85448408, + "learning_rate": 0.0007917714490053556, + "loss": 0.87770277, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.52539062, + "step": 1677, + "time_per_iteration": 2.6558380126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071536, + "balance_loss_mlp": 1.02492559, + "epoch": 0.32281646787225854, + "flos": 630572359680.0, + "grad_norm": 0.04049679721352166, + "language_loss": 0.87627459, + "learning_rate": 0.0007915183943836055, + "loss": 0.88698995, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.46557617, + "step": 1678, + "time_per_iteration": 2.898658037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.02631712, + "epoch": 0.3230088495575221, + "flos": 782808311040.0, + "grad_norm": 0.04272749105284559, + "language_loss": 0.85738349, + "learning_rate": 0.0007912652265843773, + "loss": 0.86811107, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.46386719, + "step": 1679, + "time_per_iteration": 3.049938917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082142, + "balance_loss_mlp": 1.03557873, + "epoch": 0.3232012312427857, + "flos": 537201107712.0, + "grad_norm": 0.04201967602882564, + "language_loss": 0.83624417, + "learning_rate": 0.0007910119457059597, + "loss": 0.84706557, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.46508789, + "step": 1680, + "time_per_iteration": 2.7126853466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108585, + "balance_loss_mlp": 1.03895342, + "epoch": 0.32339361292804925, + "flos": 706233461760.0, + "grad_norm": 0.044345030126194285, + "language_loss": 0.81981564, + "learning_rate": 0.0007907585518466849, + "loss": 0.83067411, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.46850586, + "step": 1681, + "time_per_iteration": 2.9758992195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088847, + "balance_loss_mlp": 1.0419023, + "epoch": 0.32358599461331283, + "flos": 453257966592.0, + "grad_norm": 0.04210474159896445, + "language_loss": 0.91257876, + "learning_rate": 0.000790505045104929, + "loss": 0.92346722, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.46899414, + "step": 1682, + "time_per_iteration": 2.5105395317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090368, + "balance_loss_mlp": 1.04337561, + "epoch": 0.32377837629857636, + "flos": 602092641024.0, + "grad_norm": 0.04465728550727914, + "language_loss": 0.88834655, + "learning_rate": 0.0007902514255791125, + "loss": 0.89925027, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.46948242, + "step": 1683, + "time_per_iteration": 2.7610387802124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089116, + "balance_loss_mlp": 1.04190934, + "epoch": 0.32397075798383995, + "flos": 808899654912.0, + "grad_norm": 0.04108658803287063, + "language_loss": 0.89801908, + "learning_rate": 0.0007899976933676986, + "loss": 0.90891027, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.47167969, + "step": 1684, + "time_per_iteration": 2.963387966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089307, + "balance_loss_mlp": 1.04205263, + "epoch": 0.3241631396691035, + "flos": 602793505536.0, + "grad_norm": 0.046655842402160155, + "language_loss": 0.89137548, + "learning_rate": 0.0007897438485691955, + "loss": 0.90226853, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.47216797, + "step": 1685, + "time_per_iteration": 2.675910711288452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079467, + "balance_loss_mlp": 1.03195012, + "epoch": 0.32435552135436707, + "flos": 475177182720.0, + "grad_norm": 0.045429866607221585, + "language_loss": 0.84063458, + "learning_rate": 0.0007894898912821542, + "loss": 0.85142922, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.47485352, + "step": 1686, + "time_per_iteration": 2.530951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077585, + "balance_loss_mlp": 1.02980566, + "epoch": 0.3245479030396306, + "flos": 539220097536.0, + "grad_norm": 0.03833008440392265, + "language_loss": 0.88029444, + "learning_rate": 0.0007892358216051695, + "loss": 0.89107037, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.47753906, + "step": 1687, + "time_per_iteration": 2.7729742527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.01963735, + "epoch": 0.3247402847248942, + "flos": 548697785856.0, + "grad_norm": 0.039082280310976325, + "language_loss": 0.93519121, + "learning_rate": 0.0007889816396368803, + "loss": 0.94586968, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.48193359, + "step": 1688, + "time_per_iteration": 2.625795602798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_mlp": 1.01371753, + "epoch": 0.3249326664101578, + "flos": 378992757504.0, + "grad_norm": 0.03548852277095179, + "language_loss": 0.86296374, + "learning_rate": 0.0007887273454759687, + "loss": 0.87358844, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.48754883, + "step": 1689, + "time_per_iteration": 2.4798507690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070366, + "balance_loss_mlp": 1.02106154, + "epoch": 0.3251250480954213, + "flos": 529123203072.0, + "grad_norm": 0.03304707654173593, + "language_loss": 0.83602285, + "learning_rate": 0.0007884729392211603, + "loss": 0.84672654, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.49194336, + "step": 1690, + "time_per_iteration": 2.6475188732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.01732576, + "epoch": 0.3253174297806849, + "flos": 450559499520.0, + "grad_norm": 0.03986808198030794, + "language_loss": 0.86860085, + "learning_rate": 0.0007882184209712245, + "loss": 0.87927043, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.49609375, + "step": 1691, + "time_per_iteration": 2.5213029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.03961909, + "epoch": 0.32550981146594843, + "flos": 705490801152.0, + "grad_norm": 0.03183986603149819, + "language_loss": 0.86227143, + "learning_rate": 0.000787963790824974, + "loss": 0.8731674, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.49975586, + "step": 1692, + "time_per_iteration": 2.9866673946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086614, + "balance_loss_mlp": 1.03654587, + "epoch": 0.325702193151212, + "flos": 393559233024.0, + "grad_norm": 0.035135222587328305, + "language_loss": 0.90092403, + "learning_rate": 0.0007877090488812651, + "loss": 0.91179013, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.50073242, + "step": 1693, + "time_per_iteration": 2.443784475326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067298, + "balance_loss_mlp": 1.01708698, + "epoch": 0.32589457483647555, + "flos": 578584091136.0, + "grad_norm": 0.03604448220117138, + "language_loss": 0.84406531, + "learning_rate": 0.0007874541952389973, + "loss": 0.85473824, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.50219727, + "step": 1694, + "time_per_iteration": 2.6662275791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069205, + "balance_loss_mlp": 1.01918459, + "epoch": 0.32608695652173914, + "flos": 499330216704.0, + "grad_norm": 0.03462929627838828, + "language_loss": 0.87473089, + "learning_rate": 0.0007871992299971136, + "loss": 0.88542295, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.50024414, + "step": 1695, + "time_per_iteration": 2.5501420497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068106, + "balance_loss_mlp": 1.01803839, + "epoch": 0.32627933820700267, + "flos": 592301948160.0, + "grad_norm": 0.0349674772808078, + "language_loss": 0.85830671, + "learning_rate": 0.0007869441532546001, + "loss": 0.86898774, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.5, + "step": 1696, + "time_per_iteration": 2.7640528678894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065186, + "balance_loss_mlp": 1.01550007, + "epoch": 0.32647171989226625, + "flos": 610274558208.0, + "grad_norm": 0.03448959411295718, + "language_loss": 0.80548751, + "learning_rate": 0.0007866889651104867, + "loss": 0.81613934, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.49658203, + "step": 1697, + "time_per_iteration": 2.8403704166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106687, + "balance_loss_mlp": 1.01723123, + "epoch": 0.32666410157752984, + "flos": 478190599680.0, + "grad_norm": 0.0393752309547029, + "language_loss": 0.84585583, + "learning_rate": 0.000786433665663846, + "loss": 0.85652447, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.49536133, + "step": 1698, + "time_per_iteration": 2.7460434436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.01603401, + "epoch": 0.3268564832627934, + "flos": 719694694656.0, + "grad_norm": 0.03598572558720647, + "language_loss": 0.87469888, + "learning_rate": 0.0007861782550137942, + "loss": 0.88535315, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.49291992, + "step": 1699, + "time_per_iteration": 2.922189474105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_mlp": 1.01299262, + "epoch": 0.32704886494805696, + "flos": 770106268416.0, + "grad_norm": 0.033319227910548664, + "language_loss": 0.86952895, + "learning_rate": 0.0007859227332594901, + "loss": 0.88014954, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.48999023, + "step": 1700, + "time_per_iteration": 2.8891940116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_mlp": 1.00782549, + "epoch": 0.3272412466333205, + "flos": 851405377536.0, + "grad_norm": 0.0384838580126543, + "language_loss": 0.85734528, + "learning_rate": 0.0007856671005001365, + "loss": 0.8679111, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.48730469, + "step": 1701, + "time_per_iteration": 3.169032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105609, + "balance_loss_mlp": 1.00728559, + "epoch": 0.3274336283185841, + "flos": 833041995264.0, + "grad_norm": 0.03605284930108709, + "language_loss": 0.82799482, + "learning_rate": 0.0007854113568349787, + "loss": 0.83855575, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.48779297, + "step": 1702, + "time_per_iteration": 3.123967170715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060179, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3276260100038476, + "flos": 693253407744.0, + "grad_norm": 0.03564674283827795, + "language_loss": 0.81364781, + "learning_rate": 0.0007851555023633052, + "loss": 0.82424963, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.48388672, + "step": 1703, + "time_per_iteration": 2.8430581092834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059511, + "balance_loss_mlp": 1.01120698, + "epoch": 0.3278183916891112, + "flos": 436978702848.0, + "grad_norm": 0.03514994366577059, + "language_loss": 0.83518881, + "learning_rate": 0.0007848995371844474, + "loss": 0.84578383, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.48291016, + "step": 1704, + "time_per_iteration": 2.552917003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_mlp": 1.00861514, + "epoch": 0.3280107733743748, + "flos": 462017293824.0, + "grad_norm": 0.03278124420090015, + "language_loss": 0.81157213, + "learning_rate": 0.0007846434613977801, + "loss": 0.82213771, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.47924805, + "step": 1705, + "time_per_iteration": 2.496506929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062567, + "balance_loss_mlp": 1.01483595, + "epoch": 0.3282031550596383, + "flos": 680529977856.0, + "grad_norm": 0.03615486988598079, + "language_loss": 0.79136091, + "learning_rate": 0.0007843872751027203, + "loss": 0.80198663, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.47705078, + "step": 1706, + "time_per_iteration": 2.8048393726348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00088739, + "epoch": 0.3283955367449019, + "flos": 546255942912.0, + "grad_norm": 0.030185021157442368, + "language_loss": 0.879673, + "learning_rate": 0.0007841309783987287, + "loss": 0.89015824, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.47607422, + "step": 1707, + "time_per_iteration": 2.7402358055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053102, + "balance_loss_mlp": 1.00553715, + "epoch": 0.32858791843016544, + "flos": 482241218304.0, + "grad_norm": 0.035416956868504886, + "language_loss": 0.89878803, + "learning_rate": 0.0007838745713853084, + "loss": 0.90931904, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.4753418, + "step": 1708, + "time_per_iteration": 2.603816270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.00752318, + "epoch": 0.328780300115429, + "flos": 567916589568.0, + "grad_norm": 0.03507338685235107, + "language_loss": 0.84775996, + "learning_rate": 0.0007836180541620053, + "loss": 0.8583082, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.47265625, + "step": 1709, + "time_per_iteration": 2.7194666862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_mlp": 1.00730944, + "epoch": 0.32897268180069256, + "flos": 476992038144.0, + "grad_norm": 0.03621825417570051, + "language_loss": 0.86992389, + "learning_rate": 0.0007833614268284082, + "loss": 0.88046837, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.47094727, + "step": 1710, + "time_per_iteration": 2.510921001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.01346588, + "epoch": 0.32916506348595614, + "flos": 1580453327616.0, + "grad_norm": 0.014405511351568959, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75167489, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.44335938, + "step": 1711, + "time_per_iteration": 4.875708341598511 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.00429153, + "epoch": 0.3293574451712197, + "flos": 483851939328.0, + "grad_norm": 0.03545808379065215, + "language_loss": 0.7916249, + "learning_rate": 0.0007828478422289016, + "loss": 0.80213821, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.4699707, + "step": 1712, + "time_per_iteration": 2.583045721054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_mlp": 1.00582564, + "epoch": 0.32954982685648326, + "flos": 623725097472.0, + "grad_norm": 0.0327870747371716, + "language_loss": 0.89787406, + "learning_rate": 0.0007825908851623833, + "loss": 0.9084022, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.46948242, + "step": 1713, + "time_per_iteration": 2.824685573577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050866, + "balance_loss_mlp": 1.00396931, + "epoch": 0.32974220854174685, + "flos": 546071250432.0, + "grad_norm": 0.03386258255996434, + "language_loss": 0.85659784, + "learning_rate": 0.0007823338183843533, + "loss": 0.8671065, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.46850586, + "step": 1714, + "time_per_iteration": 2.672525644302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051399, + "balance_loss_mlp": 1.00459802, + "epoch": 0.3299345902270104, + "flos": 983823727872.0, + "grad_norm": 0.03566876288837857, + "language_loss": 0.82096756, + "learning_rate": 0.0007820766419946141, + "loss": 0.83148158, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4675293, + "step": 1715, + "time_per_iteration": 3.2718288898468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051636, + "balance_loss_mlp": 1.00662231, + "epoch": 0.33012697191227397, + "flos": 1406904727296.0, + "grad_norm": 0.0085720970679931, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80724114, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.44921875, + "step": 1716, + "time_per_iteration": 4.983957290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065575, + "balance_loss_mlp": 1.01836789, + "epoch": 0.3303193535975375, + "flos": 506170675968.0, + "grad_norm": 0.038525927315114124, + "language_loss": 0.76583785, + "learning_rate": 0.0007815619607794288, + "loss": 0.77649361, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.47167969, + "step": 1717, + "time_per_iteration": 2.6315019130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054249, + "balance_loss_mlp": 1.00713778, + "epoch": 0.3305117352828011, + "flos": 939485653248.0, + "grad_norm": 0.041342276741222116, + "language_loss": 0.83710063, + "learning_rate": 0.0007813044561538001, + "loss": 0.84764308, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.47070312, + "step": 1718, + "time_per_iteration": 3.127446174621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055308, + "balance_loss_mlp": 1.00814831, + "epoch": 0.3307041169680646, + "flos": 722794627584.0, + "grad_norm": 0.03526572402512133, + "language_loss": 0.88796169, + "learning_rate": 0.0007810468423160958, + "loss": 0.89851475, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.47119141, + "step": 1719, + "time_per_iteration": 2.8622305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_mlp": 1.00741386, + "epoch": 0.3308964986533282, + "flos": 584817004800.0, + "grad_norm": 0.029883098234782163, + "language_loss": 0.82424414, + "learning_rate": 0.0007807891193663306, + "loss": 0.83478725, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.46850586, + "step": 1720, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.01715815, + "epoch": 0.33108888033859174, + "flos": 474525895680.0, + "grad_norm": 0.040993977150413745, + "language_loss": 0.82757467, + "learning_rate": 0.0007805312874045614, + "loss": 0.83821499, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.46826172, + "step": 1721, + "time_per_iteration": 2.516045331954956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_mlp": 1.00279772, + "epoch": 0.3312812620238553, + "flos": 386996785152.0, + "grad_norm": 0.03885390252626127, + "language_loss": 0.87709427, + "learning_rate": 0.0007802733465308874, + "loss": 0.88759029, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.4675293, + "step": 1722, + "time_per_iteration": 2.4662280082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_mlp": 1.00108933, + "epoch": 0.3314736437091189, + "flos": 495605241600.0, + "grad_norm": 0.03316625802825005, + "language_loss": 0.85110468, + "learning_rate": 0.0007800152968454501, + "loss": 0.86158121, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.46508789, + "step": 1723, + "time_per_iteration": 2.6313533782958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_mlp": 1.00515401, + "epoch": 0.33166602539438245, + "flos": 654931473408.0, + "grad_norm": 0.02722776998075876, + "language_loss": 0.90998107, + "learning_rate": 0.0007797571384484334, + "loss": 0.92049968, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.46655273, + "step": 1724, + "time_per_iteration": 2.8411970138549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00257659, + "epoch": 0.33185840707964603, + "flos": 521835591168.0, + "grad_norm": 0.03419077024576391, + "language_loss": 0.92796665, + "learning_rate": 0.0007794988714400633, + "loss": 0.93846071, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.46777344, + "step": 1725, + "time_per_iteration": 2.5964980125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_mlp": 1.00367355, + "epoch": 0.33205078876490957, + "flos": 437899252992.0, + "grad_norm": 0.033932075991051254, + "language_loss": 0.86014992, + "learning_rate": 0.0007792404959206079, + "loss": 0.87065518, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.46801758, + "step": 1726, + "time_per_iteration": 2.491852283477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_mlp": 1.00497568, + "epoch": 0.33224317045017315, + "flos": 770095574784.0, + "grad_norm": 0.034529473302537826, + "language_loss": 0.82129228, + "learning_rate": 0.0007789820119903774, + "loss": 0.83181036, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.46777344, + "step": 1727, + "time_per_iteration": 2.9898605346679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_mlp": 1.01260376, + "epoch": 0.3324355521354367, + "flos": 1469296103424.0, + "grad_norm": 0.013638873720884416, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79550946, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.45605469, + "step": 1728, + "time_per_iteration": 4.859704971313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_mlp": 1.00343382, + "epoch": 0.3326279338207003, + "flos": 497800175616.0, + "grad_norm": 0.033386991625918766, + "language_loss": 0.84234303, + "learning_rate": 0.0007784647192990428, + "loss": 0.85284609, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.46826172, + "step": 1729, + "time_per_iteration": 2.7268624305725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_mlp": 1.00419581, + "epoch": 0.33282031550596386, + "flos": 637054127616.0, + "grad_norm": 0.031138270474946127, + "language_loss": 0.81414318, + "learning_rate": 0.0007782059107387696, + "loss": 0.82465172, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.46606445, + "step": 1730, + "time_per_iteration": 2.85831618309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00752223, + "epoch": 0.3330126971912274, + "flos": 690722136576.0, + "grad_norm": 0.03556521205278414, + "language_loss": 0.89100444, + "learning_rate": 0.0007779469941693826, + "loss": 0.9015491, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.46899414, + "step": 1731, + "time_per_iteration": 2.8736839294433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058267, + "balance_loss_mlp": 1.01168013, + "epoch": 0.333205078876491, + "flos": 567554007552.0, + "grad_norm": 0.03898705252222011, + "language_loss": 0.77083337, + "learning_rate": 0.0007776879696914029, + "loss": 0.78141606, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.46533203, + "step": 1732, + "time_per_iteration": 2.84578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055134, + "balance_loss_mlp": 1.00868976, + "epoch": 0.3333974605617545, + "flos": 642171105024.0, + "grad_norm": 0.028730663384365272, + "language_loss": 0.89631069, + "learning_rate": 0.000777428837405392, + "loss": 0.90686202, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.46386719, + "step": 1733, + "time_per_iteration": 2.8595433235168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.00275302, + "epoch": 0.3335898422470181, + "flos": 462779396352.0, + "grad_norm": 0.03984590801707433, + "language_loss": 0.87746447, + "learning_rate": 0.0007771695974119544, + "loss": 0.88795674, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.46411133, + "step": 1734, + "time_per_iteration": 2.5200014114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_mlp": 1.00537193, + "epoch": 0.33378222393228163, + "flos": 854338114560.0, + "grad_norm": 0.03554719013753984, + "language_loss": 0.76235908, + "learning_rate": 0.0007769102498117359, + "loss": 0.77287674, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.46337891, + "step": 1735, + "time_per_iteration": 3.1014633178710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052548, + "balance_loss_mlp": 1.00624716, + "epoch": 0.3339746056175452, + "flos": 956310246144.0, + "grad_norm": 0.03187783426815399, + "language_loss": 0.80701965, + "learning_rate": 0.000776650794705424, + "loss": 0.81754518, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.46240234, + "step": 1736, + "time_per_iteration": 3.253756046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_mlp": 1.00434327, + "epoch": 0.33416698730280875, + "flos": 545895306240.0, + "grad_norm": 0.03238990381642275, + "language_loss": 0.83209848, + "learning_rate": 0.0007763912321937483, + "loss": 0.84260583, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.46337891, + "step": 1737, + "time_per_iteration": 2.712942361831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051632, + "balance_loss_mlp": 1.00525999, + "epoch": 0.33435936898807234, + "flos": 1015876776960.0, + "grad_norm": 0.036470780413058734, + "language_loss": 0.8337301, + "learning_rate": 0.0007761315623774799, + "loss": 0.84424639, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.46313477, + "step": 1738, + "time_per_iteration": 3.38946795463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053137, + "balance_loss_mlp": 1.00671661, + "epoch": 0.3345517506733359, + "flos": 616372356864.0, + "grad_norm": 0.034452353492031275, + "language_loss": 0.88688117, + "learning_rate": 0.0007758717853574313, + "loss": 0.89741254, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.46362305, + "step": 1739, + "time_per_iteration": 2.7438387870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105524, + "balance_loss_mlp": 1.00896263, + "epoch": 0.33474413235859946, + "flos": 495570248448.0, + "grad_norm": 0.03665446817767542, + "language_loss": 0.90973008, + "learning_rate": 0.0007756119012344571, + "loss": 0.92028248, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.4621582, + "step": 1740, + "time_per_iteration": 2.5443572998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.0052774, + "epoch": 0.33493651404386304, + "flos": 629488504320.0, + "grad_norm": 0.0365358867260097, + "language_loss": 0.85516071, + "learning_rate": 0.0007753519101094535, + "loss": 0.86567724, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.46313477, + "step": 1741, + "time_per_iteration": 2.785595417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050508, + "balance_loss_mlp": 1.00396836, + "epoch": 0.3351288957291266, + "flos": 514743365376.0, + "grad_norm": 0.038608286094447275, + "language_loss": 0.87042749, + "learning_rate": 0.0007750918120833575, + "loss": 0.88093251, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.46484375, + "step": 1742, + "time_per_iteration": 2.5612564086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_mlp": 1.00825262, + "epoch": 0.33532127741439016, + "flos": 648483731712.0, + "grad_norm": 0.038902913238311417, + "language_loss": 0.88245445, + "learning_rate": 0.0007748316072571485, + "loss": 0.89300191, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.46435547, + "step": 1743, + "time_per_iteration": 2.8040030002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056064, + "balance_loss_mlp": 1.00969172, + "epoch": 0.3355136590996537, + "flos": 769789373184.0, + "grad_norm": 0.032744002461956113, + "language_loss": 0.80090916, + "learning_rate": 0.0007745712957318467, + "loss": 0.81146979, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.46313477, + "step": 1744, + "time_per_iteration": 2.955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_mlp": 1.00656557, + "epoch": 0.3357060407849173, + "flos": 596650020096.0, + "grad_norm": 0.027209343707751667, + "language_loss": 0.86834347, + "learning_rate": 0.0007743108776085141, + "loss": 0.87887406, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.46435547, + "step": 1745, + "time_per_iteration": 2.8065922260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059361, + "balance_loss_mlp": 1.01277399, + "epoch": 0.3358984224701808, + "flos": 599802442752.0, + "grad_norm": 0.030632877870575562, + "language_loss": 0.83193165, + "learning_rate": 0.0007740503529882543, + "loss": 0.84252524, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.46533203, + "step": 1746, + "time_per_iteration": 2.783057451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058625, + "balance_loss_mlp": 1.01218116, + "epoch": 0.3360908041554444, + "flos": 579430764288.0, + "grad_norm": 0.03209356344176002, + "language_loss": 0.91440552, + "learning_rate": 0.0007737897219722114, + "loss": 0.92499179, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.46386719, + "step": 1747, + "time_per_iteration": 2.6678693294525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00723922, + "epoch": 0.336283185840708, + "flos": 514621856256.0, + "grad_norm": 0.02947569275247992, + "language_loss": 0.81706387, + "learning_rate": 0.0007735289846615716, + "loss": 0.82759976, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.46289062, + "step": 1748, + "time_per_iteration": 2.664217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_mlp": 1.00312185, + "epoch": 0.3364755675259715, + "flos": 526014521856.0, + "grad_norm": 0.03437288512368296, + "language_loss": 0.83148289, + "learning_rate": 0.0007732681411575621, + "loss": 0.84197474, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.45996094, + "step": 1749, + "time_per_iteration": 2.679304361343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_mlp": 1.00613475, + "epoch": 0.3366679492112351, + "flos": 555974704128.0, + "grad_norm": 0.040002531784274646, + "language_loss": 0.88002014, + "learning_rate": 0.0007730071915614514, + "loss": 0.89053994, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.45776367, + "step": 1750, + "time_per_iteration": 2.6813647747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00734055, + "epoch": 0.33686033089649864, + "flos": 428164940544.0, + "grad_norm": 0.03793638318473741, + "language_loss": 0.88937026, + "learning_rate": 0.0007727461359745489, + "loss": 0.89990187, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.45751953, + "step": 1751, + "time_per_iteration": 2.459137439727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_mlp": 1.00425673, + "epoch": 0.3370527125817622, + "flos": 542841060096.0, + "grad_norm": 0.030686532457312277, + "language_loss": 0.86821485, + "learning_rate": 0.0007724849744982056, + "loss": 0.87871712, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.45898438, + "step": 1752, + "time_per_iteration": 2.682023525238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.00412822, + "epoch": 0.33724509426702576, + "flos": 543231832320.0, + "grad_norm": 0.03146587739195435, + "language_loss": 0.82788759, + "learning_rate": 0.0007722237072338131, + "loss": 0.8383888, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.45922852, + "step": 1753, + "time_per_iteration": 2.7289977073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_mlp": 1.00735557, + "epoch": 0.33743747595228935, + "flos": 473753099520.0, + "grad_norm": 0.036309304678759154, + "language_loss": 0.86263937, + "learning_rate": 0.0007719623342828046, + "loss": 0.8731702, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.45654297, + "step": 1754, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_mlp": 1.00127256, + "epoch": 0.33762985763755293, + "flos": 470837859072.0, + "grad_norm": 0.037209700878319825, + "language_loss": 0.84580374, + "learning_rate": 0.000771700855746654, + "loss": 0.85627109, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.45385742, + "step": 1755, + "time_per_iteration": 2.585667848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049151, + "balance_loss_mlp": 1.00366056, + "epoch": 0.33782223932281646, + "flos": 493251859968.0, + "grad_norm": 0.03059786996599164, + "language_loss": 0.89290714, + "learning_rate": 0.0007714392717268763, + "loss": 0.90339863, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.45410156, + "step": 1756, + "time_per_iteration": 2.5836589336395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_mlp": 1.00321686, + "epoch": 0.33801462100808005, + "flos": 466018334976.0, + "grad_norm": 0.035533831964213135, + "language_loss": 0.87473714, + "learning_rate": 0.0007711775823250273, + "loss": 0.88522607, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.45605469, + "step": 1757, + "time_per_iteration": 2.5619492530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_mlp": 1.00417781, + "epoch": 0.3382070026933436, + "flos": 797068584960.0, + "grad_norm": 0.03198873828119691, + "language_loss": 0.84101963, + "learning_rate": 0.0007709157876427039, + "loss": 0.85151625, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.45410156, + "step": 1758, + "time_per_iteration": 3.084735870361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049654, + "balance_loss_mlp": 1.00414026, + "epoch": 0.33839938437860717, + "flos": 509429056512.0, + "grad_norm": 0.031347294296384644, + "language_loss": 0.86196065, + "learning_rate": 0.0007706538877815439, + "loss": 0.87245721, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4543457, + "step": 1759, + "time_per_iteration": 2.6354048252105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_mlp": 1.00371122, + "epoch": 0.3385917660638707, + "flos": 485274077184.0, + "grad_norm": 0.03028112214235413, + "language_loss": 0.83875918, + "learning_rate": 0.0007703918828432259, + "loss": 0.84925139, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.4543457, + "step": 1760, + "time_per_iteration": 2.6017844676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_mlp": 1.00358403, + "epoch": 0.3387841477491343, + "flos": 546416335872.0, + "grad_norm": 0.033680258429279644, + "language_loss": 0.89293355, + "learning_rate": 0.000770129772929469, + "loss": 0.90342498, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.45483398, + "step": 1761, + "time_per_iteration": 2.671287775039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048026, + "balance_loss_mlp": 1.00217831, + "epoch": 0.3389765294343978, + "flos": 721064342784.0, + "grad_norm": 0.03497277274463044, + "language_loss": 0.89180952, + "learning_rate": 0.0007698675581420334, + "loss": 0.90228981, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.45776367, + "step": 1762, + "time_per_iteration": 2.9236271381378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_mlp": 1.00677264, + "epoch": 0.3391689111196614, + "flos": 701264238336.0, + "grad_norm": 0.034268369898116914, + "language_loss": 0.79778481, + "learning_rate": 0.0007696052385827199, + "loss": 0.80830908, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.45581055, + "step": 1763, + "time_per_iteration": 2.9605488777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055868, + "balance_loss_mlp": 1.01018691, + "epoch": 0.339361292804925, + "flos": 628249113600.0, + "grad_norm": 0.03454670185411084, + "language_loss": 0.78905737, + "learning_rate": 0.00076934281435337, + "loss": 0.79961604, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.45605469, + "step": 1764, + "time_per_iteration": 2.7454025745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052159, + "balance_loss_mlp": 1.00647831, + "epoch": 0.33955367449018853, + "flos": 610795587840.0, + "grad_norm": 0.03693575970108084, + "language_loss": 0.86892688, + "learning_rate": 0.0007690802855558658, + "loss": 0.87944847, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.45605469, + "step": 1765, + "time_per_iteration": 2.8936946392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.01057434, + "epoch": 0.3397460561754521, + "flos": 1456589191680.0, + "grad_norm": 0.006269192400269108, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77429777, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.44335938, + "step": 1766, + "time_per_iteration": 4.913206100463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054352, + "balance_loss_mlp": 1.00855207, + "epoch": 0.33993843786071565, + "flos": 488291384832.0, + "grad_norm": 0.039386286306125895, + "language_loss": 0.89967024, + "learning_rate": 0.0007685549146641262, + "loss": 0.91021377, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.45727539, + "step": 1767, + "time_per_iteration": 2.593353271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050967, + "balance_loss_mlp": 1.00554788, + "epoch": 0.34013081954597923, + "flos": 418233296640.0, + "grad_norm": 0.032458575290873634, + "language_loss": 0.89062989, + "learning_rate": 0.0007682920727738579, + "loss": 0.90113962, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.45336914, + "step": 1768, + "time_per_iteration": 2.510331392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054131, + "balance_loss_mlp": 1.00835514, + "epoch": 0.34032320123124277, + "flos": 438430976256.0, + "grad_norm": 0.037803385345055784, + "language_loss": 0.85379529, + "learning_rate": 0.000768029126723369, + "loss": 0.86433661, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.45703125, + "step": 1769, + "time_per_iteration": 2.5152533054351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054345, + "balance_loss_mlp": 1.00852144, + "epoch": 0.34051558291650635, + "flos": 458544085248.0, + "grad_norm": 0.04157155741286578, + "language_loss": 0.82432753, + "learning_rate": 0.0007677660766147447, + "loss": 0.83487099, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.45751953, + "step": 1770, + "time_per_iteration": 2.5669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_mlp": 1.00858307, + "epoch": 0.3407079646017699, + "flos": 1562140489728.0, + "grad_norm": 0.006526141838203855, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73523682, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.44238281, + "step": 1771, + "time_per_iteration": 4.953578233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051633, + "balance_loss_mlp": 1.00602317, + "epoch": 0.3409003462870335, + "flos": 493531816704.0, + "grad_norm": 0.043561887450476046, + "language_loss": 0.80659652, + "learning_rate": 0.0007672396646316306, + "loss": 0.81711292, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.45532227, + "step": 1772, + "time_per_iteration": 2.5720248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00356674, + "epoch": 0.34109272797229706, + "flos": 809822150400.0, + "grad_norm": 0.03735237922314452, + "language_loss": 0.80629146, + "learning_rate": 0.000766976302961512, + "loss": 0.81678128, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.45336914, + "step": 1773, + "time_per_iteration": 3.0438191890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050599, + "balance_loss_mlp": 1.00513268, + "epoch": 0.3412851096575606, + "flos": 471100319232.0, + "grad_norm": 0.03730121261656314, + "language_loss": 0.82086515, + "learning_rate": 0.0007667128376420003, + "loss": 0.83137119, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.45385742, + "step": 1774, + "time_per_iteration": 2.5461959838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_mlp": 1.00681531, + "epoch": 0.3414774913428242, + "flos": 596771529216.0, + "grad_norm": 0.03978671612524881, + "language_loss": 0.85611963, + "learning_rate": 0.0007664492687753817, + "loss": 0.86664057, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.4519043, + "step": 1775, + "time_per_iteration": 2.7454183101654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_mlp": 1.00362372, + "epoch": 0.3416698730280877, + "flos": 528508854528.0, + "grad_norm": 0.03225195621375244, + "language_loss": 0.82109249, + "learning_rate": 0.000766185596463983, + "loss": 0.83158267, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.453125, + "step": 1776, + "time_per_iteration": 2.636876106262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050456, + "balance_loss_mlp": 1.00513279, + "epoch": 0.3418622547133513, + "flos": 876118324992.0, + "grad_norm": 0.033083928099711564, + "language_loss": 0.77454132, + "learning_rate": 0.0007659218208101706, + "loss": 0.78504586, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.45239258, + "step": 1777, + "time_per_iteration": 3.097163677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055706, + "balance_loss_mlp": 1.01031137, + "epoch": 0.34205463639861483, + "flos": 604877624064.0, + "grad_norm": 0.03453483859247358, + "language_loss": 0.86064076, + "learning_rate": 0.0007656579419163515, + "loss": 0.87119782, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.453125, + "step": 1778, + "time_per_iteration": 2.7452263832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055225, + "balance_loss_mlp": 1.0096159, + "epoch": 0.3422470180838784, + "flos": 464715760896.0, + "grad_norm": 0.037184345749469765, + "language_loss": 0.77793133, + "learning_rate": 0.0007653939598849724, + "loss": 0.78848356, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.45532227, + "step": 1779, + "time_per_iteration": 2.5020663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0134964, + "epoch": 0.34243939976914195, + "flos": 1589819222016.0, + "grad_norm": 0.009860928497574006, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83937383, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.4375, + "step": 1780, + "time_per_iteration": 4.958939552307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00849116, + "epoch": 0.34263178145440554, + "flos": 874444420608.0, + "grad_norm": 0.034671274665512654, + "language_loss": 0.80890739, + "learning_rate": 0.000764865686819522, + "loss": 0.81944883, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.45581055, + "step": 1781, + "time_per_iteration": 3.0468943119049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.01148522, + "epoch": 0.3428241631396691, + "flos": 507874715904.0, + "grad_norm": 0.02984044691012994, + "language_loss": 0.86276633, + "learning_rate": 0.0007646013959905449, + "loss": 0.87333775, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.45581055, + "step": 1782, + "time_per_iteration": 2.59788179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056783, + "balance_loss_mlp": 1.01114941, + "epoch": 0.34301654482493266, + "flos": 881525952768.0, + "grad_norm": 0.034646354408830966, + "language_loss": 0.81384498, + "learning_rate": 0.0007643370024341949, + "loss": 0.82441282, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.45556641, + "step": 1783, + "time_per_iteration": 3.0783512592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_mlp": 1.00288546, + "epoch": 0.34320892651019624, + "flos": 432669514752.0, + "grad_norm": 0.031189947688426686, + "language_loss": 0.84145617, + "learning_rate": 0.0007640725062531195, + "loss": 0.85193729, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.45141602, + "step": 1784, + "time_per_iteration": 2.5152812004089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00559807, + "epoch": 0.3434013081954598, + "flos": 464594251776.0, + "grad_norm": 0.03760163078295718, + "language_loss": 0.86810297, + "learning_rate": 0.0007638079075500047, + "loss": 0.87861264, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.45288086, + "step": 1785, + "time_per_iteration": 2.5846633911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.0019455, + "epoch": 0.34359368988072336, + "flos": 1560677522688.0, + "grad_norm": 0.003111664808940008, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76225722, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.43164062, + "step": 1786, + "time_per_iteration": 4.94433856010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_mlp": 1.003739, + "epoch": 0.3437860715659869, + "flos": 496573423872.0, + "grad_norm": 0.03208809815455149, + "language_loss": 0.83580017, + "learning_rate": 0.0007632784029886026, + "loss": 0.8462882, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.45019531, + "step": 1787, + "time_per_iteration": 2.6222987174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_mlp": 1.00523186, + "epoch": 0.3439784532512505, + "flos": 719610124032.0, + "grad_norm": 0.03771035877194531, + "language_loss": 0.86448389, + "learning_rate": 0.0007630134973358873, + "loss": 0.87498415, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.44799805, + "step": 1788, + "time_per_iteration": 2.9359545707702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00320995, + "epoch": 0.34417083493651407, + "flos": 566922162432.0, + "grad_norm": 0.0315223877917514, + "language_loss": 0.8730194, + "learning_rate": 0.0007627484895722763, + "loss": 0.88349926, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.44775391, + "step": 1789, + "time_per_iteration": 2.710433006286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00397587, + "epoch": 0.3443632166217776, + "flos": 797702375424.0, + "grad_norm": 0.034658336241014505, + "language_loss": 0.80973929, + "learning_rate": 0.0007624833798006552, + "loss": 0.82022536, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.4465332, + "step": 1790, + "time_per_iteration": 3.061995506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049093, + "balance_loss_mlp": 1.00419891, + "epoch": 0.3445555983070412, + "flos": 570393425664.0, + "grad_norm": 0.0359941873064626, + "language_loss": 0.84664464, + "learning_rate": 0.0007622181681239483, + "loss": 0.85713559, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.44873047, + "step": 1791, + "time_per_iteration": 2.708204984664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00192165, + "epoch": 0.3447479799923047, + "flos": 569981266176.0, + "grad_norm": 0.030307911746310208, + "language_loss": 0.85264516, + "learning_rate": 0.0007619528546451202, + "loss": 0.86311066, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.4465332, + "step": 1792, + "time_per_iteration": 2.8142476081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047842, + "balance_loss_mlp": 1.00323367, + "epoch": 0.3449403616775683, + "flos": 969333074688.0, + "grad_norm": 0.03266645448260783, + "language_loss": 0.84415537, + "learning_rate": 0.0007616874394671745, + "loss": 0.85463381, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.4465332, + "step": 1793, + "time_per_iteration": 3.340257406234741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048671, + "balance_loss_mlp": 1.00411057, + "epoch": 0.34513274336283184, + "flos": 569677009920.0, + "grad_norm": 0.042713127170940564, + "language_loss": 0.85883492, + "learning_rate": 0.0007614219226931547, + "loss": 0.86932158, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44604492, + "step": 1794, + "time_per_iteration": 2.666299343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00301611, + "epoch": 0.3453251250480954, + "flos": 461858846208.0, + "grad_norm": 0.03409376285864792, + "language_loss": 0.85191298, + "learning_rate": 0.0007611563044261435, + "loss": 0.86238825, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.44580078, + "step": 1795, + "time_per_iteration": 2.509730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00340092, + "epoch": 0.34551750673335896, + "flos": 416520508416.0, + "grad_norm": 0.03871598691360063, + "language_loss": 0.87655377, + "learning_rate": 0.0007608905847692631, + "loss": 0.88703358, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.4465332, + "step": 1796, + "time_per_iteration": 2.468144416809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045751, + "balance_loss_mlp": 1.0012145, + "epoch": 0.34570988841862255, + "flos": 589115499264.0, + "grad_norm": 0.03133980127061019, + "language_loss": 0.87422049, + "learning_rate": 0.0007606247638256749, + "loss": 0.88467801, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.44580078, + "step": 1797, + "time_per_iteration": 2.8401029109954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_mlp": 1.00758362, + "epoch": 0.34590227010388613, + "flos": 1571145747456.0, + "grad_norm": 0.007450888717391324, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79220599, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.42773438, + "step": 1798, + "time_per_iteration": 4.913544178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_mlp": 1.00097656, + "epoch": 0.34609465178914967, + "flos": 1540930886400.0, + "grad_norm": 0.004797214297707501, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80371094, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.4296875, + "step": 1799, + "time_per_iteration": 4.771878719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_mlp": 1.00469148, + "epoch": 0.34628703347441325, + "flos": 610517576448.0, + "grad_norm": 0.037119753663607306, + "language_loss": 0.86850703, + "learning_rate": 0.0007598266943068686, + "loss": 0.8790009, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44750977, + "step": 1800, + "time_per_iteration": 2.746819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050462, + "balance_loss_mlp": 1.00535274, + "epoch": 0.3464794151596768, + "flos": 474265380864.0, + "grad_norm": 0.03436691989893219, + "language_loss": 0.84791839, + "learning_rate": 0.0007595604692488507, + "loss": 0.85842299, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.45019531, + "step": 1801, + "time_per_iteration": 2.564328908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_mlp": 1.00587356, + "epoch": 0.34667179684494037, + "flos": 606822736896.0, + "grad_norm": 0.03808690892272381, + "language_loss": 0.83437663, + "learning_rate": 0.0007592941434205215, + "loss": 0.8448841, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.44848633, + "step": 1802, + "time_per_iteration": 2.826420545578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059933, + "balance_loss_mlp": 1.016922, + "epoch": 0.3468641785302039, + "flos": 1568362709760.0, + "grad_norm": 0.013636299413791342, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74630988, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.43066406, + "step": 1803, + "time_per_iteration": 5.063625812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050175, + "balance_loss_mlp": 1.00523341, + "epoch": 0.3470565602154675, + "flos": 908724484608.0, + "grad_norm": 0.03942668215130471, + "language_loss": 0.80763334, + "learning_rate": 0.0007587611898665566, + "loss": 0.81813502, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.44921875, + "step": 1804, + "time_per_iteration": 3.0834579467773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.0052247, + "epoch": 0.347248941900731, + "flos": 640060741632.0, + "grad_norm": 0.031209613313051415, + "language_loss": 0.82727098, + "learning_rate": 0.0007584945623478315, + "loss": 0.83777213, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.44873047, + "step": 1805, + "time_per_iteration": 2.861560106277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00688517, + "epoch": 0.3474413235859946, + "flos": 848782732800.0, + "grad_norm": 0.03633023546687314, + "language_loss": 0.81859386, + "learning_rate": 0.000758227834472617, + "loss": 0.82910925, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.44702148, + "step": 1806, + "time_per_iteration": 3.0337021350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.00767589, + "epoch": 0.3476337052712582, + "flos": 516697226496.0, + "grad_norm": 0.035243207865769656, + "language_loss": 0.77929807, + "learning_rate": 0.0007579610063444664, + "loss": 0.78982013, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.44580078, + "step": 1807, + "time_per_iteration": 2.7339653968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_mlp": 1.01154768, + "epoch": 0.34782608695652173, + "flos": 915115845888.0, + "grad_norm": 0.03414685220945043, + "language_loss": 0.88006967, + "learning_rate": 0.0007576940780669712, + "loss": 0.89063108, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4465332, + "step": 1808, + "time_per_iteration": 3.211806058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_mlp": 1.00756717, + "epoch": 0.3480184686417853, + "flos": 775084240128.0, + "grad_norm": 0.07111913657628408, + "language_loss": 0.84903318, + "learning_rate": 0.0007574270497437624, + "loss": 0.85955209, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.4440918, + "step": 1809, + "time_per_iteration": 2.984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00518048, + "epoch": 0.34821085032704885, + "flos": 578004735744.0, + "grad_norm": 0.031195535995176178, + "language_loss": 0.88877916, + "learning_rate": 0.000757159921478509, + "loss": 0.89927369, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.44360352, + "step": 1810, + "time_per_iteration": 2.778917074203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.00888824, + "epoch": 0.34840323201231244, + "flos": 1528042205952.0, + "grad_norm": 0.009192534613281171, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75502062, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.42578125, + "step": 1811, + "time_per_iteration": 4.791734218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.0040617, + "epoch": 0.34859561369757597, + "flos": 510182410752.0, + "grad_norm": 0.038842956055274956, + "language_loss": 0.88272417, + "learning_rate": 0.0007566253655367423, + "loss": 0.89320654, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.44262695, + "step": 1812, + "time_per_iteration": 2.6542506217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050997, + "balance_loss_mlp": 1.00689006, + "epoch": 0.34878799538283956, + "flos": 549757341696.0, + "grad_norm": 0.030689577509801048, + "language_loss": 0.90222162, + "learning_rate": 0.000756357938067762, + "loss": 0.91273159, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.44189453, + "step": 1813, + "time_per_iteration": 2.6897120475769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047642, + "balance_loss_mlp": 1.00346339, + "epoch": 0.34898037706810314, + "flos": 985195321344.0, + "grad_norm": 0.03422241032564105, + "language_loss": 0.83499646, + "learning_rate": 0.0007560904110718033, + "loss": 0.84547287, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44262695, + "step": 1814, + "time_per_iteration": 3.3129422664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_mlp": 1.00102115, + "epoch": 0.3491727587533667, + "flos": 682837672704.0, + "grad_norm": 0.03439092984945392, + "language_loss": 0.84187126, + "learning_rate": 0.0007558227846527297, + "loss": 0.85232258, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.44189453, + "step": 1815, + "time_per_iteration": 2.8228747844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_mlp": 1.00880051, + "epoch": 0.34936514043863026, + "flos": 394889997312.0, + "grad_norm": 0.04066201843968592, + "language_loss": 0.84257603, + "learning_rate": 0.0007555550589144429, + "loss": 0.8531037, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44042969, + "step": 1816, + "time_per_iteration": 2.4170055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053968, + "balance_loss_mlp": 1.01000416, + "epoch": 0.3495575221238938, + "flos": 462340992000.0, + "grad_norm": 0.036355924698056825, + "language_loss": 0.84744954, + "learning_rate": 0.000755287233960883, + "loss": 0.85798925, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.44042969, + "step": 1817, + "time_per_iteration": 2.577195405960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_mlp": 1.01115596, + "epoch": 0.3497499038091574, + "flos": 725429911296.0, + "grad_norm": 0.037028935917378006, + "language_loss": 0.78975379, + "learning_rate": 0.0007550193098960292, + "loss": 0.80030644, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.44189453, + "step": 1818, + "time_per_iteration": 2.9124276638031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_mlp": 1.00609303, + "epoch": 0.3499422854944209, + "flos": 829197456384.0, + "grad_norm": 0.03031702063556045, + "language_loss": 0.8721534, + "learning_rate": 0.0007547512868238988, + "loss": 0.88265729, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.44384766, + "step": 1819, + "time_per_iteration": 3.1275570392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_mlp": 1.00203693, + "epoch": 0.3501346671796845, + "flos": 494543740416.0, + "grad_norm": 0.03689243892136314, + "language_loss": 0.8434422, + "learning_rate": 0.0007544831648485473, + "loss": 0.85390604, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.44433594, + "step": 1820, + "time_per_iteration": 2.6672415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053354, + "balance_loss_mlp": 1.00917482, + "epoch": 0.35032704886494803, + "flos": 579849726720.0, + "grad_norm": 0.04031883928972686, + "language_loss": 0.8166672, + "learning_rate": 0.0007542149440740694, + "loss": 0.82720077, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.44262695, + "step": 1821, + "time_per_iteration": 2.659205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_mlp": 1.0069536, + "epoch": 0.3505194305502116, + "flos": 585832819200.0, + "grad_norm": 0.035872862949689145, + "language_loss": 0.86380953, + "learning_rate": 0.000753946624604597, + "loss": 0.8743242, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.44604492, + "step": 1822, + "time_per_iteration": 2.748387575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049848, + "balance_loss_mlp": 1.00528705, + "epoch": 0.3507118122354752, + "flos": 527979076608.0, + "grad_norm": 0.036265727976650085, + "language_loss": 0.88431466, + "learning_rate": 0.0007536782065443015, + "loss": 0.89481318, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44628906, + "step": 1823, + "time_per_iteration": 2.608429193496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054753, + "balance_loss_mlp": 1.00997818, + "epoch": 0.35090419392073874, + "flos": 512546486016.0, + "grad_norm": 0.039277226542114754, + "language_loss": 0.75647306, + "learning_rate": 0.0007534096899973919, + "loss": 0.76702058, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.44799805, + "step": 1824, + "time_per_iteration": 2.702721118927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.0046134, + "epoch": 0.3510965756060023, + "flos": 565196735232.0, + "grad_norm": 0.031185756782702443, + "language_loss": 0.83427215, + "learning_rate": 0.0007531410750681154, + "loss": 0.84476435, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.44677734, + "step": 1825, + "time_per_iteration": 2.7568912506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00831807, + "epoch": 0.35128895729126586, + "flos": 1022254532352.0, + "grad_norm": 0.030666943866844928, + "language_loss": 0.87304175, + "learning_rate": 0.0007528723618607575, + "loss": 0.88357341, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.44848633, + "step": 1826, + "time_per_iteration": 3.4575371742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.00510669, + "epoch": 0.35148133897652944, + "flos": 589425591552.0, + "grad_norm": 0.04947505148138052, + "language_loss": 0.83428013, + "learning_rate": 0.0007526035504796422, + "loss": 0.84477776, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.44702148, + "step": 1827, + "time_per_iteration": 2.7913553714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053021, + "balance_loss_mlp": 1.00838912, + "epoch": 0.351673720661793, + "flos": 496286664192.0, + "grad_norm": 0.03604129919469899, + "language_loss": 0.87358594, + "learning_rate": 0.0007523346410291312, + "loss": 0.88411617, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.44702148, + "step": 1828, + "time_per_iteration": 2.769817590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.00499058, + "epoch": 0.35186610234705656, + "flos": 763999721472.0, + "grad_norm": 0.036507155273352104, + "language_loss": 0.85486639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86536574, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.44921875, + "step": 1829, + "time_per_iteration": 2.960890293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00364745, + "epoch": 0.3520584840323201, + "flos": 627389801472.0, + "grad_norm": 0.0323509050656096, + "language_loss": 0.88885164, + "learning_rate": 0.0007517965283375599, + "loss": 0.89933491, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.44702148, + "step": 1830, + "time_per_iteration": 2.868405818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047282, + "balance_loss_mlp": 1.00260293, + "epoch": 0.3522508657175837, + "flos": 538449246720.0, + "grad_norm": 0.03139560131485747, + "language_loss": 0.89993465, + "learning_rate": 0.0007515273253054132, + "loss": 0.91040754, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.44726562, + "step": 1831, + "time_per_iteration": 2.6341445446014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_mlp": 1.00298083, + "epoch": 0.35244324740284727, + "flos": 568502747904.0, + "grad_norm": 0.03545868131612223, + "language_loss": 0.83198845, + "learning_rate": 0.0007512580246216988, + "loss": 0.8424651, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44726562, + "step": 1832, + "time_per_iteration": 2.691678524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00860476, + "epoch": 0.3526356290881108, + "flos": 514055139840.0, + "grad_norm": 0.03517539350184397, + "language_loss": 0.85415643, + "learning_rate": 0.000750988626390968, + "loss": 0.86468661, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.44506836, + "step": 1833, + "time_per_iteration": 2.6027944087982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050095, + "balance_loss_mlp": 1.00577271, + "epoch": 0.3528280107733744, + "flos": 596973718272.0, + "grad_norm": 0.033457257877764275, + "language_loss": 0.85569251, + "learning_rate": 0.0007507191307178108, + "loss": 0.86619347, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.4440918, + "step": 1834, + "time_per_iteration": 2.8065004348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054314, + "balance_loss_mlp": 1.00999165, + "epoch": 0.3530203924586379, + "flos": 552299306496.0, + "grad_norm": 0.040042804692427734, + "language_loss": 0.75668854, + "learning_rate": 0.0007504495377068543, + "loss": 0.76723164, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.4440918, + "step": 1835, + "time_per_iteration": 2.736536741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052598, + "balance_loss_mlp": 1.00832355, + "epoch": 0.3532127741439015, + "flos": 654306431232.0, + "grad_norm": 0.0387965270782292, + "language_loss": 0.82353514, + "learning_rate": 0.0007501798474627642, + "loss": 0.83406115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44360352, + "step": 1836, + "time_per_iteration": 2.9019014835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052616, + "balance_loss_mlp": 1.00824583, + "epoch": 0.35340515582916504, + "flos": 724151636736.0, + "grad_norm": 0.03634896017563763, + "language_loss": 0.84383756, + "learning_rate": 0.0007499100600902433, + "loss": 0.85436368, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.44458008, + "step": 1837, + "time_per_iteration": 3.0071663856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105242, + "balance_loss_mlp": 1.00812232, + "epoch": 0.35359753751442863, + "flos": 595998733056.0, + "grad_norm": 0.039287132740407786, + "language_loss": 0.853827, + "learning_rate": 0.0007496401756940324, + "loss": 0.86435115, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.44384766, + "step": 1838, + "time_per_iteration": 2.6924545764923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052318, + "balance_loss_mlp": 1.00780547, + "epoch": 0.3537899191996922, + "flos": 633806440704.0, + "grad_norm": 0.041905435038062475, + "language_loss": 0.83424079, + "learning_rate": 0.0007493701943789098, + "loss": 0.84476393, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.44580078, + "step": 1839, + "time_per_iteration": 2.744781970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.00727141, + "epoch": 0.35398230088495575, + "flos": 507353686272.0, + "grad_norm": 0.0353986915713622, + "language_loss": 0.8339026, + "learning_rate": 0.000749100116249692, + "loss": 0.84441972, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.44506836, + "step": 1840, + "time_per_iteration": 2.5823822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_mlp": 1.00490189, + "epoch": 0.35417468257021933, + "flos": 509047032576.0, + "grad_norm": 0.03988576427868324, + "language_loss": 0.86907303, + "learning_rate": 0.0007488299414112321, + "loss": 0.87956673, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.4453125, + "step": 1841, + "time_per_iteration": 2.6171295642852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055735, + "balance_loss_mlp": 1.01126969, + "epoch": 0.35436706425548287, + "flos": 657660076032.0, + "grad_norm": 0.035376771477334756, + "language_loss": 0.78015333, + "learning_rate": 0.0007485596699684215, + "loss": 0.79071069, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.44555664, + "step": 1842, + "time_per_iteration": 2.8393046855926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070571, + "balance_loss_mlp": 1.02572489, + "epoch": 0.35455944594074645, + "flos": 653889414144.0, + "grad_norm": 0.03498191670442302, + "language_loss": 0.86517459, + "learning_rate": 0.000748289302026189, + "loss": 0.87588024, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.44848633, + "step": 1843, + "time_per_iteration": 2.8524656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060154, + "balance_loss_mlp": 1.01566541, + "epoch": 0.35475182762601, + "flos": 850011429888.0, + "grad_norm": 0.03510464987001869, + "language_loss": 0.86422503, + "learning_rate": 0.0007480188376895004, + "loss": 0.87482655, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.4453125, + "step": 1844, + "time_per_iteration": 3.1228320598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048378, + "balance_loss_mlp": 1.00584412, + "epoch": 0.3549442093112736, + "flos": 1524777989376.0, + "grad_norm": 0.00626506088035535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74859715, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.42578125, + "step": 1845, + "time_per_iteration": 4.8881309032440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053267, + "balance_loss_mlp": 1.00906432, + "epoch": 0.3551365909965371, + "flos": 652715152128.0, + "grad_norm": 0.03760423595997357, + "language_loss": 0.78996736, + "learning_rate": 0.0007474776202528074, + "loss": 0.80050004, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.44287109, + "step": 1846, + "time_per_iteration": 2.9740474224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055296, + "balance_loss_mlp": 1.01118839, + "epoch": 0.3553289726818007, + "flos": 898923098112.0, + "grad_norm": 0.04404679517400465, + "language_loss": 0.81547415, + "learning_rate": 0.000747206867362922, + "loss": 0.82602704, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.44189453, + "step": 1847, + "time_per_iteration": 3.0834994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052455, + "balance_loss_mlp": 1.00822854, + "epoch": 0.3555213543670643, + "flos": 689734512384.0, + "grad_norm": 0.03965516085145463, + "language_loss": 0.8451193, + "learning_rate": 0.0007469360184988194, + "loss": 0.85564387, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.44311523, + "step": 1848, + "time_per_iteration": 2.8074848651885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050526, + "balance_loss_mlp": 1.00632286, + "epoch": 0.3557137360523278, + "flos": 539604066816.0, + "grad_norm": 0.033414642983477745, + "language_loss": 0.87585986, + "learning_rate": 0.0007466650737656518, + "loss": 0.88636506, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.44287109, + "step": 1849, + "time_per_iteration": 2.604926347732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049824, + "balance_loss_mlp": 1.00562072, + "epoch": 0.3559061177375914, + "flos": 403154539776.0, + "grad_norm": 0.03235738057519393, + "language_loss": 0.9068622, + "learning_rate": 0.0007463940332686098, + "loss": 0.91736042, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.44287109, + "step": 1850, + "time_per_iteration": 2.4913558959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056333, + "balance_loss_mlp": 1.01196373, + "epoch": 0.35609849942285493, + "flos": 697895042304.0, + "grad_norm": 0.0320980052654178, + "language_loss": 0.85078359, + "learning_rate": 0.0007461228971129205, + "loss": 0.86134696, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.44458008, + "step": 1851, + "time_per_iteration": 2.898726463317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.01557255, + "epoch": 0.3562908811081185, + "flos": 570002653440.0, + "grad_norm": 0.036011031747473804, + "language_loss": 0.86088216, + "learning_rate": 0.0007458516654038483, + "loss": 0.87148154, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.44458008, + "step": 1852, + "time_per_iteration": 2.6340625286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050769, + "balance_loss_mlp": 1.00651896, + "epoch": 0.35648326279338205, + "flos": 683610468864.0, + "grad_norm": 0.03085087761867809, + "language_loss": 0.87196577, + "learning_rate": 0.0007455803382466946, + "loss": 0.88247347, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44335938, + "step": 1853, + "time_per_iteration": 2.7936782836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_mlp": 1.00468445, + "epoch": 0.35667564447864564, + "flos": 630341980416.0, + "grad_norm": 0.02905562967314866, + "language_loss": 0.8756358, + "learning_rate": 0.0007453089157467979, + "loss": 0.88612318, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.44140625, + "step": 1854, + "time_per_iteration": 2.8003768920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053337, + "balance_loss_mlp": 1.00920558, + "epoch": 0.35686802616390917, + "flos": 815505844224.0, + "grad_norm": 0.03187136352260198, + "language_loss": 0.82840991, + "learning_rate": 0.0007450373980095341, + "loss": 0.83894324, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.44213867, + "step": 1855, + "time_per_iteration": 3.072218179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.00869787, + "epoch": 0.35706040784917276, + "flos": 527206280448.0, + "grad_norm": 0.03314729603592228, + "language_loss": 0.87318838, + "learning_rate": 0.0007447657851403155, + "loss": 0.88371575, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.44116211, + "step": 1856, + "time_per_iteration": 2.5849640369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047421, + "balance_loss_mlp": 1.00338531, + "epoch": 0.35725278953443634, + "flos": 513065570304.0, + "grad_norm": 0.033114806318055315, + "language_loss": 0.79136717, + "learning_rate": 0.0007444940772445915, + "loss": 0.80184138, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.44116211, + "step": 1857, + "time_per_iteration": 2.729100227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048076, + "balance_loss_mlp": 1.00404048, + "epoch": 0.3574451712196999, + "flos": 488493573888.0, + "grad_norm": 0.030889137628629628, + "language_loss": 0.80389744, + "learning_rate": 0.0007442222744278484, + "loss": 0.81437826, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.44116211, + "step": 1858, + "time_per_iteration": 2.673224687576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_mlp": 1.00433075, + "epoch": 0.35763755290496346, + "flos": 551822018304.0, + "grad_norm": 0.029026961526961815, + "language_loss": 0.8481214, + "learning_rate": 0.0007439503767956099, + "loss": 0.8586058, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.44189453, + "step": 1859, + "time_per_iteration": 2.7095680236816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_mlp": 1.00567627, + "epoch": 0.357829934590227, + "flos": 1507228232448.0, + "grad_norm": 0.007157576597672099, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80719817, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.42578125, + "step": 1860, + "time_per_iteration": 4.909587383270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00549006, + "epoch": 0.3580223162754906, + "flos": 569842260480.0, + "grad_norm": 0.027013738684289513, + "language_loss": 0.86190987, + "learning_rate": 0.000743406297506922, + "loss": 0.87240434, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.44042969, + "step": 1861, + "time_per_iteration": 2.7355735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00518215, + "epoch": 0.3582146979607541, + "flos": 627761131776.0, + "grad_norm": 0.0339710504259095, + "language_loss": 0.84903038, + "learning_rate": 0.0007431341160617031, + "loss": 0.8595221, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.44067383, + "step": 1862, + "time_per_iteration": 2.8932178020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054928, + "balance_loss_mlp": 1.01082051, + "epoch": 0.3584070796460177, + "flos": 508319923200.0, + "grad_norm": 0.030700215862736833, + "language_loss": 0.88826722, + "learning_rate": 0.0007428618402234491, + "loss": 0.89881647, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.44189453, + "step": 1863, + "time_per_iteration": 2.6574699878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105281, + "balance_loss_mlp": 1.00882196, + "epoch": 0.3585994613312813, + "flos": 607641219840.0, + "grad_norm": 0.030466419719222444, + "language_loss": 0.80836076, + "learning_rate": 0.0007425894700978668, + "loss": 0.8188889, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.44067383, + "step": 1864, + "time_per_iteration": 2.7388875484466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048686, + "balance_loss_mlp": 1.00467396, + "epoch": 0.3587918430165448, + "flos": 1415089579776.0, + "grad_norm": 0.030441642762586523, + "language_loss": 0.8033703, + "learning_rate": 0.0007423170057906996, + "loss": 0.8138572, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.44091797, + "step": 1865, + "time_per_iteration": 3.8431384563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3589842247018084, + "flos": 479514561024.0, + "grad_norm": 0.03198832631900347, + "language_loss": 0.8674798, + "learning_rate": 0.0007420444474077275, + "loss": 0.87792838, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.44067383, + "step": 1866, + "time_per_iteration": 2.5487258434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_mlp": 1.0028863, + "epoch": 0.35917660638707194, + "flos": 505706026752.0, + "grad_norm": 0.036738697797889144, + "language_loss": 0.90374953, + "learning_rate": 0.0007417717950547671, + "loss": 0.91421801, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.44042969, + "step": 1867, + "time_per_iteration": 2.6784894466400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052044, + "balance_loss_mlp": 1.00960541, + "epoch": 0.3593689880723355, + "flos": 1495484645376.0, + "grad_norm": 0.0080630279180651, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77048653, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.42480469, + "step": 1868, + "time_per_iteration": 4.930212497711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104515, + "balance_loss_mlp": 1.00118589, + "epoch": 0.35956136975759906, + "flos": 529672422912.0, + "grad_norm": 0.03031015371847706, + "language_loss": 0.85577166, + "learning_rate": 0.0007412262088623299, + "loss": 0.86622322, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.44042969, + "step": 1869, + "time_per_iteration": 2.73066782951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_mlp": 1.00385797, + "epoch": 0.35975375144286265, + "flos": 536000600832.0, + "grad_norm": 0.03552204952813077, + "language_loss": 0.80084878, + "learning_rate": 0.0007409532752346684, + "loss": 0.81132627, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.43969727, + "step": 1870, + "time_per_iteration": 2.6379218101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00638759, + "epoch": 0.3599461331281262, + "flos": 505929603072.0, + "grad_norm": 0.028943079800369927, + "language_loss": 0.8876543, + "learning_rate": 0.0007406802480606491, + "loss": 0.89815807, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.44067383, + "step": 1871, + "time_per_iteration": 2.6258225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049318, + "balance_loss_mlp": 1.00547302, + "epoch": 0.36013851481338977, + "flos": 512537737728.0, + "grad_norm": 0.03609789661305553, + "language_loss": 0.91903639, + "learning_rate": 0.0007404071274462707, + "loss": 0.92952955, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.43920898, + "step": 1872, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_mlp": 1.00494921, + "epoch": 0.36033089649865335, + "flos": 548632657152.0, + "grad_norm": 0.03255043761438457, + "language_loss": 0.84506214, + "learning_rate": 0.0007401339134975682, + "loss": 0.85555267, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.44189453, + "step": 1873, + "time_per_iteration": 2.6355786323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_mlp": 1.00575614, + "epoch": 0.3605232781839169, + "flos": 459614334720.0, + "grad_norm": 0.03456024010205507, + "language_loss": 0.84983587, + "learning_rate": 0.0007398606063206122, + "loss": 0.86033404, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.44140625, + "step": 1874, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_mlp": 1.00577569, + "epoch": 0.36071565986918047, + "flos": 510564434688.0, + "grad_norm": 0.03262157431229983, + "language_loss": 0.79280519, + "learning_rate": 0.0007395872060215101, + "loss": 0.80330336, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.44116211, + "step": 1875, + "time_per_iteration": 2.59242582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_mlp": 1.00785792, + "epoch": 0.360908041554444, + "flos": 560257647360.0, + "grad_norm": 0.03426029536230158, + "language_loss": 0.89306337, + "learning_rate": 0.0007393137127064056, + "loss": 0.9035809, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.43969727, + "step": 1876, + "time_per_iteration": 2.6217613220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00577068, + "epoch": 0.3611004232397076, + "flos": 524879143680.0, + "grad_norm": 0.03313366432597027, + "language_loss": 0.84778088, + "learning_rate": 0.0007390401264814779, + "loss": 0.85827708, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.43920898, + "step": 1877, + "time_per_iteration": 2.621366262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_mlp": 1.00752687, + "epoch": 0.3612928049249711, + "flos": 542033270784.0, + "grad_norm": 0.036139064810301956, + "language_loss": 0.85492337, + "learning_rate": 0.0007387664474529427, + "loss": 0.86543715, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.43920898, + "step": 1878, + "time_per_iteration": 2.6200942993164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.00776029, + "epoch": 0.3614851866102347, + "flos": 553630070784.0, + "grad_norm": 0.03346030230294773, + "language_loss": 0.91826439, + "learning_rate": 0.0007384926757270518, + "loss": 0.92877924, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.43798828, + "step": 1879, + "time_per_iteration": 2.6367645263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048472, + "balance_loss_mlp": 1.00481761, + "epoch": 0.36167756829549824, + "flos": 773427832320.0, + "grad_norm": 0.030641441804162946, + "language_loss": 0.80120707, + "learning_rate": 0.0007382188114100924, + "loss": 0.81169182, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.43725586, + "step": 1880, + "time_per_iteration": 2.9662272930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_mlp": 1.0051316, + "epoch": 0.36186994998076183, + "flos": 713188627200.0, + "grad_norm": 0.030233131555612264, + "language_loss": 0.82161707, + "learning_rate": 0.0007379448546083884, + "loss": 0.83210421, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.43652344, + "step": 1881, + "time_per_iteration": 2.9433577060699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_mlp": 1.00420797, + "epoch": 0.3620623316660254, + "flos": 748901522688.0, + "grad_norm": 0.028477152913266954, + "language_loss": 0.88624489, + "learning_rate": 0.0007376708054282992, + "loss": 0.89672405, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.43774414, + "step": 1882, + "time_per_iteration": 2.9565789699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00425851, + "epoch": 0.36225471335128895, + "flos": 483535044096.0, + "grad_norm": 0.03088815199044137, + "language_loss": 0.84632647, + "learning_rate": 0.0007373966639762201, + "loss": 0.85680467, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.4362793, + "step": 1883, + "time_per_iteration": 2.6308107376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_mlp": 1.00762069, + "epoch": 0.36244709503655254, + "flos": 507911654400.0, + "grad_norm": 0.045291722940018896, + "language_loss": 0.89109468, + "learning_rate": 0.0007371224303585822, + "loss": 0.90160698, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.43676758, + "step": 1884, + "time_per_iteration": 2.5738682746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053806, + "balance_loss_mlp": 1.01194, + "epoch": 0.36263947672181607, + "flos": 1397054741760.0, + "grad_norm": 0.007615502937667497, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81410873, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.41894531, + "step": 1885, + "time_per_iteration": 4.7547221183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105859, + "balance_loss_mlp": 1.01500738, + "epoch": 0.36283185840707965, + "flos": 654523204608.0, + "grad_norm": 0.03432185210428161, + "language_loss": 0.83272493, + "learning_rate": 0.0007365736870525335, + "loss": 0.84331077, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.43652344, + "step": 1886, + "time_per_iteration": 2.8305654525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_mlp": 1.00591362, + "epoch": 0.3630242400923432, + "flos": 489845725440.0, + "grad_norm": 0.036050619102321185, + "language_loss": 0.8310129, + "learning_rate": 0.000736299177577164, + "loss": 0.84150714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.43579102, + "step": 1887, + "time_per_iteration": 2.632485866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105207, + "balance_loss_mlp": 1.00853443, + "epoch": 0.3632166217776068, + "flos": 518232125184.0, + "grad_norm": 0.034844830144856315, + "language_loss": 0.84275633, + "learning_rate": 0.0007360245763623174, + "loss": 0.85327709, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.43603516, + "step": 1888, + "time_per_iteration": 2.6480350494384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049354, + "balance_loss_mlp": 1.00596213, + "epoch": 0.36340900346287036, + "flos": 647348353536.0, + "grad_norm": 0.03423797247490227, + "language_loss": 0.90607542, + "learning_rate": 0.0007357498835146039, + "loss": 0.91656893, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.43457031, + "step": 1889, + "time_per_iteration": 2.8152430057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055179, + "balance_loss_mlp": 1.01154852, + "epoch": 0.3636013851481339, + "flos": 554411615232.0, + "grad_norm": 0.0362068794335816, + "language_loss": 0.87730169, + "learning_rate": 0.0007354750991406684, + "loss": 0.8878535, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.43701172, + "step": 1890, + "time_per_iteration": 2.71056866645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047867, + "balance_loss_mlp": 1.0042125, + "epoch": 0.3637937668333975, + "flos": 547692665088.0, + "grad_norm": 0.03762567530645649, + "language_loss": 0.81321651, + "learning_rate": 0.0007352002233471919, + "loss": 0.82369518, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.43725586, + "step": 1891, + "time_per_iteration": 2.6590068340301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.01098096, + "epoch": 0.363986148518661, + "flos": 539211349248.0, + "grad_norm": 0.036762310622647384, + "language_loss": 0.79772675, + "learning_rate": 0.0007349252562408906, + "loss": 0.808276, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.44018555, + "step": 1892, + "time_per_iteration": 2.715721368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111044, + "balance_loss_mlp": 1.0663805, + "epoch": 0.3641785302039246, + "flos": 661511417856.0, + "grad_norm": 0.04360229312277944, + "language_loss": 0.82000142, + "learning_rate": 0.0007346501979285158, + "loss": 0.83110583, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.44140625, + "step": 1893, + "time_per_iteration": 2.927184820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_mlp": 1.01934052, + "epoch": 0.36437091188918813, + "flos": 1472084965632.0, + "grad_norm": 0.015393341944361743, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81600404, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.41992188, + "step": 1894, + "time_per_iteration": 4.786630868911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050162, + "balance_loss_mlp": 1.00648379, + "epoch": 0.3645632935744517, + "flos": 598445433600.0, + "grad_norm": 0.030741456608760154, + "language_loss": 0.86771834, + "learning_rate": 0.0007340998081127308, + "loss": 0.87822002, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.4375, + "step": 1895, + "time_per_iteration": 2.7590408325195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00284135, + "epoch": 0.36475567525971525, + "flos": 600696748032.0, + "grad_norm": 0.032247737775586885, + "language_loss": 0.91682166, + "learning_rate": 0.0007338244768230007, + "loss": 0.92728615, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.43676758, + "step": 1896, + "time_per_iteration": 2.806001663208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048582, + "balance_loss_mlp": 1.00502336, + "epoch": 0.36494805694497884, + "flos": 799832180736.0, + "grad_norm": 0.03166243516623692, + "language_loss": 0.89817142, + "learning_rate": 0.0007335490547545578, + "loss": 0.90865725, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.4362793, + "step": 1897, + "time_per_iteration": 3.0448927879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049034, + "balance_loss_mlp": 1.00535595, + "epoch": 0.3651404386302424, + "flos": 638478210816.0, + "grad_norm": 0.03536594015703217, + "language_loss": 0.82896376, + "learning_rate": 0.0007332735420143308, + "loss": 0.83945411, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.4375, + "step": 1898, + "time_per_iteration": 2.739990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_mlp": 1.00419891, + "epoch": 0.36533282031550596, + "flos": 492563634432.0, + "grad_norm": 0.03491103953335563, + "language_loss": 0.87321162, + "learning_rate": 0.0007329979387092826, + "loss": 0.88369012, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.43725586, + "step": 1899, + "time_per_iteration": 2.5661838054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_mlp": 1.00020182, + "epoch": 0.36552520200076954, + "flos": 857509979136.0, + "grad_norm": 0.025671163998745472, + "language_loss": 0.84557235, + "learning_rate": 0.0007327222449464124, + "loss": 0.85601258, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.43896484, + "step": 1900, + "time_per_iteration": 3.2916476726531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_mlp": 1.00545931, + "epoch": 0.3657175836860331, + "flos": 484716109056.0, + "grad_norm": 0.033162883177173925, + "language_loss": 0.89287698, + "learning_rate": 0.0007324464608327538, + "loss": 0.90336835, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4375, + "step": 1901, + "time_per_iteration": 2.6514644622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050151, + "balance_loss_mlp": 1.00647259, + "epoch": 0.36590996537129666, + "flos": 435721815552.0, + "grad_norm": 0.0385016057803441, + "language_loss": 0.88887352, + "learning_rate": 0.0007321705864753758, + "loss": 0.89937502, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.4375, + "step": 1902, + "time_per_iteration": 2.6785683631896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00171947, + "epoch": 0.3661023470565602, + "flos": 713514270720.0, + "grad_norm": 0.027132815564249787, + "language_loss": 0.85073566, + "learning_rate": 0.0007318946219813823, + "loss": 0.86119133, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.43920898, + "step": 1903, + "time_per_iteration": 2.9874324798583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00431406, + "epoch": 0.3662947287418238, + "flos": 565823722752.0, + "grad_norm": 0.03452387251033087, + "language_loss": 0.90632051, + "learning_rate": 0.000731618567457912, + "loss": 0.91680402, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.44116211, + "step": 1904, + "time_per_iteration": 2.684290885925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049154, + "balance_loss_mlp": 1.00516582, + "epoch": 0.3664871104270873, + "flos": 791203110912.0, + "grad_norm": 0.032826620308443535, + "language_loss": 0.87174082, + "learning_rate": 0.000731342423012139, + "loss": 0.88223237, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.44067383, + "step": 1905, + "time_per_iteration": 3.0617177486419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051542, + "balance_loss_mlp": 1.00750625, + "epoch": 0.3666794921123509, + "flos": 753981561600.0, + "grad_norm": 0.03506961035904521, + "language_loss": 0.83108962, + "learning_rate": 0.0007310661887512722, + "loss": 0.84160507, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.44116211, + "step": 1906, + "time_per_iteration": 3.046901226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.0011121, + "epoch": 0.3668718737976145, + "flos": 524607935232.0, + "grad_norm": 0.03388484398579531, + "language_loss": 0.82964659, + "learning_rate": 0.0007307898647825549, + "loss": 0.84010023, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.44335938, + "step": 1907, + "time_per_iteration": 2.6592161655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_mlp": 1.00767255, + "epoch": 0.367064255482878, + "flos": 573046205952.0, + "grad_norm": 0.03554957537225944, + "language_loss": 0.8992576, + "learning_rate": 0.0007305134512132659, + "loss": 0.90977585, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.44238281, + "step": 1908, + "time_per_iteration": 2.6961183547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055033, + "balance_loss_mlp": 1.01078284, + "epoch": 0.3672566371681416, + "flos": 448054473216.0, + "grad_norm": 0.04018581054394134, + "language_loss": 0.843858, + "learning_rate": 0.0007302369481507183, + "loss": 0.85440832, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.44335938, + "step": 1909, + "time_per_iteration": 2.488203763961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056026, + "balance_loss_mlp": 1.01358795, + "epoch": 0.36744901885340514, + "flos": 1543366893312.0, + "grad_norm": 0.00771809390988723, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81017786, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.42480469, + "step": 1910, + "time_per_iteration": 4.828088045120239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059395, + "balance_loss_mlp": 1.01457202, + "epoch": 0.36764140053866873, + "flos": 564762221568.0, + "grad_norm": 0.032014471163266715, + "language_loss": 0.86287534, + "learning_rate": 0.000729683673975274, + "loss": 0.87346923, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.44824219, + "step": 1911, + "time_per_iteration": 2.6982359886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058317, + "balance_loss_mlp": 1.01366162, + "epoch": 0.36783378222393226, + "flos": 1218652614144.0, + "grad_norm": 0.03007186425733569, + "language_loss": 0.8357197, + "learning_rate": 0.0007294069030771774, + "loss": 0.84630299, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.44702148, + "step": 1912, + "time_per_iteration": 3.6612210273742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049037, + "balance_loss_mlp": 1.0043577, + "epoch": 0.36802616390919585, + "flos": 499720988928.0, + "grad_norm": 0.03131225250708543, + "language_loss": 0.91280997, + "learning_rate": 0.0007291300431154224, + "loss": 0.92330033, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.44726562, + "step": 1913, + "time_per_iteration": 2.574129581451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053715, + "balance_loss_mlp": 1.01108551, + "epoch": 0.36821854559445943, + "flos": 1585618904064.0, + "grad_norm": 0.006266309435424964, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7144345, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.42675781, + "step": 1914, + "time_per_iteration": 4.960723876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052577, + "balance_loss_mlp": 1.0082792, + "epoch": 0.36841092727972297, + "flos": 837090668544.0, + "grad_norm": 0.03136779226227803, + "language_loss": 0.80375087, + "learning_rate": 0.0007285760564309179, + "loss": 0.81427664, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.44384766, + "step": 1915, + "time_per_iteration": 3.0985960960388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010541, + "balance_loss_mlp": 1.00965917, + "epoch": 0.36860330896498655, + "flos": 691211085312.0, + "grad_norm": 0.031502418433557444, + "language_loss": 0.85988045, + "learning_rate": 0.0007282989299232448, + "loss": 0.87042141, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.4453125, + "step": 1916, + "time_per_iteration": 3.034715175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.01065195, + "epoch": 0.3687956906502501, + "flos": 555240791808.0, + "grad_norm": 0.03953946470073971, + "language_loss": 0.84794021, + "learning_rate": 0.0007280217147820668, + "loss": 0.85849106, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.4453125, + "step": 1917, + "time_per_iteration": 2.61297869682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_mlp": 1.0093317, + "epoch": 0.3689880723355137, + "flos": 577820043264.0, + "grad_norm": 0.030128455165502346, + "language_loss": 0.7994225, + "learning_rate": 0.0007277444111150079, + "loss": 0.80996048, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.44555664, + "step": 1918, + "time_per_iteration": 2.7244873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_mlp": 1.00845671, + "epoch": 0.3691804540207772, + "flos": 529887250944.0, + "grad_norm": 0.035938670194894204, + "language_loss": 0.84948546, + "learning_rate": 0.0007274670190297272, + "loss": 0.86001301, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.44384766, + "step": 1919, + "time_per_iteration": 2.6209609508514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_mlp": 1.0041858, + "epoch": 0.3693728357060408, + "flos": 562181372928.0, + "grad_norm": 0.026922320390231402, + "language_loss": 0.82273662, + "learning_rate": 0.0007271895386339179, + "loss": 0.83322287, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.4453125, + "step": 1920, + "time_per_iteration": 2.7952609062194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00292087, + "epoch": 0.3695652173913043, + "flos": 580900534272.0, + "grad_norm": 0.03055527362799568, + "language_loss": 0.83712995, + "learning_rate": 0.0007269119700353073, + "loss": 0.84760189, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.44360352, + "step": 1921, + "time_per_iteration": 2.808595895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049468, + "balance_loss_mlp": 1.00519335, + "epoch": 0.3697575990765679, + "flos": 514059997440.0, + "grad_norm": 0.029192022992987326, + "language_loss": 0.85655916, + "learning_rate": 0.0007266343133416571, + "loss": 0.86705387, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.44360352, + "step": 1922, + "time_per_iteration": 2.7229409217834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045757, + "balance_loss_mlp": 1.00255585, + "epoch": 0.3699499807618315, + "flos": 1573906430976.0, + "grad_norm": 0.004633598174219594, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.7816267, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.43261719, + "step": 1923, + "time_per_iteration": 4.855220556259155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049661, + "balance_loss_mlp": 1.00526702, + "epoch": 0.37014236244709503, + "flos": 498325095936.0, + "grad_norm": 0.04063724538866958, + "language_loss": 0.84789312, + "learning_rate": 0.0007260787361004556, + "loss": 0.85838968, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.44482422, + "step": 1924, + "time_per_iteration": 2.5634405612945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063278, + "balance_loss_mlp": 1.01998138, + "epoch": 0.3703347441323586, + "flos": 1447608233472.0, + "grad_norm": 0.011285785538321925, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.7482478, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 4.881471157073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050604, + "balance_loss_mlp": 1.00601971, + "epoch": 0.37052712581762215, + "flos": 564714589440.0, + "grad_norm": 0.030700116077417884, + "language_loss": 0.87676865, + "learning_rate": 0.0007255228077730903, + "loss": 0.88727468, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.44628906, + "step": 1926, + "time_per_iteration": 2.6604056358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_mlp": 1.00426066, + "epoch": 0.37071950750288574, + "flos": 927571958016.0, + "grad_norm": 0.030848240929213684, + "language_loss": 0.82266426, + "learning_rate": 0.0007252447122218632, + "loss": 0.83315009, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.4440918, + "step": 1927, + "time_per_iteration": 3.189232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.00424135, + "epoch": 0.37091188918814927, + "flos": 419201478912.0, + "grad_norm": 0.038028798643346066, + "language_loss": 0.88517463, + "learning_rate": 0.0007249665292228834, + "loss": 0.89565861, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.44238281, + "step": 1928, + "time_per_iteration": 2.6051783561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_mlp": 1.00443351, + "epoch": 0.37110427087341286, + "flos": 464147099136.0, + "grad_norm": 0.03246756835091633, + "language_loss": 0.8426615, + "learning_rate": 0.000724688258884151, + "loss": 0.85314661, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.44165039, + "step": 1929, + "time_per_iteration": 2.5537402629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105112, + "balance_loss_mlp": 1.00703681, + "epoch": 0.3712966525586764, + "flos": 851081679360.0, + "grad_norm": 0.026814038228573516, + "language_loss": 0.86998665, + "learning_rate": 0.0007244099013137002, + "loss": 0.88049793, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.44165039, + "step": 1930, + "time_per_iteration": 3.091195821762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052812, + "balance_loss_mlp": 1.00901484, + "epoch": 0.37148903424394, + "flos": 927559319040.0, + "grad_norm": 0.03484228463474462, + "language_loss": 0.89224607, + "learning_rate": 0.0007241314566195993, + "loss": 0.90277416, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.4387207, + "step": 1931, + "time_per_iteration": 3.2276151180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00616395, + "epoch": 0.37168141592920356, + "flos": 520821722112.0, + "grad_norm": 0.033577876196724185, + "language_loss": 0.86394525, + "learning_rate": 0.0007238529249099496, + "loss": 0.87444603, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.43994141, + "step": 1932, + "time_per_iteration": 2.6099538803100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_mlp": 1.00075531, + "epoch": 0.3718737976144671, + "flos": 1449062452224.0, + "grad_norm": 0.005805601038449312, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78900075, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.42480469, + "step": 1933, + "time_per_iteration": 4.864013910293579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00741839, + "epoch": 0.3720661792997307, + "flos": 760954223616.0, + "grad_norm": 0.031651541573232696, + "language_loss": 0.81381935, + "learning_rate": 0.000723295600876581, + "loss": 0.82433319, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.44042969, + "step": 1934, + "time_per_iteration": 3.003988742828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_mlp": 1.00353265, + "epoch": 0.3722585609849942, + "flos": 518045487360.0, + "grad_norm": 0.031160015664157277, + "language_loss": 0.88386387, + "learning_rate": 0.0007230168087692344, + "loss": 0.89433783, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.43945312, + "step": 1935, + "time_per_iteration": 2.6490824222564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_mlp": 1.00165451, + "epoch": 0.3724509426702578, + "flos": 783869812224.0, + "grad_norm": 0.03743087194604022, + "language_loss": 0.82867873, + "learning_rate": 0.0007227379300790839, + "loss": 0.83913326, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.4387207, + "step": 1936, + "time_per_iteration": 3.010700225830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044084, + "balance_loss_mlp": 1.00011992, + "epoch": 0.37264332435552133, + "flos": 392599799040.0, + "grad_norm": 0.032423549870759565, + "language_loss": 0.86443603, + "learning_rate": 0.0007224589649143997, + "loss": 0.87487686, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.44042969, + "step": 1937, + "time_per_iteration": 2.54010272026062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_mlp": 1.00072384, + "epoch": 0.3728357060407849, + "flos": 543913254912.0, + "grad_norm": 0.03387233199209411, + "language_loss": 0.81436574, + "learning_rate": 0.0007221799133834861, + "loss": 0.82481098, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.4387207, + "step": 1938, + "time_per_iteration": 2.6355655193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_mlp": 1.00154293, + "epoch": 0.3730280877260485, + "flos": 434484370176.0, + "grad_norm": 0.03416430777388856, + "language_loss": 0.82122993, + "learning_rate": 0.00072190077559468, + "loss": 0.83168304, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.43847656, + "step": 1939, + "time_per_iteration": 2.5033867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049643, + "balance_loss_mlp": 1.00579786, + "epoch": 0.37322046941131204, + "flos": 532511841024.0, + "grad_norm": 0.031902006564455146, + "language_loss": 0.89473069, + "learning_rate": 0.0007216215516563527, + "loss": 0.90522707, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.43920898, + "step": 1940, + "time_per_iteration": 2.685201406478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_mlp": 1.00538588, + "epoch": 0.3734128510965756, + "flos": 532576969728.0, + "grad_norm": 0.03682978505173481, + "language_loss": 0.83770883, + "learning_rate": 0.0007213422416769083, + "loss": 0.84820092, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.43896484, + "step": 1941, + "time_per_iteration": 2.5981826782226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104851, + "balance_loss_mlp": 1.00454593, + "epoch": 0.37360523278183916, + "flos": 501433777152.0, + "grad_norm": 0.029644951468961563, + "language_loss": 0.75750655, + "learning_rate": 0.0007210628457647849, + "loss": 0.76799166, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.44042969, + "step": 1942, + "time_per_iteration": 2.5780391693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_mlp": 1.00365365, + "epoch": 0.37379761446710275, + "flos": 549112857600.0, + "grad_norm": 0.03283775645447924, + "language_loss": 0.79155779, + "learning_rate": 0.000720783364028453, + "loss": 0.80203396, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.44042969, + "step": 1943, + "time_per_iteration": 2.7498555183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052612, + "balance_loss_mlp": 1.0085758, + "epoch": 0.3739899961523663, + "flos": 476740271616.0, + "grad_norm": 0.03229344723146533, + "language_loss": 0.88345349, + "learning_rate": 0.0007205037965764177, + "loss": 0.89397967, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.44116211, + "step": 1944, + "time_per_iteration": 2.559565305709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049461, + "balance_loss_mlp": 1.00533009, + "epoch": 0.37418237783762986, + "flos": 613077037824.0, + "grad_norm": 0.033726561022773015, + "language_loss": 0.85856438, + "learning_rate": 0.0007202241435172161, + "loss": 0.86905897, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.44213867, + "step": 1945, + "time_per_iteration": 2.7495012283325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_mlp": 1.00618136, + "epoch": 0.3743747595228934, + "flos": 767629432320.0, + "grad_norm": 0.030482282234963888, + "language_loss": 0.88839138, + "learning_rate": 0.0007199444049594198, + "loss": 0.89889503, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.44262695, + "step": 1946, + "time_per_iteration": 2.927438259124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_mlp": 1.00679135, + "epoch": 0.374567141208157, + "flos": 525491546880.0, + "grad_norm": 0.03274984488565387, + "language_loss": 0.84098482, + "learning_rate": 0.0007196645810116322, + "loss": 0.85149455, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.44262695, + "step": 1947, + "time_per_iteration": 2.669954538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00717854, + "epoch": 0.37475952289342057, + "flos": 682614096384.0, + "grad_norm": 0.03500222096290466, + "language_loss": 0.84308642, + "learning_rate": 0.0007193846717824912, + "loss": 0.85360044, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.44311523, + "step": 1948, + "time_per_iteration": 2.873595714569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_mlp": 1.01018322, + "epoch": 0.3749519045786841, + "flos": 461216307456.0, + "grad_norm": 0.03758393676626501, + "language_loss": 0.89286113, + "learning_rate": 0.0007191046773806669, + "loss": 0.90340507, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.44287109, + "step": 1949, + "time_per_iteration": 2.5632805824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052491, + "balance_loss_mlp": 1.00816894, + "epoch": 0.3751442862639477, + "flos": 956388013824.0, + "grad_norm": 0.04355990755149793, + "language_loss": 0.83803475, + "learning_rate": 0.0007188245979148631, + "loss": 0.84855968, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.4440918, + "step": 1950, + "time_per_iteration": 3.153048515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050564, + "balance_loss_mlp": 1.00619411, + "epoch": 0.3753366679492112, + "flos": 528806307840.0, + "grad_norm": 0.034134677221205334, + "language_loss": 0.88437903, + "learning_rate": 0.0007185444334938157, + "loss": 0.89488459, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.44458008, + "step": 1951, + "time_per_iteration": 2.77795147895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052982, + "balance_loss_mlp": 1.0084213, + "epoch": 0.3755290496344748, + "flos": 522849460224.0, + "grad_norm": 0.03641649118573359, + "language_loss": 0.85489821, + "learning_rate": 0.0007182641842262947, + "loss": 0.86542803, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.44628906, + "step": 1952, + "time_per_iteration": 2.6038033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063178, + "balance_loss_mlp": 1.01852179, + "epoch": 0.37572143131973834, + "flos": 622372945920.0, + "grad_norm": 0.036303705105214745, + "language_loss": 0.78406018, + "learning_rate": 0.0007179838502211022, + "loss": 0.79469192, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.44702148, + "step": 1953, + "time_per_iteration": 2.8537991046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050235, + "balance_loss_mlp": 1.00565112, + "epoch": 0.37591381300500193, + "flos": 772274957568.0, + "grad_norm": 0.033405608161133214, + "language_loss": 0.87193865, + "learning_rate": 0.0007177034315870738, + "loss": 0.88244104, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.44677734, + "step": 1954, + "time_per_iteration": 2.9944725036621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_mlp": 1.00469208, + "epoch": 0.37610619469026546, + "flos": 521481757440.0, + "grad_norm": 0.05036646851246907, + "language_loss": 0.91552407, + "learning_rate": 0.0007174229284330773, + "loss": 0.92601728, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.44702148, + "step": 1955, + "time_per_iteration": 2.607128143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046853, + "balance_loss_mlp": 1.0023644, + "epoch": 0.37629857637552905, + "flos": 599971584000.0, + "grad_norm": 0.029911324472659546, + "language_loss": 0.87468076, + "learning_rate": 0.0007171423408680141, + "loss": 0.88514924, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.44555664, + "step": 1956, + "time_per_iteration": 2.8234241008758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00272334, + "epoch": 0.37649095806079264, + "flos": 566019108864.0, + "grad_norm": 0.03303955535560464, + "language_loss": 0.90624022, + "learning_rate": 0.0007168616690008176, + "loss": 0.91671115, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.44458008, + "step": 1957, + "time_per_iteration": 2.645219326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047066, + "balance_loss_mlp": 1.00271976, + "epoch": 0.37668333974605617, + "flos": 593569529088.0, + "grad_norm": 0.03512927569377508, + "language_loss": 0.86650079, + "learning_rate": 0.0007165809129404545, + "loss": 0.87697142, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.44433594, + "step": 1958, + "time_per_iteration": 2.762319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.00742376, + "epoch": 0.37687572143131975, + "flos": 420365047296.0, + "grad_norm": 0.03381206580119959, + "language_loss": 0.8673501, + "learning_rate": 0.0007163000727959239, + "loss": 0.87786663, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.44311523, + "step": 1959, + "time_per_iteration": 2.4887454509735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_mlp": 1.00466919, + "epoch": 0.3770681031165833, + "flos": 1360387269888.0, + "grad_norm": 0.007286715675134549, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79006183, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.42480469, + "step": 1960, + "time_per_iteration": 4.844388961791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053466, + "balance_loss_mlp": 1.00938201, + "epoch": 0.3772604848018469, + "flos": 646154649600.0, + "grad_norm": 0.030030705089392724, + "language_loss": 0.85244703, + "learning_rate": 0.00071573814069052, + "loss": 0.86298174, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.44165039, + "step": 1961, + "time_per_iteration": 2.93870210647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_mlp": 0.99976981, + "epoch": 0.3774528664871104, + "flos": 903202150656.0, + "grad_norm": 0.029467737659617427, + "language_loss": 0.88618672, + "learning_rate": 0.0007154570489478081, + "loss": 0.89662528, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.44165039, + "step": 1962, + "time_per_iteration": 3.2101829051971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_mlp": 1.00241697, + "epoch": 0.377645248172374, + "flos": 789464077824.0, + "grad_norm": 0.02894999631439154, + "language_loss": 0.87102842, + "learning_rate": 0.0007151758735572514, + "loss": 0.88149416, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.44238281, + "step": 1963, + "time_per_iteration": 3.0217864513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046842, + "balance_loss_mlp": 1.00282979, + "epoch": 0.3778376298576376, + "flos": 587925686016.0, + "grad_norm": 0.035422959183698866, + "language_loss": 0.81287247, + "learning_rate": 0.0007148946146280119, + "loss": 0.82334089, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.44091797, + "step": 1964, + "time_per_iteration": 2.9066553115844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_mlp": 1.01407623, + "epoch": 0.3780300115429011, + "flos": 1399672528896.0, + "grad_norm": 0.012885740561533653, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73248661, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.42480469, + "step": 1965, + "time_per_iteration": 4.874085426330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3782223932281647, + "flos": 1360634178816.0, + "grad_norm": 0.008484298942656315, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397645, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.42578125, + "step": 1966, + "time_per_iteration": 4.964066743850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048666, + "balance_loss_mlp": 1.00467777, + "epoch": 0.37841477491342823, + "flos": 705517046016.0, + "grad_norm": 0.02737284959483133, + "language_loss": 0.8436377, + "learning_rate": 0.0007140503377003022, + "loss": 0.85412437, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.44067383, + "step": 1967, + "time_per_iteration": 3.014033555984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_mlp": 1.00764298, + "epoch": 0.3786071565986918, + "flos": 530156514048.0, + "grad_norm": 0.03014770490429956, + "language_loss": 0.85294402, + "learning_rate": 0.000713768745708599, + "loss": 0.86346149, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.44189453, + "step": 1968, + "time_per_iteration": 2.6359875202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_mlp": 1.0084002, + "epoch": 0.37879953828395535, + "flos": 994901443584.0, + "grad_norm": 0.03323886334735767, + "language_loss": 0.78270096, + "learning_rate": 0.0007134870707245085, + "loss": 0.79322648, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.44238281, + "step": 1969, + "time_per_iteration": 3.276670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054281, + "balance_loss_mlp": 1.01010239, + "epoch": 0.37899191996921894, + "flos": 627793212672.0, + "grad_norm": 0.033324026165203316, + "language_loss": 0.84867144, + "learning_rate": 0.0007132053128573864, + "loss": 0.85921425, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.44262695, + "step": 1970, + "time_per_iteration": 2.747647523880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_mlp": 1.00727034, + "epoch": 0.37918430165448247, + "flos": 687520136448.0, + "grad_norm": 0.034311044198206936, + "language_loss": 0.84702653, + "learning_rate": 0.0007129234722166211, + "loss": 0.85754126, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.44287109, + "step": 1971, + "time_per_iteration": 2.8502755165100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104953, + "balance_loss_mlp": 1.00535131, + "epoch": 0.37937668333974606, + "flos": 476618762496.0, + "grad_norm": 0.028798969169212138, + "language_loss": 0.91637433, + "learning_rate": 0.0007126415489116328, + "loss": 0.92686969, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.44262695, + "step": 1972, + "time_per_iteration": 2.703598737716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_mlp": 1.00559556, + "epoch": 0.37956906502500964, + "flos": 708825004032.0, + "grad_norm": 0.033945121596029554, + "language_loss": 0.81780016, + "learning_rate": 0.0007123595430518736, + "loss": 0.82829797, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.44262695, + "step": 1973, + "time_per_iteration": 2.859210252761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_mlp": 1.00345445, + "epoch": 0.3797614467102732, + "flos": 427559340288.0, + "grad_norm": 0.03504063937858188, + "language_loss": 0.86830699, + "learning_rate": 0.0007120774547468282, + "loss": 0.87878382, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.44311523, + "step": 1974, + "time_per_iteration": 2.5465054512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105377, + "balance_loss_mlp": 1.00944817, + "epoch": 0.37995382839553676, + "flos": 482881811712.0, + "grad_norm": 0.031503790568027705, + "language_loss": 0.82317638, + "learning_rate": 0.0007117952841060128, + "loss": 0.83371413, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.4440918, + "step": 1975, + "time_per_iteration": 2.789965867996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_mlp": 1.00924242, + "epoch": 0.3801462100808003, + "flos": 561671036928.0, + "grad_norm": 0.03572346778222672, + "language_loss": 0.84539783, + "learning_rate": 0.0007115130312389756, + "loss": 0.85593396, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.44433594, + "step": 1976, + "time_per_iteration": 2.7104804515838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046782, + "balance_loss_mlp": 1.00236499, + "epoch": 0.3803385917660639, + "flos": 465888077568.0, + "grad_norm": 0.03508123942848817, + "language_loss": 0.80071044, + "learning_rate": 0.0007112306962552973, + "loss": 0.81117821, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.44506836, + "step": 1977, + "time_per_iteration": 2.644700527191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053171, + "balance_loss_mlp": 1.00863445, + "epoch": 0.3805309734513274, + "flos": 522905840640.0, + "grad_norm": 0.0297417361696937, + "language_loss": 0.8625899, + "learning_rate": 0.0007109482792645896, + "loss": 0.87312162, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.44580078, + "step": 1978, + "time_per_iteration": 2.736924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052388, + "balance_loss_mlp": 1.00780404, + "epoch": 0.380723355136591, + "flos": 592553714688.0, + "grad_norm": 0.03207088172149068, + "language_loss": 0.84620887, + "learning_rate": 0.0007106657803764969, + "loss": 0.85673285, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.44628906, + "step": 1979, + "time_per_iteration": 2.797027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_mlp": 1.00851822, + "epoch": 0.38091573682185453, + "flos": 623855354880.0, + "grad_norm": 0.034228405400289826, + "language_loss": 0.82734859, + "learning_rate": 0.0007103831997006948, + "loss": 0.83788031, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.4465332, + "step": 1980, + "time_per_iteration": 2.774831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00601208, + "epoch": 0.3811081185071181, + "flos": 570176652288.0, + "grad_norm": 0.02916230611543443, + "language_loss": 0.85986841, + "learning_rate": 0.0007101005373468908, + "loss": 0.87037432, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.4465332, + "step": 1981, + "time_per_iteration": 2.889430284500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_mlp": 1.00647449, + "epoch": 0.3813005001923817, + "flos": 585991266816.0, + "grad_norm": 0.029260882769569122, + "language_loss": 0.87282979, + "learning_rate": 0.0007098177934248242, + "loss": 0.88334191, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.44726562, + "step": 1982, + "time_per_iteration": 2.734011173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_mlp": 1.00509369, + "epoch": 0.38149288187764524, + "flos": 622811350272.0, + "grad_norm": 0.03279838714755621, + "language_loss": 0.86164075, + "learning_rate": 0.0007095349680442661, + "loss": 0.87213778, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.44677734, + "step": 1983, + "time_per_iteration": 2.8532214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049432, + "balance_loss_mlp": 1.00496709, + "epoch": 0.3816852635629088, + "flos": 571798066944.0, + "grad_norm": 0.03407469020321441, + "language_loss": 0.79342288, + "learning_rate": 0.0007092520613150188, + "loss": 0.80391723, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4453125, + "step": 1984, + "time_per_iteration": 2.6656527519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_mlp": 1.01058352, + "epoch": 0.38187764524817236, + "flos": 566679144192.0, + "grad_norm": 0.03287674379309895, + "language_loss": 0.81891948, + "learning_rate": 0.0007089690733469165, + "loss": 0.82946956, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.44506836, + "step": 1985, + "time_per_iteration": 2.6921868324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104986, + "balance_loss_mlp": 1.00544298, + "epoch": 0.38207002693343595, + "flos": 632399854080.0, + "grad_norm": 0.03591516825864857, + "language_loss": 0.8265506, + "learning_rate": 0.000708686004249825, + "loss": 0.83704919, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.44506836, + "step": 1986, + "time_per_iteration": 2.771472454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_mlp": 1.0026772, + "epoch": 0.3822624086186995, + "flos": 549841912320.0, + "grad_norm": 0.027805852633017242, + "language_loss": 0.91746366, + "learning_rate": 0.0007084028541336413, + "loss": 0.92793083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.44116211, + "step": 1987, + "time_per_iteration": 2.7168381214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049881, + "balance_loss_mlp": 1.00572634, + "epoch": 0.38245479030396307, + "flos": 615067837440.0, + "grad_norm": 0.03052630202850825, + "language_loss": 0.86906445, + "learning_rate": 0.0007081196231082942, + "loss": 0.87956333, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.44238281, + "step": 1988, + "time_per_iteration": 2.8021280765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00325835, + "epoch": 0.38264717198922665, + "flos": 669304508160.0, + "grad_norm": 0.03253134732635267, + "language_loss": 0.8090933, + "learning_rate": 0.0007078363112837436, + "loss": 0.81956601, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.44091797, + "step": 1989, + "time_per_iteration": 2.812901020050049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00232375, + "epoch": 0.3828395536744902, + "flos": 455687170560.0, + "grad_norm": 0.03353740504071411, + "language_loss": 0.8610149, + "learning_rate": 0.000707552918769981, + "loss": 0.87147707, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43969727, + "step": 1990, + "time_per_iteration": 2.503817081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0038017, + "epoch": 0.3830319353597538, + "flos": 500483091456.0, + "grad_norm": 0.030831133245435974, + "language_loss": 0.84298265, + "learning_rate": 0.000707269445677029, + "loss": 0.85345787, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43798828, + "step": 1991, + "time_per_iteration": 2.77250599861145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_mlp": 1.00373507, + "epoch": 0.3832243170450173, + "flos": 745467197952.0, + "grad_norm": 0.03142895241328533, + "language_loss": 0.85860848, + "learning_rate": 0.0007069858921149416, + "loss": 0.86908376, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.4387207, + "step": 1992, + "time_per_iteration": 3.001058578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_mlp": 1.00363255, + "epoch": 0.3834166987302809, + "flos": 579346193664.0, + "grad_norm": 0.027707623231004064, + "language_loss": 0.86360574, + "learning_rate": 0.0007067022581938043, + "loss": 0.87407815, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.43676758, + "step": 1993, + "time_per_iteration": 2.896017551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049302, + "balance_loss_mlp": 1.00579047, + "epoch": 0.3836090804155444, + "flos": 537609376512.0, + "grad_norm": 0.038344647976828676, + "language_loss": 0.83944476, + "learning_rate": 0.0007064185440237334, + "loss": 0.8499378, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.43579102, + "step": 1994, + "time_per_iteration": 2.8133461475372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051736, + "balance_loss_mlp": 1.00820076, + "epoch": 0.383801462100808, + "flos": 603052075008.0, + "grad_norm": 0.0304270283066245, + "language_loss": 0.85033917, + "learning_rate": 0.0007061347497148764, + "loss": 0.86085653, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.43603516, + "step": 1995, + "time_per_iteration": 2.829977035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050646, + "balance_loss_mlp": 1.00694358, + "epoch": 0.38399384378607154, + "flos": 573799560192.0, + "grad_norm": 0.034646706108572276, + "language_loss": 0.86866224, + "learning_rate": 0.0007058508753774122, + "loss": 0.87916863, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.43774414, + "step": 1996, + "time_per_iteration": 2.684966564178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049901, + "balance_loss_mlp": 1.00629473, + "epoch": 0.38418622547133513, + "flos": 537780463104.0, + "grad_norm": 0.03333459391135046, + "language_loss": 0.87270373, + "learning_rate": 0.0007055669211215505, + "loss": 0.88320273, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.43676758, + "step": 1997, + "time_per_iteration": 2.623508930206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054497, + "balance_loss_mlp": 1.01079535, + "epoch": 0.3843786071565987, + "flos": 574014388224.0, + "grad_norm": 0.04127067736406929, + "language_loss": 0.78599155, + "learning_rate": 0.0007052828870575322, + "loss": 0.79653656, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43774414, + "step": 1998, + "time_per_iteration": 2.644423723220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_mlp": 1.00761676, + "epoch": 0.38457098884186225, + "flos": 730080294144.0, + "grad_norm": 0.03146347648703673, + "language_loss": 0.87266672, + "learning_rate": 0.0007049987732956291, + "loss": 0.88318008, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.43798828, + "step": 1999, + "time_per_iteration": 2.963409185409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_mlp": 1.00447905, + "epoch": 0.38476337052712584, + "flos": 584621618688.0, + "grad_norm": 0.024706606255084192, + "language_loss": 0.83278054, + "learning_rate": 0.0007047145799461439, + "loss": 0.84326208, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.4375, + "step": 2000, + "time_per_iteration": 2.86661434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_mlp": 1.00459874, + "epoch": 0.38495575221238937, + "flos": 554159848704.0, + "grad_norm": 0.03147773281119346, + "language_loss": 0.83074015, + "learning_rate": 0.00070443030711941, + "loss": 0.84122348, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.43798828, + "step": 2001, + "time_per_iteration": 2.778719425201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_mlp": 1.00175321, + "epoch": 0.38514813389765296, + "flos": 655678024704.0, + "grad_norm": 0.03168685191580143, + "language_loss": 0.82975376, + "learning_rate": 0.0007041459549257924, + "loss": 0.84020758, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.43701172, + "step": 2002, + "time_per_iteration": 2.8597054481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046243, + "balance_loss_mlp": 1.00261223, + "epoch": 0.3853405155829165, + "flos": 869647250688.0, + "grad_norm": 0.03552713767777679, + "language_loss": 0.78954732, + "learning_rate": 0.0007038615234756859, + "loss": 0.80000973, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.43701172, + "step": 2003, + "time_per_iteration": 3.167647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.00697505, + "epoch": 0.3855328972681801, + "flos": 547469088768.0, + "grad_norm": 0.03596547507231522, + "language_loss": 0.84374714, + "learning_rate": 0.000703577012879517, + "loss": 0.85425198, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.43579102, + "step": 2004, + "time_per_iteration": 2.644718885421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00397706, + "epoch": 0.3857252789534436, + "flos": 535099492608.0, + "grad_norm": 0.03525407945169758, + "language_loss": 0.89214581, + "learning_rate": 0.0007032924232477423, + "loss": 0.90262067, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.43579102, + "step": 2005, + "time_per_iteration": 2.6340301036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_mlp": 1.01023984, + "epoch": 0.3859176606387072, + "flos": 492767768832.0, + "grad_norm": 0.0325086763316175, + "language_loss": 0.80829036, + "learning_rate": 0.0007030077546908493, + "loss": 0.81882888, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.43676758, + "step": 2006, + "time_per_iteration": 2.6427574157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051659, + "balance_loss_mlp": 1.00969696, + "epoch": 0.3861100423239708, + "flos": 1490158675968.0, + "grad_norm": 0.006099468603868092, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84116316, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.41992188, + "step": 2007, + "time_per_iteration": 4.792185068130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_mlp": 1.00383234, + "epoch": 0.3863024240092343, + "flos": 474693091584.0, + "grad_norm": 0.0379943815396184, + "language_loss": 0.79703128, + "learning_rate": 0.0007024381812438117, + "loss": 0.80750644, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.4375, + "step": 2008, + "time_per_iteration": 2.6320388317108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_mlp": 1.0153178, + "epoch": 0.3864948056944979, + "flos": 717979961088.0, + "grad_norm": 0.04179543058298576, + "language_loss": 0.84345418, + "learning_rate": 0.0007021532765747951, + "loss": 0.85404319, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.43652344, + "step": 2009, + "time_per_iteration": 3.0408942699432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_mlp": 1.01370513, + "epoch": 0.38668718737976143, + "flos": 728955609600.0, + "grad_norm": 0.033678441310908816, + "language_loss": 0.80296206, + "learning_rate": 0.0007018682934229162, + "loss": 0.81353402, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43554688, + "step": 2010, + "time_per_iteration": 2.9119958877563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_mlp": 1.01025474, + "epoch": 0.386879569065025, + "flos": 526489864704.0, + "grad_norm": 0.031759350944825356, + "language_loss": 0.83489478, + "learning_rate": 0.0007015832318988152, + "loss": 0.84543192, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43530273, + "step": 2011, + "time_per_iteration": 2.625828981399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_mlp": 1.00643158, + "epoch": 0.38707195075028855, + "flos": 1530727067136.0, + "grad_norm": 0.008010138125144308, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74938273, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.41796875, + "step": 2012, + "time_per_iteration": 4.969848155975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_mlp": 1.00555038, + "epoch": 0.38726433243555214, + "flos": 558386411520.0, + "grad_norm": 0.029387859415775444, + "language_loss": 0.84841448, + "learning_rate": 0.0007010128741766604, + "loss": 0.85890484, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.43554688, + "step": 2013, + "time_per_iteration": 2.808583974838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_mlp": 1.00205982, + "epoch": 0.38745671412081567, + "flos": 554756700672.0, + "grad_norm": 0.037665143906504196, + "language_loss": 0.84820414, + "learning_rate": 0.0007007275782000391, + "loss": 0.85866058, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.43652344, + "step": 2014, + "time_per_iteration": 2.6201975345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_mlp": 1.00775766, + "epoch": 0.38764909580607926, + "flos": 459345071616.0, + "grad_norm": 0.03590133597746071, + "language_loss": 0.85486585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86537898, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.4362793, + "step": 2015, + "time_per_iteration": 2.5167059898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_mlp": 1.00792837, + "epoch": 0.38784147749134285, + "flos": 523259674368.0, + "grad_norm": 0.036833384765870066, + "language_loss": 0.90223992, + "learning_rate": 0.0007001567525695169, + "loss": 0.9127546, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.43603516, + "step": 2016, + "time_per_iteration": 2.663416624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_mlp": 0.99923599, + "epoch": 0.3880338591766064, + "flos": 667401191424.0, + "grad_norm": 0.027528515382714943, + "language_loss": 0.84397906, + "learning_rate": 0.0006998712231372303, + "loss": 0.85440457, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.43383789, + "step": 2017, + "time_per_iteration": 2.982222080230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_mlp": 1.00389743, + "epoch": 0.38822624086186996, + "flos": 595176359424.0, + "grad_norm": 0.028816590459513517, + "language_loss": 0.86776507, + "learning_rate": 0.0006995856161080532, + "loss": 0.87823659, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43310547, + "step": 2018, + "time_per_iteration": 2.8449933528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046566, + "balance_loss_mlp": 1.00300694, + "epoch": 0.3884186225471335, + "flos": 613682638080.0, + "grad_norm": 0.032032500930829794, + "language_loss": 0.82425624, + "learning_rate": 0.0006992999315928679, + "loss": 0.83472192, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.4362793, + "step": 2019, + "time_per_iteration": 2.803743362426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_mlp": 1.00401926, + "epoch": 0.3886110042323971, + "flos": 608244874752.0, + "grad_norm": 0.027721707471257077, + "language_loss": 0.86241317, + "learning_rate": 0.0006990141697025871, + "loss": 0.87288654, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.43383789, + "step": 2020, + "time_per_iteration": 2.7804739475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046585, + "balance_loss_mlp": 1.00481415, + "epoch": 0.3888033859176606, + "flos": 1531196573952.0, + "grad_norm": 0.004554603876592686, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77406228, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.41796875, + "step": 2021, + "time_per_iteration": 4.76949667930603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_mlp": 1.00478315, + "epoch": 0.3889957676029242, + "flos": 693672370176.0, + "grad_norm": 0.038162906437672096, + "language_loss": 0.8292582, + "learning_rate": 0.0006984424142405392, + "loss": 0.83973902, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.43359375, + "step": 2022, + "time_per_iteration": 2.7983930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_mlp": 1.00599611, + "epoch": 0.3891881492881878, + "flos": 516195638784.0, + "grad_norm": 0.03974199995652067, + "language_loss": 0.82402384, + "learning_rate": 0.0006981564208907474, + "loss": 0.83451867, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.43554688, + "step": 2023, + "time_per_iteration": 2.613600730895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_mlp": 1.00707471, + "epoch": 0.3893805309734513, + "flos": 630176729856.0, + "grad_norm": 0.03303002735023947, + "language_loss": 0.90586042, + "learning_rate": 0.0006978703506098102, + "loss": 0.91636622, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.43579102, + "step": 2024, + "time_per_iteration": 2.7258403301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_mlp": 1.00748503, + "epoch": 0.3895729126587149, + "flos": 545207080704.0, + "grad_norm": 0.0334033578711094, + "language_loss": 0.88520938, + "learning_rate": 0.00069758420350879, + "loss": 0.89571834, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.43481445, + "step": 2025, + "time_per_iteration": 2.6406970024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_mlp": 1.00427127, + "epoch": 0.38976529434397844, + "flos": 619407161088.0, + "grad_norm": 0.03600656764113765, + "language_loss": 0.86979783, + "learning_rate": 0.000697297979698779, + "loss": 0.88027489, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43505859, + "step": 2026, + "time_per_iteration": 2.729025363922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00297225, + "epoch": 0.38995767602924203, + "flos": 836346062592.0, + "grad_norm": 0.030634369701250594, + "language_loss": 0.84155977, + "learning_rate": 0.0006970116792908992, + "loss": 0.85202479, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.43603516, + "step": 2027, + "time_per_iteration": 3.0780837535858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054265, + "balance_loss_mlp": 1.01070547, + "epoch": 0.39015005771450556, + "flos": 542647619328.0, + "grad_norm": 0.03376343400122794, + "language_loss": 0.81809974, + "learning_rate": 0.000696725302396302, + "loss": 0.82864237, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.4362793, + "step": 2028, + "time_per_iteration": 2.6632442474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_mlp": 1.00277102, + "epoch": 0.39034243939976915, + "flos": 1009142275584.0, + "grad_norm": 0.030316104633677343, + "language_loss": 0.86213875, + "learning_rate": 0.0006964388491261692, + "loss": 0.872603, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.43725586, + "step": 2029, + "time_per_iteration": 3.2410776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052185, + "balance_loss_mlp": 1.00848317, + "epoch": 0.3905348210850327, + "flos": 680241272832.0, + "grad_norm": 0.03528753395725821, + "language_loss": 0.88294208, + "learning_rate": 0.0006961523195917114, + "loss": 0.89346391, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.43774414, + "step": 2030, + "time_per_iteration": 2.8754475116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104739, + "balance_loss_mlp": 1.00375915, + "epoch": 0.39072720277029627, + "flos": 549989666304.0, + "grad_norm": 0.032806843563698423, + "language_loss": 0.78588331, + "learning_rate": 0.0006958657139041696, + "loss": 0.79635721, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.43701172, + "step": 2031, + "time_per_iteration": 2.7329561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_mlp": 1.00554657, + "epoch": 0.39091958445555985, + "flos": 1551054025728.0, + "grad_norm": 0.008088132411436895, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77760577, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.42089844, + "step": 2032, + "time_per_iteration": 4.958296298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048975, + "balance_loss_mlp": 1.00529635, + "epoch": 0.3911119661408234, + "flos": 505052794368.0, + "grad_norm": 0.03533188094946227, + "language_loss": 0.78901434, + "learning_rate": 0.0006952922745149434, + "loss": 0.7995041, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.4375, + "step": 2033, + "time_per_iteration": 2.6192519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050645, + "balance_loss_mlp": 1.00684798, + "epoch": 0.391304347826087, + "flos": 558330031104.0, + "grad_norm": 0.032114717040763616, + "language_loss": 0.88009661, + "learning_rate": 0.000695005441035888, + "loss": 0.89060307, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.4387207, + "step": 2034, + "time_per_iteration": 2.6519060134887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_mlp": 1.00334167, + "epoch": 0.3914967295113505, + "flos": 1502944322304.0, + "grad_norm": 0.004600085335304226, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7476902, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.42285156, + "step": 2035, + "time_per_iteration": 4.875830888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049757, + "balance_loss_mlp": 1.00581694, + "epoch": 0.3916891111966141, + "flos": 708330219264.0, + "grad_norm": 0.02756997110289995, + "language_loss": 0.81809461, + "learning_rate": 0.0006944315470656863, + "loss": 0.82859218, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.44018555, + "step": 2036, + "time_per_iteration": 2.9486818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104941, + "balance_loss_mlp": 1.00537384, + "epoch": 0.3918814928818776, + "flos": 557409480960.0, + "grad_norm": 0.03430912315299504, + "language_loss": 0.91194409, + "learning_rate": 0.000694144486797345, + "loss": 0.92243814, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.44116211, + "step": 2037, + "time_per_iteration": 2.661637783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_mlp": 1.01155853, + "epoch": 0.3920738745671412, + "flos": 1541688131328.0, + "grad_norm": 0.009695617032389551, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80574143, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.421875, + "step": 2038, + "time_per_iteration": 4.676162004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_mlp": 1.00672829, + "epoch": 0.39226625625240474, + "flos": 499805559552.0, + "grad_norm": 0.03059706599431713, + "language_loss": 0.9011066, + "learning_rate": 0.0006935701402514156, + "loss": 0.91161263, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.43945312, + "step": 2039, + "time_per_iteration": 2.5921828746795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_mlp": 0.99837494, + "epoch": 0.39245863793766833, + "flos": 1350453680640.0, + "grad_norm": 0.0024785612799689367, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74075705, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.42480469, + "step": 2040, + "time_per_iteration": 4.920953273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_mlp": 1.00180471, + "epoch": 0.3926510196229319, + "flos": 1348115873280.0, + "grad_norm": 0.032003611488688986, + "language_loss": 0.84899294, + "learning_rate": 0.0006929954931031422, + "loss": 0.85944915, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.43896484, + "step": 2041, + "time_per_iteration": 3.7454288005828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_mlp": 1.00144792, + "epoch": 0.39284340130819545, + "flos": 500604600576.0, + "grad_norm": 0.027328608847006428, + "language_loss": 0.89267606, + "learning_rate": 0.0006927080570819805, + "loss": 0.9031285, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.4387207, + "step": 2042, + "time_per_iteration": 2.6191000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049281, + "balance_loss_mlp": 1.00565004, + "epoch": 0.39303578299345904, + "flos": 521342751744.0, + "grad_norm": 0.03887631720492337, + "language_loss": 0.81479704, + "learning_rate": 0.0006924205462449161, + "loss": 0.82528985, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.43701172, + "step": 2043, + "time_per_iteration": 2.6156415939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_mlp": 1.00467432, + "epoch": 0.39322816467872257, + "flos": 909539076864.0, + "grad_norm": 0.03230930456366714, + "language_loss": 0.82451463, + "learning_rate": 0.0006921329607035702, + "loss": 0.83499742, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.43676758, + "step": 2044, + "time_per_iteration": 3.248239040374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_mlp": 1.0066911, + "epoch": 0.39342054636398616, + "flos": 518642339328.0, + "grad_norm": 0.028076885263619615, + "language_loss": 0.88591248, + "learning_rate": 0.0006918453005695938, + "loss": 0.89641762, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.43896484, + "step": 2045, + "time_per_iteration": 2.6417062282562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.00430059, + "epoch": 0.3936129280492497, + "flos": 549012735744.0, + "grad_norm": 0.027900695924135757, + "language_loss": 0.84910023, + "learning_rate": 0.0006915575659546662, + "loss": 0.85958266, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.44018555, + "step": 2046, + "time_per_iteration": 2.6784913539886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053966, + "balance_loss_mlp": 1.0100733, + "epoch": 0.3938053097345133, + "flos": 527141151744.0, + "grad_norm": 0.03448231278490725, + "language_loss": 0.81310439, + "learning_rate": 0.0006912697569704959, + "loss": 0.82364404, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.43969727, + "step": 2047, + "time_per_iteration": 2.6214752197265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050494, + "balance_loss_mlp": 1.00679207, + "epoch": 0.39399769141977686, + "flos": 472589531136.0, + "grad_norm": 0.03168334850546869, + "language_loss": 0.87124646, + "learning_rate": 0.0006909818737288205, + "loss": 0.88175148, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.43774414, + "step": 2048, + "time_per_iteration": 2.6057982444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00775015, + "epoch": 0.3941900731050404, + "flos": 502727602944.0, + "grad_norm": 0.03501112209435681, + "language_loss": 0.81578481, + "learning_rate": 0.000690693916341406, + "loss": 0.82629883, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.43725586, + "step": 2049, + "time_per_iteration": 2.6459243297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.00910771, + "epoch": 0.394382454790304, + "flos": 582007722240.0, + "grad_norm": 0.03071224069667877, + "language_loss": 0.83009964, + "learning_rate": 0.0006904058849200475, + "loss": 0.8406263, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.4362793, + "step": 2050, + "time_per_iteration": 2.766828775405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_mlp": 1.00243104, + "epoch": 0.3945748364755675, + "flos": 514845432576.0, + "grad_norm": 0.030877215482718844, + "language_loss": 0.85563171, + "learning_rate": 0.0006901177795765683, + "loss": 0.86609566, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.44042969, + "step": 2051, + "time_per_iteration": 2.659912109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051919, + "balance_loss_mlp": 1.00807357, + "epoch": 0.3947672181608311, + "flos": 595058740992.0, + "grad_norm": 0.03343854917241654, + "language_loss": 0.821091, + "learning_rate": 0.0006898296004228213, + "loss": 0.8316102, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.43920898, + "step": 2052, + "time_per_iteration": 2.7115862369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_mlp": 1.00455475, + "epoch": 0.39495959984609463, + "flos": 1551052080384.0, + "grad_norm": 0.003971648916451202, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79173255, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.41992188, + "step": 2053, + "time_per_iteration": 4.894740343093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051907, + "balance_loss_mlp": 1.00818145, + "epoch": 0.3951519815313582, + "flos": 497524109568.0, + "grad_norm": 0.03573797234588687, + "language_loss": 0.80267316, + "learning_rate": 0.0006892530211320763, + "loss": 0.81319225, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.43798828, + "step": 2054, + "time_per_iteration": 2.767686605453491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104549, + "balance_loss_mlp": 1.00193131, + "epoch": 0.39534436321662175, + "flos": 532223136000.0, + "grad_norm": 0.03591265467553322, + "language_loss": 0.84680569, + "learning_rate": 0.000688964621218926, + "loss": 0.85726058, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.4362793, + "step": 2055, + "time_per_iteration": 2.6054694652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_mlp": 1.004722, + "epoch": 0.39553674490188534, + "flos": 703725523200.0, + "grad_norm": 0.03424008758122415, + "language_loss": 0.8074584, + "learning_rate": 0.0006886761479432037, + "loss": 0.8179388, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.43383789, + "step": 2056, + "time_per_iteration": 2.8390727043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.0042696, + "epoch": 0.3957291265871489, + "flos": 410656979712.0, + "grad_norm": 0.03388460034269331, + "language_loss": 0.85256028, + "learning_rate": 0.0006883876014169045, + "loss": 0.86303759, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.43530273, + "step": 2057, + "time_per_iteration": 2.554170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051678, + "balance_loss_mlp": 1.00814319, + "epoch": 0.39592150827241246, + "flos": 619639485696.0, + "grad_norm": 0.03722447028160607, + "language_loss": 0.90694773, + "learning_rate": 0.000688098981752052, + "loss": 0.91746461, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.43603516, + "step": 2058, + "time_per_iteration": 2.733053684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_mlp": 1.00568974, + "epoch": 0.39611388995767605, + "flos": 822721524480.0, + "grad_norm": 0.04279286873756595, + "language_loss": 0.80609208, + "learning_rate": 0.0006878102890606982, + "loss": 0.81658387, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.43554688, + "step": 2059, + "time_per_iteration": 3.084789752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_mlp": 1.00416124, + "epoch": 0.3963062716429396, + "flos": 493214921472.0, + "grad_norm": 0.03961147378322192, + "language_loss": 0.81771576, + "learning_rate": 0.0006875215234549239, + "loss": 0.82819128, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.43457031, + "step": 2060, + "time_per_iteration": 2.5823421478271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_mlp": 1.00351596, + "epoch": 0.39649865332820317, + "flos": 585834764544.0, + "grad_norm": 0.03854635921535854, + "language_loss": 0.8654902, + "learning_rate": 0.0006872326850468376, + "loss": 0.87595946, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.43481445, + "step": 2061, + "time_per_iteration": 2.705690860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.0052762, + "epoch": 0.3966910350134667, + "flos": 459512267520.0, + "grad_norm": 0.037411346592439484, + "language_loss": 0.79843795, + "learning_rate": 0.0006869437739485762, + "loss": 0.80892581, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.43579102, + "step": 2062, + "time_per_iteration": 2.5978832244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050469, + "balance_loss_mlp": 1.00710082, + "epoch": 0.3968834166987303, + "flos": 509615694336.0, + "grad_norm": 0.03224635872548594, + "language_loss": 0.93265009, + "learning_rate": 0.0006866547902723053, + "loss": 0.94315481, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.43432617, + "step": 2063, + "time_per_iteration": 2.7325148582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.00502992, + "epoch": 0.3970757983839938, + "flos": 573743179776.0, + "grad_norm": 0.0353853142482034, + "language_loss": 0.80804694, + "learning_rate": 0.000686365734130218, + "loss": 0.81852973, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.43310547, + "step": 2064, + "time_per_iteration": 2.719521999359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_mlp": 1.00350547, + "epoch": 0.3972681800692574, + "flos": 482586303744.0, + "grad_norm": 0.03284702600830507, + "language_loss": 0.8411094, + "learning_rate": 0.000686076605634536, + "loss": 0.8515777, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.43383789, + "step": 2065, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_mlp": 1.00822306, + "epoch": 0.397460561754521, + "flos": 488905733376.0, + "grad_norm": 0.0324228687482344, + "language_loss": 0.84781277, + "learning_rate": 0.0006857874048975088, + "loss": 0.85833061, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.4362793, + "step": 2066, + "time_per_iteration": 2.5906848907470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_mlp": 1.00659895, + "epoch": 0.3976529434397845, + "flos": 422896318464.0, + "grad_norm": 0.03171433053589848, + "language_loss": 0.8744958, + "learning_rate": 0.0006854981320314142, + "loss": 0.8849957, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.43457031, + "step": 2067, + "time_per_iteration": 2.4699788093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_mlp": 1.00240779, + "epoch": 0.3978453251250481, + "flos": 546622415616.0, + "grad_norm": 0.03563960500295594, + "language_loss": 0.8728829, + "learning_rate": 0.0006852087871485579, + "loss": 0.88334048, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.43408203, + "step": 2068, + "time_per_iteration": 2.6414859294891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_mlp": 1.00163472, + "epoch": 0.39803770681031164, + "flos": 652002627072.0, + "grad_norm": 0.03732729296318665, + "language_loss": 0.82978511, + "learning_rate": 0.0006849193703612735, + "loss": 0.84023428, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.43334961, + "step": 2069, + "time_per_iteration": 2.791269063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_mlp": 0.999928, + "epoch": 0.39823008849557523, + "flos": 741427272960.0, + "grad_norm": 0.030595728613543666, + "language_loss": 0.78243995, + "learning_rate": 0.0006846298817819225, + "loss": 0.79287314, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.43457031, + "step": 2070, + "time_per_iteration": 2.9561986923217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_mlp": 1.00235701, + "epoch": 0.39842247018083876, + "flos": 385889597184.0, + "grad_norm": 0.036398106493658954, + "language_loss": 0.81909132, + "learning_rate": 0.0006843403215228945, + "loss": 0.82954645, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.43212891, + "step": 2071, + "time_per_iteration": 2.4993679523468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.00218797, + "epoch": 0.39861485186610235, + "flos": 534763155456.0, + "grad_norm": 0.028807086351499752, + "language_loss": 0.8150484, + "learning_rate": 0.0006840506896966065, + "loss": 0.82550067, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.4309082, + "step": 2072, + "time_per_iteration": 2.7684881687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049113, + "balance_loss_mlp": 1.00595963, + "epoch": 0.39880723355136594, + "flos": 644413671168.0, + "grad_norm": 0.03625588542647267, + "language_loss": 0.83127856, + "learning_rate": 0.0006837609864155038, + "loss": 0.8417697, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.43212891, + "step": 2073, + "time_per_iteration": 2.8514270782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_mlp": 1.00782108, + "epoch": 0.39899961523662947, + "flos": 516892612608.0, + "grad_norm": 0.031931162968107815, + "language_loss": 0.83936673, + "learning_rate": 0.0006834712117920592, + "loss": 0.84987766, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.43334961, + "step": 2074, + "time_per_iteration": 2.6099319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_mlp": 1.00583923, + "epoch": 0.39919199692189306, + "flos": 465338857728.0, + "grad_norm": 0.040350277752625376, + "language_loss": 0.86345923, + "learning_rate": 0.0006831813659387729, + "loss": 0.87394845, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.43139648, + "step": 2075, + "time_per_iteration": 2.5189003944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_mlp": 1.00421119, + "epoch": 0.3993843786071566, + "flos": 532679036928.0, + "grad_norm": 0.031639049857806745, + "language_loss": 0.84865057, + "learning_rate": 0.0006828914489681733, + "loss": 0.85912478, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.43261719, + "step": 2076, + "time_per_iteration": 2.7052366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_mlp": 1.00252223, + "epoch": 0.3995767602924202, + "flos": 505024604160.0, + "grad_norm": 0.02906284980485529, + "language_loss": 0.85967886, + "learning_rate": 0.0006826014609928162, + "loss": 0.87013543, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.43188477, + "step": 2077, + "time_per_iteration": 2.7127158641815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046635, + "balance_loss_mlp": 1.00514984, + "epoch": 0.3997691419776837, + "flos": 1457473781760.0, + "grad_norm": 0.010869866041652092, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84246022, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.41503906, + "step": 2078, + "time_per_iteration": 4.8602213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_mlp": 1.00586236, + "epoch": 0.3999615236629473, + "flos": 531756541440.0, + "grad_norm": 0.03484656463436615, + "language_loss": 0.80513203, + "learning_rate": 0.0006820212724781896, + "loss": 0.81562173, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.43164062, + "step": 2079, + "time_per_iteration": 2.6769065856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00732243, + "epoch": 0.4001539053482108, + "flos": 696362088960.0, + "grad_norm": 0.03370335981625205, + "language_loss": 0.84624374, + "learning_rate": 0.0006817310721641694, + "loss": 0.85674727, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.4309082, + "step": 2080, + "time_per_iteration": 2.8362321853637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_mlp": 1.00619566, + "epoch": 0.4003462870334744, + "flos": 521379690240.0, + "grad_norm": 0.0372462453928972, + "language_loss": 0.84107649, + "learning_rate": 0.00068144080129589, + "loss": 0.85156924, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.43139648, + "step": 2081, + "time_per_iteration": 2.673391342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047044, + "balance_loss_mlp": 1.00400949, + "epoch": 0.400538668718738, + "flos": 493503626496.0, + "grad_norm": 0.03624950820375382, + "language_loss": 0.83452618, + "learning_rate": 0.0006811504599860441, + "loss": 0.84499657, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.4309082, + "step": 2082, + "time_per_iteration": 2.5872161388397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048557, + "balance_loss_mlp": 1.0056175, + "epoch": 0.40073105040400153, + "flos": 491452555776.0, + "grad_norm": 0.03058886918361784, + "language_loss": 0.86615109, + "learning_rate": 0.0006808600483473526, + "loss": 0.87663668, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.42993164, + "step": 2083, + "time_per_iteration": 2.9167916774749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_mlp": 1.00165451, + "epoch": 0.4009234320892651, + "flos": 563540327424.0, + "grad_norm": 0.029579631805043773, + "language_loss": 0.86442864, + "learning_rate": 0.0006805695664925629, + "loss": 0.87487578, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.43115234, + "step": 2084, + "time_per_iteration": 2.8129522800445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_mlp": 1.00328159, + "epoch": 0.40111581377452865, + "flos": 426853618176.0, + "grad_norm": 0.03869673141168483, + "language_loss": 0.84653956, + "learning_rate": 0.0006802790145344506, + "loss": 0.85700059, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.42871094, + "step": 2085, + "time_per_iteration": 2.4816439151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_mlp": 1.00480056, + "epoch": 0.40130819545979224, + "flos": 613643754240.0, + "grad_norm": 0.033294901740297575, + "language_loss": 0.87748265, + "learning_rate": 0.0006799883925858176, + "loss": 0.88795811, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.42797852, + "step": 2086, + "time_per_iteration": 2.883460760116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010467, + "balance_loss_mlp": 1.00397515, + "epoch": 0.40150057714505577, + "flos": 524451432960.0, + "grad_norm": 0.03567087941007639, + "language_loss": 0.85852945, + "learning_rate": 0.0006796977007594933, + "loss": 0.86899644, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.42773438, + "step": 2087, + "time_per_iteration": 2.6274635791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_mlp": 1.00641906, + "epoch": 0.40169295883031936, + "flos": 562554648576.0, + "grad_norm": 0.03237434691106299, + "language_loss": 0.86948609, + "learning_rate": 0.0006794069391683345, + "loss": 0.87997776, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.42797852, + "step": 2088, + "time_per_iteration": 2.7452995777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00204611, + "epoch": 0.4018853405155829, + "flos": 520020735744.0, + "grad_norm": 0.03787206100605993, + "language_loss": 0.81785774, + "learning_rate": 0.0006791161079252248, + "loss": 0.82830572, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.42797852, + "step": 2089, + "time_per_iteration": 2.7205429077148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104968, + "balance_loss_mlp": 1.00683641, + "epoch": 0.4020777222008465, + "flos": 527288905728.0, + "grad_norm": 0.03117280194599123, + "language_loss": 0.83103907, + "learning_rate": 0.0006788252071430747, + "loss": 0.84153581, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.42895508, + "step": 2090, + "time_per_iteration": 2.659057378768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105285, + "balance_loss_mlp": 1.01000619, + "epoch": 0.40227010388611006, + "flos": 526841753088.0, + "grad_norm": 0.038447003118097976, + "language_loss": 0.86962426, + "learning_rate": 0.0006785342369348222, + "loss": 0.88015276, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.42895508, + "step": 2091, + "time_per_iteration": 2.7038679122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_mlp": 1.00374973, + "epoch": 0.4024624855713736, + "flos": 433227482880.0, + "grad_norm": 0.04129881296644863, + "language_loss": 0.80178273, + "learning_rate": 0.0006782431974134316, + "loss": 0.81224871, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.42895508, + "step": 2092, + "time_per_iteration": 2.522822618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_mlp": 1.00185025, + "epoch": 0.4026548672566372, + "flos": 768092136192.0, + "grad_norm": 0.028161411572745265, + "language_loss": 0.89556634, + "learning_rate": 0.0006779520886918949, + "loss": 0.90601373, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.42944336, + "step": 2093, + "time_per_iteration": 3.059269905090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051245, + "balance_loss_mlp": 1.00847256, + "epoch": 0.4028472489419007, + "flos": 644118163200.0, + "grad_norm": 0.031871945568835235, + "language_loss": 0.81586826, + "learning_rate": 0.0006776609108832301, + "loss": 0.82638067, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.42822266, + "step": 2094, + "time_per_iteration": 2.824986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_mlp": 1.00707567, + "epoch": 0.4030396306271643, + "flos": 492824149248.0, + "grad_norm": 0.03027887325873737, + "language_loss": 0.85679066, + "learning_rate": 0.0006773696641004828, + "loss": 0.86729133, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.43041992, + "step": 2095, + "time_per_iteration": 2.575521230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00742352, + "epoch": 0.40323201231242783, + "flos": 903195347712.0, + "grad_norm": 0.03549236004367387, + "language_loss": 0.78398442, + "learning_rate": 0.0006770783484567247, + "loss": 0.7944876, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.42944336, + "step": 2096, + "time_per_iteration": 3.1476502418518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_mlp": 1.00417244, + "epoch": 0.4034243939976914, + "flos": 571730992896.0, + "grad_norm": 0.04456027219971551, + "language_loss": 0.86790794, + "learning_rate": 0.000676786964065055, + "loss": 0.87837982, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.43066406, + "step": 2097, + "time_per_iteration": 2.826936960220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049393, + "balance_loss_mlp": 1.00635874, + "epoch": 0.403616775682955, + "flos": 508460874240.0, + "grad_norm": 0.03200015951198879, + "language_loss": 0.79479361, + "learning_rate": 0.0006764955110385986, + "loss": 0.80528748, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.4309082, + "step": 2098, + "time_per_iteration": 2.732429027557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105173, + "balance_loss_mlp": 1.0086236, + "epoch": 0.40380915736821854, + "flos": 520411507968.0, + "grad_norm": 0.033549102084289066, + "language_loss": 0.81161886, + "learning_rate": 0.0006762039894905083, + "loss": 0.82213616, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.43164062, + "step": 2099, + "time_per_iteration": 2.638117790222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104845, + "balance_loss_mlp": 1.00524902, + "epoch": 0.40400153905348213, + "flos": 442887918336.0, + "grad_norm": 0.03592642868139018, + "language_loss": 0.80970824, + "learning_rate": 0.000675912399533962, + "loss": 0.82019281, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.43261719, + "step": 2100, + "time_per_iteration": 2.58172345161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_mlp": 1.00585735, + "epoch": 0.40419392073874566, + "flos": 773705843712.0, + "grad_norm": 0.032245854328407444, + "language_loss": 0.85358262, + "learning_rate": 0.0006756207412821656, + "loss": 0.86407304, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.43237305, + "step": 2101, + "time_per_iteration": 3.0158467292785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053218, + "balance_loss_mlp": 1.01006424, + "epoch": 0.40438630242400925, + "flos": 767990068992.0, + "grad_norm": 0.03424537155124627, + "language_loss": 0.81043333, + "learning_rate": 0.0006753290148483505, + "loss": 0.82096547, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.43212891, + "step": 2102, + "time_per_iteration": 3.0169148445129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050406, + "balance_loss_mlp": 1.0073241, + "epoch": 0.4045786841092728, + "flos": 416129736192.0, + "grad_norm": 0.032341452227877814, + "language_loss": 0.79544723, + "learning_rate": 0.0006750372203457752, + "loss": 0.80595136, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.43139648, + "step": 2103, + "time_per_iteration": 2.459439277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104576, + "balance_loss_mlp": 1.00274944, + "epoch": 0.40477106579453637, + "flos": 540309788928.0, + "grad_norm": 0.028365330829485943, + "language_loss": 0.87031502, + "learning_rate": 0.0006747453578877242, + "loss": 0.88077265, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.43066406, + "step": 2104, + "time_per_iteration": 2.704583168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.00413048, + "epoch": 0.4049634474797999, + "flos": 828092213760.0, + "grad_norm": 0.03564801319951872, + "language_loss": 0.83885705, + "learning_rate": 0.0006744534275875085, + "loss": 0.84932852, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.43066406, + "step": 2105, + "time_per_iteration": 3.070952892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049443, + "balance_loss_mlp": 1.00631273, + "epoch": 0.4051558291650635, + "flos": 573753873408.0, + "grad_norm": 0.03321600555114549, + "language_loss": 0.86069483, + "learning_rate": 0.0006741614295584657, + "loss": 0.87118924, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.43188477, + "step": 2106, + "time_per_iteration": 2.677860736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_mlp": 1.00802493, + "epoch": 0.4053482108503271, + "flos": 733245355776.0, + "grad_norm": 0.034313991245887424, + "language_loss": 0.78860825, + "learning_rate": 0.0006738693639139595, + "loss": 0.79911888, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.4309082, + "step": 2107, + "time_per_iteration": 3.021329402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_mlp": 1.0043304, + "epoch": 0.4055405925355906, + "flos": 1214950971648.0, + "grad_norm": 0.03202932182515954, + "language_loss": 0.77947468, + "learning_rate": 0.0006735772307673796, + "loss": 0.7899493, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.43188477, + "step": 2108, + "time_per_iteration": 3.524618148803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_mlp": 1.00476336, + "epoch": 0.4057329742208542, + "flos": 717108988416.0, + "grad_norm": 0.03284224075250963, + "language_loss": 0.84037805, + "learning_rate": 0.0006732850302321421, + "loss": 0.85085559, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.43041992, + "step": 2109, + "time_per_iteration": 2.9528980255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_mlp": 1.00423336, + "epoch": 0.4059253559061177, + "flos": 565953980160.0, + "grad_norm": 0.033245578967332844, + "language_loss": 0.85031784, + "learning_rate": 0.00067299276242169, + "loss": 0.86078906, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.42944336, + "step": 2110, + "time_per_iteration": 2.715207815170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046326, + "balance_loss_mlp": 1.00493622, + "epoch": 0.4061177375913813, + "flos": 1597189459200.0, + "grad_norm": 0.00881896921345328, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75428492, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.4140625, + "step": 2111, + "time_per_iteration": 4.921623468399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045683, + "balance_loss_mlp": 1.00276768, + "epoch": 0.40631011927664484, + "flos": 616622178048.0, + "grad_norm": 0.03872377126422628, + "language_loss": 0.78301811, + "learning_rate": 0.0006724080254290395, + "loss": 0.79347491, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.4296875, + "step": 2112, + "time_per_iteration": 2.7997756004333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104896, + "balance_loss_mlp": 1.00606835, + "epoch": 0.40650250096190843, + "flos": 558748993536.0, + "grad_norm": 0.03550284292845091, + "language_loss": 0.90693575, + "learning_rate": 0.0006721155564738566, + "loss": 0.91742539, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.42944336, + "step": 2113, + "time_per_iteration": 2.6585686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_mlp": 1.00054932, + "epoch": 0.40669488264717196, + "flos": 1583545479168.0, + "grad_norm": 0.009767435928617773, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79664576, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.4140625, + "step": 2114, + "time_per_iteration": 4.948775053024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047453, + "balance_loss_mlp": 1.00460887, + "epoch": 0.40688726433243555, + "flos": 508656260352.0, + "grad_norm": 0.031160170727070474, + "language_loss": 0.86169994, + "learning_rate": 0.0006715304182135078, + "loss": 0.8721745, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.42895508, + "step": 2115, + "time_per_iteration": 2.6279850006103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00449646, + "epoch": 0.40707964601769914, + "flos": 590352944640.0, + "grad_norm": 0.04782787246513916, + "language_loss": 0.89337373, + "learning_rate": 0.0006712377491355127, + "loss": 0.90384591, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.42773438, + "step": 2116, + "time_per_iteration": 2.863960027694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_mlp": 1.00449598, + "epoch": 0.40727202770296267, + "flos": 581651943168.0, + "grad_norm": 0.026696862883813798, + "language_loss": 0.81451207, + "learning_rate": 0.0006709450135771274, + "loss": 0.8249836, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.42700195, + "step": 2117, + "time_per_iteration": 2.94854998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_mlp": 1.00589585, + "epoch": 0.40746440938822626, + "flos": 505109174784.0, + "grad_norm": 0.029498043522937258, + "language_loss": 0.87031925, + "learning_rate": 0.0006706522116520023, + "loss": 0.88080668, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.42895508, + "step": 2118, + "time_per_iteration": 2.6655611991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051527, + "balance_loss_mlp": 1.00880289, + "epoch": 0.4076567910734898, + "flos": 606711921408.0, + "grad_norm": 0.03542644850365937, + "language_loss": 0.83226359, + "learning_rate": 0.0006703593434738127, + "loss": 0.84277886, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.42773438, + "step": 2119, + "time_per_iteration": 2.7478883266448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049424, + "balance_loss_mlp": 1.00662768, + "epoch": 0.4078491727587534, + "flos": 480519681792.0, + "grad_norm": 0.032767120193604775, + "language_loss": 0.788118, + "learning_rate": 0.0006700664091562604, + "loss": 0.79861224, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.4284668, + "step": 2120, + "time_per_iteration": 2.532407760620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054491, + "balance_loss_mlp": 1.01167095, + "epoch": 0.4080415544440169, + "flos": 511419856128.0, + "grad_norm": 0.031947051498113735, + "language_loss": 0.85428649, + "learning_rate": 0.0006697734088130725, + "loss": 0.86483139, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.42871094, + "step": 2121, + "time_per_iteration": 2.6053290367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_mlp": 1.00899482, + "epoch": 0.4082339361292805, + "flos": 735928271616.0, + "grad_norm": 0.0331707162631359, + "language_loss": 0.86154819, + "learning_rate": 0.0006694803425580018, + "loss": 0.87206686, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.42919922, + "step": 2122, + "time_per_iteration": 2.995340585708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051457, + "balance_loss_mlp": 1.00863671, + "epoch": 0.4084263178145441, + "flos": 458405079552.0, + "grad_norm": 0.03582566166827548, + "language_loss": 0.85069245, + "learning_rate": 0.0006691872105048268, + "loss": 0.86120701, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.42871094, + "step": 2123, + "time_per_iteration": 2.6434147357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_mlp": 1.00655949, + "epoch": 0.4086186994998076, + "flos": 564026363904.0, + "grad_norm": 0.030981369506813725, + "language_loss": 0.84940457, + "learning_rate": 0.0006688940127673513, + "loss": 0.85990047, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.4309082, + "step": 2124, + "time_per_iteration": 2.677267074584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051992, + "balance_loss_mlp": 1.00914872, + "epoch": 0.4088110811850712, + "flos": 574894109184.0, + "grad_norm": 0.03166953679677798, + "language_loss": 0.86061293, + "learning_rate": 0.0006686007494594049, + "loss": 0.87113285, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.42895508, + "step": 2125, + "time_per_iteration": 2.806321620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_mlp": 1.00845325, + "epoch": 0.40900346287033473, + "flos": 457847111424.0, + "grad_norm": 0.04138148105998068, + "language_loss": 0.81154513, + "learning_rate": 0.0006683074206948425, + "loss": 0.82205856, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.42944336, + "step": 2126, + "time_per_iteration": 2.5422966480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051657, + "balance_loss_mlp": 1.00878966, + "epoch": 0.4091958445555983, + "flos": 618595481088.0, + "grad_norm": 0.03139043933990307, + "language_loss": 0.81871778, + "learning_rate": 0.0006680140265875443, + "loss": 0.82923436, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.42919922, + "step": 2127, + "time_per_iteration": 2.8402438163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_mlp": 1.0048064, + "epoch": 0.40938822624086185, + "flos": 473371075584.0, + "grad_norm": 0.031125843736347292, + "language_loss": 0.96506268, + "learning_rate": 0.0006677205672514162, + "loss": 0.97553754, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.42724609, + "step": 2128, + "time_per_iteration": 2.6291539669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047334, + "balance_loss_mlp": 1.00460982, + "epoch": 0.40958060792612544, + "flos": 571118589696.0, + "grad_norm": 0.02838685720934929, + "language_loss": 0.89474666, + "learning_rate": 0.000667427042800389, + "loss": 0.90522003, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.42773438, + "step": 2129, + "time_per_iteration": 2.749999761581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_mlp": 1.00435364, + "epoch": 0.40977298961138897, + "flos": 610471889664.0, + "grad_norm": 0.033304274322438925, + "language_loss": 0.8343153, + "learning_rate": 0.0006671334533484192, + "loss": 0.84478706, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.42871094, + "step": 2130, + "time_per_iteration": 2.778238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_mlp": 1.00636995, + "epoch": 0.40996537129665256, + "flos": 582873837312.0, + "grad_norm": 0.027360354791446346, + "language_loss": 0.83860981, + "learning_rate": 0.0006668397990094881, + "loss": 0.84910274, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.4296875, + "step": 2131, + "time_per_iteration": 2.711257219314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_mlp": 1.00145221, + "epoch": 0.41015775298191615, + "flos": 517554593280.0, + "grad_norm": 0.031461982022778785, + "language_loss": 0.85118818, + "learning_rate": 0.0006665460798976027, + "loss": 0.86163139, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.42919922, + "step": 2132, + "time_per_iteration": 2.7143847942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046552, + "balance_loss_mlp": 1.00370777, + "epoch": 0.4103501346671797, + "flos": 511446100992.0, + "grad_norm": 0.02874706903740214, + "language_loss": 0.82064044, + "learning_rate": 0.0006662522961267947, + "loss": 0.83110595, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.42895508, + "step": 2133, + "time_per_iteration": 2.683544635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_mlp": 1.00212467, + "epoch": 0.41054251635244327, + "flos": 550927713024.0, + "grad_norm": 0.027003210560574007, + "language_loss": 0.87900901, + "learning_rate": 0.0006659584478111211, + "loss": 0.88945937, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.4296875, + "step": 2134, + "time_per_iteration": 2.781217336654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_mlp": 1.00254142, + "epoch": 0.4107348980377068, + "flos": 841299734784.0, + "grad_norm": 0.03651700728131785, + "language_loss": 0.83066756, + "learning_rate": 0.000665664535064664, + "loss": 0.84112048, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.42797852, + "step": 2135, + "time_per_iteration": 3.067751169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_mlp": 1.00390935, + "epoch": 0.4109272797229704, + "flos": 504764089344.0, + "grad_norm": 0.03160666135819327, + "language_loss": 0.83225, + "learning_rate": 0.0006653705580015303, + "loss": 0.84271616, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.42749023, + "step": 2136, + "time_per_iteration": 2.6899030208587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.00521994, + "epoch": 0.4111196614082339, + "flos": 612024284928.0, + "grad_norm": 0.02957451828286975, + "language_loss": 0.87109792, + "learning_rate": 0.0006650765167358523, + "loss": 0.8815788, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.42919922, + "step": 2137, + "time_per_iteration": 2.8179140090942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048701, + "balance_loss_mlp": 1.00590456, + "epoch": 0.4113120430934975, + "flos": 454104639744.0, + "grad_norm": 0.033800673848535426, + "language_loss": 0.91012341, + "learning_rate": 0.0006647824113817864, + "loss": 0.92061043, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.4284668, + "step": 2138, + "time_per_iteration": 2.5263419151306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00635624, + "epoch": 0.41150442477876104, + "flos": 542710802688.0, + "grad_norm": 0.028316546184043286, + "language_loss": 0.818874, + "learning_rate": 0.000664488242053515, + "loss": 0.82936704, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.42993164, + "step": 2139, + "time_per_iteration": 2.770169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_mlp": 1.0037353, + "epoch": 0.4116968064640246, + "flos": 577392332544.0, + "grad_norm": 0.027329597632332964, + "language_loss": 0.84529692, + "learning_rate": 0.0006641940088652445, + "loss": 0.8557626, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.42895508, + "step": 2140, + "time_per_iteration": 2.761660575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046986, + "balance_loss_mlp": 1.00416613, + "epoch": 0.4118891881492882, + "flos": 497150833920.0, + "grad_norm": 0.03165424709394261, + "language_loss": 0.82833397, + "learning_rate": 0.0006638997119312065, + "loss": 0.83880383, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.42871094, + "step": 2141, + "time_per_iteration": 2.6978652477264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071571, + "balance_loss_mlp": 1.02980042, + "epoch": 0.41208156983455174, + "flos": 1541573425152.0, + "grad_norm": 0.013007961614308571, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76134878, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.41796875, + "step": 2142, + "time_per_iteration": 4.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.00611305, + "epoch": 0.41227395151981533, + "flos": 586058340864.0, + "grad_norm": 0.033991757131589403, + "language_loss": 0.85150123, + "learning_rate": 0.000663310927282877, + "loss": 0.86199009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.42822266, + "step": 2143, + "time_per_iteration": 2.7552297115325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_mlp": 1.00635242, + "epoch": 0.41246633320507886, + "flos": 443893039104.0, + "grad_norm": 0.031026250164357557, + "language_loss": 0.8627826, + "learning_rate": 0.000663016439797172, + "loss": 0.87327409, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.4284668, + "step": 2144, + "time_per_iteration": 2.627795934677124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_mlp": 1.00593042, + "epoch": 0.41265871489034245, + "flos": 581095920384.0, + "grad_norm": 0.032902127624834396, + "language_loss": 0.81700695, + "learning_rate": 0.0006627218890228724, + "loss": 0.82749426, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.4284668, + "step": 2145, + "time_per_iteration": 2.7726335525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051442, + "balance_loss_mlp": 1.00852692, + "epoch": 0.412851096575606, + "flos": 762529951488.0, + "grad_norm": 0.03700396426728773, + "language_loss": 0.8427214, + "learning_rate": 0.0006624272750743326, + "loss": 0.85323578, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.4296875, + "step": 2146, + "time_per_iteration": 3.047786235809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051555, + "balance_loss_mlp": 1.00854468, + "epoch": 0.41304347826086957, + "flos": 556521978624.0, + "grad_norm": 0.0279029176228374, + "language_loss": 0.83148611, + "learning_rate": 0.0006621325980659322, + "loss": 0.84200168, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.43066406, + "step": 2147, + "time_per_iteration": 2.7805261611938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105086, + "balance_loss_mlp": 1.00796807, + "epoch": 0.41323585994613315, + "flos": 666894746112.0, + "grad_norm": 0.03289726182172815, + "language_loss": 0.82395911, + "learning_rate": 0.000661837858112075, + "loss": 0.83446777, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.42944336, + "step": 2148, + "time_per_iteration": 2.8236329555511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_mlp": 1.00153887, + "epoch": 0.4134282416313967, + "flos": 549785531904.0, + "grad_norm": 0.03194652549549522, + "language_loss": 0.89158356, + "learning_rate": 0.0006615430553271888, + "loss": 0.90202832, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.42993164, + "step": 2149, + "time_per_iteration": 2.7931926250457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043808, + "balance_loss_mlp": 1.00101149, + "epoch": 0.4136206233166603, + "flos": 647513604096.0, + "grad_norm": 0.02946183128139913, + "language_loss": 0.8604427, + "learning_rate": 0.0006612481898257264, + "loss": 0.87088078, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.4284668, + "step": 2150, + "time_per_iteration": 2.853116512298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_mlp": 1.00279057, + "epoch": 0.4138130050019238, + "flos": 518364327936.0, + "grad_norm": 0.034556300996824205, + "language_loss": 0.85756087, + "learning_rate": 0.000660953261722165, + "loss": 0.86801755, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.42919922, + "step": 2151, + "time_per_iteration": 2.5899548530578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00575054, + "epoch": 0.4140053866871874, + "flos": 610369822464.0, + "grad_norm": 0.032804683798420206, + "language_loss": 0.83155799, + "learning_rate": 0.0006606582711310055, + "loss": 0.84204322, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.42822266, + "step": 2152, + "time_per_iteration": 2.7591912746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_mlp": 1.00258613, + "epoch": 0.4141977683724509, + "flos": 580846099200.0, + "grad_norm": 0.031179869336458114, + "language_loss": 0.84146237, + "learning_rate": 0.0006603632181667736, + "loss": 0.85191619, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.4284668, + "step": 2153, + "time_per_iteration": 2.661051034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_mlp": 1.00470734, + "epoch": 0.4143901500577145, + "flos": 1310178863616.0, + "grad_norm": 0.005957353398288201, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79989231, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.41210938, + "step": 2154, + "time_per_iteration": 4.908870458602905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_mlp": 1.00416827, + "epoch": 0.41458253174297804, + "flos": 461122988544.0, + "grad_norm": 0.03503771604154275, + "language_loss": 0.82412434, + "learning_rate": 0.0006597729255773153, + "loss": 0.83459282, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.42724609, + "step": 2155, + "time_per_iteration": 2.51566481590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048208, + "balance_loss_mlp": 1.00531614, + "epoch": 0.41477491342824163, + "flos": 554439805440.0, + "grad_norm": 0.033219020360443, + "language_loss": 0.82733047, + "learning_rate": 0.0006594776861812608, + "loss": 0.83781254, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.42944336, + "step": 2156, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_mlp": 1.00501156, + "epoch": 0.4149672951135052, + "flos": 699086800896.0, + "grad_norm": 0.029687792529517126, + "language_loss": 0.87240821, + "learning_rate": 0.0006591823848704776, + "loss": 0.88288647, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.42871094, + "step": 2157, + "time_per_iteration": 2.950136661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00647271, + "epoch": 0.41515967679876875, + "flos": 566837591808.0, + "grad_norm": 0.02753963183350331, + "language_loss": 0.82045114, + "learning_rate": 0.0006588870217596117, + "loss": 0.83094263, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.42724609, + "step": 2158, + "time_per_iteration": 2.742954730987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_mlp": 1.00440574, + "epoch": 0.41535205848403234, + "flos": 502178383104.0, + "grad_norm": 0.03782519840746282, + "language_loss": 0.86309534, + "learning_rate": 0.0006585915969633334, + "loss": 0.8735671, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.42822266, + "step": 2159, + "time_per_iteration": 2.6314492225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048815, + "balance_loss_mlp": 1.00599504, + "epoch": 0.41554444016929587, + "flos": 608702721024.0, + "grad_norm": 0.03160589415450587, + "language_loss": 0.8965854, + "learning_rate": 0.0006582961105963366, + "loss": 0.90707356, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.42871094, + "step": 2160, + "time_per_iteration": 2.779524564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052466, + "balance_loss_mlp": 1.0094316, + "epoch": 0.41573682185455946, + "flos": 530156514048.0, + "grad_norm": 0.0316987683946157, + "language_loss": 0.78011453, + "learning_rate": 0.0006580005627733395, + "loss": 0.79063922, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.4309082, + "step": 2161, + "time_per_iteration": 2.655961275100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053795, + "balance_loss_mlp": 1.01095116, + "epoch": 0.415929203539823, + "flos": 506038473216.0, + "grad_norm": 0.030200496407476712, + "language_loss": 0.82344484, + "learning_rate": 0.0006577049536090838, + "loss": 0.83398283, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.42895508, + "step": 2162, + "time_per_iteration": 2.734727144241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_mlp": 1.00536776, + "epoch": 0.4161215852250866, + "flos": 583824523008.0, + "grad_norm": 0.03528478058898885, + "language_loss": 0.86106777, + "learning_rate": 0.000657409283218335, + "loss": 0.87155068, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.4296875, + "step": 2163, + "time_per_iteration": 2.659733533859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051194, + "balance_loss_mlp": 1.00844538, + "epoch": 0.4163139669103501, + "flos": 491760702720.0, + "grad_norm": 0.03176725688202085, + "language_loss": 0.81183624, + "learning_rate": 0.0006571135517158829, + "loss": 0.82234824, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.42797852, + "step": 2164, + "time_per_iteration": 2.639364004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_mlp": 1.00241089, + "epoch": 0.4165063485956137, + "flos": 1291023243264.0, + "grad_norm": 0.009317160244550511, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807671, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.41210938, + "step": 2165, + "time_per_iteration": 4.755609750747681 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_mlp": 1.00600576, + "epoch": 0.4166987302808773, + "flos": 496258473984.0, + "grad_norm": 0.03907979296448248, + "language_loss": 0.83556676, + "learning_rate": 0.0006565219058351444, + "loss": 0.84605455, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.42822266, + "step": 2166, + "time_per_iteration": 2.549835443496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_mlp": 1.00087476, + "epoch": 0.4168911119661408, + "flos": 465067649280.0, + "grad_norm": 0.0316582334519174, + "language_loss": 0.83126116, + "learning_rate": 0.0006562259916865553, + "loss": 0.8416996, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.43017578, + "step": 2167, + "time_per_iteration": 2.577807664871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045446, + "balance_loss_mlp": 1.00253069, + "epoch": 0.4170834936514044, + "flos": 537943768320.0, + "grad_norm": 0.03263228805326442, + "language_loss": 0.79910517, + "learning_rate": 0.0006559300168856573, + "loss": 0.8095597, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.4296875, + "step": 2168, + "time_per_iteration": 2.716322898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_mlp": 1.00819373, + "epoch": 0.41727587533666793, + "flos": 551750086656.0, + "grad_norm": 0.029704951266317694, + "language_loss": 0.86753178, + "learning_rate": 0.0006556339815473577, + "loss": 0.87804294, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.4296875, + "step": 2169, + "time_per_iteration": 2.627387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044935, + "balance_loss_mlp": 1.00204313, + "epoch": 0.4174682570219315, + "flos": 632378466816.0, + "grad_norm": 0.03018462927838879, + "language_loss": 0.86615288, + "learning_rate": 0.000655337885786588, + "loss": 0.87660229, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.42944336, + "step": 2170, + "time_per_iteration": 2.8836913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_mlp": 1.00211012, + "epoch": 0.41766063870719505, + "flos": 520756593408.0, + "grad_norm": 0.03274558076895909, + "language_loss": 0.85911119, + "learning_rate": 0.0006550417297183025, + "loss": 0.86956197, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.43017578, + "step": 2171, + "time_per_iteration": 2.6085855960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054021, + "balance_loss_mlp": 1.0111295, + "epoch": 0.41785302039245864, + "flos": 559055195136.0, + "grad_norm": 0.03215226267597247, + "language_loss": 0.82142568, + "learning_rate": 0.0006547455134574793, + "loss": 0.83196592, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.42944336, + "step": 2172, + "time_per_iteration": 2.7207438945770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_mlp": 1.0057919, + "epoch": 0.41804540207772223, + "flos": 790028848896.0, + "grad_norm": 0.03152263917705172, + "language_loss": 0.84573895, + "learning_rate": 0.0006544492371191198, + "loss": 0.85622525, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.42895508, + "step": 2173, + "time_per_iteration": 3.1091549396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050974, + "balance_loss_mlp": 1.00791526, + "epoch": 0.41823778376298576, + "flos": 905891869440.0, + "grad_norm": 0.03158772894298815, + "language_loss": 0.83616948, + "learning_rate": 0.0006541529008182485, + "loss": 0.84667921, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.43115234, + "step": 2174, + "time_per_iteration": 3.1934547424316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050617, + "balance_loss_mlp": 1.0074867, + "epoch": 0.41843016544824935, + "flos": 512574676224.0, + "grad_norm": 0.036197783568866736, + "language_loss": 0.87799633, + "learning_rate": 0.0006538565046699136, + "loss": 0.88850248, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.43188477, + "step": 2175, + "time_per_iteration": 2.6156668663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043414, + "balance_loss_mlp": 1.00047445, + "epoch": 0.4186225471335129, + "flos": 654290880000.0, + "grad_norm": 0.03486733903162065, + "language_loss": 0.81864989, + "learning_rate": 0.0006535600487891862, + "loss": 0.82908404, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.42993164, + "step": 2176, + "time_per_iteration": 2.7715044021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00778568, + "epoch": 0.41881492881877647, + "flos": 570226229760.0, + "grad_norm": 0.03182850960977162, + "language_loss": 0.89874047, + "learning_rate": 0.0006532635332911603, + "loss": 0.90924585, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.42797852, + "step": 2177, + "time_per_iteration": 2.714635133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_mlp": 1.00352275, + "epoch": 0.41900731050404, + "flos": 913485682944.0, + "grad_norm": 0.031061931256926825, + "language_loss": 0.81313407, + "learning_rate": 0.0006529669582909541, + "loss": 0.82359695, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.42822266, + "step": 2178, + "time_per_iteration": 3.2592601776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052105, + "balance_loss_mlp": 1.00923753, + "epoch": 0.4191996921893036, + "flos": 536784090624.0, + "grad_norm": 0.03590517964257674, + "language_loss": 0.86468148, + "learning_rate": 0.0006526703239037077, + "loss": 0.87520254, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.42919922, + "step": 2179, + "time_per_iteration": 2.6636452674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.00259995, + "epoch": 0.4193920738745671, + "flos": 583731204096.0, + "grad_norm": 0.030716470700417473, + "language_loss": 0.86737585, + "learning_rate": 0.0006523736302445851, + "loss": 0.87783122, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.42993164, + "step": 2180, + "time_per_iteration": 2.801374673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00535846, + "epoch": 0.4195844555598307, + "flos": 1337802205440.0, + "grad_norm": 0.03692120158624074, + "language_loss": 0.77735525, + "learning_rate": 0.0006520768774287728, + "loss": 0.78783798, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.4296875, + "step": 2181, + "time_per_iteration": 3.781163454055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048528, + "balance_loss_mlp": 1.00568438, + "epoch": 0.4197768372450943, + "flos": 599997828864.0, + "grad_norm": 0.02986751846873145, + "language_loss": 0.85868645, + "learning_rate": 0.0006517800655714806, + "loss": 0.86917174, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.42895508, + "step": 2182, + "time_per_iteration": 2.8340775966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00454116, + "epoch": 0.4199692189303578, + "flos": 736597055232.0, + "grad_norm": 0.031915917751050384, + "language_loss": 0.8544265, + "learning_rate": 0.0006514831947879407, + "loss": 0.86489916, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.42773438, + "step": 2183, + "time_per_iteration": 2.943141460418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.005602, + "epoch": 0.4201616006156214, + "flos": 751663173120.0, + "grad_norm": 0.03318909585917556, + "language_loss": 0.78676963, + "learning_rate": 0.0006511862651934091, + "loss": 0.79725242, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.42724609, + "step": 2184, + "time_per_iteration": 3.0779521465301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049999, + "balance_loss_mlp": 1.00713122, + "epoch": 0.42035398230088494, + "flos": 548092185600.0, + "grad_norm": 0.030200903128349884, + "language_loss": 0.82675183, + "learning_rate": 0.0006508892769031638, + "loss": 0.83725178, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.42919922, + "step": 2185, + "time_per_iteration": 2.6862621307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052696, + "balance_loss_mlp": 1.0098995, + "epoch": 0.42054636398614853, + "flos": 618048206592.0, + "grad_norm": 0.035053166321698394, + "language_loss": 0.87309551, + "learning_rate": 0.000650592230032506, + "loss": 0.88362241, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.4284668, + "step": 2186, + "time_per_iteration": 2.7250919342041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051072, + "balance_loss_mlp": 1.00813246, + "epoch": 0.42073874567141206, + "flos": 641667571968.0, + "grad_norm": 0.033545410607481084, + "language_loss": 0.85750729, + "learning_rate": 0.0006502951246967595, + "loss": 0.86801797, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.42993164, + "step": 2187, + "time_per_iteration": 2.8897902965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051911, + "balance_loss_mlp": 1.00911534, + "epoch": 0.42093112735667565, + "flos": 494823697152.0, + "grad_norm": 0.02963421973388752, + "language_loss": 0.87416923, + "learning_rate": 0.0006499979610112706, + "loss": 0.88468838, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.4284668, + "step": 2188, + "time_per_iteration": 2.690762519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_mlp": 1.00219369, + "epoch": 0.4211235090419392, + "flos": 543437912064.0, + "grad_norm": 0.03405892185917734, + "language_loss": 0.84498167, + "learning_rate": 0.000649700739091409, + "loss": 0.85543036, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.42724609, + "step": 2189, + "time_per_iteration": 2.7150561809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050289, + "balance_loss_mlp": 1.00918579, + "epoch": 0.42131589072720277, + "flos": 1535391055872.0, + "grad_norm": 0.006162303642849888, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.7488656, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.41113281, + "step": 2190, + "time_per_iteration": 4.829074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_mlp": 1.00466371, + "epoch": 0.42150827241246636, + "flos": 567936031488.0, + "grad_norm": 0.029782751851152003, + "language_loss": 0.85824835, + "learning_rate": 0.0006491061210101557, + "loss": 0.8687222, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.42773438, + "step": 2191, + "time_per_iteration": 2.7018613815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_mlp": 1.00197124, + "epoch": 0.4217006540977299, + "flos": 708842500608.0, + "grad_norm": 0.03166528206992478, + "language_loss": 0.84430063, + "learning_rate": 0.0006488087250796157, + "loss": 0.85474735, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.42749023, + "step": 2192, + "time_per_iteration": 2.907424211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_mlp": 1.00236881, + "epoch": 0.4218930357829935, + "flos": 628562118144.0, + "grad_norm": 0.02920565844268777, + "language_loss": 0.82024074, + "learning_rate": 0.0006485112713764049, + "loss": 0.83069193, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.42797852, + "step": 2193, + "time_per_iteration": 2.9393887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_mlp": 1.00435925, + "epoch": 0.422085417468257, + "flos": 461290184448.0, + "grad_norm": 0.02925244938415649, + "language_loss": 0.84264457, + "learning_rate": 0.0006482137600160051, + "loss": 0.85311759, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.42993164, + "step": 2194, + "time_per_iteration": 2.549301862716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050742, + "balance_loss_mlp": 1.00780332, + "epoch": 0.4222777991535206, + "flos": 474981796608.0, + "grad_norm": 0.030629871462955913, + "language_loss": 0.85158336, + "learning_rate": 0.0006479161911139206, + "loss": 0.86209077, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.42993164, + "step": 2195, + "time_per_iteration": 2.6384336948394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_mlp": 1.01116967, + "epoch": 0.4224701808387841, + "flos": 471844925184.0, + "grad_norm": 0.03651823295441523, + "language_loss": 0.8580153, + "learning_rate": 0.0006476185647856778, + "loss": 0.8685571, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.43066406, + "step": 2196, + "time_per_iteration": 2.61171817779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050335, + "balance_loss_mlp": 1.00737166, + "epoch": 0.4226625625240477, + "flos": 678823992576.0, + "grad_norm": 0.03269819945270571, + "language_loss": 0.81914455, + "learning_rate": 0.0006473208811468255, + "loss": 0.8296479, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.43017578, + "step": 2197, + "time_per_iteration": 2.892245292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_mlp": 1.00611031, + "epoch": 0.4228549442093113, + "flos": 504559954944.0, + "grad_norm": 0.030930986611316814, + "language_loss": 0.84766257, + "learning_rate": 0.0006470231403129347, + "loss": 0.85815352, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.43041992, + "step": 2198, + "time_per_iteration": 2.64943265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104433, + "balance_loss_mlp": 1.00119996, + "epoch": 0.42304732589457483, + "flos": 613075092480.0, + "grad_norm": 0.027263393707605364, + "language_loss": 0.81978631, + "learning_rate": 0.0006467253423995988, + "loss": 0.83022958, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.43188477, + "step": 2199, + "time_per_iteration": 2.8850364685058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048401, + "balance_loss_mlp": 1.00527155, + "epoch": 0.4232397075798384, + "flos": 516649594368.0, + "grad_norm": 0.03785502815659436, + "language_loss": 0.79452145, + "learning_rate": 0.000646427487522433, + "loss": 0.80500549, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.43188477, + "step": 2200, + "time_per_iteration": 2.694916009902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050341, + "balance_loss_mlp": 1.00713968, + "epoch": 0.42343208926510195, + "flos": 590934245376.0, + "grad_norm": 0.030735047123199966, + "language_loss": 0.83900952, + "learning_rate": 0.0006461295757970749, + "loss": 0.84951293, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.43261719, + "step": 2201, + "time_per_iteration": 2.835726737976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046215, + "balance_loss_mlp": 1.00320446, + "epoch": 0.42362447095036554, + "flos": 641819216640.0, + "grad_norm": 0.03465447846020762, + "language_loss": 0.82287079, + "learning_rate": 0.0006458316073391839, + "loss": 0.83333296, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.43066406, + "step": 2202, + "time_per_iteration": 2.8503153324127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045543, + "balance_loss_mlp": 1.00241327, + "epoch": 0.42381685263562907, + "flos": 513718802688.0, + "grad_norm": 0.030503622319833546, + "language_loss": 0.88278598, + "learning_rate": 0.0006455335822644422, + "loss": 0.89324141, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.43188477, + "step": 2203, + "time_per_iteration": 2.6294915676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_mlp": 1.00689554, + "epoch": 0.42400923432089266, + "flos": 547822922496.0, + "grad_norm": 0.03601428124518316, + "language_loss": 0.78504658, + "learning_rate": 0.0006452355006885527, + "loss": 0.79554689, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.43188477, + "step": 2204, + "time_per_iteration": 2.7194669246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050575, + "balance_loss_mlp": 1.00756454, + "epoch": 0.4242016160061562, + "flos": 623288638464.0, + "grad_norm": 0.038292152226624715, + "language_loss": 0.88211453, + "learning_rate": 0.0006449373627272412, + "loss": 0.89262021, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.43066406, + "step": 2205, + "time_per_iteration": 2.760643243789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048111, + "balance_loss_mlp": 1.00495708, + "epoch": 0.4243939976914198, + "flos": 572972328960.0, + "grad_norm": 0.03657249930928273, + "language_loss": 0.83085704, + "learning_rate": 0.0006446391684962553, + "loss": 0.84133816, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.43212891, + "step": 2206, + "time_per_iteration": 2.656205892562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050624, + "balance_loss_mlp": 1.00766063, + "epoch": 0.42458637937668336, + "flos": 449665194240.0, + "grad_norm": 0.03531472123955245, + "language_loss": 0.83588743, + "learning_rate": 0.000644340918111364, + "loss": 0.84639364, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.43017578, + "step": 2207, + "time_per_iteration": 2.563599109649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_mlp": 1.00460744, + "epoch": 0.4247787610619469, + "flos": 436336164096.0, + "grad_norm": 0.035922125926704504, + "language_loss": 0.8567791, + "learning_rate": 0.0006440426116883585, + "loss": 0.86725497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.43041992, + "step": 2208, + "time_per_iteration": 2.5554726123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_mlp": 1.00743186, + "epoch": 0.4249711427472105, + "flos": 497122643712.0, + "grad_norm": 0.02878008588010938, + "language_loss": 0.86522639, + "learning_rate": 0.0006437442493430519, + "loss": 0.87572914, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.42895508, + "step": 2209, + "time_per_iteration": 2.698664426803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00334466, + "epoch": 0.425163524432474, + "flos": 657108910848.0, + "grad_norm": 0.03332162137783894, + "language_loss": 0.87084454, + "learning_rate": 0.000643445831191278, + "loss": 0.88130671, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.42919922, + "step": 2210, + "time_per_iteration": 2.919759750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_mlp": 1.00526094, + "epoch": 0.4253559061177376, + "flos": 651779050752.0, + "grad_norm": 0.0360276634161647, + "language_loss": 0.82163692, + "learning_rate": 0.0006431473573488937, + "loss": 0.83211577, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.42675781, + "step": 2211, + "time_per_iteration": 2.7520995140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051038, + "balance_loss_mlp": 1.00836086, + "epoch": 0.42554828780300114, + "flos": 555203853312.0, + "grad_norm": 0.03839138543396186, + "language_loss": 0.85743141, + "learning_rate": 0.0006428488279317765, + "loss": 0.86794186, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.42724609, + "step": 2212, + "time_per_iteration": 2.6509060859680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046098, + "balance_loss_mlp": 1.00356376, + "epoch": 0.4257406694882647, + "flos": 515422842624.0, + "grad_norm": 0.03572196481521071, + "language_loss": 0.88174772, + "learning_rate": 0.0006425502430558259, + "loss": 0.89220864, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.42578125, + "step": 2213, + "time_per_iteration": 2.6220855712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_mlp": 1.00623667, + "epoch": 0.42593305117352825, + "flos": 516705974784.0, + "grad_norm": 0.03258136107598633, + "language_loss": 0.85395515, + "learning_rate": 0.0006422516028369628, + "loss": 0.86444604, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.42895508, + "step": 2214, + "time_per_iteration": 2.6463093757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_mlp": 1.00069499, + "epoch": 0.42612543285879184, + "flos": 589238953728.0, + "grad_norm": 0.0291937048711678, + "language_loss": 0.83896095, + "learning_rate": 0.0006419529073911296, + "loss": 0.8493973, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.42993164, + "step": 2215, + "time_per_iteration": 2.910792112350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.0052923, + "epoch": 0.42631781454405543, + "flos": 636752783616.0, + "grad_norm": 0.03192715722055512, + "language_loss": 0.86142385, + "learning_rate": 0.0006416541568342901, + "loss": 0.87190473, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.4284668, + "step": 2216, + "time_per_iteration": 2.846374750137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_mlp": 1.00366437, + "epoch": 0.42651019622931896, + "flos": 542246153472.0, + "grad_norm": 0.029068811164029314, + "language_loss": 0.84547782, + "learning_rate": 0.0006413553512824297, + "loss": 0.8559429, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.42895508, + "step": 2217, + "time_per_iteration": 2.7738640308380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.00467396, + "epoch": 0.42670257791458255, + "flos": 559224336384.0, + "grad_norm": 0.03125487953761627, + "language_loss": 0.85257965, + "learning_rate": 0.0006410564908515549, + "loss": 0.86305416, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.42822266, + "step": 2218, + "time_per_iteration": 2.654423713684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050321, + "balance_loss_mlp": 1.00757229, + "epoch": 0.4268949595998461, + "flos": 622450713600.0, + "grad_norm": 0.03350458888486861, + "language_loss": 0.85655409, + "learning_rate": 0.0006407575756576935, + "loss": 0.86705726, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.42797852, + "step": 2219, + "time_per_iteration": 2.7789905071258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047517, + "balance_loss_mlp": 1.00479233, + "epoch": 0.42708734128510967, + "flos": 539015963136.0, + "grad_norm": 0.029341516559542476, + "language_loss": 0.87978554, + "learning_rate": 0.0006404586058168951, + "loss": 0.8902607, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.42773438, + "step": 2220, + "time_per_iteration": 2.7526872158050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047218, + "balance_loss_mlp": 1.00456524, + "epoch": 0.4272797229703732, + "flos": 503862981120.0, + "grad_norm": 0.03177497968579407, + "language_loss": 0.87384629, + "learning_rate": 0.0006401595814452296, + "loss": 0.88431847, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.42700195, + "step": 2221, + "time_per_iteration": 2.620292901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_mlp": 1.00282323, + "epoch": 0.4274721046556368, + "flos": 493438497792.0, + "grad_norm": 0.03138650703960668, + "language_loss": 0.81104958, + "learning_rate": 0.000639860502658789, + "loss": 0.82150364, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.42626953, + "step": 2222, + "time_per_iteration": 2.6335668563842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_mlp": 1.01007414, + "epoch": 0.4276644863409004, + "flos": 569462181888.0, + "grad_norm": 0.029337527326174825, + "language_loss": 0.84956491, + "learning_rate": 0.0006395613695736853, + "loss": 0.86009336, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.42822266, + "step": 2223, + "time_per_iteration": 2.69158935546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_mlp": 1.01059997, + "epoch": 0.4278568680261639, + "flos": 608563715328.0, + "grad_norm": 0.03527650476558936, + "language_loss": 0.8254534, + "learning_rate": 0.0006392621823060529, + "loss": 0.83598542, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.42651367, + "step": 2224, + "time_per_iteration": 2.7607972621917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_mlp": 0.99978256, + "epoch": 0.4280492497114275, + "flos": 561579663360.0, + "grad_norm": 0.03854840542263403, + "language_loss": 0.8576616, + "learning_rate": 0.0006389629409720465, + "loss": 0.86808693, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.42797852, + "step": 2225, + "time_per_iteration": 2.675492525100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_mlp": 1.00333333, + "epoch": 0.428241631396691, + "flos": 721902267648.0, + "grad_norm": 0.035169952304445494, + "language_loss": 0.89023572, + "learning_rate": 0.0006386636456878417, + "loss": 0.90069675, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.42822266, + "step": 2226, + "time_per_iteration": 2.8786110877990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046721, + "balance_loss_mlp": 1.00397301, + "epoch": 0.4284340130819546, + "flos": 430370568192.0, + "grad_norm": 0.04053005061098929, + "language_loss": 0.92206526, + "learning_rate": 0.0006383642965696353, + "loss": 0.93253243, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.42797852, + "step": 2227, + "time_per_iteration": 2.468848705291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00519955, + "epoch": 0.42862639476721814, + "flos": 526160330496.0, + "grad_norm": 0.0312355764309364, + "language_loss": 0.83643448, + "learning_rate": 0.000638064893733645, + "loss": 0.84691536, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.42944336, + "step": 2228, + "time_per_iteration": 2.7273313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_mlp": 1.0059433, + "epoch": 0.42881877645248173, + "flos": 466378971648.0, + "grad_norm": 0.033088247906643435, + "language_loss": 0.90412128, + "learning_rate": 0.000637765437296109, + "loss": 0.91460913, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.42895508, + "step": 2229, + "time_per_iteration": 2.6459994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104348, + "balance_loss_mlp": 1.00051713, + "epoch": 0.42901115813774526, + "flos": 561356087040.0, + "grad_norm": 0.033851055909267555, + "language_loss": 0.85812581, + "learning_rate": 0.000637465927373287, + "loss": 0.86856055, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.43017578, + "step": 2230, + "time_per_iteration": 2.6650984287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_mlp": 1.00843728, + "epoch": 0.42920353982300885, + "flos": 562528403712.0, + "grad_norm": 0.03941473686966497, + "language_loss": 0.79439276, + "learning_rate": 0.000637166364081459, + "loss": 0.80490577, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.42919922, + "step": 2231, + "time_per_iteration": 2.6497089862823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_mlp": 1.00242341, + "epoch": 0.42939592150827244, + "flos": 557316162048.0, + "grad_norm": 0.0345529023969128, + "language_loss": 0.84757453, + "learning_rate": 0.0006368667475369256, + "loss": 0.85802627, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.42797852, + "step": 2232, + "time_per_iteration": 2.7934672832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_mlp": 1.00753021, + "epoch": 0.42958830319353597, + "flos": 1524945185280.0, + "grad_norm": 0.006396251355867503, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79576218, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.40917969, + "step": 2233, + "time_per_iteration": 6.342620372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_mlp": 1.0040741, + "epoch": 0.42978068487879956, + "flos": 1498872316416.0, + "grad_norm": 0.003657386104401554, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.7994051, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.41015625, + "step": 2234, + "time_per_iteration": 4.862509250640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044747, + "balance_loss_mlp": 1.00209367, + "epoch": 0.4299730665640631, + "flos": 548063995392.0, + "grad_norm": 0.029617650166464796, + "language_loss": 0.86346197, + "learning_rate": 0.0006359675795504112, + "loss": 0.87390947, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.42700195, + "step": 2235, + "time_per_iteration": 2.747687339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_mlp": 1.0022428, + "epoch": 0.4301654482493267, + "flos": 1131116700672.0, + "grad_norm": 0.034530900471349386, + "language_loss": 0.74852663, + "learning_rate": 0.0006356677511584775, + "loss": 0.75897634, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.42773438, + "step": 2236, + "time_per_iteration": 3.4453399181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_mlp": 1.00291729, + "epoch": 0.4303578299345902, + "flos": 496742565120.0, + "grad_norm": 0.03572959525697719, + "language_loss": 0.8668766, + "learning_rate": 0.0006353678700956511, + "loss": 0.87733233, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.42700195, + "step": 2237, + "time_per_iteration": 2.562898874282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_mlp": 1.00228131, + "epoch": 0.4305502116198538, + "flos": 616930324992.0, + "grad_norm": 0.03185512314906856, + "language_loss": 0.84350532, + "learning_rate": 0.0006350679364783569, + "loss": 0.853953, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.42529297, + "step": 2238, + "time_per_iteration": 2.7968668937683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044024, + "balance_loss_mlp": 1.00139523, + "epoch": 0.4307425933051173, + "flos": 560322776064.0, + "grad_norm": 0.03209283293682184, + "language_loss": 0.85997605, + "learning_rate": 0.0006347679504230393, + "loss": 0.87041628, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.42675781, + "step": 2239, + "time_per_iteration": 2.634075880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042998, + "balance_loss_mlp": 1.00039279, + "epoch": 0.4309349749903809, + "flos": 973818206976.0, + "grad_norm": 0.03253096283776471, + "language_loss": 0.77016532, + "learning_rate": 0.0006344679120461632, + "loss": 0.7805953, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.42651367, + "step": 2240, + "time_per_iteration": 3.334874153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044857, + "balance_loss_mlp": 1.00222731, + "epoch": 0.4311273566756445, + "flos": 542973262848.0, + "grad_norm": 0.034862997803941254, + "language_loss": 0.8043505, + "learning_rate": 0.0006341678214642134, + "loss": 0.81479907, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.42675781, + "step": 2241, + "time_per_iteration": 2.6504814624786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00386059, + "epoch": 0.43131973836090803, + "flos": 763112219136.0, + "grad_norm": 0.032836493574204505, + "language_loss": 0.83329326, + "learning_rate": 0.0006338676787936963, + "loss": 0.84375745, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.42602539, + "step": 2242, + "time_per_iteration": 3.0819406509399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049013, + "balance_loss_mlp": 1.0064075, + "epoch": 0.4315121200461716, + "flos": 555603373824.0, + "grad_norm": 0.03474898353682057, + "language_loss": 0.8436116, + "learning_rate": 0.0006335674841511367, + "loss": 0.85410172, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.42651367, + "step": 2243, + "time_per_iteration": 2.688323974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.00395203, + "epoch": 0.43170450173143515, + "flos": 1488689872896.0, + "grad_norm": 0.005657229041031833, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80226028, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.40917969, + "step": 2244, + "time_per_iteration": 5.0437562465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_mlp": 1.00093079, + "epoch": 0.43189688341669874, + "flos": 1476910325760.0, + "grad_norm": 0.004174711640612148, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.784073, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.40820312, + "step": 2245, + "time_per_iteration": 4.930269002914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105129, + "balance_loss_mlp": 1.00870872, + "epoch": 0.43208926510196227, + "flos": 493985772288.0, + "grad_norm": 0.03367129883883542, + "language_loss": 0.83325648, + "learning_rate": 0.0006326665895567652, + "loss": 0.84376937, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.42626953, + "step": 2246, + "time_per_iteration": 2.6496520042419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_mlp": 1.0025456, + "epoch": 0.43228164678722586, + "flos": 521303867904.0, + "grad_norm": 0.0373506965449987, + "language_loss": 0.88340402, + "learning_rate": 0.0006323661881916976, + "loss": 0.89385581, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.42675781, + "step": 2247, + "time_per_iteration": 2.7220535278320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_mlp": 1.00188208, + "epoch": 0.4324740284724894, + "flos": 797396173824.0, + "grad_norm": 0.03547023876634794, + "language_loss": 0.8184936, + "learning_rate": 0.0006320657354375179, + "loss": 0.82893801, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.42602539, + "step": 2248, + "time_per_iteration": 2.939730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00463986, + "epoch": 0.432666410157753, + "flos": 483098585088.0, + "grad_norm": 0.03653679675435745, + "language_loss": 0.87333679, + "learning_rate": 0.0006317652314108726, + "loss": 0.88380903, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.42626953, + "step": 2249, + "time_per_iteration": 2.554605007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104756, + "balance_loss_mlp": 1.00512183, + "epoch": 0.43285879184301657, + "flos": 501210200832.0, + "grad_norm": 0.035110898136686476, + "language_loss": 0.91870761, + "learning_rate": 0.0006314646762284277, + "loss": 0.92918324, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.42480469, + "step": 2250, + "time_per_iteration": 2.6592071056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051212, + "balance_loss_mlp": 1.01029968, + "epoch": 0.4330511735282801, + "flos": 1513793592576.0, + "grad_norm": 0.004753866691066904, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76477039, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.40917969, + "step": 2251, + "time_per_iteration": 4.880429267883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00837088, + "epoch": 0.4332435552135437, + "flos": 700838472960.0, + "grad_norm": 0.03213295924784481, + "language_loss": 0.77973437, + "learning_rate": 0.0006308634128629022, + "loss": 0.790241, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.42333984, + "step": 2252, + "time_per_iteration": 2.882138729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048462, + "balance_loss_mlp": 1.00621462, + "epoch": 0.4334359368988072, + "flos": 593483013120.0, + "grad_norm": 0.03310670466815904, + "language_loss": 0.87855673, + "learning_rate": 0.0006305627049132531, + "loss": 0.8890413, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.42285156, + "step": 2253, + "time_per_iteration": 2.756601095199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052718, + "balance_loss_mlp": 1.01049364, + "epoch": 0.4336283185840708, + "flos": 844276213248.0, + "grad_norm": 0.028181128656308053, + "language_loss": 0.86222875, + "learning_rate": 0.0006302619462746662, + "loss": 0.87275594, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.42260742, + "step": 2254, + "time_per_iteration": 3.1384341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00748384, + "epoch": 0.43382070026933434, + "flos": 627402440448.0, + "grad_norm": 0.031912731462448586, + "language_loss": 0.90840006, + "learning_rate": 0.0006299611370639069, + "loss": 0.91889828, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.42382812, + "step": 2255, + "time_per_iteration": 2.712411642074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00746036, + "epoch": 0.4340130819545979, + "flos": 592210574592.0, + "grad_norm": 0.034079381595113686, + "language_loss": 0.79521996, + "learning_rate": 0.0006296602773977593, + "loss": 0.80571818, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.42407227, + "step": 2256, + "time_per_iteration": 2.714035987854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044192, + "balance_loss_mlp": 1.00182462, + "epoch": 0.4342054636398615, + "flos": 491956088832.0, + "grad_norm": 0.031173748742501443, + "language_loss": 0.88170785, + "learning_rate": 0.0006293593673930277, + "loss": 0.89214981, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.42407227, + "step": 2257, + "time_per_iteration": 2.6403400897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050154, + "balance_loss_mlp": 1.00771534, + "epoch": 0.43439784532512504, + "flos": 700261062912.0, + "grad_norm": 0.031956889919079245, + "language_loss": 0.79138076, + "learning_rate": 0.0006290584071665358, + "loss": 0.80188227, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.42480469, + "step": 2258, + "time_per_iteration": 2.88726544380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051787, + "balance_loss_mlp": 1.00942004, + "epoch": 0.43459022701038863, + "flos": 486802172928.0, + "grad_norm": 0.03220669099915263, + "language_loss": 0.82764459, + "learning_rate": 0.0006287573968351266, + "loss": 0.83816242, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.42407227, + "step": 2259, + "time_per_iteration": 2.556873083114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_mlp": 1.00314939, + "epoch": 0.43478260869565216, + "flos": 644267862528.0, + "grad_norm": 0.0421666552527836, + "language_loss": 0.83019865, + "learning_rate": 0.0006284563365156626, + "loss": 0.84065259, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.42285156, + "step": 2260, + "time_per_iteration": 2.7845253944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044012, + "balance_loss_mlp": 1.0014782, + "epoch": 0.43497499038091575, + "flos": 427010120448.0, + "grad_norm": 0.03632893260701325, + "language_loss": 0.87946701, + "learning_rate": 0.0006281552263250261, + "loss": 0.88990712, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.42578125, + "step": 2261, + "time_per_iteration": 2.4605414867401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_mlp": 1.00973511, + "epoch": 0.4351673720661793, + "flos": 1541527738368.0, + "grad_norm": 0.007050141628338806, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81742275, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.41015625, + "step": 2262, + "time_per_iteration": 4.901712656021118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105392, + "balance_loss_mlp": 1.01160097, + "epoch": 0.43535975375144287, + "flos": 750466556928.0, + "grad_norm": 0.036118497785784055, + "language_loss": 0.8206706, + "learning_rate": 0.0006275528567978593, + "loss": 0.83120978, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.42358398, + "step": 2263, + "time_per_iteration": 2.9023561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049817, + "balance_loss_mlp": 1.00749719, + "epoch": 0.4355521354367064, + "flos": 862752356352.0, + "grad_norm": 0.037575674234966834, + "language_loss": 0.82972687, + "learning_rate": 0.0006272515976951898, + "loss": 0.84022498, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.42358398, + "step": 2264, + "time_per_iteration": 3.062626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_mlp": 1.00086057, + "epoch": 0.43574451712197, + "flos": 735843700992.0, + "grad_norm": 0.027621901281680974, + "language_loss": 0.7971707, + "learning_rate": 0.0006269502891890687, + "loss": 0.80760157, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.42260742, + "step": 2265, + "time_per_iteration": 3.006544351577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047735, + "balance_loss_mlp": 1.00548732, + "epoch": 0.4359368988072336, + "flos": 571713496320.0, + "grad_norm": 0.03795602123750952, + "language_loss": 0.88080567, + "learning_rate": 0.0006266489313964743, + "loss": 0.89128304, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.42285156, + "step": 2266, + "time_per_iteration": 2.7217609882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00633645, + "epoch": 0.4361292804924971, + "flos": 556671677952.0, + "grad_norm": 0.02985944883667051, + "language_loss": 0.86046827, + "learning_rate": 0.0006263475244344041, + "loss": 0.87095433, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.4230957, + "step": 2267, + "time_per_iteration": 2.844616651535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048894, + "balance_loss_mlp": 1.00688469, + "epoch": 0.4363216621777607, + "flos": 558349473024.0, + "grad_norm": 0.03645132335916721, + "language_loss": 0.84930134, + "learning_rate": 0.0006260460684198746, + "loss": 0.85979033, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.42041016, + "step": 2268, + "time_per_iteration": 2.6209938526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_mlp": 1.00457883, + "epoch": 0.4365140438630242, + "flos": 479197665792.0, + "grad_norm": 0.03681259693925087, + "language_loss": 0.84888554, + "learning_rate": 0.0006257445634699213, + "loss": 0.85935068, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.41967773, + "step": 2269, + "time_per_iteration": 2.5371193885803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_mlp": 1.00675571, + "epoch": 0.4367064255482878, + "flos": 580008174336.0, + "grad_norm": 0.03379370609735099, + "language_loss": 0.83707798, + "learning_rate": 0.0006254430097015993, + "loss": 0.84756517, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.41992188, + "step": 2270, + "time_per_iteration": 2.663670539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053223, + "balance_loss_mlp": 1.01278687, + "epoch": 0.43689880723355135, + "flos": 1462274830848.0, + "grad_norm": 0.005499517712732893, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77532315, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.40429688, + "step": 2271, + "time_per_iteration": 4.872848033905029 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051249, + "balance_loss_mlp": 1.00945389, + "epoch": 0.43709118891881493, + "flos": 668874852096.0, + "grad_norm": 0.028346757116800847, + "language_loss": 0.85555887, + "learning_rate": 0.0006248397561781609, + "loss": 0.86607134, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.41821289, + "step": 2272, + "time_per_iteration": 2.8525848388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_mlp": 1.01004434, + "epoch": 0.43728357060407846, + "flos": 545914748160.0, + "grad_norm": 0.03971939435737374, + "language_loss": 0.86681366, + "learning_rate": 0.0006245380566572482, + "loss": 0.87733418, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.42041016, + "step": 2273, + "time_per_iteration": 2.65950608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_mlp": 1.01047897, + "epoch": 0.43747595228934205, + "flos": 748185106944.0, + "grad_norm": 0.03474296828051499, + "language_loss": 0.764799, + "learning_rate": 0.0006242363087863744, + "loss": 0.77532339, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.41992188, + "step": 2274, + "time_per_iteration": 3.009678363800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_mlp": 1.00212932, + "epoch": 0.43766833397460564, + "flos": 632530111488.0, + "grad_norm": 0.043644038275203835, + "language_loss": 0.86733937, + "learning_rate": 0.0006239345126826878, + "loss": 0.87778056, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.42016602, + "step": 2275, + "time_per_iteration": 2.7913572788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_mlp": 1.00093269, + "epoch": 0.43786071565986917, + "flos": 532099681536.0, + "grad_norm": 0.03488456741245989, + "language_loss": 0.84520668, + "learning_rate": 0.0006236326684633561, + "loss": 0.85563612, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.42041016, + "step": 2276, + "time_per_iteration": 2.868460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_mlp": 1.00567341, + "epoch": 0.43805309734513276, + "flos": 539558380032.0, + "grad_norm": 0.04090877877929134, + "language_loss": 0.75841373, + "learning_rate": 0.0006233307762455658, + "loss": 0.76888937, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.41918945, + "step": 2277, + "time_per_iteration": 2.675471782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.00563169, + "epoch": 0.4382454790303963, + "flos": 865965050112.0, + "grad_norm": 0.057141626101515054, + "language_loss": 0.83989596, + "learning_rate": 0.0006230288361465216, + "loss": 0.85037291, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.42089844, + "step": 2278, + "time_per_iteration": 3.0322673320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_mlp": 1.005216, + "epoch": 0.4384378607156599, + "flos": 766802201088.0, + "grad_norm": 0.03709867443192191, + "language_loss": 0.85241038, + "learning_rate": 0.0006227268482834473, + "loss": 0.86288601, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.42382812, + "step": 2279, + "time_per_iteration": 2.900203227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.0024029, + "epoch": 0.4386302424009234, + "flos": 669797347584.0, + "grad_norm": 0.03112976006735108, + "language_loss": 0.87510288, + "learning_rate": 0.000622424812773585, + "loss": 0.88555157, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.42504883, + "step": 2280, + "time_per_iteration": 2.8384146690368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_mlp": 1.00591767, + "epoch": 0.438822624086187, + "flos": 486150885888.0, + "grad_norm": 0.037274279546085635, + "language_loss": 0.8020004, + "learning_rate": 0.000622122729734195, + "loss": 0.81248468, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.42553711, + "step": 2281, + "time_per_iteration": 2.6004860401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048001, + "balance_loss_mlp": 1.00549114, + "epoch": 0.4390150057714506, + "flos": 500259515136.0, + "grad_norm": 0.032261530197162686, + "language_loss": 0.88006121, + "learning_rate": 0.0006218205992825566, + "loss": 0.8905412, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.42553711, + "step": 2282, + "time_per_iteration": 2.619781494140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049893, + "balance_loss_mlp": 1.00745404, + "epoch": 0.4392073874567141, + "flos": 559352648448.0, + "grad_norm": 0.035010140104523226, + "language_loss": 0.8217926, + "learning_rate": 0.0006215184215359671, + "loss": 0.83229148, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.42480469, + "step": 2283, + "time_per_iteration": 2.7295265197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_mlp": 1.00495577, + "epoch": 0.4393997691419777, + "flos": 606423216384.0, + "grad_norm": 0.031848598857185544, + "language_loss": 0.86998332, + "learning_rate": 0.0006212161966117425, + "loss": 0.88045812, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.42578125, + "step": 2284, + "time_per_iteration": 2.718440532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_mlp": 1.00607538, + "epoch": 0.43959215082724123, + "flos": 805484772096.0, + "grad_norm": 0.035712970592664255, + "language_loss": 0.82239711, + "learning_rate": 0.0006209139246272164, + "loss": 0.83288318, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.42578125, + "step": 2285, + "time_per_iteration": 2.9688222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050536, + "balance_loss_mlp": 1.00793087, + "epoch": 0.4397845325125048, + "flos": 488608280064.0, + "grad_norm": 0.03687327973299051, + "language_loss": 0.82202113, + "learning_rate": 0.0006206116056997421, + "loss": 0.8325265, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.42651367, + "step": 2286, + "time_per_iteration": 2.5476558208465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048309, + "balance_loss_mlp": 1.00579894, + "epoch": 0.43997691419776835, + "flos": 481785317376.0, + "grad_norm": 0.030160303580515496, + "language_loss": 0.8299154, + "learning_rate": 0.0006203092399466892, + "loss": 0.84039849, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.42553711, + "step": 2287, + "time_per_iteration": 2.5308852195739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_mlp": 1.00539064, + "epoch": 0.44016929588303194, + "flos": 484129950720.0, + "grad_norm": 0.02729114822665251, + "language_loss": 0.85650307, + "learning_rate": 0.0006200068274854473, + "loss": 0.8669818, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.42529297, + "step": 2288, + "time_per_iteration": 2.6596133708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045912, + "balance_loss_mlp": 1.00361645, + "epoch": 0.4403616775682955, + "flos": 573024818688.0, + "grad_norm": 0.028573956325372987, + "language_loss": 0.86632061, + "learning_rate": 0.0006197043684334229, + "loss": 0.87677968, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.42333984, + "step": 2289, + "time_per_iteration": 2.773327350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00496542, + "epoch": 0.44055405925355906, + "flos": 632000333568.0, + "grad_norm": 0.03542319310998882, + "language_loss": 0.80357343, + "learning_rate": 0.0006194018629080411, + "loss": 0.81404698, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.42431641, + "step": 2290, + "time_per_iteration": 2.7465741634368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_mlp": 1.00444698, + "epoch": 0.44074644093882265, + "flos": 537826149888.0, + "grad_norm": 0.033710926441732514, + "language_loss": 0.82429153, + "learning_rate": 0.0006190993110267451, + "loss": 0.83475971, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.42407227, + "step": 2291, + "time_per_iteration": 2.734936237335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_mlp": 1.00401258, + "epoch": 0.4409388226240862, + "flos": 464166541056.0, + "grad_norm": 0.03677198311176373, + "language_loss": 0.84841394, + "learning_rate": 0.0006187967129069958, + "loss": 0.85887772, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.42407227, + "step": 2292, + "time_per_iteration": 2.491478443145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048193, + "balance_loss_mlp": 1.00604105, + "epoch": 0.44113120430934977, + "flos": 567161289984.0, + "grad_norm": 0.027373577802651455, + "language_loss": 0.87309539, + "learning_rate": 0.0006184940686662722, + "loss": 0.88357735, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.421875, + "step": 2293, + "time_per_iteration": 2.7358779907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045965, + "balance_loss_mlp": 1.00371683, + "epoch": 0.4413235859946133, + "flos": 544675357440.0, + "grad_norm": 0.03072432375615432, + "language_loss": 0.9056381, + "learning_rate": 0.0006181913784220714, + "loss": 0.91609776, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.42285156, + "step": 2294, + "time_per_iteration": 2.6358015537261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_mlp": 1.00485992, + "epoch": 0.4415159676798769, + "flos": 1573305688320.0, + "grad_norm": 0.007789835090792861, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81599367, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.40722656, + "step": 2295, + "time_per_iteration": 4.902246713638306 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_mlp": 1.00181961, + "epoch": 0.4417083493651404, + "flos": 660013457664.0, + "grad_norm": 0.029698143477661094, + "language_loss": 0.80193049, + "learning_rate": 0.0006175858603933146, + "loss": 0.8123709, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.42260742, + "step": 2296, + "time_per_iteration": 2.8894712924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010469, + "balance_loss_mlp": 1.00477171, + "epoch": 0.441900731050404, + "flos": 741818045184.0, + "grad_norm": 0.03343125158047759, + "language_loss": 0.81235009, + "learning_rate": 0.0006172830328438416, + "loss": 0.82281911, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.42163086, + "step": 2297, + "time_per_iteration": 3.03363299369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_mlp": 1.00080705, + "epoch": 0.44209311273566754, + "flos": 540596548608.0, + "grad_norm": 0.03516131163144532, + "language_loss": 0.87775767, + "learning_rate": 0.0006169801597610572, + "loss": 0.88818848, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.4230957, + "step": 2298, + "time_per_iteration": 2.7615511417388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047937, + "balance_loss_mlp": 1.00580859, + "epoch": 0.4422854944209311, + "flos": 622730670336.0, + "grad_norm": 0.03691263796350213, + "language_loss": 0.90342188, + "learning_rate": 0.0006166772412625469, + "loss": 0.91390121, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.42163086, + "step": 2299, + "time_per_iteration": 2.757885456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_mlp": 1.00208378, + "epoch": 0.4424778761061947, + "flos": 660061089792.0, + "grad_norm": 0.03315959572172903, + "language_loss": 0.82509053, + "learning_rate": 0.0006163742774659141, + "loss": 0.835536, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.42504883, + "step": 2300, + "time_per_iteration": 2.8489365577697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045508, + "balance_loss_mlp": 1.00316477, + "epoch": 0.44267025779145824, + "flos": 569703254784.0, + "grad_norm": 0.02877714461404429, + "language_loss": 0.86486191, + "learning_rate": 0.0006160712684887801, + "loss": 0.87531698, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.42382812, + "step": 2301, + "time_per_iteration": 2.783581495285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_mlp": 1.00126386, + "epoch": 0.44286263947672183, + "flos": 497819617536.0, + "grad_norm": 0.032325076823307486, + "language_loss": 0.82883227, + "learning_rate": 0.0006157682144487832, + "loss": 0.83926737, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.42285156, + "step": 2302, + "time_per_iteration": 2.8138058185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046294, + "balance_loss_mlp": 1.00395119, + "epoch": 0.44305502116198536, + "flos": 610608950016.0, + "grad_norm": 0.032307808069359366, + "language_loss": 0.83262819, + "learning_rate": 0.0006154651154635793, + "loss": 0.84309107, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.42382812, + "step": 2303, + "time_per_iteration": 2.9065494537353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045793, + "balance_loss_mlp": 1.00349796, + "epoch": 0.44324740284724895, + "flos": 471742857984.0, + "grad_norm": 0.03422426159351285, + "language_loss": 0.85742319, + "learning_rate": 0.0006151619716508421, + "loss": 0.86788118, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.42333984, + "step": 2304, + "time_per_iteration": 2.5973682403564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00377095, + "epoch": 0.4434397845325125, + "flos": 579812788224.0, + "grad_norm": 0.032225909976612614, + "language_loss": 0.87212336, + "learning_rate": 0.0006148587831282625, + "loss": 0.88258433, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.42358398, + "step": 2305, + "time_per_iteration": 2.6349332332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_mlp": 1.00563049, + "epoch": 0.44363216621777607, + "flos": 1499997967872.0, + "grad_norm": 0.0072841640427745245, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80222803, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.41113281, + "step": 2306, + "time_per_iteration": 4.920953989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047952, + "balance_loss_mlp": 1.00565624, + "epoch": 0.44382454790303966, + "flos": 478285863936.0, + "grad_norm": 0.035350800366555836, + "language_loss": 0.87850344, + "learning_rate": 0.0006142522724244255, + "loss": 0.88898295, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.42333984, + "step": 2307, + "time_per_iteration": 2.5206384658813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044529, + "balance_loss_mlp": 1.00361633, + "epoch": 0.4440169295883032, + "flos": 1547306696448.0, + "grad_norm": 0.0037013242818687312, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77529252, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.40917969, + "step": 2308, + "time_per_iteration": 4.906585454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00570333, + "epoch": 0.4442093112735668, + "flos": 592291254528.0, + "grad_norm": 0.03559804859588436, + "language_loss": 0.78114909, + "learning_rate": 0.000613645584293942, + "loss": 0.79162765, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.421875, + "step": 2309, + "time_per_iteration": 2.9084970951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_mlp": 1.00767648, + "epoch": 0.4444016929588303, + "flos": 531328830720.0, + "grad_norm": 0.036447190975963356, + "language_loss": 0.83448339, + "learning_rate": 0.0006133421739881185, + "loss": 0.84498286, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.4230957, + "step": 2310, + "time_per_iteration": 2.652672052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044821, + "balance_loss_mlp": 1.0026927, + "epoch": 0.4445940746440939, + "flos": 621389212416.0, + "grad_norm": 0.035906278639006764, + "language_loss": 0.83511341, + "learning_rate": 0.0006130387196789605, + "loss": 0.84556162, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.42163086, + "step": 2311, + "time_per_iteration": 2.747197151184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.00328362, + "epoch": 0.4447864563293574, + "flos": 630376973568.0, + "grad_norm": 0.027043038636915952, + "language_loss": 0.84677482, + "learning_rate": 0.0006127352214842795, + "loss": 0.85723037, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.4230957, + "step": 2312, + "time_per_iteration": 3.0515668392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045637, + "balance_loss_mlp": 1.00327015, + "epoch": 0.444978838014621, + "flos": 652002627072.0, + "grad_norm": 0.034195517498726076, + "language_loss": 0.85929281, + "learning_rate": 0.0006124316795219041, + "loss": 0.86974919, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.42407227, + "step": 2313, + "time_per_iteration": 2.778184652328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050022, + "balance_loss_mlp": 1.00786984, + "epoch": 0.44517121969988455, + "flos": 613589319168.0, + "grad_norm": 0.029604729226228255, + "language_loss": 0.82924336, + "learning_rate": 0.0006121280939096794, + "loss": 0.83974361, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.421875, + "step": 2314, + "time_per_iteration": 2.7615392208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045707, + "balance_loss_mlp": 1.00350666, + "epoch": 0.44536360138514813, + "flos": 489715468032.0, + "grad_norm": 0.036472505020621125, + "language_loss": 0.8826952, + "learning_rate": 0.000611824464765468, + "loss": 0.89315224, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.42236328, + "step": 2315, + "time_per_iteration": 2.67606782913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_mlp": 1.01759338, + "epoch": 0.4455559830704117, + "flos": 1519056390144.0, + "grad_norm": 0.01193419136680653, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79653352, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.40820312, + "step": 2316, + "time_per_iteration": 4.725375652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045978, + "balance_loss_mlp": 1.00384891, + "epoch": 0.44574836475567525, + "flos": 616817564160.0, + "grad_norm": 0.032139423648612636, + "language_loss": 0.85745513, + "learning_rate": 0.000611217076352619, + "loss": 0.86791497, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.42163086, + "step": 2317, + "time_per_iteration": 2.8277692794799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046443, + "balance_loss_mlp": 1.00429094, + "epoch": 0.44594074644093884, + "flos": 507434366208.0, + "grad_norm": 0.030845694350894858, + "language_loss": 0.83782113, + "learning_rate": 0.0006109133173197905, + "loss": 0.84828556, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.421875, + "step": 2318, + "time_per_iteration": 2.740814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044287, + "balance_loss_mlp": 1.0021348, + "epoch": 0.44613312812620237, + "flos": 728313070848.0, + "grad_norm": 0.03532114030566384, + "language_loss": 0.86011016, + "learning_rate": 0.0006106095152265935, + "loss": 0.87055302, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.421875, + "step": 2319, + "time_per_iteration": 2.982090473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048334, + "balance_loss_mlp": 1.00615764, + "epoch": 0.44632550981146596, + "flos": 637058985216.0, + "grad_norm": 0.029959494040304766, + "language_loss": 0.85331011, + "learning_rate": 0.0006103056701909739, + "loss": 0.86379343, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.42211914, + "step": 2320, + "time_per_iteration": 2.911764621734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_mlp": 1.00878716, + "epoch": 0.4465178914967295, + "flos": 828618100992.0, + "grad_norm": 0.026414177364328564, + "language_loss": 0.83389866, + "learning_rate": 0.0006100017823308956, + "loss": 0.8444078, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.42163086, + "step": 2321, + "time_per_iteration": 3.166370153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00672007, + "epoch": 0.4467102731819931, + "flos": 667033751808.0, + "grad_norm": 0.03675396641442824, + "language_loss": 0.80177474, + "learning_rate": 0.0006096978517643377, + "loss": 0.81226206, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.42041016, + "step": 2322, + "time_per_iteration": 2.7839677333831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049863, + "balance_loss_mlp": 1.00780618, + "epoch": 0.4469026548672566, + "flos": 513970569216.0, + "grad_norm": 0.036357166954029595, + "language_loss": 0.84299958, + "learning_rate": 0.0006093938786092968, + "loss": 0.85349822, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.42089844, + "step": 2323, + "time_per_iteration": 2.6366002559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_mlp": 1.01054394, + "epoch": 0.4470950365525202, + "flos": 685286318592.0, + "grad_norm": 0.03621901423501995, + "language_loss": 0.9042533, + "learning_rate": 0.0006090898629837857, + "loss": 0.91477954, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.42114258, + "step": 2324, + "time_per_iteration": 2.8338427543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_mlp": 1.00514829, + "epoch": 0.4472874182377838, + "flos": 628535873280.0, + "grad_norm": 0.028780974393906523, + "language_loss": 0.87792349, + "learning_rate": 0.0006087858050058337, + "loss": 0.88839531, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.4206543, + "step": 2325, + "time_per_iteration": 2.7868492603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_mlp": 1.00534439, + "epoch": 0.4474797999230473, + "flos": 548241884928.0, + "grad_norm": 0.03362424978515615, + "language_loss": 0.83227015, + "learning_rate": 0.0006084817047934866, + "loss": 0.84274435, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.42114258, + "step": 2326, + "time_per_iteration": 2.6603922843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.01144028, + "epoch": 0.4476721816083109, + "flos": 456757420032.0, + "grad_norm": 0.033869443234677665, + "language_loss": 0.90294945, + "learning_rate": 0.0006081775624648066, + "loss": 0.91348392, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.42041016, + "step": 2327, + "time_per_iteration": 2.563965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_mlp": 1.00730181, + "epoch": 0.44786456329357444, + "flos": 482501733120.0, + "grad_norm": 0.03973119590818811, + "language_loss": 0.83093679, + "learning_rate": 0.0006078733781378721, + "loss": 0.8414318, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.42236328, + "step": 2328, + "time_per_iteration": 2.5500621795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056217, + "balance_loss_mlp": 1.01401651, + "epoch": 0.448056944978838, + "flos": 553237353216.0, + "grad_norm": 0.0336771809947293, + "language_loss": 0.82818258, + "learning_rate": 0.0006075691519307781, + "loss": 0.83874476, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.42236328, + "step": 2329, + "time_per_iteration": 2.8369436264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053357, + "balance_loss_mlp": 1.01125205, + "epoch": 0.44824932666410156, + "flos": 551917282560.0, + "grad_norm": 0.03290883990888194, + "language_loss": 0.81853932, + "learning_rate": 0.0006072648839616356, + "loss": 0.82907289, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.42138672, + "step": 2330, + "time_per_iteration": 2.707853078842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_mlp": 1.00861132, + "epoch": 0.44844170834936514, + "flos": 990273414912.0, + "grad_norm": 0.029288900679948552, + "language_loss": 0.83132529, + "learning_rate": 0.0006069605743485718, + "loss": 0.84183216, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.42114258, + "step": 2331, + "time_per_iteration": 3.347529649734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053351, + "balance_loss_mlp": 1.011127, + "epoch": 0.44863409003462873, + "flos": 592451647488.0, + "grad_norm": 0.033148459483392366, + "language_loss": 0.84139442, + "learning_rate": 0.0006066562232097303, + "loss": 0.85192794, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.42260742, + "step": 2332, + "time_per_iteration": 2.7059993743896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048256, + "balance_loss_mlp": 1.00600874, + "epoch": 0.44882647171989226, + "flos": 725985934080.0, + "grad_norm": 0.033171968523288915, + "language_loss": 0.86700636, + "learning_rate": 0.0006063518306632708, + "loss": 0.87748891, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.42285156, + "step": 2333, + "time_per_iteration": 2.9296460151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_mlp": 1.00607038, + "epoch": 0.44901885340515585, + "flos": 535991852544.0, + "grad_norm": 0.03657763323068719, + "language_loss": 0.83056581, + "learning_rate": 0.0006060473968273688, + "loss": 0.84104872, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.42260742, + "step": 2334, + "time_per_iteration": 2.6368448734283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104747, + "balance_loss_mlp": 1.0070343, + "epoch": 0.4492112350904194, + "flos": 1558693526016.0, + "grad_norm": 0.008278759352477436, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.7892701, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.40429688, + "step": 2335, + "time_per_iteration": 4.866518497467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050045, + "balance_loss_mlp": 1.00951385, + "epoch": 0.44940361677568297, + "flos": 1526703660288.0, + "grad_norm": 0.009772749846677187, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82055259, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.40527344, + "step": 2336, + "time_per_iteration": 4.832434892654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049508, + "balance_loss_mlp": 1.00759399, + "epoch": 0.4495959984609465, + "flos": 383321387520.0, + "grad_norm": 0.039418428301582195, + "language_loss": 0.88819385, + "learning_rate": 0.0006051338487650047, + "loss": 0.89868897, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.41943359, + "step": 2337, + "time_per_iteration": 2.4261343479156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104738, + "balance_loss_mlp": 1.00537109, + "epoch": 0.4497883801462101, + "flos": 498883064064.0, + "grad_norm": 0.03829280299631375, + "language_loss": 0.83062887, + "learning_rate": 0.0006048292509534095, + "loss": 0.84110272, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.42041016, + "step": 2338, + "time_per_iteration": 2.5792438983917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00425851, + "epoch": 0.4499807618314736, + "flos": 615590812416.0, + "grad_norm": 0.03236488600067343, + "language_loss": 0.78186011, + "learning_rate": 0.0006045246124434895, + "loss": 0.79232258, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.42016602, + "step": 2339, + "time_per_iteration": 2.736332654953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049178, + "balance_loss_mlp": 1.00704992, + "epoch": 0.4501731435167372, + "flos": 1007068850688.0, + "grad_norm": 0.0336222564343559, + "language_loss": 0.8735106, + "learning_rate": 0.0006042199333535162, + "loss": 0.88400233, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.42163086, + "step": 2340, + "time_per_iteration": 3.3217411041259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_mlp": 1.0066278, + "epoch": 0.4503655252020008, + "flos": 822328806912.0, + "grad_norm": 0.031746848330129245, + "language_loss": 0.8445214, + "learning_rate": 0.0006039152138017763, + "loss": 0.85500968, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.42236328, + "step": 2341, + "time_per_iteration": 3.027831792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_mlp": 1.00464213, + "epoch": 0.4505579068872643, + "flos": 487414576128.0, + "grad_norm": 0.03971234339866032, + "language_loss": 0.84330553, + "learning_rate": 0.0006036104539065726, + "loss": 0.85377491, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.42333984, + "step": 2342, + "time_per_iteration": 2.6650640964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_mlp": 1.00030267, + "epoch": 0.4507502885725279, + "flos": 886336728576.0, + "grad_norm": 0.030953760348096254, + "language_loss": 0.8473978, + "learning_rate": 0.000603305653786223, + "loss": 0.85782403, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.42358398, + "step": 2343, + "time_per_iteration": 3.146728277206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_mlp": 1.00284708, + "epoch": 0.45094267025779144, + "flos": 579422016000.0, + "grad_norm": 0.032254310776320565, + "language_loss": 0.84862161, + "learning_rate": 0.0006030008135590622, + "loss": 0.859074, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.42431641, + "step": 2344, + "time_per_iteration": 2.716326951980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00387657, + "epoch": 0.45113505194305503, + "flos": 526442232576.0, + "grad_norm": 0.029625683171065443, + "language_loss": 0.81110835, + "learning_rate": 0.0006026959333434387, + "loss": 0.82157081, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.42407227, + "step": 2345, + "time_per_iteration": 2.757293939590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046277, + "balance_loss_mlp": 1.00379133, + "epoch": 0.45132743362831856, + "flos": 503116429824.0, + "grad_norm": 0.029442245536271623, + "language_loss": 0.77997512, + "learning_rate": 0.0006023910132577181, + "loss": 0.79043788, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.42529297, + "step": 2346, + "time_per_iteration": 2.6643226146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_mlp": 1.00201178, + "epoch": 0.45151981531358215, + "flos": 432836710656.0, + "grad_norm": 0.03508285710405181, + "language_loss": 0.85304409, + "learning_rate": 0.0006020860534202806, + "loss": 0.86348718, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.42333984, + "step": 2347, + "time_per_iteration": 2.508922815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_mlp": 1.00444722, + "epoch": 0.4517121969988457, + "flos": 713494828800.0, + "grad_norm": 0.031320840574665956, + "language_loss": 0.81720173, + "learning_rate": 0.0006017810539495224, + "loss": 0.8276692, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.42333984, + "step": 2348, + "time_per_iteration": 2.916851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046985, + "balance_loss_mlp": 1.00459409, + "epoch": 0.45190457868410927, + "flos": 580557394176.0, + "grad_norm": 0.03199810496833265, + "language_loss": 0.82887936, + "learning_rate": 0.0006014760149638547, + "loss": 0.83934915, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.42431641, + "step": 2349, + "time_per_iteration": 2.6583147048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_mlp": 1.00189018, + "epoch": 0.45209696036937286, + "flos": 483628363008.0, + "grad_norm": 0.034942038630734404, + "language_loss": 0.89322019, + "learning_rate": 0.000601170936581704, + "loss": 0.90366322, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.42456055, + "step": 2350, + "time_per_iteration": 2.5171234607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051553, + "balance_loss_mlp": 1.00906706, + "epoch": 0.4522893420546364, + "flos": 541260474624.0, + "grad_norm": 0.03828852417675836, + "language_loss": 0.85383743, + "learning_rate": 0.0006008658189215121, + "loss": 0.86435294, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.42529297, + "step": 2351, + "time_per_iteration": 2.6463332176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_mlp": 1.00725281, + "epoch": 0.4524817237399, + "flos": 497691305472.0, + "grad_norm": 0.039190213199739796, + "language_loss": 0.80507791, + "learning_rate": 0.0006005606621017366, + "loss": 0.81557548, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.42553711, + "step": 2352, + "time_per_iteration": 2.5637879371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048153, + "balance_loss_mlp": 1.00597668, + "epoch": 0.4526741054251635, + "flos": 653841782016.0, + "grad_norm": 0.04275245206988235, + "language_loss": 0.80476063, + "learning_rate": 0.0006002554662408496, + "loss": 0.81524217, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.42211914, + "step": 2353, + "time_per_iteration": 2.8951141834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_mlp": 1.00500786, + "epoch": 0.4528664871104271, + "flos": 572004146688.0, + "grad_norm": 0.03654890079235127, + "language_loss": 0.91683698, + "learning_rate": 0.0005999502314573388, + "loss": 0.92731076, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.42407227, + "step": 2354, + "time_per_iteration": 2.64512300491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051522, + "balance_loss_mlp": 1.00927448, + "epoch": 0.45305886879569063, + "flos": 459679463424.0, + "grad_norm": 0.03675635166201985, + "language_loss": 0.86984789, + "learning_rate": 0.0005996449578697066, + "loss": 0.88036311, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.42285156, + "step": 2355, + "time_per_iteration": 2.6577048301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.0069412, + "epoch": 0.4532512504809542, + "flos": 506207614464.0, + "grad_norm": 0.033984488129296754, + "language_loss": 0.81732345, + "learning_rate": 0.0005993396455964709, + "loss": 0.82781321, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.4206543, + "step": 2356, + "time_per_iteration": 2.7086563110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.0067569, + "epoch": 0.4534436321662178, + "flos": 583312241664.0, + "grad_norm": 0.03467705138292274, + "language_loss": 0.82385033, + "learning_rate": 0.0005990342947561647, + "loss": 0.8343392, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.42163086, + "step": 2357, + "time_per_iteration": 2.6705219745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_mlp": 1.00484145, + "epoch": 0.45363601385148133, + "flos": 550773156096.0, + "grad_norm": 0.03186226313127573, + "language_loss": 0.78742826, + "learning_rate": 0.0005987289054673351, + "loss": 0.79789847, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.42211914, + "step": 2358, + "time_per_iteration": 2.6073710918426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105212, + "balance_loss_mlp": 1.01063538, + "epoch": 0.4538283955367449, + "flos": 1477793937408.0, + "grad_norm": 0.008894510659601113, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77627861, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.41503906, + "step": 2359, + "time_per_iteration": 4.796559810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044324, + "balance_loss_mlp": 1.00245762, + "epoch": 0.45402077722200845, + "flos": 585797826048.0, + "grad_norm": 0.043889208643714143, + "language_loss": 0.91937214, + "learning_rate": 0.0005981180120183722, + "loss": 0.92981529, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.41894531, + "step": 2360, + "time_per_iteration": 2.6962461471557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104759, + "balance_loss_mlp": 1.00584316, + "epoch": 0.45421315890727204, + "flos": 532889974272.0, + "grad_norm": 0.05191452902852925, + "language_loss": 0.85740328, + "learning_rate": 0.0005978125080954089, + "loss": 0.86787915, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.41772461, + "step": 2361, + "time_per_iteration": 2.777160882949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049759, + "balance_loss_mlp": 1.00794065, + "epoch": 0.4544055405925356, + "flos": 786552728064.0, + "grad_norm": 0.0404371323010207, + "language_loss": 0.77941048, + "learning_rate": 0.000597506966198262, + "loss": 0.78990805, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.41845703, + "step": 2362, + "time_per_iteration": 2.9561667442321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048479, + "balance_loss_mlp": 1.00663614, + "epoch": 0.45459792227779916, + "flos": 519202252800.0, + "grad_norm": 0.0386377549927772, + "language_loss": 0.84570003, + "learning_rate": 0.0005972013864455536, + "loss": 0.85618478, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.41870117, + "step": 2363, + "time_per_iteration": 2.577075958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_mlp": 1.00757432, + "epoch": 0.4547903039630627, + "flos": 538598946048.0, + "grad_norm": 0.03734609962487706, + "language_loss": 0.86156821, + "learning_rate": 0.0005968957689559203, + "loss": 0.87206089, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.41723633, + "step": 2364, + "time_per_iteration": 2.663912773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_mlp": 1.00543737, + "epoch": 0.4549826856483263, + "flos": 529691864832.0, + "grad_norm": 0.03600076061776594, + "language_loss": 0.89443278, + "learning_rate": 0.0005965901138480131, + "loss": 0.90490627, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.41943359, + "step": 2365, + "time_per_iteration": 2.635735034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048292, + "balance_loss_mlp": 1.00633037, + "epoch": 0.45517506733358987, + "flos": 521983345152.0, + "grad_norm": 0.04096543812015268, + "language_loss": 0.87860775, + "learning_rate": 0.0005962844212404982, + "loss": 0.88909072, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.41992188, + "step": 2366, + "time_per_iteration": 2.675039291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049904, + "balance_loss_mlp": 1.00799048, + "epoch": 0.4553674490188534, + "flos": 452009827584.0, + "grad_norm": 0.02917585056549172, + "language_loss": 0.88090932, + "learning_rate": 0.0005959786912520558, + "loss": 0.89140838, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.41943359, + "step": 2367, + "time_per_iteration": 2.605693817138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046028, + "balance_loss_mlp": 1.00399494, + "epoch": 0.455559830704117, + "flos": 547745154816.0, + "grad_norm": 0.029185999772899627, + "language_loss": 0.84459692, + "learning_rate": 0.0005956729240013806, + "loss": 0.85505724, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.4206543, + "step": 2368, + "time_per_iteration": 2.792929172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104665, + "balance_loss_mlp": 1.00447345, + "epoch": 0.4557522123893805, + "flos": 584866582272.0, + "grad_norm": 0.02991931447914949, + "language_loss": 0.92050606, + "learning_rate": 0.0005953671196071824, + "loss": 0.93097258, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.42211914, + "step": 2369, + "time_per_iteration": 2.7024593353271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052055, + "balance_loss_mlp": 1.00992644, + "epoch": 0.4559445940746441, + "flos": 527484291840.0, + "grad_norm": 0.03299201390628513, + "language_loss": 0.80723774, + "learning_rate": 0.0005950612781881846, + "loss": 0.81775832, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.42163086, + "step": 2370, + "time_per_iteration": 2.7288575172424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.0061928, + "epoch": 0.45613697575990764, + "flos": 653368384512.0, + "grad_norm": 0.034012751150725565, + "language_loss": 0.76432264, + "learning_rate": 0.0005947553998631259, + "loss": 0.77480543, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.42114258, + "step": 2371, + "time_per_iteration": 2.865060567855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051777, + "balance_loss_mlp": 1.00976777, + "epoch": 0.4563293574451712, + "flos": 868624633344.0, + "grad_norm": 0.02789239974176414, + "language_loss": 0.79458821, + "learning_rate": 0.000594449484750758, + "loss": 0.80510592, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.42041016, + "step": 2372, + "time_per_iteration": 3.147550344467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_mlp": 1.00242209, + "epoch": 0.45652173913043476, + "flos": 499132885248.0, + "grad_norm": 0.03342359133343608, + "language_loss": 0.83513892, + "learning_rate": 0.0005941435329698484, + "loss": 0.84558398, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.42114258, + "step": 2373, + "time_per_iteration": 2.6924219131469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_mlp": 1.00441325, + "epoch": 0.45671412081569834, + "flos": 561959741952.0, + "grad_norm": 0.03267163379038315, + "language_loss": 0.83796972, + "learning_rate": 0.0005938375446391778, + "loss": 0.84843373, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.42016602, + "step": 2374, + "time_per_iteration": 2.731687307357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_mlp": 1.00281477, + "epoch": 0.45690650250096193, + "flos": 504123495936.0, + "grad_norm": 0.03711297965033783, + "language_loss": 0.89367199, + "learning_rate": 0.0005935315198775415, + "loss": 0.90412098, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.42114258, + "step": 2375, + "time_per_iteration": 2.679049015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_mlp": 1.0040555, + "epoch": 0.45709888418622546, + "flos": 431599265280.0, + "grad_norm": 0.033405413713201326, + "language_loss": 0.87559128, + "learning_rate": 0.0005932254588037486, + "loss": 0.88605309, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.42163086, + "step": 2376, + "time_per_iteration": 2.5139987468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_mlp": 1.00384891, + "epoch": 0.45729126587148905, + "flos": 526693999104.0, + "grad_norm": 0.034118342932564036, + "language_loss": 0.86638731, + "learning_rate": 0.000592919361536623, + "loss": 0.87684566, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.42016602, + "step": 2377, + "time_per_iteration": 2.652921438217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047367, + "balance_loss_mlp": 1.00545263, + "epoch": 0.4574836475567526, + "flos": 639148939776.0, + "grad_norm": 0.03214355149845838, + "language_loss": 0.89487022, + "learning_rate": 0.0005926132281950017, + "loss": 0.90534389, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.41943359, + "step": 2378, + "time_per_iteration": 2.7740533351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050302, + "balance_loss_mlp": 1.00819683, + "epoch": 0.45767602924201617, + "flos": 650791426560.0, + "grad_norm": 0.03291422707035226, + "language_loss": 0.85368007, + "learning_rate": 0.0005923070588977367, + "loss": 0.86418307, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.42138672, + "step": 2379, + "time_per_iteration": 2.8456881046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050793, + "balance_loss_mlp": 1.00873554, + "epoch": 0.4578684109272797, + "flos": 747963475968.0, + "grad_norm": 0.03509802642472786, + "language_loss": 0.86739749, + "learning_rate": 0.0005920008537636931, + "loss": 0.87790543, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.42089844, + "step": 2380, + "time_per_iteration": 2.910720109939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048858, + "balance_loss_mlp": 1.00692058, + "epoch": 0.4580607926125433, + "flos": 642729073152.0, + "grad_norm": 0.029242782263759974, + "language_loss": 0.87235177, + "learning_rate": 0.0005916946129117504, + "loss": 0.88284034, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.41967773, + "step": 2381, + "time_per_iteration": 2.8813161849975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.00948262, + "epoch": 0.4582531742978069, + "flos": 803240260608.0, + "grad_norm": 0.03239264438363608, + "language_loss": 0.81130052, + "learning_rate": 0.0005913883364608017, + "loss": 0.82181567, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.4206543, + "step": 2382, + "time_per_iteration": 3.062751531600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105103, + "balance_loss_mlp": 1.00914025, + "epoch": 0.4584455559830704, + "flos": 685518643200.0, + "grad_norm": 0.031797549541833704, + "language_loss": 0.88895178, + "learning_rate": 0.0005910820245297542, + "loss": 0.8994621, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.41918945, + "step": 2383, + "time_per_iteration": 2.8653757572174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_mlp": 1.00387442, + "epoch": 0.458637937668334, + "flos": 519282932736.0, + "grad_norm": 0.03550111139800055, + "language_loss": 0.80986464, + "learning_rate": 0.000590775677237529, + "loss": 0.82032269, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.41967773, + "step": 2384, + "time_per_iteration": 2.7324440479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_mlp": 1.0042969, + "epoch": 0.4588303193535975, + "flos": 506533257984.0, + "grad_norm": 0.03366806840699952, + "language_loss": 0.80683196, + "learning_rate": 0.0005904692947030601, + "loss": 0.81729311, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.41845703, + "step": 2385, + "time_per_iteration": 2.5837819576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_mlp": 1.00176287, + "epoch": 0.4590227010388611, + "flos": 496909761024.0, + "grad_norm": 0.03855013464211847, + "language_loss": 0.89966094, + "learning_rate": 0.0005901628770452963, + "loss": 0.91009706, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.41870117, + "step": 2386, + "time_per_iteration": 2.60300350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_mlp": 1.00132906, + "epoch": 0.45921508272412465, + "flos": 494602066176.0, + "grad_norm": 0.034718704885035666, + "language_loss": 0.87768519, + "learning_rate": 0.000589856424383199, + "loss": 0.88811642, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.41821289, + "step": 2387, + "time_per_iteration": 2.6108267307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_mlp": 1.00232685, + "epoch": 0.45940746440938823, + "flos": 692593372416.0, + "grad_norm": 0.03330437261727838, + "language_loss": 0.83652228, + "learning_rate": 0.000589549936835744, + "loss": 0.846964, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.41870117, + "step": 2388, + "time_per_iteration": 2.8968546390533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_mlp": 1.00545883, + "epoch": 0.45959984609465176, + "flos": 504737844480.0, + "grad_norm": 0.03238722342606361, + "language_loss": 0.79404306, + "learning_rate": 0.0005892434145219202, + "loss": 0.80451536, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.41796875, + "step": 2389, + "time_per_iteration": 2.6019601821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_mlp": 1.00350749, + "epoch": 0.45979222777991535, + "flos": 677840259072.0, + "grad_norm": 0.03571192687498619, + "language_loss": 0.83136904, + "learning_rate": 0.0005889368575607303, + "loss": 0.84182131, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.41748047, + "step": 2390, + "time_per_iteration": 2.8418307304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042843, + "balance_loss_mlp": 1.00107241, + "epoch": 0.45998460946517894, + "flos": 779039594496.0, + "grad_norm": 0.031212653964934608, + "language_loss": 0.79287618, + "learning_rate": 0.00058863026607119, + "loss": 0.80330467, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.41796875, + "step": 2391, + "time_per_iteration": 3.0931389331817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_mlp": 1.00333977, + "epoch": 0.46017699115044247, + "flos": 853022901504.0, + "grad_norm": 0.035796836390277, + "language_loss": 0.80142331, + "learning_rate": 0.0005883236401723287, + "loss": 0.8118751, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.41870117, + "step": 2392, + "time_per_iteration": 3.170374631881714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_mlp": 1.00222623, + "epoch": 0.46036937283570606, + "flos": 576964621824.0, + "grad_norm": 0.03330985308732758, + "language_loss": 0.84980971, + "learning_rate": 0.0005880169799831893, + "loss": 0.86025083, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.41918945, + "step": 2393, + "time_per_iteration": 2.693976879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048284, + "balance_loss_mlp": 1.00641727, + "epoch": 0.4605617545209696, + "flos": 613120779264.0, + "grad_norm": 0.03386951364717573, + "language_loss": 0.82288468, + "learning_rate": 0.0005877102856228278, + "loss": 0.83336759, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.41894531, + "step": 2394, + "time_per_iteration": 2.8137876987457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104836, + "balance_loss_mlp": 1.0063504, + "epoch": 0.4607541362062332, + "flos": 534159500544.0, + "grad_norm": 0.06543347642857557, + "language_loss": 0.85095239, + "learning_rate": 0.0005874035572103133, + "loss": 0.86143595, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.42041016, + "step": 2395, + "time_per_iteration": 2.6604816913604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_mlp": 1.0043298, + "epoch": 0.4609465178914967, + "flos": 648474983424.0, + "grad_norm": 0.04503809754512356, + "language_loss": 0.83026469, + "learning_rate": 0.0005870967948647288, + "loss": 0.84072733, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.41967773, + "step": 2396, + "time_per_iteration": 2.8022336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.00658417, + "epoch": 0.4611388995767603, + "flos": 1469501204736.0, + "grad_norm": 0.004136605290049959, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75355613, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.40722656, + "step": 2397, + "time_per_iteration": 5.5826334953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00350773, + "epoch": 0.46133128126202383, + "flos": 724477280256.0, + "grad_norm": 0.03194619056097999, + "language_loss": 0.86316049, + "learning_rate": 0.0005864831688507443, + "loss": 0.8736161, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.42089844, + "step": 2398, + "time_per_iteration": 3.0160725116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051143, + "balance_loss_mlp": 1.00903809, + "epoch": 0.4615236629472874, + "flos": 549114802944.0, + "grad_norm": 0.0336665595141197, + "language_loss": 0.75746781, + "learning_rate": 0.0005861763054205754, + "loss": 0.76797926, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.42138672, + "step": 2399, + "time_per_iteration": 2.7720346450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052341, + "balance_loss_mlp": 1.01011705, + "epoch": 0.461716044632551, + "flos": 603460343808.0, + "grad_norm": 0.030278987672658065, + "language_loss": 0.80694187, + "learning_rate": 0.0005858694085337976, + "loss": 0.81746531, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.42260742, + "step": 2400, + "time_per_iteration": 2.790825366973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_mlp": 1.00722611, + "epoch": 0.46190842631781454, + "flos": 475437697536.0, + "grad_norm": 0.03561782978750914, + "language_loss": 0.83960855, + "learning_rate": 0.0005855624783095589, + "loss": 0.85010278, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.42236328, + "step": 2401, + "time_per_iteration": 2.5512595176696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_mlp": 1.00930238, + "epoch": 0.4621008080030781, + "flos": 438402786048.0, + "grad_norm": 0.034731386600305836, + "language_loss": 0.85895813, + "learning_rate": 0.00058525551486702, + "loss": 0.86947024, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.41943359, + "step": 2402, + "time_per_iteration": 2.5168349742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_mlp": 1.0077796, + "epoch": 0.46229318968834165, + "flos": 526498612992.0, + "grad_norm": 0.03903258697063272, + "language_loss": 0.81848848, + "learning_rate": 0.0005849485183253548, + "loss": 0.82898641, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.42041016, + "step": 2403, + "time_per_iteration": 2.640596389770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043904, + "balance_loss_mlp": 1.00213277, + "epoch": 0.46248557137360524, + "flos": 440534536704.0, + "grad_norm": 0.0318215105397156, + "language_loss": 0.87703103, + "learning_rate": 0.0005846414888037501, + "loss": 0.88747007, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.41796875, + "step": 2404, + "time_per_iteration": 2.4814634323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046869, + "balance_loss_mlp": 1.00516927, + "epoch": 0.4626779530588688, + "flos": 618773370624.0, + "grad_norm": 0.036713203920182555, + "language_loss": 0.8266353, + "learning_rate": 0.0005843344264214049, + "loss": 0.83710396, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.41723633, + "step": 2405, + "time_per_iteration": 2.7493507862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_mlp": 1.00461316, + "epoch": 0.46287033474413236, + "flos": 671360436480.0, + "grad_norm": 0.031131832431387497, + "language_loss": 0.85281026, + "learning_rate": 0.0005840273312975317, + "loss": 0.86327314, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.41699219, + "step": 2406, + "time_per_iteration": 2.8235156536102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045049, + "balance_loss_mlp": 1.00332618, + "epoch": 0.46306271642939595, + "flos": 481199159040.0, + "grad_norm": 0.037353418102982906, + "language_loss": 0.90573472, + "learning_rate": 0.0005837202035513555, + "loss": 0.91618526, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.41748047, + "step": 2407, + "time_per_iteration": 2.5672457218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043171, + "balance_loss_mlp": 1.001472, + "epoch": 0.4632550981146595, + "flos": 581858022912.0, + "grad_norm": 0.03272683029516706, + "language_loss": 0.81903768, + "learning_rate": 0.0005834130433021136, + "loss": 0.82946944, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.41723633, + "step": 2408, + "time_per_iteration": 4.229294538497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_mlp": 1.00044954, + "epoch": 0.46344747979992307, + "flos": 525018149376.0, + "grad_norm": 0.030754893265702864, + "language_loss": 0.73835284, + "learning_rate": 0.0005831058506690563, + "loss": 0.74877453, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.41748047, + "step": 2409, + "time_per_iteration": 2.614616632461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_mlp": 1.00183976, + "epoch": 0.4636398614851866, + "flos": 747813776640.0, + "grad_norm": 0.03608107183813509, + "language_loss": 0.86105043, + "learning_rate": 0.0005827986257714464, + "loss": 0.87148345, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.41479492, + "step": 2410, + "time_per_iteration": 2.953162670135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00935507, + "epoch": 0.4638322431704502, + "flos": 597646392576.0, + "grad_norm": 0.032192415237476964, + "language_loss": 0.89042687, + "learning_rate": 0.0005824913687285591, + "loss": 0.90093744, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.41723633, + "step": 2411, + "time_per_iteration": 2.685081958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045225, + "balance_loss_mlp": 1.00357294, + "epoch": 0.4640246248557137, + "flos": 540533365248.0, + "grad_norm": 0.03324810257023632, + "language_loss": 0.82180583, + "learning_rate": 0.0005821840796596821, + "loss": 0.83225811, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.41674805, + "step": 2412, + "time_per_iteration": 2.7183375358581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_mlp": 1.00403953, + "epoch": 0.4642170065409773, + "flos": 563809590528.0, + "grad_norm": 0.030050486484180242, + "language_loss": 0.80926406, + "learning_rate": 0.0005818767586841158, + "loss": 0.81972128, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.41699219, + "step": 2413, + "time_per_iteration": 2.7701165676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050594, + "balance_loss_mlp": 1.00884688, + "epoch": 0.46440938822624084, + "flos": 532062743040.0, + "grad_norm": 0.027541485530404662, + "language_loss": 0.86138541, + "learning_rate": 0.0005815694059211726, + "loss": 0.87189138, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.41772461, + "step": 2414, + "time_per_iteration": 2.668760061264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104752, + "balance_loss_mlp": 1.00717926, + "epoch": 0.4646017699115044, + "flos": 1529627649024.0, + "grad_norm": 0.008676045744997887, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81921148, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.40332031, + "step": 2415, + "time_per_iteration": 4.801916599273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054371, + "balance_loss_mlp": 1.01403046, + "epoch": 0.464794151596768, + "flos": 1544174682624.0, + "grad_norm": 0.009441918844152984, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.77999437, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.40332031, + "step": 2416, + "time_per_iteration": 4.990759372711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.0040803, + "epoch": 0.46498653328203154, + "flos": 502539019776.0, + "grad_norm": 0.03083676606802021, + "language_loss": 0.86654723, + "learning_rate": 0.0005806471581013931, + "loss": 0.87700671, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.41894531, + "step": 2417, + "time_per_iteration": 2.6697516441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046587, + "balance_loss_mlp": 1.00452995, + "epoch": 0.46517891496729513, + "flos": 677301732864.0, + "grad_norm": 0.03671323650301262, + "language_loss": 0.79226685, + "learning_rate": 0.0005803396793823146, + "loss": 0.80273271, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.42089844, + "step": 2418, + "time_per_iteration": 2.8375697135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054201, + "balance_loss_mlp": 1.01212037, + "epoch": 0.46537129665255866, + "flos": 586512296448.0, + "grad_norm": 0.037063881541601694, + "language_loss": 0.86435425, + "learning_rate": 0.0005800321694726065, + "loss": 0.87489623, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.42114258, + "step": 2419, + "time_per_iteration": 2.7743778228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01136279, + "epoch": 0.46556367833782225, + "flos": 588821936640.0, + "grad_norm": 0.0340005426894483, + "language_loss": 0.87128568, + "learning_rate": 0.0005797246284916545, + "loss": 0.8818208, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.421875, + "step": 2420, + "time_per_iteration": 2.6835851669311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049313, + "balance_loss_mlp": 1.00878143, + "epoch": 0.4657560600230858, + "flos": 1488584893440.0, + "grad_norm": 0.006163961209168608, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78554499, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.40527344, + "step": 2421, + "time_per_iteration": 4.943193197250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_mlp": 1.00570607, + "epoch": 0.46594844170834937, + "flos": 581393373696.0, + "grad_norm": 0.03388172676180004, + "language_loss": 0.8850925, + "learning_rate": 0.0005791094537936233, + "loss": 0.89556992, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.4206543, + "step": 2422, + "time_per_iteration": 2.694913148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.00559843, + "epoch": 0.4661408233936129, + "flos": 513571048704.0, + "grad_norm": 0.036220885297141736, + "language_loss": 0.82194817, + "learning_rate": 0.0005788018203153762, + "loss": 0.83242476, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.42089844, + "step": 2423, + "time_per_iteration": 2.582130193710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_mlp": 1.006392, + "epoch": 0.4663332050788765, + "flos": 492033856512.0, + "grad_norm": 0.03516767090589214, + "language_loss": 0.86157548, + "learning_rate": 0.000578494156243549, + "loss": 0.87205875, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.41967773, + "step": 2424, + "time_per_iteration": 2.569465160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047846, + "balance_loss_mlp": 1.0060271, + "epoch": 0.4665255867641401, + "flos": 513708109056.0, + "grad_norm": 0.03097112252036683, + "language_loss": 0.89247042, + "learning_rate": 0.0005781864616975878, + "loss": 0.90294886, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.41845703, + "step": 2425, + "time_per_iteration": 2.6580159664154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043502, + "balance_loss_mlp": 1.00175464, + "epoch": 0.4667179684494036, + "flos": 425707546368.0, + "grad_norm": 0.0331787429652153, + "language_loss": 0.84786129, + "learning_rate": 0.0005778787367969502, + "loss": 0.85829628, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.41772461, + "step": 2426, + "time_per_iteration": 2.577146291732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046987, + "balance_loss_mlp": 1.00526416, + "epoch": 0.4669103501346672, + "flos": 709224524544.0, + "grad_norm": 0.030186535385466236, + "language_loss": 0.81415391, + "learning_rate": 0.0005775709816611053, + "loss": 0.82462376, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.41748047, + "step": 2427, + "time_per_iteration": 2.946763515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044811, + "balance_loss_mlp": 1.00294447, + "epoch": 0.4671027318199307, + "flos": 555946513920.0, + "grad_norm": 0.029160974795623382, + "language_loss": 0.83887118, + "learning_rate": 0.0005772631964095346, + "loss": 0.84931928, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.41894531, + "step": 2428, + "time_per_iteration": 2.7246575355529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047997, + "balance_loss_mlp": 1.0062499, + "epoch": 0.4672951135051943, + "flos": 568196546304.0, + "grad_norm": 0.03470882192857659, + "language_loss": 0.86100912, + "learning_rate": 0.000576955381161731, + "loss": 0.87148911, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.41772461, + "step": 2429, + "time_per_iteration": 2.6618916988372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00959647, + "epoch": 0.46748749519045785, + "flos": 425418841344.0, + "grad_norm": 0.034295751127670006, + "language_loss": 0.86858582, + "learning_rate": 0.0005766475360371985, + "loss": 0.87909877, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.41723633, + "step": 2430, + "time_per_iteration": 2.6010043621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048678, + "balance_loss_mlp": 1.00697899, + "epoch": 0.46767987687572143, + "flos": 539371742208.0, + "grad_norm": 0.034969896754344705, + "language_loss": 0.85521102, + "learning_rate": 0.0005763396611554536, + "loss": 0.86569786, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.41723633, + "step": 2431, + "time_per_iteration": 2.6345412731170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045821, + "balance_loss_mlp": 1.00409806, + "epoch": 0.467872258560985, + "flos": 825076851456.0, + "grad_norm": 0.03589185796451142, + "language_loss": 0.80950278, + "learning_rate": 0.0005760317566360237, + "loss": 0.81996095, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.41748047, + "step": 2432, + "time_per_iteration": 3.0410006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050036, + "balance_loss_mlp": 1.0083127, + "epoch": 0.46806464024624855, + "flos": 662854821120.0, + "grad_norm": 0.03375923289076794, + "language_loss": 0.86271471, + "learning_rate": 0.000575723822598448, + "loss": 0.87321508, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.41748047, + "step": 2433, + "time_per_iteration": 2.7712388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00251079, + "epoch": 0.46825702193151214, + "flos": 757055249664.0, + "grad_norm": 0.029730946872360612, + "language_loss": 0.82302332, + "learning_rate": 0.0005754158591622773, + "loss": 0.83346617, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.41796875, + "step": 2434, + "time_per_iteration": 2.9708468914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_mlp": 1.00818896, + "epoch": 0.4684494036167757, + "flos": 440310960384.0, + "grad_norm": 0.03563934149764459, + "language_loss": 0.83011699, + "learning_rate": 0.0005751078664470732, + "loss": 0.84061682, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.41821289, + "step": 2435, + "time_per_iteration": 2.5696167945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.00468564, + "epoch": 0.46864178530203926, + "flos": 533749286400.0, + "grad_norm": 0.031914354194682755, + "language_loss": 0.86557531, + "learning_rate": 0.0005747998445724094, + "loss": 0.87603986, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.41796875, + "step": 2436, + "time_per_iteration": 2.6336376667022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047101, + "balance_loss_mlp": 1.00535429, + "epoch": 0.4688341669873028, + "flos": 577826846208.0, + "grad_norm": 0.03221336233810001, + "language_loss": 0.89470494, + "learning_rate": 0.0005744917936578707, + "loss": 0.90517592, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.41772461, + "step": 2437, + "time_per_iteration": 2.7748000621795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054309, + "balance_loss_mlp": 1.0126332, + "epoch": 0.4690265486725664, + "flos": 540718057728.0, + "grad_norm": 0.029623138174113085, + "language_loss": 0.84520715, + "learning_rate": 0.0005741837138230526, + "loss": 0.85575026, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.41699219, + "step": 2438, + "time_per_iteration": 2.717194080352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047928, + "balance_loss_mlp": 1.0061574, + "epoch": 0.4692189303578299, + "flos": 771882240000.0, + "grad_norm": 0.03250588789777806, + "language_loss": 0.86937356, + "learning_rate": 0.0005738756051875627, + "loss": 0.87985283, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.41796875, + "step": 2439, + "time_per_iteration": 3.0656278133392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050138, + "balance_loss_mlp": 1.00846255, + "epoch": 0.4694113120430935, + "flos": 572514482688.0, + "grad_norm": 0.03167805631394848, + "language_loss": 0.84031767, + "learning_rate": 0.0005735674678710192, + "loss": 0.85081905, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.41699219, + "step": 2440, + "time_per_iteration": 2.6962802410125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010485, + "balance_loss_mlp": 1.00675285, + "epoch": 0.4696036937283571, + "flos": 750095226624.0, + "grad_norm": 0.037443971636707395, + "language_loss": 0.82144701, + "learning_rate": 0.0005732593019930517, + "loss": 0.83193195, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.41772461, + "step": 2441, + "time_per_iteration": 2.9041428565979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050874, + "balance_loss_mlp": 1.00915074, + "epoch": 0.4697960754136206, + "flos": 494443618560.0, + "grad_norm": 0.033679899008564836, + "language_loss": 0.87957233, + "learning_rate": 0.0005729511076733008, + "loss": 0.89008105, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.41748047, + "step": 2442, + "time_per_iteration": 2.6734514236450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056181, + "balance_loss_mlp": 1.01433861, + "epoch": 0.4699884570988842, + "flos": 726361155072.0, + "grad_norm": 0.036289078656904894, + "language_loss": 0.85521489, + "learning_rate": 0.000572642885031418, + "loss": 0.86577672, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.41870117, + "step": 2443, + "time_per_iteration": 2.9099576473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_mlp": 1.01062274, + "epoch": 0.47018083878414774, + "flos": 556578359040.0, + "grad_norm": 0.03125880297204364, + "language_loss": 0.81027329, + "learning_rate": 0.0005723346341870662, + "loss": 0.82079738, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.41821289, + "step": 2444, + "time_per_iteration": 2.7017409801483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046603, + "balance_loss_mlp": 1.00480783, + "epoch": 0.4703732204694113, + "flos": 424962940416.0, + "grad_norm": 0.03329454905005034, + "language_loss": 0.86812586, + "learning_rate": 0.0005720263552599188, + "loss": 0.8785919, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.41821289, + "step": 2445, + "time_per_iteration": 2.462155818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044259, + "balance_loss_mlp": 1.00239313, + "epoch": 0.47056560215467486, + "flos": 704756888832.0, + "grad_norm": 0.03166905827629482, + "language_loss": 0.80339378, + "learning_rate": 0.0005717180483696604, + "loss": 0.81383634, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.41894531, + "step": 2446, + "time_per_iteration": 2.8927905559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_mlp": 1.00115991, + "epoch": 0.47075798383993844, + "flos": 556013587968.0, + "grad_norm": 0.03197533000624638, + "language_loss": 0.8331126, + "learning_rate": 0.0005714097136359862, + "loss": 0.8435452, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.42138672, + "step": 2447, + "time_per_iteration": 2.632544994354248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043755, + "balance_loss_mlp": 1.00169826, + "epoch": 0.470950365525202, + "flos": 565494188544.0, + "grad_norm": 0.028044805803111937, + "language_loss": 0.87163484, + "learning_rate": 0.0005711013511786027, + "loss": 0.88207239, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.42089844, + "step": 2448, + "time_per_iteration": 2.781325578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049475, + "balance_loss_mlp": 1.00768065, + "epoch": 0.47114274721046556, + "flos": 535499013120.0, + "grad_norm": 0.029728682222295192, + "language_loss": 0.84444499, + "learning_rate": 0.0005707929611172263, + "loss": 0.8549397, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.41821289, + "step": 2449, + "time_per_iteration": 2.704754114151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104782, + "balance_loss_mlp": 1.00576317, + "epoch": 0.47133512889572915, + "flos": 474078743040.0, + "grad_norm": 0.03341999970225476, + "language_loss": 0.84505057, + "learning_rate": 0.000570484543571585, + "loss": 0.85552877, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.42089844, + "step": 2450, + "time_per_iteration": 2.56648850440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_mlp": 1.00129259, + "epoch": 0.4715275105809927, + "flos": 459968168448.0, + "grad_norm": 0.03640704052870178, + "language_loss": 0.83504367, + "learning_rate": 0.0005701760986614171, + "loss": 0.84547579, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.41943359, + "step": 2451, + "time_per_iteration": 2.5392374992370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_mlp": 1.00522745, + "epoch": 0.47171989226625627, + "flos": 422887570176.0, + "grad_norm": 0.0300201122524448, + "language_loss": 0.87997985, + "learning_rate": 0.0005698676265064714, + "loss": 0.89045107, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.41918945, + "step": 2452, + "time_per_iteration": 2.501518487930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_mlp": 1.00378954, + "epoch": 0.4719122739515198, + "flos": 458376889344.0, + "grad_norm": 0.036567202146268483, + "language_loss": 0.89326543, + "learning_rate": 0.0005695591272265074, + "loss": 0.90372366, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.4206543, + "step": 2453, + "time_per_iteration": 2.5203113555908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00703239, + "epoch": 0.4721046556367834, + "flos": 516017749248.0, + "grad_norm": 0.03590555599096038, + "language_loss": 0.82296801, + "learning_rate": 0.0005692506009412954, + "loss": 0.83345866, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.4206543, + "step": 2454, + "time_per_iteration": 2.703277826309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_mlp": 1.00982666, + "epoch": 0.4722970373220469, + "flos": 1575706702080.0, + "grad_norm": 0.007700978657663942, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78601336, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.40234375, + "step": 2455, + "time_per_iteration": 4.935078859329224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_mlp": 1.00380278, + "epoch": 0.4724894190073105, + "flos": 587395908096.0, + "grad_norm": 0.032995428661028114, + "language_loss": 0.90020776, + "learning_rate": 0.0005686334678342593, + "loss": 0.91066664, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.42114258, + "step": 2456, + "time_per_iteration": 2.913954019546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_mlp": 1.00291097, + "epoch": 0.4726818006925741, + "flos": 869073731328.0, + "grad_norm": 0.0323844824027511, + "language_loss": 0.82033843, + "learning_rate": 0.0005683248612520274, + "loss": 0.83078766, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.42041016, + "step": 2457, + "time_per_iteration": 4.4027345180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_mlp": 1.0055697, + "epoch": 0.4728741823778376, + "flos": 754228470528.0, + "grad_norm": 0.03548497467281451, + "language_loss": 0.84315181, + "learning_rate": 0.0005680162281437321, + "loss": 0.85363138, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.42431641, + "step": 2458, + "time_per_iteration": 2.8824384212493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_mlp": 1.00649393, + "epoch": 0.4730665640631012, + "flos": 539658501888.0, + "grad_norm": 0.029540383226657484, + "language_loss": 0.85216498, + "learning_rate": 0.000567707568629195, + "loss": 0.86265045, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.42089844, + "step": 2459, + "time_per_iteration": 2.7024879455566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105311, + "balance_loss_mlp": 1.01088595, + "epoch": 0.47325894574836475, + "flos": 492683198208.0, + "grad_norm": 0.02914158825310119, + "language_loss": 0.8318013, + "learning_rate": 0.0005673988828282486, + "loss": 0.84233236, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.42260742, + "step": 2460, + "time_per_iteration": 2.680508852005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045398, + "balance_loss_mlp": 1.00341213, + "epoch": 0.47345132743362833, + "flos": 765832073472.0, + "grad_norm": 0.11223827549321637, + "language_loss": 0.8158704, + "learning_rate": 0.0005670901708607352, + "loss": 0.82632446, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.42016602, + "step": 2461, + "time_per_iteration": 2.963573455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_mlp": 1.00873268, + "epoch": 0.47364370911889186, + "flos": 541169101056.0, + "grad_norm": 0.03621241484942453, + "language_loss": 0.84821182, + "learning_rate": 0.0005667814328465076, + "loss": 0.85871977, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.42089844, + "step": 2462, + "time_per_iteration": 2.623180389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052459, + "balance_loss_mlp": 1.01042545, + "epoch": 0.47383609080415545, + "flos": 407092397568.0, + "grad_norm": 0.0408736366196423, + "language_loss": 0.82667732, + "learning_rate": 0.0005664726689054285, + "loss": 0.83720195, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.4206543, + "step": 2463, + "time_per_iteration": 2.463602304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_mlp": 1.01253569, + "epoch": 0.474028472489419, + "flos": 454439031552.0, + "grad_norm": 0.030418063351129263, + "language_loss": 0.81695265, + "learning_rate": 0.0005661638791573704, + "loss": 0.82749808, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.42041016, + "step": 2464, + "time_per_iteration": 2.736748695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048425, + "balance_loss_mlp": 1.00651097, + "epoch": 0.47422085417468257, + "flos": 493195479552.0, + "grad_norm": 0.029840540723241396, + "language_loss": 0.87200695, + "learning_rate": 0.0005658550637222164, + "loss": 0.88249123, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.41943359, + "step": 2465, + "time_per_iteration": 2.618978261947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00532842, + "epoch": 0.47441323585994616, + "flos": 740126644224.0, + "grad_norm": 0.027711669007488924, + "language_loss": 0.82591414, + "learning_rate": 0.0005655462227198592, + "loss": 0.8363868, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.41967773, + "step": 2466, + "time_per_iteration": 2.9003212451934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045592, + "balance_loss_mlp": 1.00363016, + "epoch": 0.4746056175452097, + "flos": 485675543040.0, + "grad_norm": 0.03086334809399425, + "language_loss": 0.84889436, + "learning_rate": 0.0005652373562702016, + "loss": 0.85935026, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.41992188, + "step": 2467, + "time_per_iteration": 2.635524272918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050978, + "balance_loss_mlp": 1.00913572, + "epoch": 0.4747979992304733, + "flos": 462006600192.0, + "grad_norm": 0.030700027016666232, + "language_loss": 0.89103687, + "learning_rate": 0.000564928464493156, + "loss": 0.9015466, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.41870117, + "step": 2468, + "time_per_iteration": 2.5902397632598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050963, + "balance_loss_mlp": 1.00900185, + "epoch": 0.4749903809157368, + "flos": 865880479488.0, + "grad_norm": 0.04027391649848807, + "language_loss": 0.82258296, + "learning_rate": 0.000564619547508645, + "loss": 0.83309263, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.41992188, + "step": 2469, + "time_per_iteration": 3.071483850479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_mlp": 1.00877666, + "epoch": 0.4751827626010004, + "flos": 506552699904.0, + "grad_norm": 0.03439249398490307, + "language_loss": 0.83728659, + "learning_rate": 0.0005643106054366008, + "loss": 0.84779418, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.42016602, + "step": 2470, + "time_per_iteration": 2.5717906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054243, + "balance_loss_mlp": 1.01240063, + "epoch": 0.47537514428626393, + "flos": 560453033472.0, + "grad_norm": 0.030831302101538484, + "language_loss": 0.80302799, + "learning_rate": 0.000564001638396965, + "loss": 0.81357038, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.41870117, + "step": 2471, + "time_per_iteration": 2.807666540145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010519, + "balance_loss_mlp": 1.01008177, + "epoch": 0.4755675259715275, + "flos": 835677278976.0, + "grad_norm": 0.03000607606640632, + "language_loss": 0.82444054, + "learning_rate": 0.0005636926465096897, + "loss": 0.83495951, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.41845703, + "step": 2472, + "time_per_iteration": 3.0930862426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_mlp": 1.01106381, + "epoch": 0.47575990765679105, + "flos": 509233670400.0, + "grad_norm": 0.03423576863830587, + "language_loss": 0.88083971, + "learning_rate": 0.0005633836298947363, + "loss": 0.89136827, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.41821289, + "step": 2473, + "time_per_iteration": 2.5820775032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050107, + "balance_loss_mlp": 1.00819325, + "epoch": 0.47595228934205464, + "flos": 592963928832.0, + "grad_norm": 0.03298724569498326, + "language_loss": 0.71285135, + "learning_rate": 0.000563074588672075, + "loss": 0.72335243, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.41943359, + "step": 2474, + "time_per_iteration": 2.693268299102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_mlp": 1.01231647, + "epoch": 0.4761446710273182, + "flos": 581684024064.0, + "grad_norm": 0.03213378714772974, + "language_loss": 0.85775197, + "learning_rate": 0.0005627655229616868, + "loss": 0.86829406, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.41918945, + "step": 2475, + "time_per_iteration": 2.719207286834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051223, + "balance_loss_mlp": 1.00933242, + "epoch": 0.47633705271258175, + "flos": 674080290816.0, + "grad_norm": 0.026991444464169446, + "language_loss": 0.9029963, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350853, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.41918945, + "step": 2476, + "time_per_iteration": 2.793189764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054365, + "balance_loss_mlp": 1.0125705, + "epoch": 0.47652943439784534, + "flos": 542971317504.0, + "grad_norm": 0.02962321585608733, + "language_loss": 0.84663439, + "learning_rate": 0.0005621473185576986, + "loss": 0.85717803, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.41821289, + "step": 2477, + "time_per_iteration": 2.7773327827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_mlp": 1.00822008, + "epoch": 0.4767218160831089, + "flos": 525847325952.0, + "grad_norm": 0.03556533386707064, + "language_loss": 0.87709439, + "learning_rate": 0.0005618381801041068, + "loss": 0.8875953, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.41894531, + "step": 2478, + "time_per_iteration": 2.6155920028686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053209, + "balance_loss_mlp": 1.0111047, + "epoch": 0.47691419776837246, + "flos": 569127790080.0, + "grad_norm": 0.035286823129286084, + "language_loss": 0.83750623, + "learning_rate": 0.0005615290176428044, + "loss": 0.84803832, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.42138672, + "step": 2479, + "time_per_iteration": 2.6538074016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049988, + "balance_loss_mlp": 1.00802612, + "epoch": 0.477106579453636, + "flos": 532025804544.0, + "grad_norm": 0.0314839310376407, + "language_loss": 0.85928833, + "learning_rate": 0.0005612198312938187, + "loss": 0.86978817, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.41992188, + "step": 2480, + "time_per_iteration": 2.781107187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051481, + "balance_loss_mlp": 1.00937629, + "epoch": 0.4772989611388996, + "flos": 595502002944.0, + "grad_norm": 0.03185012593036433, + "language_loss": 0.79825139, + "learning_rate": 0.0005609106211771868, + "loss": 0.80876625, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.42138672, + "step": 2481, + "time_per_iteration": 2.854200839996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049319, + "balance_loss_mlp": 1.00702322, + "epoch": 0.4774913428241631, + "flos": 545708668416.0, + "grad_norm": 0.032298555104441296, + "language_loss": 0.89798552, + "learning_rate": 0.0005606013874129543, + "loss": 0.90847874, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.42333984, + "step": 2482, + "time_per_iteration": 2.8364884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.00214577, + "epoch": 0.4776837245094267, + "flos": 541130217216.0, + "grad_norm": 0.031860038244933726, + "language_loss": 0.8004725, + "learning_rate": 0.0005602921301211768, + "loss": 0.81091738, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.42382812, + "step": 2483, + "time_per_iteration": 2.719606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_mlp": 1.00185454, + "epoch": 0.4778761061946903, + "flos": 472756727040.0, + "grad_norm": 0.037639636071959574, + "language_loss": 0.82567894, + "learning_rate": 0.0005599828494219185, + "loss": 0.83612138, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.42431641, + "step": 2484, + "time_per_iteration": 2.5541560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_mlp": 1.00548136, + "epoch": 0.4780684878799538, + "flos": 727338085632.0, + "grad_norm": 0.033674716450053835, + "language_loss": 0.89748895, + "learning_rate": 0.0005596735454352527, + "loss": 0.90796649, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.4230957, + "step": 2485, + "time_per_iteration": 2.9516124725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_mlp": 1.00921071, + "epoch": 0.4782608695652174, + "flos": 549954673152.0, + "grad_norm": 0.03622289239904689, + "language_loss": 0.86092174, + "learning_rate": 0.0005593642182812619, + "loss": 0.87143582, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.42236328, + "step": 2486, + "time_per_iteration": 2.643221139907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054193, + "balance_loss_mlp": 1.01192153, + "epoch": 0.47845325125048094, + "flos": 831403084032.0, + "grad_norm": 0.035916445699024475, + "language_loss": 0.84163451, + "learning_rate": 0.0005590548680800378, + "loss": 0.85217643, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.4230957, + "step": 2487, + "time_per_iteration": 3.1013588905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_mlp": 1.01356208, + "epoch": 0.4786456329357445, + "flos": 515271197952.0, + "grad_norm": 0.032399463516541584, + "language_loss": 0.76797146, + "learning_rate": 0.0005587454949516804, + "loss": 0.77852952, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.42285156, + "step": 2488, + "time_per_iteration": 2.7681314945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00992179, + "epoch": 0.47883801462100806, + "flos": 565730403840.0, + "grad_norm": 0.034669501918414815, + "language_loss": 0.88538134, + "learning_rate": 0.0005584360990162993, + "loss": 0.89590186, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.42163086, + "step": 2489, + "time_per_iteration": 2.6323490142822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105419, + "balance_loss_mlp": 1.01196563, + "epoch": 0.47903039630627164, + "flos": 580705148160.0, + "grad_norm": 0.028676455513171533, + "language_loss": 0.85944891, + "learning_rate": 0.0005581266803940124, + "loss": 0.86999071, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.42260742, + "step": 2490, + "time_per_iteration": 2.758180856704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051095, + "balance_loss_mlp": 1.00891864, + "epoch": 0.47922277799153523, + "flos": 620086638336.0, + "grad_norm": 0.029629924190795385, + "language_loss": 0.8824507, + "learning_rate": 0.0005578172392049471, + "loss": 0.89296162, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.42211914, + "step": 2491, + "time_per_iteration": 2.733055353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049931, + "balance_loss_mlp": 1.00787377, + "epoch": 0.47941515967679876, + "flos": 640859782656.0, + "grad_norm": 0.03401187912624355, + "language_loss": 0.84927547, + "learning_rate": 0.0005575077755692386, + "loss": 0.85977477, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.42089844, + "step": 2492, + "time_per_iteration": 2.7897393703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051779, + "balance_loss_mlp": 1.00988865, + "epoch": 0.47960754136206235, + "flos": 520876157184.0, + "grad_norm": 0.02611914925979928, + "language_loss": 0.8632732, + "learning_rate": 0.0005571982896070316, + "loss": 0.87379098, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.41918945, + "step": 2493, + "time_per_iteration": 2.667999744415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_mlp": 1.01010633, + "epoch": 0.4797999230473259, + "flos": 476032604160.0, + "grad_norm": 0.03441931276085345, + "language_loss": 0.90227294, + "learning_rate": 0.0005568887814384792, + "loss": 0.9127928, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.41918945, + "step": 2494, + "time_per_iteration": 2.5400681495666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105198, + "balance_loss_mlp": 1.01023245, + "epoch": 0.47999230473258947, + "flos": 533069809152.0, + "grad_norm": 0.031194267436751296, + "language_loss": 0.87632048, + "learning_rate": 0.000556579251183743, + "loss": 0.88684028, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.41772461, + "step": 2495, + "time_per_iteration": 2.662360906600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047828, + "balance_loss_mlp": 1.00615287, + "epoch": 0.480184686417853, + "flos": 602606867712.0, + "grad_norm": 0.03455941378420467, + "language_loss": 0.8073976, + "learning_rate": 0.0005562696989629936, + "loss": 0.81787586, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.41699219, + "step": 2496, + "time_per_iteration": 2.677384614944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049959, + "balance_loss_mlp": 1.00837922, + "epoch": 0.4803770681031166, + "flos": 529262208768.0, + "grad_norm": 0.02987635047659329, + "language_loss": 0.83264202, + "learning_rate": 0.0005559601248964095, + "loss": 0.84314156, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.41601562, + "step": 2497, + "time_per_iteration": 2.629697322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052806, + "balance_loss_mlp": 1.01132119, + "epoch": 0.4805694497883801, + "flos": 512229590784.0, + "grad_norm": 0.031958617017597245, + "language_loss": 0.86286914, + "learning_rate": 0.0005556505291041783, + "loss": 0.87339711, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.41503906, + "step": 2498, + "time_per_iteration": 2.6821835041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105189, + "balance_loss_mlp": 1.0103811, + "epoch": 0.4807618314736437, + "flos": 601606604544.0, + "grad_norm": 0.02993690761083535, + "language_loss": 0.84804475, + "learning_rate": 0.0005553409117064954, + "loss": 0.85856366, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.4152832, + "step": 2499, + "time_per_iteration": 2.868149518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_mlp": 1.00626087, + "epoch": 0.4809542131589073, + "flos": 570030843648.0, + "grad_norm": 0.03218775088546566, + "language_loss": 0.85501659, + "learning_rate": 0.0005550312728235654, + "loss": 0.86549377, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.41479492, + "step": 2500, + "time_per_iteration": 2.6775684356689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00767624, + "epoch": 0.4811465948441708, + "flos": 577166810880.0, + "grad_norm": 0.03560315442462447, + "language_loss": 0.84339613, + "learning_rate": 0.0005547216125756003, + "loss": 0.85388672, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.4140625, + "step": 2501, + "time_per_iteration": 2.730938196182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051501, + "balance_loss_mlp": 1.01011145, + "epoch": 0.4813389765294344, + "flos": 825298482432.0, + "grad_norm": 0.030150461655227775, + "language_loss": 0.82324314, + "learning_rate": 0.0005544119310828211, + "loss": 0.83375812, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.4140625, + "step": 2502, + "time_per_iteration": 3.113402843475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01203024, + "epoch": 0.48153135821469795, + "flos": 636700293888.0, + "grad_norm": 0.03404405348604493, + "language_loss": 0.85394537, + "learning_rate": 0.0005541022284654568, + "loss": 0.8644805, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.41503906, + "step": 2503, + "time_per_iteration": 2.946800708770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055542, + "balance_loss_mlp": 1.01393807, + "epoch": 0.48172373989996153, + "flos": 504709654272.0, + "grad_norm": 0.029988445312160498, + "language_loss": 0.84392428, + "learning_rate": 0.0005537925048437446, + "loss": 0.85447979, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.41625977, + "step": 2504, + "time_per_iteration": 2.5928125381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_mlp": 1.0131073, + "epoch": 0.48191612158522507, + "flos": 1535568945408.0, + "grad_norm": 0.009640282548559968, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76805007, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.40429688, + "step": 2505, + "time_per_iteration": 4.956170320510864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105109, + "balance_loss_mlp": 1.00936711, + "epoch": 0.48210850327048865, + "flos": 703813006080.0, + "grad_norm": 0.02927379087328487, + "language_loss": 0.88880217, + "learning_rate": 0.0005531729950682664, + "loss": 0.89931303, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.41748047, + "step": 2506, + "time_per_iteration": 2.9935836791992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052672, + "balance_loss_mlp": 1.01106763, + "epoch": 0.4823008849557522, + "flos": 440701732608.0, + "grad_norm": 0.04047033106809228, + "language_loss": 0.85417378, + "learning_rate": 0.000552863209155015, + "loss": 0.86470056, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.41625977, + "step": 2507, + "time_per_iteration": 2.4729647636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053204, + "balance_loss_mlp": 1.01157653, + "epoch": 0.48249326664101577, + "flos": 472813107456.0, + "grad_norm": 0.04603508602748786, + "language_loss": 0.82726657, + "learning_rate": 0.0005525534027184461, + "loss": 0.8377986, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.41650391, + "step": 2508, + "time_per_iteration": 2.5513370037078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055774, + "balance_loss_mlp": 1.01421785, + "epoch": 0.48268564832627936, + "flos": 564315068928.0, + "grad_norm": 0.02879273586569962, + "language_loss": 0.83137357, + "learning_rate": 0.0005522435758788365, + "loss": 0.84193128, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.41577148, + "step": 2509, + "time_per_iteration": 2.753450393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055715, + "balance_loss_mlp": 1.01415896, + "epoch": 0.4828780300115429, + "flos": 630843568128.0, + "grad_norm": 0.03460020680283242, + "language_loss": 0.80409563, + "learning_rate": 0.0005519337287564721, + "loss": 0.8146528, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.41577148, + "step": 2510, + "time_per_iteration": 2.790820360183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.01020396, + "epoch": 0.4830704116968065, + "flos": 633005454336.0, + "grad_norm": 0.032398618840687954, + "language_loss": 0.83713245, + "learning_rate": 0.000551623861471646, + "loss": 0.84764957, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.4152832, + "step": 2511, + "time_per_iteration": 2.750471353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.01596832, + "epoch": 0.48326279338207, + "flos": 1572619408128.0, + "grad_norm": 0.008656675131842123, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79874945, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.40136719, + "step": 2512, + "time_per_iteration": 4.832056999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_mlp": 1.00636733, + "epoch": 0.4834551750673336, + "flos": 510238791168.0, + "grad_norm": 0.030652937711335218, + "language_loss": 0.87039137, + "learning_rate": 0.0005510040668958211, + "loss": 0.88087165, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.41674805, + "step": 2513, + "time_per_iteration": 2.593559741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053741, + "balance_loss_mlp": 1.0134964, + "epoch": 0.48364755675259713, + "flos": 1531828419072.0, + "grad_norm": 0.007806244380112886, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78814328, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.40234375, + "step": 2514, + "time_per_iteration": 4.834583282470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049855, + "balance_loss_mlp": 1.00810826, + "epoch": 0.4838399384378607, + "flos": 566047299072.0, + "grad_norm": 0.0392841259920432, + "language_loss": 0.83837014, + "learning_rate": 0.0005503841931138645, + "loss": 0.84886873, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.41772461, + "step": 2515, + "time_per_iteration": 2.704660177230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_mlp": 1.00741005, + "epoch": 0.4840323201231243, + "flos": 388542377472.0, + "grad_norm": 0.03590543250931975, + "language_loss": 0.82853907, + "learning_rate": 0.0005500742268214025, + "loss": 0.83903086, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.41796875, + "step": 2516, + "time_per_iteration": 2.4684557914733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048399, + "balance_loss_mlp": 1.00662851, + "epoch": 0.48422470180838784, + "flos": 632176277760.0, + "grad_norm": 0.031370714323768, + "language_loss": 0.8605336, + "learning_rate": 0.0005497642410884014, + "loss": 0.87101769, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.41796875, + "step": 2517, + "time_per_iteration": 2.7523274421691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00808239, + "epoch": 0.4844170834936514, + "flos": 500313950208.0, + "grad_norm": 0.02829147010426611, + "language_loss": 0.85602349, + "learning_rate": 0.0005494542360352085, + "loss": 0.86652207, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.41796875, + "step": 2518, + "time_per_iteration": 2.635472059249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_mlp": 1.00882208, + "epoch": 0.48460946517891496, + "flos": 552195293952.0, + "grad_norm": 0.029973626664194793, + "language_loss": 0.86134493, + "learning_rate": 0.0005491442117821783, + "loss": 0.87185204, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.41918945, + "step": 2519, + "time_per_iteration": 2.686150550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050979, + "balance_loss_mlp": 1.00916088, + "epoch": 0.48480184686417854, + "flos": 530462715648.0, + "grad_norm": 0.03547836116600895, + "language_loss": 0.87863553, + "learning_rate": 0.0005488341684496732, + "loss": 0.88914526, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.41845703, + "step": 2520, + "time_per_iteration": 2.6380345821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_mlp": 1.01155508, + "epoch": 0.4849942285494421, + "flos": 533048421888.0, + "grad_norm": 0.030317982530802673, + "language_loss": 0.92374247, + "learning_rate": 0.0005485241061580624, + "loss": 0.93427622, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.41845703, + "step": 2521, + "time_per_iteration": 2.7106375694274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048164, + "balance_loss_mlp": 1.00639331, + "epoch": 0.48518661023470566, + "flos": 723973747200.0, + "grad_norm": 0.029300799536016952, + "language_loss": 0.85061228, + "learning_rate": 0.0005482140250277228, + "loss": 0.86109388, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.41796875, + "step": 2522, + "time_per_iteration": 2.998014450073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_mlp": 1.00859261, + "epoch": 0.4853789919199692, + "flos": 507156354816.0, + "grad_norm": 0.033835684591452045, + "language_loss": 0.87858051, + "learning_rate": 0.0005479039251790387, + "loss": 0.88908345, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.41723633, + "step": 2523, + "time_per_iteration": 2.6554031372070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00470078, + "epoch": 0.4855713736052328, + "flos": 661700001024.0, + "grad_norm": 0.033801552668461764, + "language_loss": 0.85375023, + "learning_rate": 0.0005475938067324014, + "loss": 0.86421466, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.41772461, + "step": 2524, + "time_per_iteration": 2.8294761180877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_mlp": 1.00839663, + "epoch": 0.48576375529049637, + "flos": 437890504704.0, + "grad_norm": 0.03215141471545655, + "language_loss": 0.84198898, + "learning_rate": 0.0005472836698082098, + "loss": 0.85249019, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.41748047, + "step": 2525, + "time_per_iteration": 2.553400754928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_mlp": 1.00858843, + "epoch": 0.4859561369757599, + "flos": 582845647104.0, + "grad_norm": 0.029048493067812663, + "language_loss": 0.84421259, + "learning_rate": 0.0005469735145268694, + "loss": 0.85471547, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.41723633, + "step": 2526, + "time_per_iteration": 2.741071939468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01121581, + "epoch": 0.4861485186610235, + "flos": 488933923584.0, + "grad_norm": 0.035658567470948505, + "language_loss": 0.81546867, + "learning_rate": 0.0005466633410087933, + "loss": 0.82599807, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.41748047, + "step": 2527, + "time_per_iteration": 2.7008073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057697, + "balance_loss_mlp": 1.01735687, + "epoch": 0.486340900346287, + "flos": 1561113981696.0, + "grad_norm": 0.006481424575109751, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78318518, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.40332031, + "step": 2528, + "time_per_iteration": 4.889545679092407 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048632, + "balance_loss_mlp": 1.00719464, + "epoch": 0.4865332820315506, + "flos": 483990945024.0, + "grad_norm": 0.029120047594960542, + "language_loss": 0.88662624, + "learning_rate": 0.0005460429397441214, + "loss": 0.89711249, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.41455078, + "step": 2529, + "time_per_iteration": 4.04598331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.00706387, + "epoch": 0.48672566371681414, + "flos": 536857967616.0, + "grad_norm": 0.030816613356667605, + "language_loss": 0.87420261, + "learning_rate": 0.0005457327122383866, + "loss": 0.88468921, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.41625977, + "step": 2530, + "time_per_iteration": 2.613560676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.01515198, + "epoch": 0.4869180454020777, + "flos": 1415833195776.0, + "grad_norm": 0.0094125035005948, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75691986, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.40332031, + "step": 2531, + "time_per_iteration": 4.826287269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104985, + "balance_loss_mlp": 1.00831711, + "epoch": 0.48711042708734126, + "flos": 574227270912.0, + "grad_norm": 0.03266780624208146, + "language_loss": 0.76332569, + "learning_rate": 0.0005451122040823244, + "loss": 0.77382421, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.41552734, + "step": 2532, + "time_per_iteration": 2.805912494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_mlp": 1.00438511, + "epoch": 0.48730280877260485, + "flos": 627817512192.0, + "grad_norm": 0.03502227574741412, + "language_loss": 0.77874511, + "learning_rate": 0.0005448019236728997, + "loss": 0.78920573, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.41699219, + "step": 2533, + "time_per_iteration": 2.865936040878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048209, + "balance_loss_mlp": 1.00670052, + "epoch": 0.48749519045786843, + "flos": 513468981504.0, + "grad_norm": 0.035197852276093636, + "language_loss": 0.85303891, + "learning_rate": 0.0005444916258698255, + "loss": 0.86352104, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.4152832, + "step": 2534, + "time_per_iteration": 2.6375105381011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_mlp": 1.00399435, + "epoch": 0.48768757214313196, + "flos": 526479171072.0, + "grad_norm": 0.030578272272676787, + "language_loss": 0.86534977, + "learning_rate": 0.0005441813107935704, + "loss": 0.87580293, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.41333008, + "step": 2535, + "time_per_iteration": 2.6708908081054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044819, + "balance_loss_mlp": 1.0033108, + "epoch": 0.48787995382839555, + "flos": 506031670272.0, + "grad_norm": 0.03128667529665633, + "language_loss": 0.86385322, + "learning_rate": 0.0005438709785646091, + "loss": 0.87430143, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.4152832, + "step": 2536, + "time_per_iteration": 2.587376117706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_mlp": 1.00599802, + "epoch": 0.4880723355136591, + "flos": 576248206080.0, + "grad_norm": 0.031424284702784445, + "language_loss": 0.87241846, + "learning_rate": 0.0005435606293034234, + "loss": 0.88289213, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.41381836, + "step": 2537, + "time_per_iteration": 2.6678061485290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.00425005, + "epoch": 0.48826471719892267, + "flos": 562537152000.0, + "grad_norm": 0.03574143188627203, + "language_loss": 0.85282528, + "learning_rate": 0.0005432502631305016, + "loss": 0.8632828, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.4152832, + "step": 2538, + "time_per_iteration": 2.7138583660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00763726, + "epoch": 0.4884570988841862, + "flos": 727549022976.0, + "grad_norm": 0.02708673321136359, + "language_loss": 0.84024864, + "learning_rate": 0.0005429398801663386, + "loss": 0.85074031, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.41552734, + "step": 2539, + "time_per_iteration": 2.964188814163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_mlp": 1.00797033, + "epoch": 0.4886494805694498, + "flos": 431924908800.0, + "grad_norm": 0.037537890597472735, + "language_loss": 0.83715379, + "learning_rate": 0.0005426294805314355, + "loss": 0.84764791, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.41455078, + "step": 2540, + "time_per_iteration": 2.5386080741882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_mlp": 1.00251019, + "epoch": 0.4888418622547134, + "flos": 674345663232.0, + "grad_norm": 0.02795943805212824, + "language_loss": 0.80757105, + "learning_rate": 0.0005423190643463003, + "loss": 0.81801265, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.41674805, + "step": 2541, + "time_per_iteration": 3.0026512145996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_mlp": 1.00182211, + "epoch": 0.4890342439399769, + "flos": 542936324352.0, + "grad_norm": 0.03490297591946719, + "language_loss": 0.83297753, + "learning_rate": 0.0005420086317314473, + "loss": 0.84341061, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.41503906, + "step": 2542, + "time_per_iteration": 2.713738441467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_mlp": 1.00457919, + "epoch": 0.4892266256252405, + "flos": 591863543808.0, + "grad_norm": 0.03220316860335889, + "language_loss": 0.81509852, + "learning_rate": 0.0005416981828073971, + "loss": 0.8255589, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.41479492, + "step": 2543, + "time_per_iteration": 2.833582639694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_mlp": 1.00983429, + "epoch": 0.48941900731050403, + "flos": 1519657121280.0, + "grad_norm": 0.011925691275285389, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78164709, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.3984375, + "step": 2544, + "time_per_iteration": 4.825795412063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_mlp": 1.00319445, + "epoch": 0.4896113889957676, + "flos": 471519281664.0, + "grad_norm": 0.035595787649594084, + "language_loss": 0.85265428, + "learning_rate": 0.000541077236513819, + "loss": 0.86310375, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.41772461, + "step": 2545, + "time_per_iteration": 2.5318596363067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046977, + "balance_loss_mlp": 1.00515878, + "epoch": 0.48980377068103115, + "flos": 497552299776.0, + "grad_norm": 0.029954814135253697, + "language_loss": 0.8290776, + "learning_rate": 0.0005407667393853638, + "loss": 0.8395474, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.41845703, + "step": 2546, + "time_per_iteration": 2.6808276176452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049449, + "balance_loss_mlp": 1.00765431, + "epoch": 0.48999615236629473, + "flos": 694108829184.0, + "grad_norm": 0.033072726692276254, + "language_loss": 0.83875388, + "learning_rate": 0.0005404562264298569, + "loss": 0.84924835, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.41821289, + "step": 2547, + "time_per_iteration": 2.8665168285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_mlp": 1.00894189, + "epoch": 0.49018853405155827, + "flos": 542749686528.0, + "grad_norm": 0.0323259245637504, + "language_loss": 0.84166187, + "learning_rate": 0.0005401456977678498, + "loss": 0.85217071, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.41967773, + "step": 2548, + "time_per_iteration": 2.646385431289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.01248467, + "epoch": 0.49038091573682185, + "flos": 697109607168.0, + "grad_norm": 0.03434023749691101, + "language_loss": 0.7811271, + "learning_rate": 0.0005398351535199008, + "loss": 0.79166895, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.41723633, + "step": 2549, + "time_per_iteration": 3.0581490993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056036, + "balance_loss_mlp": 1.01443195, + "epoch": 0.49057329742208544, + "flos": 598063409664.0, + "grad_norm": 0.032237778563639685, + "language_loss": 0.84733725, + "learning_rate": 0.0005395245938065735, + "loss": 0.85789764, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.41625977, + "step": 2550, + "time_per_iteration": 2.7877790927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_mlp": 1.01105404, + "epoch": 0.490765679107349, + "flos": 514417721856.0, + "grad_norm": 0.03812364840268788, + "language_loss": 0.82968283, + "learning_rate": 0.0005392140187484379, + "loss": 0.84021086, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.41772461, + "step": 2551, + "time_per_iteration": 2.59513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_mlp": 1.01097441, + "epoch": 0.49095806079261256, + "flos": 630843568128.0, + "grad_norm": 0.028435741934699065, + "language_loss": 0.8977747, + "learning_rate": 0.0005389034284660701, + "loss": 0.90830076, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.41650391, + "step": 2552, + "time_per_iteration": 2.8811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051565, + "balance_loss_mlp": 1.00979364, + "epoch": 0.4911504424778761, + "flos": 916793640960.0, + "grad_norm": 0.038088038632412044, + "language_loss": 0.82567823, + "learning_rate": 0.000538592823080052, + "loss": 0.83619392, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.41796875, + "step": 2553, + "time_per_iteration": 3.147981882095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_mlp": 1.00736189, + "epoch": 0.4913428241631397, + "flos": 439855059456.0, + "grad_norm": 0.03635352086596181, + "language_loss": 0.85271204, + "learning_rate": 0.000538282202710971, + "loss": 0.86320198, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.41650391, + "step": 2554, + "time_per_iteration": 2.5295345783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_mlp": 1.00865471, + "epoch": 0.4915352058484032, + "flos": 637240765440.0, + "grad_norm": 0.03576310950851386, + "language_loss": 0.82746387, + "learning_rate": 0.000537971567479421, + "loss": 0.83796692, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.41674805, + "step": 2555, + "time_per_iteration": 2.7715530395507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047308, + "balance_loss_mlp": 1.00567997, + "epoch": 0.4917275875336668, + "flos": 505510640640.0, + "grad_norm": 0.03586911519664752, + "language_loss": 0.88338435, + "learning_rate": 0.0005376609175060011, + "loss": 0.89385736, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.41650391, + "step": 2556, + "time_per_iteration": 2.6225156784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_mlp": 1.00252998, + "epoch": 0.49191996921893033, + "flos": 655734405120.0, + "grad_norm": 0.03188042342455107, + "language_loss": 0.80798948, + "learning_rate": 0.0005373502529113162, + "loss": 0.81842965, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.41503906, + "step": 2557, + "time_per_iteration": 2.809008836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00485921, + "epoch": 0.4921123509041939, + "flos": 493399613952.0, + "grad_norm": 0.03491285747037794, + "language_loss": 0.8216666, + "learning_rate": 0.0005370395738159773, + "loss": 0.83213049, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.41552734, + "step": 2558, + "time_per_iteration": 2.6442172527313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047723, + "balance_loss_mlp": 1.00619018, + "epoch": 0.4923047325894575, + "flos": 547208573952.0, + "grad_norm": 0.0376599347248576, + "language_loss": 0.83764005, + "learning_rate": 0.0005367288803406003, + "loss": 0.84811723, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.41552734, + "step": 2559, + "time_per_iteration": 2.6496431827545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_mlp": 1.00299704, + "epoch": 0.49249711427472104, + "flos": 597590012160.0, + "grad_norm": 0.034513710641845094, + "language_loss": 0.81748044, + "learning_rate": 0.0005364181726058073, + "loss": 0.8279264, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.41625977, + "step": 2560, + "time_per_iteration": 2.677976608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.0049566, + "epoch": 0.4926894959599846, + "flos": 498809187072.0, + "grad_norm": 0.0360523922041074, + "language_loss": 0.83156157, + "learning_rate": 0.0005361074507322261, + "loss": 0.84202433, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.41333008, + "step": 2561, + "time_per_iteration": 2.5902929306030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_mlp": 1.00575542, + "epoch": 0.49288187764524816, + "flos": 537183611136.0, + "grad_norm": 0.03594243708601782, + "language_loss": 0.81942439, + "learning_rate": 0.000535796714840489, + "loss": 0.82989568, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.41381836, + "step": 2562, + "time_per_iteration": 2.6181418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_mlp": 1.00658977, + "epoch": 0.49307425933051174, + "flos": 642713521920.0, + "grad_norm": 0.03700989683335547, + "language_loss": 0.84345794, + "learning_rate": 0.0005354859650512348, + "loss": 0.85393751, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.41381836, + "step": 2563, + "time_per_iteration": 2.7921204566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048021, + "balance_loss_mlp": 1.00670326, + "epoch": 0.4932666410157753, + "flos": 517265888256.0, + "grad_norm": 0.0348037560143354, + "language_loss": 0.8771596, + "learning_rate": 0.0005351752014851074, + "loss": 0.88763982, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.41333008, + "step": 2564, + "time_per_iteration": 2.602555990219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048335, + "balance_loss_mlp": 1.00694537, + "epoch": 0.49345902270103886, + "flos": 602652554496.0, + "grad_norm": 0.04115766537624956, + "language_loss": 0.83900678, + "learning_rate": 0.0005348644242627553, + "loss": 0.84949011, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.4140625, + "step": 2565, + "time_per_iteration": 2.7332029342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010522, + "balance_loss_mlp": 1.01195526, + "epoch": 0.49365140438630245, + "flos": 1496984550912.0, + "grad_norm": 0.005471138804527184, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76338828, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.40234375, + "step": 2566, + "time_per_iteration": 4.974903583526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051209, + "balance_loss_mlp": 1.00991523, + "epoch": 0.493843786071566, + "flos": 630789133056.0, + "grad_norm": 0.031108020693620165, + "language_loss": 0.8259182, + "learning_rate": 0.0005342428293320013, + "loss": 0.83643031, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.41308594, + "step": 2567, + "time_per_iteration": 2.774355173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_mlp": 1.01332963, + "epoch": 0.49403616775682957, + "flos": 618690745344.0, + "grad_norm": 0.04042101882964004, + "language_loss": 0.84698522, + "learning_rate": 0.0005339320118649238, + "loss": 0.85753244, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.4140625, + "step": 2568, + "time_per_iteration": 2.7593345642089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_mlp": 1.0091759, + "epoch": 0.4942285494420931, + "flos": 578814470400.0, + "grad_norm": 0.03306097920847627, + "language_loss": 0.87056893, + "learning_rate": 0.000533621181224271, + "loss": 0.88107407, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.41357422, + "step": 2569, + "time_per_iteration": 2.815171957015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_mlp": 1.00358069, + "epoch": 0.4944209311273567, + "flos": 631466664960.0, + "grad_norm": 0.04400973771206172, + "language_loss": 0.82116252, + "learning_rate": 0.0005333103375307182, + "loss": 0.83161294, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.41479492, + "step": 2570, + "time_per_iteration": 2.86649227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048831, + "balance_loss_mlp": 1.00751352, + "epoch": 0.4946133128126202, + "flos": 588719869440.0, + "grad_norm": 0.030724614795269025, + "language_loss": 0.86645854, + "learning_rate": 0.0005329994809049451, + "loss": 0.87694681, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.41333008, + "step": 2571, + "time_per_iteration": 2.717759847640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_mlp": 1.00297725, + "epoch": 0.4948056944978838, + "flos": 584847140352.0, + "grad_norm": 0.02937251460087377, + "language_loss": 0.88108343, + "learning_rate": 0.0005326886114676375, + "loss": 0.89152658, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.41357422, + "step": 2572, + "time_per_iteration": 2.767547369003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_mlp": 1.00207376, + "epoch": 0.49499807618314734, + "flos": 482781689856.0, + "grad_norm": 0.032763972727654474, + "language_loss": 0.88217831, + "learning_rate": 0.0005323777293394854, + "loss": 0.8926127, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.41381836, + "step": 2573, + "time_per_iteration": 2.557117223739624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_mlp": 1.00318027, + "epoch": 0.4951904578684109, + "flos": 520038232320.0, + "grad_norm": 0.044201740478413694, + "language_loss": 0.82535017, + "learning_rate": 0.000532066834641184, + "loss": 0.83579636, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.41455078, + "step": 2574, + "time_per_iteration": 2.6565427780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_mlp": 1.00202954, + "epoch": 0.4953828395536745, + "flos": 536578010880.0, + "grad_norm": 0.03171877270725238, + "language_loss": 0.85277009, + "learning_rate": 0.0005317559274934334, + "loss": 0.8632071, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.41699219, + "step": 2575, + "time_per_iteration": 2.720740795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048187, + "balance_loss_mlp": 1.00653565, + "epoch": 0.49557522123893805, + "flos": 529607294208.0, + "grad_norm": 0.03640176927698583, + "language_loss": 0.81348443, + "learning_rate": 0.0005314450080169382, + "loss": 0.82396632, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.41674805, + "step": 2576, + "time_per_iteration": 2.6694118976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.00729847, + "epoch": 0.49576760292420163, + "flos": 428918294784.0, + "grad_norm": 0.03343170538339807, + "language_loss": 0.81225574, + "learning_rate": 0.0005311340763324083, + "loss": 0.82274544, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.41699219, + "step": 2577, + "time_per_iteration": 2.5676074028015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050384, + "balance_loss_mlp": 1.00866091, + "epoch": 0.49595998460946517, + "flos": 566316562176.0, + "grad_norm": 0.031028578783915843, + "language_loss": 0.83262658, + "learning_rate": 0.0005308231325605578, + "loss": 0.84313035, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.41748047, + "step": 2578, + "time_per_iteration": 2.6750431060791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00893033, + "epoch": 0.49615236629472875, + "flos": 703814951424.0, + "grad_norm": 0.16493684193156796, + "language_loss": 0.7742933, + "learning_rate": 0.0005305121768221061, + "loss": 0.78479862, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.41625977, + "step": 2579, + "time_per_iteration": 3.083477020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047222, + "balance_loss_mlp": 1.00688171, + "epoch": 0.4963447479799923, + "flos": 1444755209472.0, + "grad_norm": 0.004557610476670616, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76085544, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.40332031, + "step": 2580, + "time_per_iteration": 4.820146083831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_mlp": 1.00602686, + "epoch": 0.49653712966525587, + "flos": 538664074752.0, + "grad_norm": 0.031551785699882776, + "language_loss": 0.92325974, + "learning_rate": 0.0005298902299282984, + "loss": 0.93373942, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.41967773, + "step": 2581, + "time_per_iteration": 2.619842529296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050513, + "balance_loss_mlp": 1.00840831, + "epoch": 0.4967295113505194, + "flos": 608396519424.0, + "grad_norm": 0.03377113658216861, + "language_loss": 0.8488903, + "learning_rate": 0.0005295792390144033, + "loss": 0.8593955, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.42138672, + "step": 2582, + "time_per_iteration": 2.722321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00872111, + "epoch": 0.496921893035783, + "flos": 475531016448.0, + "grad_norm": 0.04081472802053015, + "language_loss": 0.84166956, + "learning_rate": 0.0005292682366168294, + "loss": 0.85217929, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.42285156, + "step": 2583, + "time_per_iteration": 2.5314435958862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00393724, + "epoch": 0.4971142747210466, + "flos": 598603881216.0, + "grad_norm": 0.03300753756436905, + "language_loss": 0.80573511, + "learning_rate": 0.0005289572228563181, + "loss": 0.81619596, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.421875, + "step": 2584, + "time_per_iteration": 2.7332074642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00846612, + "epoch": 0.4973066564063101, + "flos": 600735631872.0, + "grad_norm": 0.03199938195942058, + "language_loss": 0.83498567, + "learning_rate": 0.000528646197853616, + "loss": 0.8454923, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.42236328, + "step": 2585, + "time_per_iteration": 2.748955249786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_mlp": 1.00938058, + "epoch": 0.4974990380915737, + "flos": 650770039296.0, + "grad_norm": 0.03327645798274956, + "language_loss": 0.86559486, + "learning_rate": 0.0005283351617294735, + "loss": 0.87611067, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.42236328, + "step": 2586, + "time_per_iteration": 2.9175055027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051655, + "balance_loss_mlp": 1.01093292, + "epoch": 0.49769141977683723, + "flos": 1532442767616.0, + "grad_norm": 0.005920405298637117, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77688324, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.40722656, + "step": 2587, + "time_per_iteration": 4.992246627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.00936949, + "epoch": 0.4978838014621008, + "flos": 537398439168.0, + "grad_norm": 0.03485872476270145, + "language_loss": 0.87171799, + "learning_rate": 0.0005277130565998916, + "loss": 0.88223433, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.4230957, + "step": 2588, + "time_per_iteration": 2.7742838859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048839, + "balance_loss_mlp": 1.00666261, + "epoch": 0.49807618314736435, + "flos": 540746247936.0, + "grad_norm": 0.02719767735149213, + "language_loss": 0.82424593, + "learning_rate": 0.0005274019878359748, + "loss": 0.83473426, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.42211914, + "step": 2589, + "time_per_iteration": 2.7111029624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_mlp": 1.00699103, + "epoch": 0.49826856483262794, + "flos": 543522482688.0, + "grad_norm": 0.03488772819740132, + "language_loss": 0.87582624, + "learning_rate": 0.0005270909084336628, + "loss": 0.88631868, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.42285156, + "step": 2590, + "time_per_iteration": 2.6801702976226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00911105, + "epoch": 0.4984609465178915, + "flos": 523361741568.0, + "grad_norm": 0.03538182267925601, + "language_loss": 0.89689445, + "learning_rate": 0.0005267798185137276, + "loss": 0.90740824, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.4230957, + "step": 2591, + "time_per_iteration": 2.673933506011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00577164, + "epoch": 0.49865332820315506, + "flos": 575705789184.0, + "grad_norm": 0.03191547825845594, + "language_loss": 0.90023857, + "learning_rate": 0.0005264687181969444, + "loss": 0.91071951, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.42358398, + "step": 2592, + "time_per_iteration": 2.729825735092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047144, + "balance_loss_mlp": 1.00484908, + "epoch": 0.49884570988841864, + "flos": 1015211884032.0, + "grad_norm": 0.03571151562514848, + "language_loss": 0.75975507, + "learning_rate": 0.0005261576076040937, + "loss": 0.77022654, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.42333984, + "step": 2593, + "time_per_iteration": 3.284675359725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00502849, + "epoch": 0.4990380915736822, + "flos": 560648419584.0, + "grad_norm": 0.032935336602121515, + "language_loss": 0.84734505, + "learning_rate": 0.0005258464868559591, + "loss": 0.85781705, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.42211914, + "step": 2594, + "time_per_iteration": 2.638974905014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_mlp": 1.00772595, + "epoch": 0.49923047325894576, + "flos": 499944565248.0, + "grad_norm": 0.031535831762229155, + "language_loss": 0.89198703, + "learning_rate": 0.0005255353560733284, + "loss": 0.90248442, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.42041016, + "step": 2595, + "time_per_iteration": 2.5665078163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_mlp": 1.00414276, + "epoch": 0.4994228549442093, + "flos": 1499790921216.0, + "grad_norm": 0.005502914482473529, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76623321, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.40527344, + "step": 2596, + "time_per_iteration": 4.774062395095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_mlp": 1.0082401, + "epoch": 0.4996152366294729, + "flos": 558514723584.0, + "grad_norm": 0.032060383149289634, + "language_loss": 0.83672047, + "learning_rate": 0.0005249130648877492, + "loss": 0.84722298, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.42041016, + "step": 2597, + "time_per_iteration": 2.7558000087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051517, + "balance_loss_mlp": 1.00950754, + "epoch": 0.4998076183147364, + "flos": 416483569920.0, + "grad_norm": 0.036130927396763525, + "language_loss": 0.85007888, + "learning_rate": 0.0005246019047263953, + "loss": 0.86059409, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.42041016, + "step": 2598, + "time_per_iteration": 2.4761478900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045154, + "balance_loss_mlp": 1.00300181, + "epoch": 0.5, + "flos": 468326029824.0, + "grad_norm": 0.035928472301153966, + "language_loss": 0.83319026, + "learning_rate": 0.0005242907350137353, + "loss": 0.84364176, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.421875, + "step": 2599, + "time_per_iteration": 2.551312208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_mlp": 1.00439322, + "epoch": 0.5001923816852636, + "flos": 483756675072.0, + "grad_norm": 0.03511658446114867, + "language_loss": 0.79463625, + "learning_rate": 0.0005239795558705754, + "loss": 0.80510032, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.42041016, + "step": 2600, + "time_per_iteration": 2.6441214084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_mlp": 1.00278771, + "epoch": 0.5003847633705272, + "flos": 534856474368.0, + "grad_norm": 0.03015144944524051, + "language_loss": 0.89835393, + "learning_rate": 0.0005236683674177264, + "loss": 0.90880144, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.41992188, + "step": 2601, + "time_per_iteration": 2.669487953186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_mlp": 1.00746012, + "epoch": 0.5005771450557907, + "flos": 739056394752.0, + "grad_norm": 0.03236196452732128, + "language_loss": 0.82869333, + "learning_rate": 0.0005233571697760021, + "loss": 0.83918852, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.42089844, + "step": 2602, + "time_per_iteration": 2.85748028755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_mlp": 1.00264096, + "epoch": 0.5007695267410542, + "flos": 780307175424.0, + "grad_norm": 0.03720253600362933, + "language_loss": 0.83658135, + "learning_rate": 0.0005230459630662203, + "loss": 0.84702832, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.42089844, + "step": 2603, + "time_per_iteration": 2.9300596714019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_mlp": 1.00358939, + "epoch": 0.5009619084263178, + "flos": 624619402752.0, + "grad_norm": 0.038089595528021734, + "language_loss": 0.82175541, + "learning_rate": 0.0005227347474092022, + "loss": 0.83221114, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.42016602, + "step": 2604, + "time_per_iteration": 2.7056775093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00621724, + "epoch": 0.5011542901115814, + "flos": 532193000448.0, + "grad_norm": 0.026542730624890497, + "language_loss": 0.84019673, + "learning_rate": 0.0005224235229257724, + "loss": 0.85067946, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.42089844, + "step": 2605, + "time_per_iteration": 2.6953065395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048861, + "balance_loss_mlp": 1.00680435, + "epoch": 0.5013466717968449, + "flos": 528628418304.0, + "grad_norm": 0.028335807962849974, + "language_loss": 0.87261045, + "learning_rate": 0.0005221122897367589, + "loss": 0.88309902, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.42089844, + "step": 2606, + "time_per_iteration": 2.7901618480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051336, + "balance_loss_mlp": 1.00939834, + "epoch": 0.5015390534821085, + "flos": 567089358336.0, + "grad_norm": 0.03672669743645021, + "language_loss": 0.81618142, + "learning_rate": 0.0005218010479629932, + "loss": 0.82669473, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.41967773, + "step": 2607, + "time_per_iteration": 2.6298229694366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047474, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5017314351673721, + "flos": 567768835584.0, + "grad_norm": 0.038374388481505664, + "language_loss": 0.82467473, + "learning_rate": 0.0005214897977253102, + "loss": 0.83514941, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41992188, + "step": 2608, + "time_per_iteration": 2.6571240425109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044229, + "balance_loss_mlp": 1.00231516, + "epoch": 0.5019238168526357, + "flos": 523387986432.0, + "grad_norm": 0.030375370520194293, + "language_loss": 0.84678638, + "learning_rate": 0.0005211785391445473, + "loss": 0.85722864, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.41943359, + "step": 2609, + "time_per_iteration": 2.7354485988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_mlp": 1.00309336, + "epoch": 0.5021161985378992, + "flos": 642637699584.0, + "grad_norm": 0.0345609683707489, + "language_loss": 0.80034763, + "learning_rate": 0.0005208672723415467, + "loss": 0.81079769, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.41943359, + "step": 2610, + "time_per_iteration": 2.8003506660461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_mlp": 1.00431252, + "epoch": 0.5023085802231627, + "flos": 592423457280.0, + "grad_norm": 0.034384432252957974, + "language_loss": 0.79919124, + "learning_rate": 0.0005205559974371525, + "loss": 0.8096537, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41967773, + "step": 2611, + "time_per_iteration": 2.801931142807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_mlp": 1.00283635, + "epoch": 0.5025009619084263, + "flos": 473334137088.0, + "grad_norm": 0.0314075616675113, + "language_loss": 0.83085155, + "learning_rate": 0.0005202447145522123, + "loss": 0.84129953, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.41992188, + "step": 2612, + "time_per_iteration": 2.7084405422210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104663, + "balance_loss_mlp": 1.00476372, + "epoch": 0.5026933435936899, + "flos": 456077942784.0, + "grad_norm": 0.03248187925620893, + "language_loss": 0.79969329, + "learning_rate": 0.0005199334238075769, + "loss": 0.81015956, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.41894531, + "step": 2613, + "time_per_iteration": 2.5416245460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00367975, + "epoch": 0.5028857252789535, + "flos": 492722082048.0, + "grad_norm": 0.030734349084793038, + "language_loss": 0.92369366, + "learning_rate": 0.0005196221253241, + "loss": 0.93415004, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.41992188, + "step": 2614, + "time_per_iteration": 2.5504183769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.00344431, + "epoch": 0.503078106964217, + "flos": 626731711488.0, + "grad_norm": 0.0333228394962432, + "language_loss": 0.83482671, + "learning_rate": 0.0005193108192226383, + "loss": 0.84528148, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.4206543, + "step": 2615, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_mlp": 1.00445676, + "epoch": 0.5032704886494805, + "flos": 580138431744.0, + "grad_norm": 0.028161477664975402, + "language_loss": 0.87796414, + "learning_rate": 0.000518999505624052, + "loss": 0.88842779, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.41943359, + "step": 2616, + "time_per_iteration": 2.703958749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_mlp": 1.00289583, + "epoch": 0.5034628703347441, + "flos": 472846155264.0, + "grad_norm": 0.026579731156649716, + "language_loss": 0.83874726, + "learning_rate": 0.000518688184649203, + "loss": 0.84919554, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41967773, + "step": 2617, + "time_per_iteration": 2.7804102897644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046877, + "balance_loss_mlp": 1.00501108, + "epoch": 0.5036552520200077, + "flos": 490813907712.0, + "grad_norm": 0.028739225931260208, + "language_loss": 0.84081781, + "learning_rate": 0.0005183768564189577, + "loss": 0.85128659, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41894531, + "step": 2618, + "time_per_iteration": 2.559967517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049251, + "balance_loss_mlp": 1.00724185, + "epoch": 0.5038476337052713, + "flos": 495216414720.0, + "grad_norm": 0.040417435174145346, + "language_loss": 0.82122672, + "learning_rate": 0.0005180655210541838, + "loss": 0.83171928, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.42041016, + "step": 2619, + "time_per_iteration": 2.569495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_mlp": 1.00471759, + "epoch": 0.5040400153905348, + "flos": 601740752640.0, + "grad_norm": 0.03616333015321602, + "language_loss": 0.83923668, + "learning_rate": 0.0005177541786757527, + "loss": 0.84970129, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.41772461, + "step": 2620, + "time_per_iteration": 2.7744040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048715, + "balance_loss_mlp": 1.0068723, + "epoch": 0.5042323970757984, + "flos": 812920137984.0, + "grad_norm": 0.03309299686066053, + "language_loss": 0.83304209, + "learning_rate": 0.000517442829404538, + "loss": 0.84352922, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.41870117, + "step": 2621, + "time_per_iteration": 2.97257137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048042, + "balance_loss_mlp": 1.00610471, + "epoch": 0.504424778761062, + "flos": 628607804928.0, + "grad_norm": 0.035914844760130495, + "language_loss": 0.87778026, + "learning_rate": 0.0005171314733614166, + "loss": 0.88826072, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.41967773, + "step": 2622, + "time_per_iteration": 2.8732259273529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051056, + "balance_loss_mlp": 1.0091418, + "epoch": 0.5046171604463255, + "flos": 516957741312.0, + "grad_norm": 0.03505567711141955, + "language_loss": 0.79205, + "learning_rate": 0.0005168201106672671, + "loss": 0.80256051, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.41943359, + "step": 2623, + "time_per_iteration": 2.773688316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_mlp": 1.00590754, + "epoch": 0.504809542131589, + "flos": 528853939968.0, + "grad_norm": 0.0377301000829576, + "language_loss": 0.8564831, + "learning_rate": 0.0005165087414429717, + "loss": 0.86696255, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.4206543, + "step": 2624, + "time_per_iteration": 2.6755454540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051892, + "balance_loss_mlp": 1.0100261, + "epoch": 0.5050019238168526, + "flos": 555175663104.0, + "grad_norm": 0.03350143092818485, + "language_loss": 0.83751678, + "learning_rate": 0.0005161973658094144, + "loss": 0.84803575, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.41894531, + "step": 2625, + "time_per_iteration": 2.6260385513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105232, + "balance_loss_mlp": 1.01057339, + "epoch": 0.5051943055021162, + "flos": 575929365504.0, + "grad_norm": 0.030667351452066165, + "language_loss": 0.83093894, + "learning_rate": 0.000515885983887482, + "loss": 0.84146214, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.41772461, + "step": 2626, + "time_per_iteration": 2.7437500953674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_mlp": 1.00686646, + "epoch": 0.5053866871873798, + "flos": 497682557184.0, + "grad_norm": 0.033924054159163435, + "language_loss": 0.84715843, + "learning_rate": 0.0005155745957980636, + "loss": 0.85764432, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.41748047, + "step": 2627, + "time_per_iteration": 2.625260353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048009, + "balance_loss_mlp": 1.00638068, + "epoch": 0.5055790688726434, + "flos": 503220442368.0, + "grad_norm": 0.03037314022037546, + "language_loss": 0.89067703, + "learning_rate": 0.000515263201662051, + "loss": 0.90115714, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41650391, + "step": 2628, + "time_per_iteration": 2.68068265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.00565541, + "epoch": 0.5057714505579068, + "flos": 846768600576.0, + "grad_norm": 0.031311962044338205, + "language_loss": 0.83074951, + "learning_rate": 0.0005149518016003378, + "loss": 0.84122205, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.41625977, + "step": 2629, + "time_per_iteration": 3.208085060119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048667, + "balance_loss_mlp": 1.00720644, + "epoch": 0.5059638322431704, + "flos": 498809187072.0, + "grad_norm": 0.03517894489413756, + "language_loss": 0.82677329, + "learning_rate": 0.0005146403957338206, + "loss": 0.83725995, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.41479492, + "step": 2630, + "time_per_iteration": 2.5591788291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_mlp": 1.0044378, + "epoch": 0.506156213928434, + "flos": 619114565376.0, + "grad_norm": 0.029747387185900163, + "language_loss": 0.82375658, + "learning_rate": 0.0005143289841833975, + "loss": 0.83421576, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41503906, + "step": 2631, + "time_per_iteration": 2.8919997215270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5063485956136976, + "flos": 425790171648.0, + "grad_norm": 0.040524041139339724, + "language_loss": 0.82811654, + "learning_rate": 0.0005140175670699696, + "loss": 0.83857036, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6062378883361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.0038327, + "epoch": 0.5065409772989612, + "flos": 571070957568.0, + "grad_norm": 0.026263595366118216, + "language_loss": 0.83201623, + "learning_rate": 0.0005137061445144395, + "loss": 0.84246838, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.4140625, + "step": 2633, + "time_per_iteration": 2.9138190746307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00282133, + "epoch": 0.5067333589842247, + "flos": 629970650112.0, + "grad_norm": 0.032671607566671305, + "language_loss": 0.87714005, + "learning_rate": 0.000513394716637712, + "loss": 0.8875829, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.41479492, + "step": 2634, + "time_per_iteration": 2.7618257999420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044567, + "balance_loss_mlp": 1.00422668, + "epoch": 0.5069257406694883, + "flos": 1451098938624.0, + "grad_norm": 0.004578936312393245, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.8023628, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.40332031, + "step": 2635, + "time_per_iteration": 4.85358738899231 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050199, + "balance_loss_mlp": 1.00869, + "epoch": 0.5071181223547518, + "flos": 640058796288.0, + "grad_norm": 0.03342633817994969, + "language_loss": 0.81428993, + "learning_rate": 0.0005127718454042958, + "loss": 0.82479185, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.4152832, + "step": 2636, + "time_per_iteration": 2.8021318912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_mlp": 1.00304461, + "epoch": 0.5073105040400154, + "flos": 714873225216.0, + "grad_norm": 0.031182962990379204, + "language_loss": 0.85094464, + "learning_rate": 0.0005124604022894269, + "loss": 0.86139023, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.4152832, + "step": 2637, + "time_per_iteration": 2.934414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_mlp": 1.00676727, + "epoch": 0.5075028857252789, + "flos": 1439614899456.0, + "grad_norm": 0.007557162842452459, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7823543, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.40429688, + "step": 2638, + "time_per_iteration": 4.820345878601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104601, + "balance_loss_mlp": 1.00435817, + "epoch": 0.5076952674105425, + "flos": 572308402944.0, + "grad_norm": 0.03427455588588844, + "language_loss": 0.83839953, + "learning_rate": 0.0005118375016679325, + "loss": 0.84885961, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.41674805, + "step": 2639, + "time_per_iteration": 2.753891706466675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104483, + "balance_loss_mlp": 1.00327373, + "epoch": 0.5078876490958061, + "flos": 517713040896.0, + "grad_norm": 0.0397313189962262, + "language_loss": 0.81205344, + "learning_rate": 0.0005115260444031382, + "loss": 0.82250178, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.41577148, + "step": 2640, + "time_per_iteration": 2.5884034633636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_mlp": 1.0042038, + "epoch": 0.5080800307810697, + "flos": 1587622342656.0, + "grad_norm": 0.00452780467183982, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79776466, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.40429688, + "step": 2641, + "time_per_iteration": 5.021141290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048765, + "balance_loss_mlp": 1.0071131, + "epoch": 0.5082724124663333, + "flos": 486187824384.0, + "grad_norm": 0.03665123216497768, + "language_loss": 0.87927556, + "learning_rate": 0.0005109031165700483, + "loss": 0.88976324, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.41674805, + "step": 2642, + "time_per_iteration": 2.564768075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044813, + "balance_loss_mlp": 1.00313723, + "epoch": 0.5084647941515967, + "flos": 683443272960.0, + "grad_norm": 0.03222315683418769, + "language_loss": 0.84105259, + "learning_rate": 0.0005105916462435945, + "loss": 0.85150075, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.41699219, + "step": 2643, + "time_per_iteration": 2.8432576656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046727, + "balance_loss_mlp": 1.0049082, + "epoch": 0.5086571758368603, + "flos": 549813722112.0, + "grad_norm": 0.031341979306324576, + "language_loss": 0.85911554, + "learning_rate": 0.0005102801718050989, + "loss": 0.86958289, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.41845703, + "step": 2644, + "time_per_iteration": 2.7012667655944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_mlp": 1.00658011, + "epoch": 0.5088495575221239, + "flos": 565079116800.0, + "grad_norm": 0.03553781912080262, + "language_loss": 0.89604807, + "learning_rate": 0.0005099686933754867, + "loss": 0.90653086, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.41723633, + "step": 2645, + "time_per_iteration": 2.774092197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047355, + "balance_loss_mlp": 1.00551212, + "epoch": 0.5090419392073875, + "flos": 552512189184.0, + "grad_norm": 0.03374447512064937, + "language_loss": 0.84807706, + "learning_rate": 0.0005096572110756845, + "loss": 0.85855055, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.41870117, + "step": 2646, + "time_per_iteration": 2.691534996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050517, + "balance_loss_mlp": 1.00857961, + "epoch": 0.509234320892651, + "flos": 568884771840.0, + "grad_norm": 0.0280586539552875, + "language_loss": 0.86222303, + "learning_rate": 0.0005093457250266205, + "loss": 0.87272823, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.41967773, + "step": 2647, + "time_per_iteration": 2.669032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_mlp": 1.00750375, + "epoch": 0.5094267025779146, + "flos": 583694265600.0, + "grad_norm": 0.03456739808544309, + "language_loss": 0.83707237, + "learning_rate": 0.000509034235349224, + "loss": 0.84756589, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.41870117, + "step": 2648, + "time_per_iteration": 2.7174429893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048368, + "balance_loss_mlp": 1.00657344, + "epoch": 0.5096190842631781, + "flos": 593139873024.0, + "grad_norm": 0.03190176036185227, + "language_loss": 0.81830442, + "learning_rate": 0.0005087227421644266, + "loss": 0.82878816, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.41821289, + "step": 2649, + "time_per_iteration": 2.730527877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_mlp": 1.00278723, + "epoch": 0.5098114659484417, + "flos": 514584917760.0, + "grad_norm": 0.03166339002539628, + "language_loss": 0.86503744, + "learning_rate": 0.0005084112455931602, + "loss": 0.87548256, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.41748047, + "step": 2650, + "time_per_iteration": 2.588543176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048046, + "balance_loss_mlp": 1.00627494, + "epoch": 0.5100038476337053, + "flos": 485601666048.0, + "grad_norm": 0.03514605484852806, + "language_loss": 0.85810292, + "learning_rate": 0.0005080997457563586, + "loss": 0.86858344, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41796875, + "step": 2651, + "time_per_iteration": 2.547510862350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053822, + "balance_loss_mlp": 1.01214612, + "epoch": 0.5101962293189688, + "flos": 462555820032.0, + "grad_norm": 0.03981395249249623, + "language_loss": 0.79794431, + "learning_rate": 0.0005077882427749569, + "loss": 0.80848241, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41699219, + "step": 2652, + "time_per_iteration": 2.5867154598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052855, + "balance_loss_mlp": 1.0111798, + "epoch": 0.5103886110042324, + "flos": 588133711104.0, + "grad_norm": 0.03576387090025985, + "language_loss": 0.8527801, + "learning_rate": 0.0005074767367698913, + "loss": 0.86330867, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.41699219, + "step": 2653, + "time_per_iteration": 2.668619155883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052633, + "balance_loss_mlp": 1.01083803, + "epoch": 0.510580992689496, + "flos": 846679172352.0, + "grad_norm": 0.03324234024932545, + "language_loss": 0.84336531, + "learning_rate": 0.0005071652278620988, + "loss": 0.85389161, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.41821289, + "step": 2654, + "time_per_iteration": 3.0502736568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_mlp": 1.01043141, + "epoch": 0.5107733743747596, + "flos": 659811268608.0, + "grad_norm": 0.033221976859431776, + "language_loss": 0.83371234, + "learning_rate": 0.0005068537161725186, + "loss": 0.84423465, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.41821289, + "step": 2655, + "time_per_iteration": 2.7732832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049493, + "balance_loss_mlp": 1.00784123, + "epoch": 0.510965756060023, + "flos": 702961475328.0, + "grad_norm": 0.03652104464060243, + "language_loss": 0.84970605, + "learning_rate": 0.0005065422018220893, + "loss": 0.860201, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41674805, + "step": 2656, + "time_per_iteration": 2.8670201301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_mlp": 1.00430822, + "epoch": 0.5111581377452866, + "flos": 560941982208.0, + "grad_norm": 0.03459233510222537, + "language_loss": 0.80690587, + "learning_rate": 0.0005062306849317521, + "loss": 0.81736469, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41601562, + "step": 2657, + "time_per_iteration": 2.8002302646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_mlp": 1.00202358, + "epoch": 0.5113505194305502, + "flos": 610146246144.0, + "grad_norm": 0.03554743150534212, + "language_loss": 0.83936596, + "learning_rate": 0.0005059191656224487, + "loss": 0.84980083, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41479492, + "step": 2658, + "time_per_iteration": 2.716935157775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_mlp": 1.0037955, + "epoch": 0.5115429011158138, + "flos": 535535951616.0, + "grad_norm": 0.03199868953010379, + "language_loss": 0.89635181, + "learning_rate": 0.0005056076440151212, + "loss": 0.90680414, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.41455078, + "step": 2659, + "time_per_iteration": 2.6661012172698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_mlp": 1.0019455, + "epoch": 0.5117352828010774, + "flos": 1365275813376.0, + "grad_norm": 0.005851878799964376, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.773305, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.40429688, + "step": 2660, + "time_per_iteration": 4.8821775913238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5119276644863409, + "flos": 634931125248.0, + "grad_norm": 0.030472593638878876, + "language_loss": 0.87624103, + "learning_rate": 0.0005049845943901691, + "loss": 0.88671124, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.4152832, + "step": 2661, + "time_per_iteration": 2.868314743041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_mlp": 1.00434649, + "epoch": 0.5121200461716044, + "flos": 586781559552.0, + "grad_norm": 0.035240788892260635, + "language_loss": 0.87104362, + "learning_rate": 0.0005046730666144338, + "loss": 0.88150167, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.41479492, + "step": 2662, + "time_per_iteration": 2.7716057300567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_mlp": 1.00323498, + "epoch": 0.512312427856868, + "flos": 1034224608000.0, + "grad_norm": 0.027938837780362106, + "language_loss": 0.8826527, + "learning_rate": 0.0005043615370244532, + "loss": 0.89309919, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41430664, + "step": 2663, + "time_per_iteration": 3.4280622005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046261, + "balance_loss_mlp": 1.00611115, + "epoch": 0.5125048095421316, + "flos": 1540901729280.0, + "grad_norm": 0.006786755652655265, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.7929064, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.40136719, + "step": 2664, + "time_per_iteration": 4.68994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.003901, + "epoch": 0.5126971912273951, + "flos": 592328193024.0, + "grad_norm": 0.02608573212926663, + "language_loss": 0.86075294, + "learning_rate": 0.0005037384728855425, + "loss": 0.87120485, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.41308594, + "step": 2665, + "time_per_iteration": 2.7917027473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_mlp": 1.00552762, + "epoch": 0.5128895729126587, + "flos": 552718268928.0, + "grad_norm": 0.03821611985083245, + "language_loss": 0.85252321, + "learning_rate": 0.0005034269385785075, + "loss": 0.86299217, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.41381836, + "step": 2666, + "time_per_iteration": 2.63472318649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_mlp": 1.00605392, + "epoch": 0.5130819545979223, + "flos": 482232470016.0, + "grad_norm": 0.03834683208397515, + "language_loss": 0.85133517, + "learning_rate": 0.0005031154029410168, + "loss": 0.86180985, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.41430664, + "step": 2667, + "time_per_iteration": 2.517110824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049696, + "balance_loss_mlp": 1.00837803, + "epoch": 0.5132743362831859, + "flos": 476768461824.0, + "grad_norm": 0.033096203996997774, + "language_loss": 0.87656248, + "learning_rate": 0.0005028038660940197, + "loss": 0.88705945, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.41333008, + "step": 2668, + "time_per_iteration": 2.5096347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105007, + "balance_loss_mlp": 1.00870478, + "epoch": 0.5134667179684494, + "flos": 504903095040.0, + "grad_norm": 0.028882778070319505, + "language_loss": 0.84998578, + "learning_rate": 0.0005024923281584648, + "loss": 0.86048645, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.41381836, + "step": 2669, + "time_per_iteration": 2.6474804878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_mlp": 1.0076561, + "epoch": 0.5136590996537129, + "flos": 505005162240.0, + "grad_norm": 0.03165719334287126, + "language_loss": 0.8319236, + "learning_rate": 0.0005021807892553026, + "loss": 0.84241164, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.41162109, + "step": 2670, + "time_per_iteration": 2.7183725833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044063, + "balance_loss_mlp": 1.00269723, + "epoch": 0.5138514813389765, + "flos": 625800467712.0, + "grad_norm": 0.030310171756311025, + "language_loss": 0.85420138, + "learning_rate": 0.0005018692495054828, + "loss": 0.86464202, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41381836, + "step": 2671, + "time_per_iteration": 2.772813081741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_mlp": 1.00224543, + "epoch": 0.5140438630242401, + "flos": 584634257664.0, + "grad_norm": 0.030896406933945995, + "language_loss": 0.80988181, + "learning_rate": 0.0005015577090299561, + "loss": 0.82031626, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.41210938, + "step": 2672, + "time_per_iteration": 2.6667463779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_mlp": 1.00858212, + "epoch": 0.5142362447095037, + "flos": 488905733376.0, + "grad_norm": 0.032429697018958814, + "language_loss": 0.87124586, + "learning_rate": 0.0005012461679496729, + "loss": 0.88174391, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41235352, + "step": 2673, + "time_per_iteration": 2.6442089080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104556, + "balance_loss_mlp": 1.00431406, + "epoch": 0.5144286263947672, + "flos": 527885757696.0, + "grad_norm": 0.03122591363863073, + "language_loss": 0.88052714, + "learning_rate": 0.0005009346263855848, + "loss": 0.89098281, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.41259766, + "step": 2674, + "time_per_iteration": 2.602527379989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048679, + "balance_loss_mlp": 1.00736094, + "epoch": 0.5146210080800308, + "flos": 487590520320.0, + "grad_norm": 0.029060606816111258, + "language_loss": 0.84209937, + "learning_rate": 0.0005006230844586422, + "loss": 0.85258621, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.41333008, + "step": 2675, + "time_per_iteration": 2.8685102462768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043387, + "balance_loss_mlp": 1.00216484, + "epoch": 0.5148133897652943, + "flos": 516975237888.0, + "grad_norm": 0.028587045609365692, + "language_loss": 0.79492688, + "learning_rate": 0.0005003115422897968, + "loss": 0.80536079, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.41235352, + "step": 2676, + "time_per_iteration": 2.765714168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_mlp": 1.00024414, + "epoch": 0.5150057714505579, + "flos": 512212094208.0, + "grad_norm": 0.033131913333961045, + "language_loss": 0.87827182, + "learning_rate": 0.0005, + "loss": 0.88868773, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.41357422, + "step": 2677, + "time_per_iteration": 2.705502986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047696, + "balance_loss_mlp": 1.00623488, + "epoch": 0.5151981531358215, + "flos": 912391133952.0, + "grad_norm": 0.03328612222334398, + "language_loss": 0.79844034, + "learning_rate": 0.0004996884577102033, + "loss": 0.80891728, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.41479492, + "step": 2678, + "time_per_iteration": 3.112602949142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049572, + "balance_loss_mlp": 1.00801528, + "epoch": 0.515390534821085, + "flos": 472930725888.0, + "grad_norm": 0.03414850275815592, + "language_loss": 0.85192269, + "learning_rate": 0.000499376915541358, + "loss": 0.86241841, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.41577148, + "step": 2679, + "time_per_iteration": 2.732088565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_mlp": 1.00475073, + "epoch": 0.5155829165063486, + "flos": 651358142976.0, + "grad_norm": 0.0316115868451719, + "language_loss": 0.81490767, + "learning_rate": 0.0004990653736144155, + "loss": 0.82537097, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.41601562, + "step": 2680, + "time_per_iteration": 2.9006052017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_mlp": 1.00425994, + "epoch": 0.5157752981916122, + "flos": 415161553920.0, + "grad_norm": 0.034873868180568895, + "language_loss": 0.86566359, + "learning_rate": 0.0004987538320503271, + "loss": 0.876122, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.41601562, + "step": 2681, + "time_per_iteration": 2.5385584831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049108, + "balance_loss_mlp": 1.00750434, + "epoch": 0.5159676798768758, + "flos": 554932644864.0, + "grad_norm": 0.03448939758068617, + "language_loss": 0.83127022, + "learning_rate": 0.0004984422909700442, + "loss": 0.84176129, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.41625977, + "step": 2682, + "time_per_iteration": 2.7167794704437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105107, + "balance_loss_mlp": 1.00944197, + "epoch": 0.5161600615621393, + "flos": 587621429760.0, + "grad_norm": 0.033752660754493145, + "language_loss": 0.84206975, + "learning_rate": 0.0004981307504945173, + "loss": 0.85258043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.41650391, + "step": 2683, + "time_per_iteration": 2.6896650791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_mlp": 1.00856805, + "epoch": 0.5163524432474028, + "flos": 589948566528.0, + "grad_norm": 0.03498305011402451, + "language_loss": 0.90086776, + "learning_rate": 0.0004978192107446976, + "loss": 0.9113704, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.41723633, + "step": 2684, + "time_per_iteration": 2.7550315856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_mlp": 1.00456297, + "epoch": 0.5165448249326664, + "flos": 504905040384.0, + "grad_norm": 0.03233825392148911, + "language_loss": 0.87956327, + "learning_rate": 0.0004975076718415353, + "loss": 0.89002615, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41748047, + "step": 2685, + "time_per_iteration": 2.5969831943511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046859, + "balance_loss_mlp": 1.00515938, + "epoch": 0.51673720661793, + "flos": 417647138304.0, + "grad_norm": 0.0327603501643271, + "language_loss": 0.91275072, + "learning_rate": 0.0004971961339059806, + "loss": 0.9232192, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.41723633, + "step": 2686, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048798, + "balance_loss_mlp": 1.00714636, + "epoch": 0.5169295883031936, + "flos": 600075596544.0, + "grad_norm": 0.03249247039046824, + "language_loss": 0.84663117, + "learning_rate": 0.0004968845970589832, + "loss": 0.8571192, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41674805, + "step": 2687, + "time_per_iteration": 2.7266340255737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047672, + "balance_loss_mlp": 1.00597274, + "epoch": 0.517121969988457, + "flos": 557911068672.0, + "grad_norm": 0.03510688251477249, + "language_loss": 0.85442108, + "learning_rate": 0.0004965730614214926, + "loss": 0.86489779, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.41723633, + "step": 2688, + "time_per_iteration": 2.669203758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00721848, + "epoch": 0.5173143516737206, + "flos": 470375155200.0, + "grad_norm": 0.031768698442390816, + "language_loss": 0.85484231, + "learning_rate": 0.0004962615271144576, + "loss": 0.86533004, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.41577148, + "step": 2689, + "time_per_iteration": 2.508864164352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00578225, + "epoch": 0.5175067333589842, + "flos": 721379292672.0, + "grad_norm": 0.036604011276375, + "language_loss": 0.83442801, + "learning_rate": 0.0004959499942588264, + "loss": 0.84490001, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.41430664, + "step": 2690, + "time_per_iteration": 2.937147617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054985, + "balance_loss_mlp": 1.01473999, + "epoch": 0.5176991150442478, + "flos": 1469344702464.0, + "grad_norm": 0.008104040921495323, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79255009, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.40234375, + "step": 2691, + "time_per_iteration": 4.793481111526489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047518, + "balance_loss_mlp": 1.00593746, + "epoch": 0.5178914967295114, + "flos": 613784705280.0, + "grad_norm": 0.029651978346564224, + "language_loss": 0.85819978, + "learning_rate": 0.0004953269333855661, + "loss": 0.86867493, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.41601562, + "step": 2692, + "time_per_iteration": 2.7456183433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054253, + "balance_loss_mlp": 1.01293516, + "epoch": 0.5180838784147749, + "flos": 501981051648.0, + "grad_norm": 0.03275547277888071, + "language_loss": 0.85017627, + "learning_rate": 0.0004950154056098309, + "loss": 0.86071873, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.41333008, + "step": 2693, + "time_per_iteration": 2.710204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052407, + "balance_loss_mlp": 1.01108897, + "epoch": 0.5182762601000385, + "flos": 690042659328.0, + "grad_norm": 0.03430000909694698, + "language_loss": 0.84476924, + "learning_rate": 0.0004947038797692867, + "loss": 0.85529327, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41333008, + "step": 2694, + "time_per_iteration": 2.846104860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053132, + "balance_loss_mlp": 1.01169479, + "epoch": 0.518468641785302, + "flos": 666801427200.0, + "grad_norm": 0.031372779584062496, + "language_loss": 0.77936417, + "learning_rate": 0.0004943923559848789, + "loss": 0.78989553, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.41455078, + "step": 2695, + "time_per_iteration": 2.780346155166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054261, + "balance_loss_mlp": 1.01303816, + "epoch": 0.5186610234705656, + "flos": 567814522368.0, + "grad_norm": 0.025403978054072948, + "language_loss": 0.9097802, + "learning_rate": 0.0004940808343775515, + "loss": 0.92032284, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.41235352, + "step": 2696, + "time_per_iteration": 2.6940221786499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052298, + "balance_loss_mlp": 1.01093256, + "epoch": 0.5188534051558291, + "flos": 429793158144.0, + "grad_norm": 0.033988353521974116, + "language_loss": 0.8254481, + "learning_rate": 0.0004937693150682479, + "loss": 0.83597112, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.41381836, + "step": 2697, + "time_per_iteration": 2.5146913528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048575, + "balance_loss_mlp": 1.00725734, + "epoch": 0.5190457868410927, + "flos": 547412708352.0, + "grad_norm": 0.031596370266791504, + "language_loss": 0.77111042, + "learning_rate": 0.0004934577981779107, + "loss": 0.78159618, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41333008, + "step": 2698, + "time_per_iteration": 2.6567137241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_mlp": 1.00327134, + "epoch": 0.5192381685263563, + "flos": 549746648064.0, + "grad_norm": 0.029705122804042017, + "language_loss": 0.81764138, + "learning_rate": 0.0004931462838274817, + "loss": 0.82808805, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.4140625, + "step": 2699, + "time_per_iteration": 2.817087173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050227, + "balance_loss_mlp": 1.00895715, + "epoch": 0.5194305502116199, + "flos": 576350273280.0, + "grad_norm": 0.03619468074242637, + "language_loss": 0.84569639, + "learning_rate": 0.0004928347721379011, + "loss": 0.85619867, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.4128418, + "step": 2700, + "time_per_iteration": 2.6439361572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049587, + "balance_loss_mlp": 1.00831699, + "epoch": 0.5196229318968835, + "flos": 435218282496.0, + "grad_norm": 0.03299749227833017, + "language_loss": 0.82266027, + "learning_rate": 0.0004925232632301089, + "loss": 0.83315617, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.4128418, + "step": 2701, + "time_per_iteration": 2.5564098358154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_mlp": 1.00409007, + "epoch": 0.5198153135821469, + "flos": 559986438912.0, + "grad_norm": 0.03181007655018395, + "language_loss": 0.79940033, + "learning_rate": 0.0004922117572250431, + "loss": 0.80985349, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.41235352, + "step": 2702, + "time_per_iteration": 2.651662826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00763237, + "epoch": 0.5200076952674105, + "flos": 566835646464.0, + "grad_norm": 0.030877309828348475, + "language_loss": 0.81538028, + "learning_rate": 0.0004919002542436414, + "loss": 0.82586813, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.41162109, + "step": 2703, + "time_per_iteration": 2.829218864440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_mlp": 1.01028192, + "epoch": 0.5202000769526741, + "flos": 572273409792.0, + "grad_norm": 0.031996161034096735, + "language_loss": 0.81638157, + "learning_rate": 0.0004915887544068399, + "loss": 0.82689589, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.41162109, + "step": 2704, + "time_per_iteration": 2.6583306789398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052276, + "balance_loss_mlp": 1.01110101, + "epoch": 0.5203924586379377, + "flos": 695467783680.0, + "grad_norm": 0.03456723160752419, + "language_loss": 0.7851603, + "learning_rate": 0.0004912772578355736, + "loss": 0.79568309, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41186523, + "step": 2705, + "time_per_iteration": 2.9061107635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051355, + "balance_loss_mlp": 1.01010871, + "epoch": 0.5205848403232012, + "flos": 567691067904.0, + "grad_norm": 0.03253184462937942, + "language_loss": 0.83445644, + "learning_rate": 0.000490965764650776, + "loss": 0.84497005, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.41259766, + "step": 2706, + "time_per_iteration": 2.8724799156188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_mlp": 1.01042521, + "epoch": 0.5207772220084648, + "flos": 1216205913600.0, + "grad_norm": 0.03130848752928153, + "language_loss": 0.83192623, + "learning_rate": 0.0004906542749733798, + "loss": 0.84244412, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.41381836, + "step": 2707, + "time_per_iteration": 3.6585958003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_mlp": 1.00770402, + "epoch": 0.5209696036937284, + "flos": 594032232960.0, + "grad_norm": 0.02732760694007456, + "language_loss": 0.85709697, + "learning_rate": 0.0004903427889243156, + "loss": 0.86758834, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.41455078, + "step": 2708, + "time_per_iteration": 2.871150016784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00294721, + "epoch": 0.5211619853789919, + "flos": 523956648192.0, + "grad_norm": 0.03352920522422817, + "language_loss": 0.85979593, + "learning_rate": 0.0004900313066245134, + "loss": 0.87024117, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.41601562, + "step": 2709, + "time_per_iteration": 2.6438417434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_mlp": 1.00632536, + "epoch": 0.5213543670642555, + "flos": 503861035776.0, + "grad_norm": 0.03205745002268137, + "language_loss": 0.81327069, + "learning_rate": 0.0004897198281949012, + "loss": 0.82374883, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.41503906, + "step": 2710, + "time_per_iteration": 2.693906307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049613, + "balance_loss_mlp": 1.00800931, + "epoch": 0.521546748749519, + "flos": 587072209920.0, + "grad_norm": 0.036857631666753196, + "language_loss": 0.78204525, + "learning_rate": 0.0004894083537564057, + "loss": 0.79254138, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.41625977, + "step": 2711, + "time_per_iteration": 2.7300491333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_mlp": 1.00333273, + "epoch": 0.5217391304347826, + "flos": 571266343680.0, + "grad_norm": 0.030696577254243577, + "language_loss": 0.81681752, + "learning_rate": 0.0004890968834299519, + "loss": 0.82726759, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.41699219, + "step": 2712, + "time_per_iteration": 2.746556043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_mlp": 1.00831652, + "epoch": 0.5219315121200462, + "flos": 543920057856.0, + "grad_norm": 0.028956363679279982, + "language_loss": 0.79082847, + "learning_rate": 0.0004887854173364633, + "loss": 0.80132675, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.4152832, + "step": 2713, + "time_per_iteration": 2.733306884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051945, + "balance_loss_mlp": 1.01045978, + "epoch": 0.5221238938053098, + "flos": 551531367936.0, + "grad_norm": 0.030815907554272836, + "language_loss": 0.82228422, + "learning_rate": 0.0004884739555968617, + "loss": 0.83280361, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.41503906, + "step": 2714, + "time_per_iteration": 2.815034866333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054211, + "balance_loss_mlp": 1.01425171, + "epoch": 0.5223162754905732, + "flos": 1358392579584.0, + "grad_norm": 0.009025254493072253, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80031264, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.39941406, + "step": 2715, + "time_per_iteration": 5.005860090255737 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_mlp": 1.00550854, + "epoch": 0.5225086571758368, + "flos": 568974200064.0, + "grad_norm": 0.030755982791586634, + "language_loss": 0.87142956, + "learning_rate": 0.0004878510456629992, + "loss": 0.88190192, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.41748047, + "step": 2716, + "time_per_iteration": 2.9582624435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_mlp": 1.00713038, + "epoch": 0.5227010388611004, + "flos": 501136323840.0, + "grad_norm": 0.03155972783921746, + "language_loss": 0.85419679, + "learning_rate": 0.00048753959771057314, + "loss": 0.86468375, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.41577148, + "step": 2717, + "time_per_iteration": 2.623081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104979, + "balance_loss_mlp": 1.00832856, + "epoch": 0.522893420546364, + "flos": 598799267328.0, + "grad_norm": 0.035176839616525644, + "language_loss": 0.83230948, + "learning_rate": 0.0004872281545957044, + "loss": 0.84280741, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.41479492, + "step": 2718, + "time_per_iteration": 2.7231285572052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059587, + "balance_loss_mlp": 1.01800716, + "epoch": 0.5230858022316276, + "flos": 665922673152.0, + "grad_norm": 0.03224340083556492, + "language_loss": 0.86415994, + "learning_rate": 0.0004869167164393055, + "loss": 0.8747558, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.41601562, + "step": 2719, + "time_per_iteration": 2.9305646419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054772, + "balance_loss_mlp": 1.0132159, + "epoch": 0.5232781839168911, + "flos": 605034126336.0, + "grad_norm": 0.0287825993415993, + "language_loss": 0.89917624, + "learning_rate": 0.00048660528336228793, + "loss": 0.909724, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.41577148, + "step": 2720, + "time_per_iteration": 2.788072347640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_mlp": 1.0080725, + "epoch": 0.5234705656021547, + "flos": 551841460224.0, + "grad_norm": 0.02763684671666484, + "language_loss": 0.90116215, + "learning_rate": 0.0004862938554855606, + "loss": 0.91165972, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.41699219, + "step": 2721, + "time_per_iteration": 2.775818109512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_mlp": 1.00965011, + "epoch": 0.5236629472874182, + "flos": 505295812608.0, + "grad_norm": 0.03601660428487822, + "language_loss": 0.86817378, + "learning_rate": 0.0004859824329300304, + "loss": 0.87868822, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.41821289, + "step": 2722, + "time_per_iteration": 2.587228536605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053208, + "balance_loss_mlp": 1.01138973, + "epoch": 0.5238553289726818, + "flos": 548697785856.0, + "grad_norm": 0.03170706554102953, + "language_loss": 0.83958352, + "learning_rate": 0.00048567101581660244, + "loss": 0.85011566, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.41845703, + "step": 2723, + "time_per_iteration": 2.6208062171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050325, + "balance_loss_mlp": 1.00843501, + "epoch": 0.5240477106579453, + "flos": 533004680448.0, + "grad_norm": 0.03335820140898581, + "language_loss": 0.87488234, + "learning_rate": 0.00048535960426617956, + "loss": 0.88538557, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.41918945, + "step": 2724, + "time_per_iteration": 2.5951199531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050726, + "balance_loss_mlp": 1.00883543, + "epoch": 0.5242400923432089, + "flos": 619090265856.0, + "grad_norm": 0.03212273913620546, + "language_loss": 0.8244487, + "learning_rate": 0.0004850481983996621, + "loss": 0.83495593, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.41918945, + "step": 2725, + "time_per_iteration": 2.747008800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049694, + "balance_loss_mlp": 1.00785124, + "epoch": 0.5244324740284725, + "flos": 417590757888.0, + "grad_norm": 0.03280670580990367, + "language_loss": 0.88229245, + "learning_rate": 0.0004847367983379492, + "loss": 0.89278936, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.41870117, + "step": 2726, + "time_per_iteration": 2.437721014022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_mlp": 1.00770533, + "epoch": 0.5246248557137361, + "flos": 627732941568.0, + "grad_norm": 0.03120006141405487, + "language_loss": 0.79435945, + "learning_rate": 0.00048442540420193643, + "loss": 0.80485278, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.41650391, + "step": 2727, + "time_per_iteration": 2.927518844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.00911331, + "epoch": 0.5248172373989997, + "flos": 1250403352320.0, + "grad_norm": 0.03663625191481743, + "language_loss": 0.7991612, + "learning_rate": 0.0004841140161125182, + "loss": 0.80966663, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.41455078, + "step": 2728, + "time_per_iteration": 3.574690818786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053819, + "balance_loss_mlp": 1.01250064, + "epoch": 0.5250096190842631, + "flos": 507883464192.0, + "grad_norm": 0.03360211420143325, + "language_loss": 0.85387456, + "learning_rate": 0.0004838026341905857, + "loss": 0.86441278, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.41333008, + "step": 2729, + "time_per_iteration": 2.7263481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046844, + "balance_loss_mlp": 1.00547838, + "epoch": 0.5252020007695267, + "flos": 612508376064.0, + "grad_norm": 0.029211194306351093, + "language_loss": 0.85320604, + "learning_rate": 0.00048349125855702844, + "loss": 0.86367452, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.41381836, + "step": 2730, + "time_per_iteration": 2.775851011276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00649071, + "epoch": 0.5253943824547903, + "flos": 540292292352.0, + "grad_norm": 0.02938539212610817, + "language_loss": 0.81675971, + "learning_rate": 0.00048317988933273287, + "loss": 0.82723826, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.41381836, + "step": 2731, + "time_per_iteration": 2.7763831615448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00613368, + "epoch": 0.5255867641400539, + "flos": 699338567424.0, + "grad_norm": 0.033934632058623626, + "language_loss": 0.82549971, + "learning_rate": 0.00048286852663858367, + "loss": 0.83597326, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.41235352, + "step": 2732, + "time_per_iteration": 2.96213698387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052513, + "balance_loss_mlp": 1.01131439, + "epoch": 0.5257791458253175, + "flos": 668549208576.0, + "grad_norm": 0.03297641476237434, + "language_loss": 0.84432375, + "learning_rate": 0.000482557170595462, + "loss": 0.85484892, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.41210938, + "step": 2733, + "time_per_iteration": 2.840514659881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050533, + "balance_loss_mlp": 1.00943005, + "epoch": 0.525971527510581, + "flos": 484605293568.0, + "grad_norm": 0.032410991276381265, + "language_loss": 0.88272679, + "learning_rate": 0.0004822458213242475, + "loss": 0.89323211, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.41113281, + "step": 2734, + "time_per_iteration": 2.560474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047172, + "balance_loss_mlp": 1.00613987, + "epoch": 0.5261639091958445, + "flos": 831348648960.0, + "grad_norm": 0.03341440797603734, + "language_loss": 0.86630881, + "learning_rate": 0.00048193447894581627, + "loss": 0.87678051, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.41040039, + "step": 2735, + "time_per_iteration": 3.1240243911743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105002, + "balance_loss_mlp": 1.00886869, + "epoch": 0.5263562908811081, + "flos": 521733523968.0, + "grad_norm": 0.03226346413051534, + "language_loss": 0.88327318, + "learning_rate": 0.00048162314358104243, + "loss": 0.89377338, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.41162109, + "step": 2736, + "time_per_iteration": 2.599510669708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_mlp": 1.00581563, + "epoch": 0.5265486725663717, + "flos": 576098506752.0, + "grad_norm": 0.03477073688653673, + "language_loss": 0.84006953, + "learning_rate": 0.0004813118153507969, + "loss": 0.85054016, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.41259766, + "step": 2737, + "time_per_iteration": 2.7309916019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0173111, + "epoch": 0.5267410542516352, + "flos": 1550561186304.0, + "grad_norm": 0.008968329145720436, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83504307, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.3984375, + "step": 2738, + "time_per_iteration": 4.815824747085571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054104, + "balance_loss_mlp": 1.01311994, + "epoch": 0.5269334359368988, + "flos": 931462183680.0, + "grad_norm": 0.03276977156640091, + "language_loss": 0.84196591, + "learning_rate": 0.00048068918077736163, + "loss": 0.85250694, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.40991211, + "step": 2739, + "time_per_iteration": 3.2470173835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051244, + "balance_loss_mlp": 1.01004505, + "epoch": 0.5271258176221624, + "flos": 656635513344.0, + "grad_norm": 0.03436954846361053, + "language_loss": 0.82138938, + "learning_rate": 0.0004803778746759001, + "loss": 0.83190179, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.41210938, + "step": 2740, + "time_per_iteration": 2.920330286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051176, + "balance_loss_mlp": 1.01007247, + "epoch": 0.527318199307426, + "flos": 544062954240.0, + "grad_norm": 0.045913237701965745, + "language_loss": 0.82631075, + "learning_rate": 0.00048006657619242317, + "loss": 0.83682251, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.41113281, + "step": 2741, + "time_per_iteration": 2.612001419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_mlp": 1.00462067, + "epoch": 0.5275105809926895, + "flos": 448899201024.0, + "grad_norm": 0.036563153452021165, + "language_loss": 0.78434455, + "learning_rate": 0.00047975528544778775, + "loss": 0.7948041, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.41357422, + "step": 2742, + "time_per_iteration": 2.590146064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_mlp": 1.00130069, + "epoch": 0.527702962677953, + "flos": 580053861120.0, + "grad_norm": 0.038221984800347206, + "language_loss": 0.89132345, + "learning_rate": 0.00047944400256284754, + "loss": 0.90174961, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.41333008, + "step": 2743, + "time_per_iteration": 2.691096305847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046686, + "balance_loss_mlp": 1.00548708, + "epoch": 0.5278953443632166, + "flos": 654010923264.0, + "grad_norm": 0.03476413811576821, + "language_loss": 0.80653423, + "learning_rate": 0.0004791327276584532, + "loss": 0.8170011, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.41210938, + "step": 2744, + "time_per_iteration": 2.8089282512664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00753677, + "epoch": 0.5280877260484802, + "flos": 515049566976.0, + "grad_norm": 0.03187296499214836, + "language_loss": 0.81036532, + "learning_rate": 0.00047882146085545264, + "loss": 0.82085317, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.41259766, + "step": 2745, + "time_per_iteration": 2.646883010864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055283, + "balance_loss_mlp": 1.01541901, + "epoch": 0.5282801077337438, + "flos": 1448715421440.0, + "grad_norm": 0.006687794222264933, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76457667, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.3984375, + "step": 2746, + "time_per_iteration": 4.967897653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_mlp": 1.00703144, + "epoch": 0.5284724894190073, + "flos": 605967315456.0, + "grad_norm": 0.03667028691338261, + "language_loss": 0.80105197, + "learning_rate": 0.00047819895203700684, + "loss": 0.81153399, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.41186523, + "step": 2747, + "time_per_iteration": 2.7146098613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105164, + "balance_loss_mlp": 1.01187134, + "epoch": 0.5286648711042709, + "flos": 1498106323200.0, + "grad_norm": 0.006729060992495368, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76564074, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.39746094, + "step": 2748, + "time_per_iteration": 4.6327197551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045337, + "balance_loss_mlp": 1.00416195, + "epoch": 0.5288572527895344, + "flos": 598834260480.0, + "grad_norm": 0.03692084834433464, + "language_loss": 0.89385319, + "learning_rate": 0.0004775764770742277, + "loss": 0.90430653, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.41186523, + "step": 2749, + "time_per_iteration": 2.807567834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_mlp": 1.00394237, + "epoch": 0.529049634474798, + "flos": 558440846592.0, + "grad_norm": 0.03911259999059639, + "language_loss": 0.87067056, + "learning_rate": 0.00047726525259079777, + "loss": 0.88112199, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.41210938, + "step": 2750, + "time_per_iteration": 2.7838735580444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_mlp": 1.00348663, + "epoch": 0.5292420161600616, + "flos": 582435432960.0, + "grad_norm": 0.03406590895995427, + "language_loss": 0.89342177, + "learning_rate": 0.0004769540369337798, + "loss": 0.9038682, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.41162109, + "step": 2751, + "time_per_iteration": 2.716430902481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 1.00405395, + "epoch": 0.5294343978453251, + "flos": 609564945408.0, + "grad_norm": 0.0303004693379624, + "language_loss": 0.8646909, + "learning_rate": 0.00047664283022399794, + "loss": 0.87514395, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.41259766, + "step": 2752, + "time_per_iteration": 2.8746426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048518, + "balance_loss_mlp": 1.00736678, + "epoch": 0.5296267795305887, + "flos": 647710935552.0, + "grad_norm": 0.032209809873809676, + "language_loss": 0.81781971, + "learning_rate": 0.00047633163258227376, + "loss": 0.82830489, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.41162109, + "step": 2753, + "time_per_iteration": 2.859628677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048376, + "balance_loss_mlp": 1.0070343, + "epoch": 0.5298191612158523, + "flos": 560806867200.0, + "grad_norm": 0.034095977821307535, + "language_loss": 0.85918152, + "learning_rate": 0.0004760204441294247, + "loss": 0.86966527, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.41357422, + "step": 2754, + "time_per_iteration": 2.642761707305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049622, + "balance_loss_mlp": 1.00842357, + "epoch": 0.5300115429011159, + "flos": 515132192256.0, + "grad_norm": 0.03324074908377848, + "language_loss": 0.86806327, + "learning_rate": 0.00047570926498626486, + "loss": 0.87855953, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.41210938, + "step": 2755, + "time_per_iteration": 2.688204765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048264, + "balance_loss_mlp": 1.00699341, + "epoch": 0.5302039245863793, + "flos": 674050155264.0, + "grad_norm": 0.032282959747224574, + "language_loss": 0.82332271, + "learning_rate": 0.00047539809527360474, + "loss": 0.83380532, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.4128418, + "step": 2756, + "time_per_iteration": 2.891369104385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051726, + "balance_loss_mlp": 1.01052761, + "epoch": 0.5303963062716429, + "flos": 732157609728.0, + "grad_norm": 0.027910460797545535, + "language_loss": 0.82830453, + "learning_rate": 0.0004750869351122511, + "loss": 0.83882177, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.41210938, + "step": 2757, + "time_per_iteration": 2.9782614707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051325, + "balance_loss_mlp": 1.01015055, + "epoch": 0.5305886879569065, + "flos": 574552914432.0, + "grad_norm": 0.03118318769242836, + "language_loss": 0.82440865, + "learning_rate": 0.00047477578462300685, + "loss": 0.83492196, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.41186523, + "step": 2758, + "time_per_iteration": 2.7210254669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104635, + "balance_loss_mlp": 1.00498474, + "epoch": 0.5307810696421701, + "flos": 696729528576.0, + "grad_norm": 0.03181982217221047, + "language_loss": 0.79867083, + "learning_rate": 0.0004744646439266718, + "loss": 0.8091343, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.41381836, + "step": 2759, + "time_per_iteration": 2.997299909591675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046488, + "balance_loss_mlp": 1.005265, + "epoch": 0.5309734513274337, + "flos": 650203322880.0, + "grad_norm": 0.04897119780065821, + "language_loss": 0.92728293, + "learning_rate": 0.000474153513144041, + "loss": 0.93774784, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.41235352, + "step": 2760, + "time_per_iteration": 2.9030909538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047525, + "balance_loss_mlp": 1.00618315, + "epoch": 0.5311658330126972, + "flos": 606056743680.0, + "grad_norm": 0.03383323202633534, + "language_loss": 0.87311566, + "learning_rate": 0.00047384239239590633, + "loss": 0.88359094, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.41357422, + "step": 2761, + "time_per_iteration": 2.8522770404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_mlp": 1.00859571, + "epoch": 0.5313582146979607, + "flos": 559317655296.0, + "grad_norm": 0.03320129260812799, + "language_loss": 0.89026552, + "learning_rate": 0.0004735312818030556, + "loss": 0.90076458, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.41333008, + "step": 2762, + "time_per_iteration": 2.6917500495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00390708, + "epoch": 0.5315505963832243, + "flos": 509446553088.0, + "grad_norm": 0.032512052220750494, + "language_loss": 0.8324827, + "learning_rate": 0.0004732201814862727, + "loss": 0.84293473, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.41308594, + "step": 2763, + "time_per_iteration": 2.7620086669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045981, + "balance_loss_mlp": 1.00461555, + "epoch": 0.5317429780684879, + "flos": 627669758208.0, + "grad_norm": 0.03302669202039023, + "language_loss": 0.81508183, + "learning_rate": 0.0004729090915663373, + "loss": 0.82554156, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.41381836, + "step": 2764, + "time_per_iteration": 2.827430248260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_mlp": 1.00333464, + "epoch": 0.5319353597537514, + "flos": 477699705600.0, + "grad_norm": 0.039772813062738895, + "language_loss": 0.85676539, + "learning_rate": 0.00047259801216402534, + "loss": 0.86721289, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.41430664, + "step": 2765, + "time_per_iteration": 2.5082104206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104674, + "balance_loss_mlp": 1.00535059, + "epoch": 0.532127741439015, + "flos": 502634284032.0, + "grad_norm": 0.03926492526470634, + "language_loss": 0.86841261, + "learning_rate": 0.00047228694340010845, + "loss": 0.87888008, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.4140625, + "step": 2766, + "time_per_iteration": 2.549739360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047042, + "balance_loss_mlp": 1.00555718, + "epoch": 0.5323201231242786, + "flos": 1166484510720.0, + "grad_norm": 0.033303639033777616, + "language_loss": 0.86118937, + "learning_rate": 0.0004719758853953544, + "loss": 0.87165976, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.41503906, + "step": 2767, + "time_per_iteration": 3.5872445106506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_mlp": 1.00888503, + "epoch": 0.5325125048095422, + "flos": 379541977344.0, + "grad_norm": 0.045646551162954616, + "language_loss": 0.84812796, + "learning_rate": 0.00047166483827052645, + "loss": 0.85863209, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.41552734, + "step": 2768, + "time_per_iteration": 2.4177846908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057545, + "balance_loss_mlp": 1.01796722, + "epoch": 0.5327048864948057, + "flos": 1544750147328.0, + "grad_norm": 0.015563445131555704, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78136033, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.39550781, + "step": 2769, + "time_per_iteration": 4.974437236785889 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00447309, + "epoch": 0.5328972681800692, + "flos": 912862586112.0, + "grad_norm": 0.03252924413682995, + "language_loss": 0.84066141, + "learning_rate": 0.000471042777143682, + "loss": 0.85112101, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.41503906, + "step": 2770, + "time_per_iteration": 3.204782724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104834, + "balance_loss_mlp": 1.00680697, + "epoch": 0.5330896498653328, + "flos": 474851539200.0, + "grad_norm": 0.03462661973501109, + "language_loss": 0.80093729, + "learning_rate": 0.0004707317633831707, + "loss": 0.81142068, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.41552734, + "step": 2771, + "time_per_iteration": 2.566772699356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049499, + "balance_loss_mlp": 1.00789511, + "epoch": 0.5332820315505964, + "flos": 502634284032.0, + "grad_norm": 0.03484250248812788, + "language_loss": 0.78787035, + "learning_rate": 0.00047042076098559673, + "loss": 0.79836535, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.41625977, + "step": 2772, + "time_per_iteration": 2.5929906368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_mlp": 1.00454724, + "epoch": 0.53347441323586, + "flos": 926033168640.0, + "grad_norm": 0.038112679556298976, + "language_loss": 0.74248701, + "learning_rate": 0.00047010977007170174, + "loss": 0.75295115, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.41894531, + "step": 2773, + "time_per_iteration": 3.221947193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_mlp": 1.00956452, + "epoch": 0.5336667949211235, + "flos": 575540538624.0, + "grad_norm": 0.03388488907034337, + "language_loss": 0.83005095, + "learning_rate": 0.00046979879076222334, + "loss": 0.8405627, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.41625977, + "step": 2774, + "time_per_iteration": 2.7014822959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_mlp": 1.00767875, + "epoch": 0.533859176606387, + "flos": 1066392363264.0, + "grad_norm": 0.03095569704566717, + "language_loss": 0.85300922, + "learning_rate": 0.0004694878231778939, + "loss": 0.86350143, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.41552734, + "step": 2775, + "time_per_iteration": 3.368795156478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048588, + "balance_loss_mlp": 1.00700808, + "epoch": 0.5340515582916506, + "flos": 747907095552.0, + "grad_norm": 0.030429614039409136, + "language_loss": 0.84799051, + "learning_rate": 0.0004691768674394423, + "loss": 0.8584764, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.41601562, + "step": 2776, + "time_per_iteration": 2.958280324935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052456, + "balance_loss_mlp": 1.01230621, + "epoch": 0.5342439399769142, + "flos": 1448821379328.0, + "grad_norm": 0.012202915272427423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85536468, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.40136719, + "step": 2777, + "time_per_iteration": 4.774897575378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_mlp": 1.00908661, + "epoch": 0.5344363216621778, + "flos": 1430699069952.0, + "grad_norm": 0.005918596107012712, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77702767, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.40136719, + "step": 2778, + "time_per_iteration": 4.978635549545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051694, + "balance_loss_mlp": 1.01039958, + "epoch": 0.5346287033474413, + "flos": 528676050432.0, + "grad_norm": 0.029867236989907914, + "language_loss": 0.79874206, + "learning_rate": 0.00046824407250656676, + "loss": 0.80925894, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.41308594, + "step": 2779, + "time_per_iteration": 2.610321044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_mlp": 1.00790143, + "epoch": 0.5348210850327049, + "flos": 511756193280.0, + "grad_norm": 0.03028632537310572, + "language_loss": 0.83974576, + "learning_rate": 0.0004679331653588161, + "loss": 0.85023701, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.41235352, + "step": 2780, + "time_per_iteration": 2.641401529312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_mlp": 1.00530362, + "epoch": 0.5350134667179685, + "flos": 463626069504.0, + "grad_norm": 0.032724184133620285, + "language_loss": 0.86073065, + "learning_rate": 0.0004676222706605147, + "loss": 0.87119734, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.41381836, + "step": 2781, + "time_per_iteration": 2.6093719005584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046994, + "balance_loss_mlp": 1.005795, + "epoch": 0.535205848403232, + "flos": 710118829824.0, + "grad_norm": 0.033538440780340566, + "language_loss": 0.85521388, + "learning_rate": 0.0004673113885323626, + "loss": 0.86568379, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.41210938, + "step": 2782, + "time_per_iteration": 2.8278369903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_mlp": 1.00337684, + "epoch": 0.5353982300884956, + "flos": 895793029632.0, + "grad_norm": 0.03115315889801346, + "language_loss": 0.79367262, + "learning_rate": 0.00046700051909505494, + "loss": 0.80411977, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.41357422, + "step": 2783, + "time_per_iteration": 3.181025743484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_mlp": 1.00410628, + "epoch": 0.5355906117737591, + "flos": 537025163520.0, + "grad_norm": 0.03272022966866855, + "language_loss": 0.84359205, + "learning_rate": 0.000466689662469282, + "loss": 0.85404533, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41235352, + "step": 2784, + "time_per_iteration": 2.623128890991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045278, + "balance_loss_mlp": 1.00419891, + "epoch": 0.5357829934590227, + "flos": 870328673280.0, + "grad_norm": 0.0344669350963294, + "language_loss": 0.84610772, + "learning_rate": 0.00046637881877572917, + "loss": 0.85656047, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.41088867, + "step": 2785, + "time_per_iteration": 3.079174757003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010433, + "balance_loss_mlp": 1.00229168, + "epoch": 0.5359753751442863, + "flos": 554446608384.0, + "grad_norm": 0.028858393123854686, + "language_loss": 0.85135722, + "learning_rate": 0.0004660679881350764, + "loss": 0.86179018, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.41015625, + "step": 2786, + "time_per_iteration": 2.7473020553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_mlp": 1.00150299, + "epoch": 0.5361677568295499, + "flos": 1483759533312.0, + "grad_norm": 0.0067453290840893895, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76649511, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.3984375, + "step": 2787, + "time_per_iteration": 5.041473627090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_mlp": 1.0027802, + "epoch": 0.5363601385148133, + "flos": 807642767616.0, + "grad_norm": 0.03504389904677532, + "language_loss": 0.78613555, + "learning_rate": 0.0004654463664951667, + "loss": 0.79657346, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.41015625, + "step": 2788, + "time_per_iteration": 2.9798529148101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_mlp": 1.00775349, + "epoch": 0.5365525202000769, + "flos": 508879836672.0, + "grad_norm": 0.03320853792290129, + "language_loss": 0.8327626, + "learning_rate": 0.0004651355757372447, + "loss": 0.84325004, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.40991211, + "step": 2789, + "time_per_iteration": 2.643827438354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720656, + "epoch": 0.5367449018853405, + "flos": 530015563008.0, + "grad_norm": 0.032066447391342436, + "language_loss": 0.8626231, + "learning_rate": 0.00046482479851489274, + "loss": 0.87310588, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.41088867, + "step": 2790, + "time_per_iteration": 2.7637765407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046715, + "balance_loss_mlp": 1.0056597, + "epoch": 0.5369372835706041, + "flos": 651217191936.0, + "grad_norm": 0.038515792328953954, + "language_loss": 0.78515691, + "learning_rate": 0.00046451403494876525, + "loss": 0.79562402, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.41064453, + "step": 2791, + "time_per_iteration": 2.9090025424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_mlp": 1.00504696, + "epoch": 0.5371296652558677, + "flos": 585628684800.0, + "grad_norm": 0.03231753899308558, + "language_loss": 0.84747189, + "learning_rate": 0.0004642032851595111, + "loss": 0.85793316, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.41088867, + "step": 2792, + "time_per_iteration": 2.775444507598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_mlp": 1.00717819, + "epoch": 0.5373220469411312, + "flos": 597084533760.0, + "grad_norm": 0.03483653357210067, + "language_loss": 0.85361469, + "learning_rate": 0.00046389254926777404, + "loss": 0.86409795, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41162109, + "step": 2793, + "time_per_iteration": 2.8168118000030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00378096, + "epoch": 0.5375144286263948, + "flos": 1116279016704.0, + "grad_norm": 0.03171846878783484, + "language_loss": 0.78282589, + "learning_rate": 0.0004635818273941926, + "loss": 0.79327619, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.41259766, + "step": 2794, + "time_per_iteration": 3.5206284523010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_mlp": 1.00301409, + "epoch": 0.5377068103116583, + "flos": 596769583872.0, + "grad_norm": 0.0416500636560626, + "language_loss": 0.82705241, + "learning_rate": 0.0004632711196593997, + "loss": 0.83749551, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.41308594, + "step": 2795, + "time_per_iteration": 2.81925892829895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010512, + "balance_loss_mlp": 1.0100255, + "epoch": 0.5378991919969219, + "flos": 885650448384.0, + "grad_norm": 0.03764518727969069, + "language_loss": 0.85939819, + "learning_rate": 0.00046296042618402297, + "loss": 0.86991024, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.41186523, + "step": 2796, + "time_per_iteration": 3.076819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047791, + "balance_loss_mlp": 1.00666356, + "epoch": 0.5380915736821854, + "flos": 711951181824.0, + "grad_norm": 0.02842771896049368, + "language_loss": 0.79539001, + "learning_rate": 0.0004626497470886839, + "loss": 0.80586791, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.41137695, + "step": 2797, + "time_per_iteration": 2.9846107959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00844073, + "epoch": 0.538283955367449, + "flos": 558115203072.0, + "grad_norm": 0.029565541443496178, + "language_loss": 0.82388103, + "learning_rate": 0.00046233908249399897, + "loss": 0.83437717, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41186523, + "step": 2798, + "time_per_iteration": 2.7782254219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01255548, + "epoch": 0.5384763370527126, + "flos": 514482850560.0, + "grad_norm": 0.03320479864481119, + "language_loss": 0.78804994, + "learning_rate": 0.00046202843252057905, + "loss": 0.79858828, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.4128418, + "step": 2799, + "time_per_iteration": 2.60296368598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_mlp": 1.00985634, + "epoch": 0.5386687187379762, + "flos": 490720588800.0, + "grad_norm": 0.036707180351256564, + "language_loss": 0.84230787, + "learning_rate": 0.00046171779728902896, + "loss": 0.8528192, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.4128418, + "step": 2800, + "time_per_iteration": 2.5585505962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00514555, + "epoch": 0.5388611004232398, + "flos": 483628363008.0, + "grad_norm": 0.04683117604826235, + "language_loss": 0.86678994, + "learning_rate": 0.000461407176919948, + "loss": 0.87725389, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.41259766, + "step": 2801, + "time_per_iteration": 2.5158677101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045703, + "balance_loss_mlp": 1.00440919, + "epoch": 0.5390534821085032, + "flos": 562089999360.0, + "grad_norm": 0.033429611400543416, + "language_loss": 0.85806906, + "learning_rate": 0.00046109657153392997, + "loss": 0.8685261, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.41308594, + "step": 2802, + "time_per_iteration": 2.685462236404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_mlp": 1.00591016, + "epoch": 0.5392458637937668, + "flos": 489361634304.0, + "grad_norm": 0.036955437438287664, + "language_loss": 0.83497781, + "learning_rate": 0.0004607859812515622, + "loss": 0.84544891, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.41210938, + "step": 2803, + "time_per_iteration": 2.6187045574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054182, + "balance_loss_mlp": 1.01300752, + "epoch": 0.5394382454790304, + "flos": 513050019072.0, + "grad_norm": 0.03744234433888121, + "language_loss": 0.88279247, + "learning_rate": 0.00046047540619342667, + "loss": 0.89333427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.41186523, + "step": 2804, + "time_per_iteration": 2.5895795822143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046381, + "balance_loss_mlp": 1.00525355, + "epoch": 0.539630627164294, + "flos": 568689385728.0, + "grad_norm": 0.033797229327163864, + "language_loss": 0.80605161, + "learning_rate": 0.00046016484648009933, + "loss": 0.81651545, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.41137695, + "step": 2805, + "time_per_iteration": 2.691092014312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_mlp": 1.00612748, + "epoch": 0.5398230088495575, + "flos": 527503733760.0, + "grad_norm": 0.03721333567310717, + "language_loss": 0.8141259, + "learning_rate": 0.0004598543022321501, + "loss": 0.82459861, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.41162109, + "step": 2806, + "time_per_iteration": 2.6083474159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_mlp": 1.00312901, + "epoch": 0.5400153905348211, + "flos": 539853888000.0, + "grad_norm": 0.03209862982455251, + "language_loss": 0.80560988, + "learning_rate": 0.0004595437735701433, + "loss": 0.81605339, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.41235352, + "step": 2807, + "time_per_iteration": 2.688770055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_mlp": 1.00354242, + "epoch": 0.5402077722200846, + "flos": 514665597696.0, + "grad_norm": 0.03651112385557252, + "language_loss": 0.83778703, + "learning_rate": 0.00045923326061463623, + "loss": 0.84823376, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.41137695, + "step": 2808, + "time_per_iteration": 2.761165142059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_mlp": 1.00534451, + "epoch": 0.5404001539053482, + "flos": 677567105280.0, + "grad_norm": 0.031915220360544935, + "language_loss": 0.81941223, + "learning_rate": 0.00045892276348618113, + "loss": 0.82987767, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.41210938, + "step": 2809, + "time_per_iteration": 2.9716503620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105954, + "balance_loss_mlp": 1.01948547, + "epoch": 0.5405925355906118, + "flos": 1558191938304.0, + "grad_norm": 0.009079850654737754, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79320371, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.40039062, + "step": 2810, + "time_per_iteration": 4.989593029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051922, + "balance_loss_mlp": 1.01069915, + "epoch": 0.5407849172758753, + "flos": 648538166784.0, + "grad_norm": 0.030063831285765737, + "language_loss": 0.81372178, + "learning_rate": 0.000458301817192603, + "loss": 0.82424104, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.41235352, + "step": 2811, + "time_per_iteration": 2.855461359024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063999, + "balance_loss_mlp": 1.02404022, + "epoch": 0.5409772989611389, + "flos": 1410483893760.0, + "grad_norm": 0.010433444863556941, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81905782, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.39941406, + "step": 2812, + "time_per_iteration": 4.82320761680603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048585, + "balance_loss_mlp": 1.00748193, + "epoch": 0.5411696806464025, + "flos": 555545048064.0, + "grad_norm": 0.0337189850887645, + "language_loss": 0.87703073, + "learning_rate": 0.00045768093565369983, + "loss": 0.88751662, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.41113281, + "step": 2813, + "time_per_iteration": 2.7693569660186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047899, + "balance_loss_mlp": 1.00660491, + "epoch": 0.5413620623316661, + "flos": 529205828352.0, + "grad_norm": 0.032417929995103685, + "language_loss": 0.82523155, + "learning_rate": 0.0004573705194685646, + "loss": 0.83571053, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.41308594, + "step": 2814, + "time_per_iteration": 2.6525402069091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_mlp": 1.00637758, + "epoch": 0.5415544440169295, + "flos": 599852020224.0, + "grad_norm": 0.03532378336462207, + "language_loss": 0.85743833, + "learning_rate": 0.00045706011983366157, + "loss": 0.86791384, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.41186523, + "step": 2815, + "time_per_iteration": 2.67850661277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_mlp": 1.0085113, + "epoch": 0.5417468257021931, + "flos": 471714667776.0, + "grad_norm": 0.039926593194372036, + "language_loss": 0.83561838, + "learning_rate": 0.00045674973686949847, + "loss": 0.84611619, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.4128418, + "step": 2816, + "time_per_iteration": 2.56265926361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_mlp": 1.00839996, + "epoch": 0.5419392073874567, + "flos": 682191243264.0, + "grad_norm": 0.04027281254885066, + "language_loss": 0.85790694, + "learning_rate": 0.0004564393706965766, + "loss": 0.86840272, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.41186523, + "step": 2817, + "time_per_iteration": 2.955655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_mlp": 1.00700641, + "epoch": 0.5421315890727203, + "flos": 463337364480.0, + "grad_norm": 0.033241337033607515, + "language_loss": 0.82050943, + "learning_rate": 0.00045612902143539116, + "loss": 0.83099198, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.41259766, + "step": 2818, + "time_per_iteration": 2.546567440032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_mlp": 1.0021013, + "epoch": 0.5423239707579839, + "flos": 437890504704.0, + "grad_norm": 0.03727551718578137, + "language_loss": 0.82264733, + "learning_rate": 0.00045581868920642986, + "loss": 0.83307964, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.41137695, + "step": 2819, + "time_per_iteration": 2.4746038913726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043245, + "balance_loss_mlp": 1.00197434, + "epoch": 0.5425163524432474, + "flos": 459306187776.0, + "grad_norm": 0.035271404401503774, + "language_loss": 0.80009091, + "learning_rate": 0.00045550837413017457, + "loss": 0.81052339, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.4128418, + "step": 2820, + "time_per_iteration": 2.598879098892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00348616, + "epoch": 0.542708734128511, + "flos": 420410734080.0, + "grad_norm": 0.029285477013781286, + "language_loss": 0.8579312, + "learning_rate": 0.0004551980763271005, + "loss": 0.86837852, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.41259766, + "step": 2821, + "time_per_iteration": 2.650609254837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00917685, + "epoch": 0.5429011158137745, + "flos": 679709549568.0, + "grad_norm": 0.038877958454501954, + "language_loss": 0.84286433, + "learning_rate": 0.0004548877959176756, + "loss": 0.8533681, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.41210938, + "step": 2822, + "time_per_iteration": 2.831773042678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049844, + "balance_loss_mlp": 1.00857341, + "epoch": 0.5430934974990381, + "flos": 541968142080.0, + "grad_norm": 0.03541809911924704, + "language_loss": 0.8707608, + "learning_rate": 0.00045457753302236166, + "loss": 0.8812592, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.4128418, + "step": 2823, + "time_per_iteration": 2.609090805053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_mlp": 1.00726891, + "epoch": 0.5432858791843016, + "flos": 659644072704.0, + "grad_norm": 0.03671475643697152, + "language_loss": 0.87739956, + "learning_rate": 0.00045426728776161353, + "loss": 0.8878845, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.41235352, + "step": 2824, + "time_per_iteration": 2.802915334701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_mlp": 1.00574553, + "epoch": 0.5434782608695652, + "flos": 532967741952.0, + "grad_norm": 0.03427907044877429, + "language_loss": 0.82057846, + "learning_rate": 0.00045395706025587863, + "loss": 0.83104837, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.41259766, + "step": 2825, + "time_per_iteration": 2.6308939456939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_mlp": 1.00194418, + "epoch": 0.5436706425548288, + "flos": 609633964800.0, + "grad_norm": 0.034616126048734014, + "language_loss": 0.8290934, + "learning_rate": 0.00045364685062559843, + "loss": 0.83952391, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.41113281, + "step": 2826, + "time_per_iteration": 2.8231375217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.006657, + "epoch": 0.5438630242400924, + "flos": 706773933312.0, + "grad_norm": 0.03098010756730768, + "language_loss": 0.92170852, + "learning_rate": 0.0004533366589912067, + "loss": 0.93218541, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.41040039, + "step": 2827, + "time_per_iteration": 2.9529805183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105042, + "balance_loss_mlp": 1.00912547, + "epoch": 0.544055405925356, + "flos": 857839513344.0, + "grad_norm": 0.036966152235284246, + "language_loss": 0.78087002, + "learning_rate": 0.0004530264854731306, + "loss": 0.79137421, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.41308594, + "step": 2828, + "time_per_iteration": 3.0584123134613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00913441, + "epoch": 0.5442477876106194, + "flos": 572968438272.0, + "grad_norm": 0.03388858680916364, + "language_loss": 0.84792554, + "learning_rate": 0.00045271633019179034, + "loss": 0.85842907, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.41235352, + "step": 2829, + "time_per_iteration": 2.827160596847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046647, + "balance_loss_mlp": 1.00532901, + "epoch": 0.544440169295883, + "flos": 626803643136.0, + "grad_norm": 0.02947280635893411, + "language_loss": 0.88373405, + "learning_rate": 0.0004524061932675986, + "loss": 0.89420056, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.41333008, + "step": 2830, + "time_per_iteration": 2.8206188678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_mlp": 1.00768852, + "epoch": 0.5446325509811466, + "flos": 837641833728.0, + "grad_norm": 0.03760239902604625, + "language_loss": 0.87454915, + "learning_rate": 0.00045209607482096125, + "loss": 0.88503784, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.41186523, + "step": 2831, + "time_per_iteration": 3.0359649658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_mlp": 1.00600255, + "epoch": 0.5448249326664102, + "flos": 484390465536.0, + "grad_norm": 0.03560900416786153, + "language_loss": 0.8480038, + "learning_rate": 0.0004517859749722772, + "loss": 0.85847604, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.41235352, + "step": 2832, + "time_per_iteration": 2.689295768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050991, + "balance_loss_mlp": 1.00972044, + "epoch": 0.5450173143516738, + "flos": 562346623488.0, + "grad_norm": 0.03426430427633819, + "language_loss": 0.79531574, + "learning_rate": 0.0004514758938419376, + "loss": 0.80582559, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.4128418, + "step": 2833, + "time_per_iteration": 2.8727176189422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049419, + "balance_loss_mlp": 1.00965118, + "epoch": 0.5452096960369373, + "flos": 1473588761856.0, + "grad_norm": 0.014550980978032766, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77970004, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.39746094, + "step": 2834, + "time_per_iteration": 4.9399590492248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_mlp": 1.00556791, + "epoch": 0.5454020777222008, + "flos": 466018334976.0, + "grad_norm": 0.03248736316688099, + "language_loss": 0.84558713, + "learning_rate": 0.00045085578821782175, + "loss": 0.85605574, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.41308594, + "step": 2835, + "time_per_iteration": 2.5900182723999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057518, + "balance_loss_mlp": 1.01784515, + "epoch": 0.5455944594074644, + "flos": 1472617667328.0, + "grad_norm": 0.013168056581512213, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77192259, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.39648438, + "step": 2836, + "time_per_iteration": 4.910645961761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_mlp": 1.01100063, + "epoch": 0.545786841092728, + "flos": 534305309184.0, + "grad_norm": 0.02738620901632673, + "language_loss": 0.81102663, + "learning_rate": 0.00045023575891159866, + "loss": 0.82154894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.41235352, + "step": 2837, + "time_per_iteration": 2.7457492351531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_mlp": 1.00682068, + "epoch": 0.5459792227779915, + "flos": 1355428740096.0, + "grad_norm": 0.008010480990562174, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75810492, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.3984375, + "step": 2838, + "time_per_iteration": 4.94202995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00748277, + "epoch": 0.5461716044632551, + "flos": 639073117440.0, + "grad_norm": 0.02877585305336934, + "language_loss": 0.78956163, + "learning_rate": 0.0004496158068861354, + "loss": 0.80004895, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.41259766, + "step": 2839, + "time_per_iteration": 2.808370590209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_mlp": 1.00642872, + "epoch": 0.5463639861485187, + "flos": 603926938368.0, + "grad_norm": 0.03433602558833516, + "language_loss": 0.81297666, + "learning_rate": 0.00044930586015455207, + "loss": 0.82345319, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.41235352, + "step": 2840, + "time_per_iteration": 2.782735824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_mlp": 1.00695133, + "epoch": 0.5465563678337823, + "flos": 643753635840.0, + "grad_norm": 0.02662038136573285, + "language_loss": 0.89087546, + "learning_rate": 0.000448995933104179, + "loss": 0.9013567, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.41186523, + "step": 2841, + "time_per_iteration": 2.869476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050304, + "balance_loss_mlp": 1.0090816, + "epoch": 0.5467487495190458, + "flos": 615365290752.0, + "grad_norm": 0.03719587304070891, + "language_loss": 0.80725658, + "learning_rate": 0.00044868602585534077, + "loss": 0.81775963, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.41235352, + "step": 2842, + "time_per_iteration": 2.843027353286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_mlp": 1.00552344, + "epoch": 0.5469411312043093, + "flos": 462128109312.0, + "grad_norm": 0.03959126806850753, + "language_loss": 0.89450765, + "learning_rate": 0.0004483761385283541, + "loss": 0.90497464, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.41186523, + "step": 2843, + "time_per_iteration": 2.5162315368652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044431, + "balance_loss_mlp": 1.00332797, + "epoch": 0.5471335128895729, + "flos": 562267888896.0, + "grad_norm": 0.03475490738980998, + "language_loss": 0.82207608, + "learning_rate": 0.0004480662712435281, + "loss": 0.83252037, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.41113281, + "step": 2844, + "time_per_iteration": 2.7367589473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_mlp": 1.0045476, + "epoch": 0.5473258945748365, + "flos": 519686343936.0, + "grad_norm": 0.032685207895773144, + "language_loss": 0.8903448, + "learning_rate": 0.0004477564241211635, + "loss": 0.90080059, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.41040039, + "step": 2845, + "time_per_iteration": 2.6059961318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00640178, + "epoch": 0.5475182762601001, + "flos": 434744884992.0, + "grad_norm": 0.035185291050346845, + "language_loss": 0.87463105, + "learning_rate": 0.0004474465972815541, + "loss": 0.88510644, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.41137695, + "step": 2846, + "time_per_iteration": 2.5159108638763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_mlp": 1.00808775, + "epoch": 0.5477106579453636, + "flos": 512574676224.0, + "grad_norm": 0.03033857724648134, + "language_loss": 0.88145, + "learning_rate": 0.000447136790844985, + "loss": 0.89194143, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.41064453, + "step": 2847, + "time_per_iteration": 2.7494916915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049923, + "balance_loss_mlp": 1.00889075, + "epoch": 0.5479030396306271, + "flos": 677141339904.0, + "grad_norm": 0.030728657632270156, + "language_loss": 0.81529921, + "learning_rate": 0.00044682700493173385, + "loss": 0.82579845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.41040039, + "step": 2848, + "time_per_iteration": 2.8558499813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043616, + "balance_loss_mlp": 1.00260758, + "epoch": 0.5480954213158907, + "flos": 877579346688.0, + "grad_norm": 0.03576262257130289, + "language_loss": 0.80969125, + "learning_rate": 0.00044651723966207004, + "loss": 0.82012743, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.41015625, + "step": 2849, + "time_per_iteration": 3.1599223613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_mlp": 1.00768459, + "epoch": 0.5482878030011543, + "flos": 623175877632.0, + "grad_norm": 0.0450385792128453, + "language_loss": 0.79220605, + "learning_rate": 0.00044620749515625536, + "loss": 0.80269301, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.41015625, + "step": 2850, + "time_per_iteration": 2.816164255142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044849, + "balance_loss_mlp": 1.00376952, + "epoch": 0.5484801846864179, + "flos": 498258021888.0, + "grad_norm": 0.033687612572946876, + "language_loss": 0.85353971, + "learning_rate": 0.00044589777153454334, + "loss": 0.86398828, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.41088867, + "step": 2851, + "time_per_iteration": 2.767086982727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_mlp": 1.00158429, + "epoch": 0.5486725663716814, + "flos": 443354512896.0, + "grad_norm": 0.032917884516517996, + "language_loss": 0.84102762, + "learning_rate": 0.00044558806891717895, + "loss": 0.85145497, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.41162109, + "step": 2852, + "time_per_iteration": 2.4791274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_mlp": 1.00560999, + "epoch": 0.548864948056945, + "flos": 656348753664.0, + "grad_norm": 0.02926310360240776, + "language_loss": 0.80048501, + "learning_rate": 0.0004452783874243998, + "loss": 0.81095093, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.40991211, + "step": 2853, + "time_per_iteration": 2.8510489463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051891, + "balance_loss_mlp": 1.01100183, + "epoch": 0.5490573297422086, + "flos": 547141499904.0, + "grad_norm": 0.035598285504377866, + "language_loss": 0.85552013, + "learning_rate": 0.00044496872717643475, + "loss": 0.86603898, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.40893555, + "step": 2854, + "time_per_iteration": 2.6640069484710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107375, + "balance_loss_mlp": 1.03398132, + "epoch": 0.5492497114274721, + "flos": 1593763882752.0, + "grad_norm": 0.015003928091872471, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7816304, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.39746094, + "step": 2855, + "time_per_iteration": 4.924941778182983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00791013, + "epoch": 0.5494420931127356, + "flos": 752270718720.0, + "grad_norm": 0.03382110809465603, + "language_loss": 0.82668245, + "learning_rate": 0.0004443494708958217, + "loss": 0.83717024, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.40869141, + "step": 2856, + "time_per_iteration": 2.9736838340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049194, + "balance_loss_mlp": 1.00837672, + "epoch": 0.5496344747979992, + "flos": 627305230848.0, + "grad_norm": 0.02827813290363101, + "language_loss": 0.81289691, + "learning_rate": 0.0004440398751035906, + "loss": 0.82338881, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.40820312, + "step": 2857, + "time_per_iteration": 2.943936347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_mlp": 1.01289868, + "epoch": 0.5498268564832628, + "flos": 524125789440.0, + "grad_norm": 0.04150845511788398, + "language_loss": 0.8407867, + "learning_rate": 0.00044373030103700645, + "loss": 0.85132337, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.40771484, + "step": 2858, + "time_per_iteration": 2.5977840423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00719178, + "epoch": 0.5500192381685264, + "flos": 605778732288.0, + "grad_norm": 0.03313045470580536, + "language_loss": 0.80440414, + "learning_rate": 0.000443420748816257, + "loss": 0.81488407, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.40795898, + "step": 2859, + "time_per_iteration": 2.7645347118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049935, + "balance_loss_mlp": 1.00914145, + "epoch": 0.55021161985379, + "flos": 521655756288.0, + "grad_norm": 0.037659665058523445, + "language_loss": 0.79047614, + "learning_rate": 0.0004431112185615208, + "loss": 0.8009755, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.40795898, + "step": 2860, + "time_per_iteration": 2.7862706184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_mlp": 1.00302446, + "epoch": 0.5504040015390534, + "flos": 490655460096.0, + "grad_norm": 0.03348154415794888, + "language_loss": 0.8037793, + "learning_rate": 0.00044280171039296845, + "loss": 0.8142184, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.40893555, + "step": 2861, + "time_per_iteration": 2.6561086177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052487, + "balance_loss_mlp": 1.01166964, + "epoch": 0.550596383224317, + "flos": 576862554624.0, + "grad_norm": 0.03513860333112342, + "language_loss": 0.88868964, + "learning_rate": 0.0004424922244307616, + "loss": 0.89921451, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.40820312, + "step": 2862, + "time_per_iteration": 2.7066099643707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01298499, + "epoch": 0.5507887649095806, + "flos": 643634072064.0, + "grad_norm": 0.03653258974946179, + "language_loss": 0.82663441, + "learning_rate": 0.00044218276079505315, + "loss": 0.83717263, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.40844727, + "step": 2863, + "time_per_iteration": 2.87058162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049384, + "balance_loss_mlp": 1.00856698, + "epoch": 0.5509811465948442, + "flos": 532865674752.0, + "grad_norm": 0.034931125724459874, + "language_loss": 0.75083911, + "learning_rate": 0.0004418733196059876, + "loss": 0.76133299, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.40820312, + "step": 2864, + "time_per_iteration": 2.690927743911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048154, + "balance_loss_mlp": 1.00719357, + "epoch": 0.5511735282801077, + "flos": 655984226304.0, + "grad_norm": 0.03582782743987034, + "language_loss": 0.80482149, + "learning_rate": 0.0004415639009837008, + "loss": 0.81530309, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.40966797, + "step": 2865, + "time_per_iteration": 2.8515002727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050322, + "balance_loss_mlp": 1.00948107, + "epoch": 0.5513659099653713, + "flos": 530610469632.0, + "grad_norm": 0.03216902856467023, + "language_loss": 0.82250589, + "learning_rate": 0.00044125450504831955, + "loss": 0.83300906, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.40844727, + "step": 2866, + "time_per_iteration": 2.743833303451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053087, + "balance_loss_mlp": 1.01229346, + "epoch": 0.5515582916506349, + "flos": 555974704128.0, + "grad_norm": 0.03636447949545943, + "language_loss": 0.827411, + "learning_rate": 0.0004409451319199622, + "loss": 0.83794183, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.40795898, + "step": 2867, + "time_per_iteration": 2.654466390609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045403, + "balance_loss_mlp": 1.00439477, + "epoch": 0.5517506733358984, + "flos": 736772999424.0, + "grad_norm": 0.03752588301556939, + "language_loss": 0.85160595, + "learning_rate": 0.0004406357817187381, + "loss": 0.86206001, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.41015625, + "step": 2868, + "time_per_iteration": 2.9610273838043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051499, + "balance_loss_mlp": 1.01065779, + "epoch": 0.551943055021162, + "flos": 1117190818560.0, + "grad_norm": 0.028811275091252902, + "language_loss": 0.81857193, + "learning_rate": 0.0004403264545647474, + "loss": 0.8290869, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.40844727, + "step": 2869, + "time_per_iteration": 3.511462450027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043012, + "balance_loss_mlp": 1.00195587, + "epoch": 0.5521354367064255, + "flos": 545502588672.0, + "grad_norm": 0.03184831617373855, + "language_loss": 0.85004073, + "learning_rate": 0.00044001715057808154, + "loss": 0.86047089, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.41064453, + "step": 2870, + "time_per_iteration": 2.744248390197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048495, + "balance_loss_mlp": 1.00746286, + "epoch": 0.5523278183916891, + "flos": 937872986880.0, + "grad_norm": 0.03348956391566461, + "language_loss": 0.81933939, + "learning_rate": 0.0004397078698788232, + "loss": 0.82982433, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.41040039, + "step": 2871, + "time_per_iteration": 3.193040132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_mlp": 1.01277161, + "epoch": 0.5525202000769527, + "flos": 1469101684224.0, + "grad_norm": 0.00853782264427079, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81494617, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.39453125, + "step": 2872, + "time_per_iteration": 4.887877702713013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050207, + "balance_loss_mlp": 1.00917542, + "epoch": 0.5527125817622163, + "flos": 490785717504.0, + "grad_norm": 0.036240955421061, + "language_loss": 0.78392744, + "learning_rate": 0.00043908937882281343, + "loss": 0.79442948, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.41040039, + "step": 2873, + "time_per_iteration": 2.5992209911346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00414526, + "epoch": 0.5529049634474797, + "flos": 636149128704.0, + "grad_norm": 0.03461125376652938, + "language_loss": 0.82969832, + "learning_rate": 0.0004387801687061814, + "loss": 0.84015036, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.41064453, + "step": 2874, + "time_per_iteration": 2.8166332244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_mlp": 1.00408852, + "epoch": 0.5530973451327433, + "flos": 582435432960.0, + "grad_norm": 0.031639900781256135, + "language_loss": 0.81371784, + "learning_rate": 0.0004384709823571958, + "loss": 0.82416999, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.41137695, + "step": 2875, + "time_per_iteration": 2.7777786254882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00458348, + "epoch": 0.5532897268180069, + "flos": 1124330676480.0, + "grad_norm": 0.03430168550584483, + "language_loss": 0.83714402, + "learning_rate": 0.0004381618198958932, + "loss": 0.84760094, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.41113281, + "step": 2876, + "time_per_iteration": 3.517432451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_mlp": 1.00536335, + "epoch": 0.5534821085032705, + "flos": 638513203968.0, + "grad_norm": 0.03082674119581989, + "language_loss": 0.83886576, + "learning_rate": 0.00043785268144230137, + "loss": 0.84933138, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.41210938, + "step": 2877, + "time_per_iteration": 2.9488272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048169, + "balance_loss_mlp": 1.0069226, + "epoch": 0.5536744901885341, + "flos": 572217029376.0, + "grad_norm": 0.037462471463683845, + "language_loss": 0.8303535, + "learning_rate": 0.00043754356711643837, + "loss": 0.84083521, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.41259766, + "step": 2878, + "time_per_iteration": 2.669304370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_mlp": 1.00479829, + "epoch": 0.5538668718737976, + "flos": 596917337856.0, + "grad_norm": 0.03146432649645385, + "language_loss": 0.84558415, + "learning_rate": 0.0004372344770383132, + "loss": 0.8560434, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.41137695, + "step": 2879, + "time_per_iteration": 2.855231761932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050888, + "balance_loss_mlp": 1.0097847, + "epoch": 0.5540592535590612, + "flos": 533719150848.0, + "grad_norm": 0.0358528854453713, + "language_loss": 0.83432066, + "learning_rate": 0.00043692541132792507, + "loss": 0.84482956, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.41113281, + "step": 2880, + "time_per_iteration": 2.662008047103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051879, + "balance_loss_mlp": 1.01070428, + "epoch": 0.5542516352443247, + "flos": 413505146112.0, + "grad_norm": 0.035032849721931915, + "language_loss": 0.83894408, + "learning_rate": 0.00043661637010526384, + "loss": 0.84946287, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.41186523, + "step": 2881, + "time_per_iteration": 2.507699489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_mlp": 1.00484717, + "epoch": 0.5544440169295883, + "flos": 548678343936.0, + "grad_norm": 0.03314086611141918, + "language_loss": 0.83246458, + "learning_rate": 0.00043630735349031025, + "loss": 0.84292531, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.41235352, + "step": 2882, + "time_per_iteration": 2.70409893989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_mlp": 1.00623393, + "epoch": 0.5546363986148518, + "flos": 623034926592.0, + "grad_norm": 0.03282028788454341, + "language_loss": 0.82495463, + "learning_rate": 0.00043599836160303495, + "loss": 0.83542871, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.41186523, + "step": 2883, + "time_per_iteration": 2.900757312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_mlp": 1.00550687, + "epoch": 0.5548287803001154, + "flos": 706580492544.0, + "grad_norm": 0.029978122278870225, + "language_loss": 0.78110325, + "learning_rate": 0.0004356893945633995, + "loss": 0.79157007, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.41186523, + "step": 2884, + "time_per_iteration": 2.975062608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_mlp": 1.00501966, + "epoch": 0.555021161985379, + "flos": 505184997120.0, + "grad_norm": 0.033025085572570244, + "language_loss": 0.82143605, + "learning_rate": 0.0004353804524913551, + "loss": 0.83189756, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.41137695, + "step": 2885, + "time_per_iteration": 2.6369645595550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_mlp": 1.00512528, + "epoch": 0.5552135436706426, + "flos": 617210281728.0, + "grad_norm": 0.0369840001422722, + "language_loss": 0.82350749, + "learning_rate": 0.0004350715355068441, + "loss": 0.83396947, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.41088867, + "step": 2886, + "time_per_iteration": 2.727186441421509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044964, + "balance_loss_mlp": 1.00393176, + "epoch": 0.5554059253559062, + "flos": 464817828096.0, + "grad_norm": 0.043659618464352824, + "language_loss": 0.80073905, + "learning_rate": 0.00043476264372979847, + "loss": 0.8111887, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.41040039, + "step": 2887, + "time_per_iteration": 2.5368049144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_mlp": 1.00357509, + "epoch": 0.5555983070411696, + "flos": 1564876885248.0, + "grad_norm": 0.03408551435207337, + "language_loss": 0.79322737, + "learning_rate": 0.0004344537772801408, + "loss": 0.80367273, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.40966797, + "step": 2888, + "time_per_iteration": 3.869920015335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057491, + "balance_loss_mlp": 1.01791382, + "epoch": 0.5557906887264332, + "flos": 1471229544192.0, + "grad_norm": 0.014769088101488215, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74479944, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.39550781, + "step": 2889, + "time_per_iteration": 4.936699867248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_mlp": 1.00608003, + "epoch": 0.5559830704116968, + "flos": 530864181504.0, + "grad_norm": 0.0376436874687178, + "language_loss": 0.83696067, + "learning_rate": 0.0004338361208426298, + "loss": 0.84743202, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.41064453, + "step": 2890, + "time_per_iteration": 2.6094541549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051706, + "balance_loss_mlp": 1.01069844, + "epoch": 0.5561754520969604, + "flos": 652519766016.0, + "grad_norm": 0.029226912064567154, + "language_loss": 0.81876659, + "learning_rate": 0.00043352733109457164, + "loss": 0.82928365, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.41015625, + "step": 2891, + "time_per_iteration": 2.8833718299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00985098, + "epoch": 0.556367833782224, + "flos": 735620124672.0, + "grad_norm": 0.029092214279724596, + "language_loss": 0.84975475, + "learning_rate": 0.00043321856715349244, + "loss": 0.86026359, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.41040039, + "step": 2892, + "time_per_iteration": 2.9798240661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046881, + "balance_loss_mlp": 1.00575387, + "epoch": 0.5565602154674875, + "flos": 673641886464.0, + "grad_norm": 0.03553967461394851, + "language_loss": 0.81101406, + "learning_rate": 0.00043290982913926466, + "loss": 0.8214829, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.41137695, + "step": 2893, + "time_per_iteration": 2.8139491081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00463176, + "epoch": 0.556752597152751, + "flos": 587504778240.0, + "grad_norm": 0.036653967015968944, + "language_loss": 0.84921324, + "learning_rate": 0.0004326011171717514, + "loss": 0.85967016, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.41064453, + "step": 2894, + "time_per_iteration": 2.9087953567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046707, + "balance_loss_mlp": 1.00555551, + "epoch": 0.5569449788380146, + "flos": 438691491072.0, + "grad_norm": 0.03515530628910635, + "language_loss": 0.81422639, + "learning_rate": 0.0004322924313708051, + "loss": 0.82469344, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.41162109, + "step": 2895, + "time_per_iteration": 2.529937505722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051552, + "balance_loss_mlp": 1.01054382, + "epoch": 0.5571373605232782, + "flos": 503248632576.0, + "grad_norm": 0.03724847922393753, + "language_loss": 0.84896851, + "learning_rate": 0.0004319837718562681, + "loss": 0.85948396, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.41015625, + "step": 2896, + "time_per_iteration": 2.6142115592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_mlp": 1.00599957, + "epoch": 0.5573297422085417, + "flos": 578590894080.0, + "grad_norm": 0.04905398235042313, + "language_loss": 0.83417499, + "learning_rate": 0.0004316751387479726, + "loss": 0.84464645, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.41162109, + "step": 2897, + "time_per_iteration": 2.7738893032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_mlp": 1.00555933, + "epoch": 0.5575221238938053, + "flos": 1346049251328.0, + "grad_norm": 0.03588075887117774, + "language_loss": 0.82779884, + "learning_rate": 0.0004313665321657409, + "loss": 0.83826572, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.41137695, + "step": 2898, + "time_per_iteration": 3.725510835647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_mlp": 1.00672877, + "epoch": 0.5577145055790689, + "flos": 603099707136.0, + "grad_norm": 0.03720848090960627, + "language_loss": 0.80283779, + "learning_rate": 0.00043105795222938436, + "loss": 0.81331486, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.40991211, + "step": 2899, + "time_per_iteration": 2.7282700538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_mlp": 1.00829744, + "epoch": 0.5579068872643325, + "flos": 563691972096.0, + "grad_norm": 0.03568825250494595, + "language_loss": 0.79214776, + "learning_rate": 0.00043074939905870467, + "loss": 0.80263913, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.40844727, + "step": 2900, + "time_per_iteration": 2.696354389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_mlp": 1.00399923, + "epoch": 0.558099268949596, + "flos": 545589104640.0, + "grad_norm": 0.04035642488371941, + "language_loss": 0.81151342, + "learning_rate": 0.0004304408727734927, + "loss": 0.82196188, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.40844727, + "step": 2901, + "time_per_iteration": 2.6394877433776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044507, + "balance_loss_mlp": 1.00366592, + "epoch": 0.5582916506348595, + "flos": 553853647104.0, + "grad_norm": 0.036813902208390564, + "language_loss": 0.89428526, + "learning_rate": 0.0004301323734935288, + "loss": 0.90473032, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.40844727, + "step": 2902, + "time_per_iteration": 2.659945249557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_mlp": 1.00635207, + "epoch": 0.5584840323201231, + "flos": 544425536256.0, + "grad_norm": 0.03290970227186249, + "language_loss": 0.87933898, + "learning_rate": 0.000429823901338583, + "loss": 0.88981086, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.40844727, + "step": 2903, + "time_per_iteration": 2.643388032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_mlp": 1.00432324, + "epoch": 0.5586764140053867, + "flos": 817023246336.0, + "grad_norm": 0.03162840926526219, + "language_loss": 0.87249023, + "learning_rate": 0.00042951545642841513, + "loss": 0.88294262, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.40917969, + "step": 2904, + "time_per_iteration": 3.0901763439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047509, + "balance_loss_mlp": 1.00642967, + "epoch": 0.5588687956906503, + "flos": 487416521472.0, + "grad_norm": 0.02951660315659268, + "language_loss": 0.87151515, + "learning_rate": 0.0004292070388827737, + "loss": 0.88199031, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.41088867, + "step": 2905, + "time_per_iteration": 2.6241614818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050702, + "balance_loss_mlp": 1.00967062, + "epoch": 0.5590611773759138, + "flos": 453069383424.0, + "grad_norm": 0.03428125950398782, + "language_loss": 0.81863332, + "learning_rate": 0.00042889864882139753, + "loss": 0.82914031, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.41040039, + "step": 2906, + "time_per_iteration": 2.6295247077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_mlp": 1.01025224, + "epoch": 0.5592535590611774, + "flos": 521957100288.0, + "grad_norm": 0.03203389874594117, + "language_loss": 0.82458705, + "learning_rate": 0.0004285902863640139, + "loss": 0.83510035, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.41088867, + "step": 2907, + "time_per_iteration": 2.6310994625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044647, + "balance_loss_mlp": 1.00366294, + "epoch": 0.5594459407464409, + "flos": 553601880576.0, + "grad_norm": 0.029509403523767207, + "language_loss": 0.86282808, + "learning_rate": 0.00042828195163033966, + "loss": 0.87327456, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.40991211, + "step": 2908, + "time_per_iteration": 2.720059871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_mlp": 1.00285828, + "epoch": 0.5596383224317045, + "flos": 485788303872.0, + "grad_norm": 0.032784621074408576, + "language_loss": 0.796462, + "learning_rate": 0.0004279736447400812, + "loss": 0.80690086, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.41040039, + "step": 2909, + "time_per_iteration": 2.562958240509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_mlp": 1.00323904, + "epoch": 0.5598307041169681, + "flos": 612380064000.0, + "grad_norm": 0.03125271468065307, + "language_loss": 0.78822809, + "learning_rate": 0.00042766536581293385, + "loss": 0.79866982, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.40942383, + "step": 2910, + "time_per_iteration": 2.742727041244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_mlp": 1.00297463, + "epoch": 0.5600230858022316, + "flos": 489917657088.0, + "grad_norm": 0.033084161668713065, + "language_loss": 0.80192208, + "learning_rate": 0.0004273571149685819, + "loss": 0.81236243, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.41064453, + "step": 2911, + "time_per_iteration": 2.7333109378814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_mlp": 1.00091636, + "epoch": 0.5602154674874952, + "flos": 599982277632.0, + "grad_norm": 0.033670817346998394, + "language_loss": 0.84396589, + "learning_rate": 0.00042704889232669937, + "loss": 0.8543846, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.40966797, + "step": 2912, + "time_per_iteration": 2.7085225582122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00336003, + "epoch": 0.5604078491727588, + "flos": 587063461632.0, + "grad_norm": 0.043754524068974454, + "language_loss": 0.8611334, + "learning_rate": 0.0004267406980069484, + "loss": 0.87157494, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.40795898, + "step": 2913, + "time_per_iteration": 2.747812271118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00275385, + "epoch": 0.5606002308580224, + "flos": 542328778752.0, + "grad_norm": 0.02876490223829942, + "language_loss": 0.7993964, + "learning_rate": 0.0004264325321289808, + "loss": 0.80983406, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.41015625, + "step": 2914, + "time_per_iteration": 2.8028316497802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043869, + "balance_loss_mlp": 1.0028609, + "epoch": 0.5607926125432858, + "flos": 585079464960.0, + "grad_norm": 0.03419971609404561, + "language_loss": 0.86714381, + "learning_rate": 0.00042612439481243736, + "loss": 0.87758255, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.41015625, + "step": 2915, + "time_per_iteration": 2.7691102027893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045259, + "balance_loss_mlp": 1.00417948, + "epoch": 0.5609849942285494, + "flos": 628631137536.0, + "grad_norm": 0.0372312942186238, + "language_loss": 0.90099525, + "learning_rate": 0.00042581628617694735, + "loss": 0.91144788, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.41088867, + "step": 2916, + "time_per_iteration": 2.7420172691345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043547, + "balance_loss_mlp": 1.00261009, + "epoch": 0.561177375913813, + "flos": 589455727104.0, + "grad_norm": 0.03338895186153077, + "language_loss": 0.82208467, + "learning_rate": 0.0004255082063421296, + "loss": 0.83252013, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.40942383, + "step": 2917, + "time_per_iteration": 2.673243999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_mlp": 1.0016005, + "epoch": 0.5613697575990766, + "flos": 528144327168.0, + "grad_norm": 0.03066260992789867, + "language_loss": 0.85543269, + "learning_rate": 0.00042520015542759065, + "loss": 0.86586022, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.41162109, + "step": 2918, + "time_per_iteration": 2.879850387573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_mlp": 1.00201178, + "epoch": 0.5615621392843402, + "flos": 643875144960.0, + "grad_norm": 0.028477148441929827, + "language_loss": 0.88382292, + "learning_rate": 0.00042489213355292687, + "loss": 0.89425319, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.41015625, + "step": 2919, + "time_per_iteration": 2.9279518127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044234, + "balance_loss_mlp": 1.00315475, + "epoch": 0.5617545209696037, + "flos": 428657779968.0, + "grad_norm": 0.03756668389237789, + "language_loss": 0.81703657, + "learning_rate": 0.00042458414083772276, + "loss": 0.82747889, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.41088867, + "step": 2920, + "time_per_iteration": 2.5474023818969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_mlp": 1.00371051, + "epoch": 0.5619469026548672, + "flos": 569590493952.0, + "grad_norm": 0.029467937694277743, + "language_loss": 0.85509026, + "learning_rate": 0.000424276177401552, + "loss": 0.86553693, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.40966797, + "step": 2921, + "time_per_iteration": 2.797123670578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_mlp": 1.00260556, + "epoch": 0.5621392843401308, + "flos": 506244552960.0, + "grad_norm": 0.03575401527758356, + "language_loss": 0.86372185, + "learning_rate": 0.0004239682433639763, + "loss": 0.87415743, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.40966797, + "step": 2922, + "time_per_iteration": 2.6631922721862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_mlp": 1.00281191, + "epoch": 0.5623316660253944, + "flos": 518010494208.0, + "grad_norm": 0.03518251960287723, + "language_loss": 0.86062789, + "learning_rate": 0.0004236603388445467, + "loss": 0.87106532, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.40942383, + "step": 2923, + "time_per_iteration": 2.60380482673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_mlp": 1.00410116, + "epoch": 0.5625240477106579, + "flos": 607139632128.0, + "grad_norm": 0.03089029411800112, + "language_loss": 0.82301855, + "learning_rate": 0.00042335246396280166, + "loss": 0.8334682, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.40869141, + "step": 2924, + "time_per_iteration": 2.7605555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045584, + "balance_loss_mlp": 1.00462389, + "epoch": 0.5627164293959215, + "flos": 451341043968.0, + "grad_norm": 0.04701230911743114, + "language_loss": 0.91272092, + "learning_rate": 0.0004230446188382693, + "loss": 0.92317677, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.40966797, + "step": 2925, + "time_per_iteration": 2.5571765899658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042518, + "balance_loss_mlp": 1.00158191, + "epoch": 0.5629088110811851, + "flos": 743437514496.0, + "grad_norm": 0.0349005963329915, + "language_loss": 0.81125653, + "learning_rate": 0.0004227368035904654, + "loss": 0.82168174, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.40942383, + "step": 2926, + "time_per_iteration": 3.0334270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_mlp": 1.00211096, + "epoch": 0.5631011927664487, + "flos": 497980010496.0, + "grad_norm": 0.0467260030557379, + "language_loss": 0.83361161, + "learning_rate": 0.00042242901833889474, + "loss": 0.84404236, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.40966797, + "step": 2927, + "time_per_iteration": 2.6271822452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_mlp": 1.00153816, + "epoch": 0.5632935744517122, + "flos": 887595561216.0, + "grad_norm": 0.03653524957968277, + "language_loss": 0.8629514, + "learning_rate": 0.0004221212632030501, + "loss": 0.87337685, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.41015625, + "step": 2928, + "time_per_iteration": 3.1174416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_mlp": 1.00542605, + "epoch": 0.5634859561369757, + "flos": 605902186752.0, + "grad_norm": 0.04110669316721802, + "language_loss": 0.80746865, + "learning_rate": 0.0004218135383024124, + "loss": 0.81793177, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.40893555, + "step": 2929, + "time_per_iteration": 2.705615758895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_mlp": 1.00056946, + "epoch": 0.5636783378222393, + "flos": 454903680768.0, + "grad_norm": 0.0339470495466753, + "language_loss": 0.85614669, + "learning_rate": 0.0004215058437564511, + "loss": 0.86656082, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.40844727, + "step": 2930, + "time_per_iteration": 2.5682146549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_mlp": 1.00006831, + "epoch": 0.5638707195075029, + "flos": 519462767616.0, + "grad_norm": 0.03372410984042782, + "language_loss": 0.82691574, + "learning_rate": 0.00042119817968462397, + "loss": 0.83732378, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.4074707, + "step": 2931, + "time_per_iteration": 2.6308341026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105222, + "balance_loss_mlp": 1.01135468, + "epoch": 0.5640631011927665, + "flos": 565845110016.0, + "grad_norm": 0.03794773284405352, + "language_loss": 0.87544155, + "learning_rate": 0.0004208905462063766, + "loss": 0.88596374, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.40869141, + "step": 2932, + "time_per_iteration": 2.6615707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049556, + "balance_loss_mlp": 1.00866711, + "epoch": 0.56425548287803, + "flos": 518038684416.0, + "grad_norm": 0.03232798556838129, + "language_loss": 0.84722394, + "learning_rate": 0.00042058294344114315, + "loss": 0.85771948, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.40893555, + "step": 2933, + "time_per_iteration": 2.6182868480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049116, + "balance_loss_mlp": 1.0083226, + "epoch": 0.5644478645632935, + "flos": 855670824192.0, + "grad_norm": 0.03170317888214056, + "language_loss": 0.78432804, + "learning_rate": 0.0004202753715083456, + "loss": 0.79481918, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.40795898, + "step": 2934, + "time_per_iteration": 3.0613481998443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045, + "balance_loss_mlp": 1.00420666, + "epoch": 0.5646402462485571, + "flos": 554496185856.0, + "grad_norm": 0.03929055225526713, + "language_loss": 0.81611717, + "learning_rate": 0.0004199678305273936, + "loss": 0.82656717, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.40795898, + "step": 2935, + "time_per_iteration": 2.634765386581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00552905, + "epoch": 0.5648326279338207, + "flos": 687312111360.0, + "grad_norm": 0.02956036273454178, + "language_loss": 0.8172124, + "learning_rate": 0.0004196603206176854, + "loss": 0.82767659, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.40893555, + "step": 2936, + "time_per_iteration": 2.9358084201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_mlp": 1.00783014, + "epoch": 0.5650250096190843, + "flos": 804683785728.0, + "grad_norm": 0.03257366451462874, + "language_loss": 0.84142041, + "learning_rate": 0.000419352841898607, + "loss": 0.85190785, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.40917969, + "step": 2937, + "time_per_iteration": 2.9652152061462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_mlp": 1.00891984, + "epoch": 0.5652173913043478, + "flos": 583145045760.0, + "grad_norm": 0.037245032295536384, + "language_loss": 0.7792089, + "learning_rate": 0.000419045394489532, + "loss": 0.78970701, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.40893555, + "step": 2938, + "time_per_iteration": 2.6814448833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048464, + "balance_loss_mlp": 1.00752795, + "epoch": 0.5654097729896114, + "flos": 822168413952.0, + "grad_norm": 0.03166469527574581, + "language_loss": 0.76863134, + "learning_rate": 0.0004187379785098224, + "loss": 0.77911597, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.40942383, + "step": 2939, + "time_per_iteration": 3.1437690258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049709, + "balance_loss_mlp": 1.00881994, + "epoch": 0.565602154674875, + "flos": 785482478592.0, + "grad_norm": 0.035451368889273006, + "language_loss": 0.84531581, + "learning_rate": 0.00041843059407882744, + "loss": 0.85581291, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.40893555, + "step": 2940, + "time_per_iteration": 2.9561386108398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_mlp": 1.00554383, + "epoch": 0.5657945363601385, + "flos": 550744965888.0, + "grad_norm": 0.033205673863039784, + "language_loss": 0.83385015, + "learning_rate": 0.0004181232413158842, + "loss": 0.84431374, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.40820312, + "step": 2941, + "time_per_iteration": 2.6476027965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047072, + "balance_loss_mlp": 1.0061357, + "epoch": 0.5659869180454021, + "flos": 669332698368.0, + "grad_norm": 0.03636978251075169, + "language_loss": 0.83073509, + "learning_rate": 0.0004178159203403179, + "loss": 0.84120584, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.40942383, + "step": 2942, + "time_per_iteration": 2.835840940475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_mlp": 1.00862479, + "epoch": 0.5661792997306656, + "flos": 500949686016.0, + "grad_norm": 0.030415094414242012, + "language_loss": 0.8213833, + "learning_rate": 0.0004175086312714409, + "loss": 0.83187747, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.40795898, + "step": 2943, + "time_per_iteration": 2.6258370876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00911188, + "epoch": 0.5663716814159292, + "flos": 602363849472.0, + "grad_norm": 0.030374801338140925, + "language_loss": 0.84196591, + "learning_rate": 0.00041720137422855366, + "loss": 0.85246402, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.40698242, + "step": 2944, + "time_per_iteration": 2.753483772277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050206, + "balance_loss_mlp": 1.00948393, + "epoch": 0.5665640631011928, + "flos": 542033270784.0, + "grad_norm": 0.0327328941542846, + "language_loss": 0.79511452, + "learning_rate": 0.00041689414933094383, + "loss": 0.80561656, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.40722656, + "step": 2945, + "time_per_iteration": 2.6251614093780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_mlp": 1.00701642, + "epoch": 0.5667564447864564, + "flos": 603062768640.0, + "grad_norm": 0.03650681858880775, + "language_loss": 0.81631696, + "learning_rate": 0.00041658695669788653, + "loss": 0.82679439, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.40722656, + "step": 2946, + "time_per_iteration": 2.7196879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00432932, + "epoch": 0.5669488264717198, + "flos": 660723070464.0, + "grad_norm": 0.039783949444703086, + "language_loss": 0.82089484, + "learning_rate": 0.00041627979644864453, + "loss": 0.83134508, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.40698242, + "step": 2947, + "time_per_iteration": 2.8414080142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_mlp": 1.00243521, + "epoch": 0.5671412081569834, + "flos": 486383210496.0, + "grad_norm": 0.029571262892964766, + "language_loss": 0.81883216, + "learning_rate": 0.0004159726687024683, + "loss": 0.82926297, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.40649414, + "step": 2948, + "time_per_iteration": 2.6365981101989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043624, + "balance_loss_mlp": 1.0029496, + "epoch": 0.567333589842247, + "flos": 731061115392.0, + "grad_norm": 0.03568675680792695, + "language_loss": 0.79577011, + "learning_rate": 0.00041566557357859506, + "loss": 0.80620635, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.40673828, + "step": 2949, + "time_per_iteration": 2.8660199642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046952, + "balance_loss_mlp": 1.00618231, + "epoch": 0.5675259715275106, + "flos": 970559826432.0, + "grad_norm": 0.03148848509964497, + "language_loss": 0.79963183, + "learning_rate": 0.0004153585111962502, + "loss": 0.81010127, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.40771484, + "step": 2950, + "time_per_iteration": 3.284973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049042, + "balance_loss_mlp": 1.00824845, + "epoch": 0.5677183532127742, + "flos": 566214494976.0, + "grad_norm": 0.035222224981726044, + "language_loss": 0.84893769, + "learning_rate": 0.0004150514816746453, + "loss": 0.85942811, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.40795898, + "step": 2951, + "time_per_iteration": 2.688965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053398, + "balance_loss_mlp": 1.0126282, + "epoch": 0.5679107348980377, + "flos": 552746459136.0, + "grad_norm": 0.03211470229094595, + "language_loss": 0.86231828, + "learning_rate": 0.0004147444851329802, + "loss": 0.87285221, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.40771484, + "step": 2952, + "time_per_iteration": 2.654975175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050037, + "balance_loss_mlp": 1.00929093, + "epoch": 0.5681031165833013, + "flos": 820841540352.0, + "grad_norm": 0.031520082579240216, + "language_loss": 0.86395264, + "learning_rate": 0.00041443752169044126, + "loss": 0.87445295, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.4074707, + "step": 2953, + "time_per_iteration": 2.9978690147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_mlp": 1.00384951, + "epoch": 0.5682954982685648, + "flos": 619146646272.0, + "grad_norm": 0.031195671435834585, + "language_loss": 0.85214126, + "learning_rate": 0.0004141305914662025, + "loss": 0.86258864, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.40893555, + "step": 2954, + "time_per_iteration": 2.7177786827087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01214588, + "epoch": 0.5684878799538284, + "flos": 649252637184.0, + "grad_norm": 0.03230481359903608, + "language_loss": 0.81020069, + "learning_rate": 0.0004138236945794246, + "loss": 0.82073009, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.40795898, + "step": 2955, + "time_per_iteration": 2.8862104415893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.01065099, + "epoch": 0.5686802616390919, + "flos": 807354062592.0, + "grad_norm": 0.038353041221636526, + "language_loss": 0.84374332, + "learning_rate": 0.00041351683114925576, + "loss": 0.85425854, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.40869141, + "step": 2956, + "time_per_iteration": 3.0500295162200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_mlp": 1.01126814, + "epoch": 0.5688726433243555, + "flos": 548176756224.0, + "grad_norm": 0.03189027766628176, + "language_loss": 0.87115657, + "learning_rate": 0.0004132100012948308, + "loss": 0.8816781, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.40893555, + "step": 2957, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104687, + "balance_loss_mlp": 1.00593376, + "epoch": 0.5690650250096191, + "flos": 487546778880.0, + "grad_norm": 0.03605588885155363, + "language_loss": 0.84833193, + "learning_rate": 0.00041290320513527145, + "loss": 0.85880065, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.40942383, + "step": 2958, + "time_per_iteration": 2.567070960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010482, + "balance_loss_mlp": 1.00733471, + "epoch": 0.5692574066948827, + "flos": 578555900928.0, + "grad_norm": 0.030752617047449367, + "language_loss": 0.85344827, + "learning_rate": 0.0004125964427896867, + "loss": 0.86393028, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.40869141, + "step": 2959, + "time_per_iteration": 2.672534704208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047545, + "balance_loss_mlp": 1.00663245, + "epoch": 0.5694497883801463, + "flos": 455220576000.0, + "grad_norm": 0.04229544295686443, + "language_loss": 0.79680836, + "learning_rate": 0.0004122897143771723, + "loss": 0.80728376, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.40917969, + "step": 2960, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_mlp": 1.00534308, + "epoch": 0.5696421700654097, + "flos": 560583290880.0, + "grad_norm": 0.03127363894209499, + "language_loss": 0.82077289, + "learning_rate": 0.0004119830200168109, + "loss": 0.83123589, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.40966797, + "step": 2961, + "time_per_iteration": 2.663581609725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045878, + "balance_loss_mlp": 1.00510836, + "epoch": 0.5698345517506733, + "flos": 466502426112.0, + "grad_norm": 0.0350478630821908, + "language_loss": 0.89062726, + "learning_rate": 0.0004116763598276714, + "loss": 0.90108603, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.40771484, + "step": 2962, + "time_per_iteration": 2.521552801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_mlp": 1.00641382, + "epoch": 0.5700269334359369, + "flos": 607192121856.0, + "grad_norm": 0.031424704719117534, + "language_loss": 0.81706619, + "learning_rate": 0.00041136973392881017, + "loss": 0.82753831, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.40795898, + "step": 2963, + "time_per_iteration": 2.91904878616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_mlp": 1.00296056, + "epoch": 0.5702193151212005, + "flos": 563857222656.0, + "grad_norm": 0.03326860309508315, + "language_loss": 0.82831907, + "learning_rate": 0.00041106314243926983, + "loss": 0.83875614, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.4074707, + "step": 2964, + "time_per_iteration": 2.7399420738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_mlp": 1.00340486, + "epoch": 0.570411696806464, + "flos": 524310481920.0, + "grad_norm": 0.03332690132244082, + "language_loss": 0.8800739, + "learning_rate": 0.0004107565854780798, + "loss": 0.89051443, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.40649414, + "step": 2965, + "time_per_iteration": 2.6200034618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_mlp": 1.00565064, + "epoch": 0.5706040784917276, + "flos": 719473063680.0, + "grad_norm": 0.03436086388372073, + "language_loss": 0.81524932, + "learning_rate": 0.000410450063164256, + "loss": 0.82571304, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.40722656, + "step": 2966, + "time_per_iteration": 2.8336212635040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048641, + "balance_loss_mlp": 1.00787103, + "epoch": 0.5707964601769911, + "flos": 477671515392.0, + "grad_norm": 0.03782244517116874, + "language_loss": 0.82540762, + "learning_rate": 0.00041014357561680115, + "loss": 0.83589399, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.40771484, + "step": 2967, + "time_per_iteration": 2.5143654346466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00714386, + "epoch": 0.5709888418622547, + "flos": 581217429504.0, + "grad_norm": 0.030421169355448613, + "language_loss": 0.86193347, + "learning_rate": 0.0004098371229547039, + "loss": 0.87241161, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.40673828, + "step": 2968, + "time_per_iteration": 2.6610617637634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057983, + "balance_loss_mlp": 1.01869202, + "epoch": 0.5711812235475183, + "flos": 1583195536128.0, + "grad_norm": 0.0076189717983582966, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8106879, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.39257812, + "step": 2969, + "time_per_iteration": 4.76263952255249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_mlp": 1.00790465, + "epoch": 0.5713736052327818, + "flos": 469498346496.0, + "grad_norm": 0.03484927048715074, + "language_loss": 0.80634308, + "learning_rate": 0.00040922432276247107, + "loss": 0.81682986, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.40771484, + "step": 2970, + "time_per_iteration": 2.5514628887176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_mlp": 1.0054065, + "epoch": 0.5715659869180454, + "flos": 538755448320.0, + "grad_norm": 0.029079861926461517, + "language_loss": 0.84918243, + "learning_rate": 0.0004089179754702457, + "loss": 0.85964465, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.40820312, + "step": 2971, + "time_per_iteration": 2.749539613723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_mlp": 1.00396252, + "epoch": 0.571758368603309, + "flos": 657251807232.0, + "grad_norm": 0.03418066993480882, + "language_loss": 0.80556142, + "learning_rate": 0.00040861166353919843, + "loss": 0.81600946, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.40844727, + "step": 2972, + "time_per_iteration": 2.814680814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052102, + "balance_loss_mlp": 1.011356, + "epoch": 0.5719507502885726, + "flos": 669100373760.0, + "grad_norm": 0.031053974574008693, + "language_loss": 0.82602715, + "learning_rate": 0.00040830538708824983, + "loss": 0.83654815, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.4074707, + "step": 2973, + "time_per_iteration": 2.904085636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050783, + "balance_loss_mlp": 1.01018071, + "epoch": 0.572143131973836, + "flos": 477280743168.0, + "grad_norm": 0.03419925971016847, + "language_loss": 0.82092619, + "learning_rate": 0.000407999146236307, + "loss": 0.83143401, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.40600586, + "step": 2974, + "time_per_iteration": 2.549262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051203, + "balance_loss_mlp": 1.01062381, + "epoch": 0.5723355136590996, + "flos": 540535310592.0, + "grad_norm": 0.03597856382327793, + "language_loss": 0.83747095, + "learning_rate": 0.0004076929411022634, + "loss": 0.847983, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.40576172, + "step": 2975, + "time_per_iteration": 2.602869987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053058, + "balance_loss_mlp": 1.01235974, + "epoch": 0.5725278953443632, + "flos": 825650370816.0, + "grad_norm": 0.037415312483521146, + "language_loss": 0.8006742, + "learning_rate": 0.0004073867718049982, + "loss": 0.81120479, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.40698242, + "step": 2976, + "time_per_iteration": 3.139498472213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_mlp": 1.00966477, + "epoch": 0.5727202770296268, + "flos": 588570170112.0, + "grad_norm": 0.037681082671355684, + "language_loss": 0.83124882, + "learning_rate": 0.00040708063846337704, + "loss": 0.84175301, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.4074707, + "step": 2977, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00937819, + "epoch": 0.5729126587148904, + "flos": 447941712384.0, + "grad_norm": 0.03249864108633733, + "language_loss": 0.81268066, + "learning_rate": 0.00040677454119625143, + "loss": 0.82318383, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.40942383, + "step": 2978, + "time_per_iteration": 2.5775671005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049596, + "balance_loss_mlp": 1.00870752, + "epoch": 0.5731050404001539, + "flos": 520467888384.0, + "grad_norm": 0.034012599703189976, + "language_loss": 0.83670664, + "learning_rate": 0.0004064684801224587, + "loss": 0.84720254, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.40893555, + "step": 2979, + "time_per_iteration": 2.6424074172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047576, + "balance_loss_mlp": 1.00675905, + "epoch": 0.5732974220854175, + "flos": 505771155456.0, + "grad_norm": 0.032486782592384814, + "language_loss": 0.80872238, + "learning_rate": 0.00040616245536082224, + "loss": 0.81919813, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.40820312, + "step": 2980, + "time_per_iteration": 2.57401704788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050822, + "balance_loss_mlp": 1.01000464, + "epoch": 0.573489803770681, + "flos": 593678399232.0, + "grad_norm": 0.028956426653120197, + "language_loss": 0.82143462, + "learning_rate": 0.00040585646703015165, + "loss": 0.8319428, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.40820312, + "step": 2981, + "time_per_iteration": 2.828683614730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.01010036, + "epoch": 0.5736821854559446, + "flos": 490870288128.0, + "grad_norm": 0.04412597729133787, + "language_loss": 0.78605878, + "learning_rate": 0.0004055505152492419, + "loss": 0.79656816, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.40844727, + "step": 2982, + "time_per_iteration": 2.640928268432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048392, + "balance_loss_mlp": 1.00747919, + "epoch": 0.5738745671412081, + "flos": 459202175232.0, + "grad_norm": 0.034256342510568284, + "language_loss": 0.74769032, + "learning_rate": 0.00040524460013687425, + "loss": 0.7581743, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.40917969, + "step": 2983, + "time_per_iteration": 2.7067794799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105312, + "balance_loss_mlp": 1.0123024, + "epoch": 0.5740669488264717, + "flos": 581621807616.0, + "grad_norm": 0.029467935021435916, + "language_loss": 0.81554836, + "learning_rate": 0.0004049387218118155, + "loss": 0.82607955, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.40820312, + "step": 2984, + "time_per_iteration": 2.9581944942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_mlp": 1.00468242, + "epoch": 0.5742593305117353, + "flos": 525574172160.0, + "grad_norm": 0.03631391131249333, + "language_loss": 0.85729742, + "learning_rate": 0.00040463288039281777, + "loss": 0.86775261, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.40844727, + "step": 2985, + "time_per_iteration": 2.7224113941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056683, + "balance_loss_mlp": 1.01729584, + "epoch": 0.5744517121969989, + "flos": 1557269442816.0, + "grad_norm": 0.010841110534864203, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78933102, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.39355469, + "step": 2986, + "time_per_iteration": 5.064981698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.01089525, + "epoch": 0.5746440938822625, + "flos": 753203907840.0, + "grad_norm": 0.045288596232844924, + "language_loss": 0.82885808, + "learning_rate": 0.0004040213087479444, + "loss": 0.83937448, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.4074707, + "step": 2987, + "time_per_iteration": 2.98020601272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043481, + "balance_loss_mlp": 1.00266409, + "epoch": 0.5748364755675259, + "flos": 502857860352.0, + "grad_norm": 0.036149920431262125, + "language_loss": 0.85748988, + "learning_rate": 0.0004037155787595018, + "loss": 0.86792469, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.40820312, + "step": 2988, + "time_per_iteration": 2.5745627880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_mlp": 1.01026356, + "epoch": 0.5750288572527895, + "flos": 505198603008.0, + "grad_norm": 0.03371383384616788, + "language_loss": 0.81460357, + "learning_rate": 0.000403409886151987, + "loss": 0.82511389, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.40771484, + "step": 2989, + "time_per_iteration": 2.9434561729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045067, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5752212389380531, + "flos": 1544678215680.0, + "grad_norm": 0.006920775411585041, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83044171, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.39453125, + "step": 2990, + "time_per_iteration": 4.784885406494141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_mlp": 1.00367737, + "epoch": 0.5754136206233167, + "flos": 1570674295296.0, + "grad_norm": 0.003743957088283973, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79241765, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.39453125, + "step": 2991, + "time_per_iteration": 4.776461362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049264, + "balance_loss_mlp": 1.00842321, + "epoch": 0.5756060023085803, + "flos": 799562917632.0, + "grad_norm": 0.03045005809397815, + "language_loss": 0.77561808, + "learning_rate": 0.00040249303380173807, + "loss": 0.78611076, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.40844727, + "step": 2992, + "time_per_iteration": 3.0843074321746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_mlp": 1.00451803, + "epoch": 0.5757983839938438, + "flos": 589034819328.0, + "grad_norm": 0.034529184723129894, + "language_loss": 0.79738832, + "learning_rate": 0.00040218749190459126, + "loss": 0.8078438, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.41040039, + "step": 2993, + "time_per_iteration": 2.7403366565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.00428283, + "epoch": 0.5759907656791073, + "flos": 517852046592.0, + "grad_norm": 0.035278528612120996, + "language_loss": 0.82955313, + "learning_rate": 0.00040188198798162775, + "loss": 0.84000504, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.40917969, + "step": 2994, + "time_per_iteration": 2.6673707962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_mlp": 1.00718617, + "epoch": 0.5761831473643709, + "flos": 588290213376.0, + "grad_norm": 0.029287821677584636, + "language_loss": 0.85980493, + "learning_rate": 0.000401576522151455, + "loss": 0.87028569, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.40893555, + "step": 2995, + "time_per_iteration": 2.788686513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_mlp": 1.00815868, + "epoch": 0.5763755290496345, + "flos": 545009749248.0, + "grad_norm": 0.03018415670660867, + "language_loss": 0.8281709, + "learning_rate": 0.0004012710945326651, + "loss": 0.83866143, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.40893555, + "step": 2996, + "time_per_iteration": 2.7784581184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047721, + "balance_loss_mlp": 1.00685585, + "epoch": 0.576567910734898, + "flos": 627428685312.0, + "grad_norm": 0.030965553916741433, + "language_loss": 0.81781155, + "learning_rate": 0.0004009657052438355, + "loss": 0.82828873, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.40869141, + "step": 2997, + "time_per_iteration": 2.787832498550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_mlp": 1.00593948, + "epoch": 0.5767602924201616, + "flos": 539278423296.0, + "grad_norm": 0.0362963808148575, + "language_loss": 0.86264056, + "learning_rate": 0.00040066035440352904, + "loss": 0.87310815, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.40820312, + "step": 2998, + "time_per_iteration": 2.6896724700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045353, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5769526741054252, + "flos": 1563026046720.0, + "grad_norm": 0.005169215201186531, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8033849, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.39746094, + "step": 2999, + "time_per_iteration": 4.891216039657593 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_mlp": 1.00318265, + "epoch": 0.5771450557906888, + "flos": 469172702976.0, + "grad_norm": 0.037596514401195116, + "language_loss": 0.7668246, + "learning_rate": 0.00040004976854266145, + "loss": 0.77726436, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.40795898, + "step": 3000, + "time_per_iteration": 2.51895809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00478971, + "epoch": 0.5773374374759523, + "flos": 575633857536.0, + "grad_norm": 0.03248080927364981, + "language_loss": 0.81750363, + "learning_rate": 0.0003997445337591505, + "loss": 0.82796073, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.40917969, + "step": 3001, + "time_per_iteration": 2.692239999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.0079695, + "epoch": 0.5775298191612158, + "flos": 529505227008.0, + "grad_norm": 0.031913043384180086, + "language_loss": 0.74606609, + "learning_rate": 0.0003994393378982635, + "loss": 0.75655282, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.40698242, + "step": 3002, + "time_per_iteration": 2.665146589279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053272, + "balance_loss_mlp": 1.01369476, + "epoch": 0.5777222008464794, + "flos": 1306899095808.0, + "grad_norm": 0.010106387724362367, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80591273, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.39550781, + "step": 3003, + "time_per_iteration": 4.803764581680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104489, + "balance_loss_mlp": 1.00409698, + "epoch": 0.577914582531743, + "flos": 604793053440.0, + "grad_norm": 0.0386937293491606, + "language_loss": 0.88557941, + "learning_rate": 0.0003988290634182961, + "loss": 0.89602828, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.40795898, + "step": 3004, + "time_per_iteration": 2.7506465911865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_mlp": 1.00943995, + "epoch": 0.5781069642170066, + "flos": 487833538560.0, + "grad_norm": 0.034765884683499934, + "language_loss": 0.81038988, + "learning_rate": 0.0003985239850361453, + "loss": 0.82089031, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.40600586, + "step": 3005, + "time_per_iteration": 2.5988621711730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047258, + "balance_loss_mlp": 1.00653589, + "epoch": 0.5782993459022701, + "flos": 507414924288.0, + "grad_norm": 0.036479253397917216, + "language_loss": 0.85073388, + "learning_rate": 0.0003982189460504777, + "loss": 0.86120641, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.40722656, + "step": 3006, + "time_per_iteration": 2.694517135620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00913548, + "epoch": 0.5784917275875336, + "flos": 603295093248.0, + "grad_norm": 0.03899121610040523, + "language_loss": 0.79739761, + "learning_rate": 0.00039791394657971935, + "loss": 0.80789566, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.40673828, + "step": 3007, + "time_per_iteration": 2.694913387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_mlp": 1.00376368, + "epoch": 0.5786841092727972, + "flos": 522588945408.0, + "grad_norm": 0.03653808704233678, + "language_loss": 0.84952617, + "learning_rate": 0.00039760898674228205, + "loss": 0.85997152, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.40771484, + "step": 3008, + "time_per_iteration": 2.6486122608184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_mlp": 1.00476897, + "epoch": 0.5788764909580608, + "flos": 768836742144.0, + "grad_norm": 0.02798603221606654, + "language_loss": 0.81355041, + "learning_rate": 0.0003973040666565613, + "loss": 0.82400489, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.40673828, + "step": 3009, + "time_per_iteration": 3.029721975326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_mlp": 1.00590491, + "epoch": 0.5790688726433244, + "flos": 600332220672.0, + "grad_norm": 0.03710521046969438, + "language_loss": 0.82796824, + "learning_rate": 0.000396999186440938, + "loss": 0.8384347, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.4074707, + "step": 3010, + "time_per_iteration": 2.866637945175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_mlp": 1.00711966, + "epoch": 0.5792612543285879, + "flos": 524106347520.0, + "grad_norm": 0.03822457095680595, + "language_loss": 0.85752803, + "learning_rate": 0.000396694346213777, + "loss": 0.86800808, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.40893555, + "step": 3011, + "time_per_iteration": 2.6125171184539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045119, + "balance_loss_mlp": 1.00430202, + "epoch": 0.5794536360138515, + "flos": 878080934400.0, + "grad_norm": 0.030461633114119882, + "language_loss": 0.8396455, + "learning_rate": 0.0003963895460934276, + "loss": 0.8500967, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.40820312, + "step": 3012, + "time_per_iteration": 3.1341123580932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047321, + "balance_loss_mlp": 1.00631309, + "epoch": 0.5796460176991151, + "flos": 402299118336.0, + "grad_norm": 0.04162907217084141, + "language_loss": 0.85323715, + "learning_rate": 0.00039608478619822376, + "loss": 0.86371034, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.41015625, + "step": 3013, + "time_per_iteration": 2.45570969581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_mlp": 1.00448704, + "epoch": 0.5798383993843786, + "flos": 619676424192.0, + "grad_norm": 0.02973237056850944, + "language_loss": 0.8328954, + "learning_rate": 0.00039578006664648394, + "loss": 0.84334981, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.40966797, + "step": 3014, + "time_per_iteration": 2.796370506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_mlp": 1.00351644, + "epoch": 0.5800307810696421, + "flos": 845793615360.0, + "grad_norm": 0.037256106488294125, + "language_loss": 0.81995672, + "learning_rate": 0.0003954753875565105, + "loss": 0.83040106, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.40917969, + "step": 3015, + "time_per_iteration": 3.0796241760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045336, + "balance_loss_mlp": 1.00442326, + "epoch": 0.5802231627549057, + "flos": 570365235456.0, + "grad_norm": 0.0302253929683373, + "language_loss": 0.82961631, + "learning_rate": 0.00039517074904659057, + "loss": 0.84006965, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.40917969, + "step": 3016, + "time_per_iteration": 2.6984057426452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105015, + "balance_loss_mlp": 1.00921345, + "epoch": 0.5804155444401693, + "flos": 661663062528.0, + "grad_norm": 0.033398230079863866, + "language_loss": 0.85268873, + "learning_rate": 0.00039486615123499535, + "loss": 0.86319029, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.40942383, + "step": 3017, + "time_per_iteration": 2.8348796367645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051377, + "balance_loss_mlp": 1.01022601, + "epoch": 0.5806079261254329, + "flos": 515058315264.0, + "grad_norm": 0.030637451118741787, + "language_loss": 0.85653043, + "learning_rate": 0.00039456159423997996, + "loss": 0.86704421, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.41162109, + "step": 3018, + "time_per_iteration": 2.6296215057373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_mlp": 1.00740576, + "epoch": 0.5808003078106965, + "flos": 529718109696.0, + "grad_norm": 0.03062870911456177, + "language_loss": 0.90210342, + "learning_rate": 0.00039425707817978406, + "loss": 0.91258705, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.40966797, + "step": 3019, + "time_per_iteration": 2.631979465484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720644, + "epoch": 0.58099268949596, + "flos": 477997158912.0, + "grad_norm": 0.03679030272618613, + "language_loss": 0.84110886, + "learning_rate": 0.00039395260317263124, + "loss": 0.85159171, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.41088867, + "step": 3020, + "time_per_iteration": 2.584413528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00930095, + "epoch": 0.5811850711812235, + "flos": 518688026112.0, + "grad_norm": 0.03473628129951431, + "language_loss": 0.85378569, + "learning_rate": 0.0003936481693367291, + "loss": 0.86428928, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.41064453, + "step": 3021, + "time_per_iteration": 2.6612508296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049465, + "balance_loss_mlp": 1.00833774, + "epoch": 0.5813774528664871, + "flos": 617627298816.0, + "grad_norm": 0.037803518868136904, + "language_loss": 0.88371962, + "learning_rate": 0.0003933437767902697, + "loss": 0.89421427, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.41137695, + "step": 3022, + "time_per_iteration": 2.7910103797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00499725, + "epoch": 0.5815698345517507, + "flos": 568604815104.0, + "grad_norm": 0.03314052138705104, + "language_loss": 0.78534555, + "learning_rate": 0.00039303942565142825, + "loss": 0.7958051, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.40966797, + "step": 3023, + "time_per_iteration": 2.7066261768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_mlp": 1.00525796, + "epoch": 0.5817622162370142, + "flos": 564304375296.0, + "grad_norm": 0.034500169077956666, + "language_loss": 0.76946682, + "learning_rate": 0.0003927351160383644, + "loss": 0.77992761, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.40820312, + "step": 3024, + "time_per_iteration": 2.785215377807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_mlp": 1.00370252, + "epoch": 0.5819545979222778, + "flos": 460154806272.0, + "grad_norm": 0.03482271460519531, + "language_loss": 0.78468955, + "learning_rate": 0.000392430848069222, + "loss": 0.79513502, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.40844727, + "step": 3025, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_mlp": 1.00244236, + "epoch": 0.5821469796075414, + "flos": 542517361920.0, + "grad_norm": 0.03539348008973476, + "language_loss": 0.83090204, + "learning_rate": 0.00039212662186212795, + "loss": 0.8413347, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.40820312, + "step": 3026, + "time_per_iteration": 2.6203463077545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046114, + "balance_loss_mlp": 1.00534403, + "epoch": 0.582339361292805, + "flos": 553341365760.0, + "grad_norm": 0.030591419392928903, + "language_loss": 0.77452922, + "learning_rate": 0.0003918224375351934, + "loss": 0.78499031, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.40771484, + "step": 3027, + "time_per_iteration": 2.700643301010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_mlp": 1.00646877, + "epoch": 0.5825317429780685, + "flos": 497448287232.0, + "grad_norm": 0.03355698207676345, + "language_loss": 0.79253477, + "learning_rate": 0.0003915182952065135, + "loss": 0.80300689, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.4074707, + "step": 3028, + "time_per_iteration": 2.693223714828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043151, + "balance_loss_mlp": 1.00247645, + "epoch": 0.582724124663332, + "flos": 565255060992.0, + "grad_norm": 0.03374091506860629, + "language_loss": 0.88055015, + "learning_rate": 0.0003912141949941664, + "loss": 0.89098167, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.40673828, + "step": 3029, + "time_per_iteration": 2.674584150314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_mlp": 1.00249338, + "epoch": 0.5829165063485956, + "flos": 493112854272.0, + "grad_norm": 0.039605660090179254, + "language_loss": 0.83319384, + "learning_rate": 0.0003909101370162143, + "loss": 0.84362668, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.40795898, + "step": 3030, + "time_per_iteration": 2.592111587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.00718689, + "epoch": 0.5831088880338592, + "flos": 1531879941888.0, + "grad_norm": 0.006346134957791291, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73480463, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.39355469, + "step": 3031, + "time_per_iteration": 4.929339170455933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047591, + "balance_loss_mlp": 1.00686908, + "epoch": 0.5833012697191228, + "flos": 619209829632.0, + "grad_norm": 0.03163493287885039, + "language_loss": 0.83241516, + "learning_rate": 0.0003903021482356622, + "loss": 0.8428911, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.40722656, + "step": 3032, + "time_per_iteration": 2.7828269004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_mlp": 1.00508761, + "epoch": 0.5834936514043862, + "flos": 769294588416.0, + "grad_norm": 0.028764675594544035, + "language_loss": 0.83318806, + "learning_rate": 0.00038999821766910465, + "loss": 0.84364575, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.40673828, + "step": 3033, + "time_per_iteration": 2.976440906524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046889, + "balance_loss_mlp": 1.00616705, + "epoch": 0.5836860330896498, + "flos": 459316881408.0, + "grad_norm": 0.03570453873198092, + "language_loss": 0.86074644, + "learning_rate": 0.00038969432980902606, + "loss": 0.87121534, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.40722656, + "step": 3034, + "time_per_iteration": 2.5605523586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049232, + "balance_loss_mlp": 1.00975037, + "epoch": 0.5838784147749134, + "flos": 1364198760960.0, + "grad_norm": 0.006741388763220325, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80833733, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.39453125, + "step": 3035, + "time_per_iteration": 4.870011329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046432, + "balance_loss_mlp": 1.00566232, + "epoch": 0.584070796460177, + "flos": 568289865216.0, + "grad_norm": 0.0320953374409888, + "language_loss": 0.82746142, + "learning_rate": 0.00038908668268020953, + "loss": 0.83792579, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.40771484, + "step": 3036, + "time_per_iteration": 2.6482043266296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_mlp": 1.00582528, + "epoch": 0.5842631781454406, + "flos": 612666823680.0, + "grad_norm": 0.032158289179941596, + "language_loss": 0.85682309, + "learning_rate": 0.00038878292364738097, + "loss": 0.86729091, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.40966797, + "step": 3037, + "time_per_iteration": 2.7571158409118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104852, + "balance_loss_mlp": 1.00758314, + "epoch": 0.5844555598307041, + "flos": 464333736960.0, + "grad_norm": 0.037716829310632, + "language_loss": 0.87422657, + "learning_rate": 0.0003884792077928508, + "loss": 0.88471174, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.40942383, + "step": 3038, + "time_per_iteration": 2.5060815811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00522828, + "epoch": 0.5846479415159677, + "flos": 411058445568.0, + "grad_norm": 0.036592459093467214, + "language_loss": 0.77285695, + "learning_rate": 0.0003881755352345322, + "loss": 0.78331912, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.40991211, + "step": 3039, + "time_per_iteration": 2.558833360671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049516, + "balance_loss_mlp": 1.0084126, + "epoch": 0.5848403232012312, + "flos": 492266181120.0, + "grad_norm": 0.028436591435814704, + "language_loss": 0.87703776, + "learning_rate": 0.0003878719060903207, + "loss": 0.88753295, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.41113281, + "step": 3040, + "time_per_iteration": 2.563680410385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048574, + "balance_loss_mlp": 1.0073278, + "epoch": 0.5850327048864948, + "flos": 585509121024.0, + "grad_norm": 0.03942000109029475, + "language_loss": 0.8397156, + "learning_rate": 0.0003875683204780961, + "loss": 0.85020131, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.41259766, + "step": 3041, + "time_per_iteration": 2.707235336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046188, + "balance_loss_mlp": 1.00506115, + "epoch": 0.5852250865717584, + "flos": 652719042816.0, + "grad_norm": 0.03661913957485838, + "language_loss": 0.85946143, + "learning_rate": 0.00038726477851572043, + "loss": 0.86992323, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.41137695, + "step": 3042, + "time_per_iteration": 2.7779452800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_mlp": 1.00753701, + "epoch": 0.5854174682570219, + "flos": 535620522240.0, + "grad_norm": 0.03519010087747146, + "language_loss": 0.80754662, + "learning_rate": 0.0003869612803210395, + "loss": 0.81803256, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.41064453, + "step": 3043, + "time_per_iteration": 2.64778733253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_mlp": 1.01044726, + "epoch": 0.5856098499422855, + "flos": 510759820800.0, + "grad_norm": 0.03494290194274924, + "language_loss": 0.83645654, + "learning_rate": 0.0003866578260118817, + "loss": 0.84697139, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.41040039, + "step": 3044, + "time_per_iteration": 2.596379041671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00828719, + "epoch": 0.5858022316275491, + "flos": 594993612288.0, + "grad_norm": 0.03849486234726574, + "language_loss": 0.83826196, + "learning_rate": 0.0003863544157060581, + "loss": 0.84875488, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.41015625, + "step": 3045, + "time_per_iteration": 2.6666998863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_mlp": 1.0086298, + "epoch": 0.5859946133128127, + "flos": 560318885376.0, + "grad_norm": 0.02876341489298987, + "language_loss": 0.82639688, + "learning_rate": 0.0003860510495213634, + "loss": 0.83689421, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.41113281, + "step": 3046, + "time_per_iteration": 2.865504264831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00827503, + "epoch": 0.5861869949980761, + "flos": 554756700672.0, + "grad_norm": 0.0396946944562825, + "language_loss": 0.78689963, + "learning_rate": 0.0003857477275755746, + "loss": 0.79739368, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.41137695, + "step": 3047, + "time_per_iteration": 2.624819278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049982, + "balance_loss_mlp": 1.00887823, + "epoch": 0.5863793766833397, + "flos": 720055331328.0, + "grad_norm": 0.02972376125592825, + "language_loss": 0.84339547, + "learning_rate": 0.00038544444998645167, + "loss": 0.85389531, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.41113281, + "step": 3048, + "time_per_iteration": 2.990790367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_mlp": 1.00750625, + "epoch": 0.5865717583686033, + "flos": 473286504960.0, + "grad_norm": 0.034605288898392046, + "language_loss": 0.82032233, + "learning_rate": 0.00038514121687173767, + "loss": 0.83080769, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.41040039, + "step": 3049, + "time_per_iteration": 2.596529960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049884, + "balance_loss_mlp": 1.0088284, + "epoch": 0.5867641400538669, + "flos": 814847754240.0, + "grad_norm": 0.03903750410866887, + "language_loss": 0.82380903, + "learning_rate": 0.00038483802834915807, + "loss": 0.83430791, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.41064453, + "step": 3050, + "time_per_iteration": 2.9996161460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.00480914, + "epoch": 0.5869565217391305, + "flos": 487518588672.0, + "grad_norm": 0.0350404565928551, + "language_loss": 0.79904723, + "learning_rate": 0.00038453488453642074, + "loss": 0.80950606, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.41088867, + "step": 3051, + "time_per_iteration": 2.7099759578704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_mlp": 1.00626779, + "epoch": 0.587148903424394, + "flos": 570512989440.0, + "grad_norm": 0.03324549798167153, + "language_loss": 0.8786602, + "learning_rate": 0.00038423178555121697, + "loss": 0.88913417, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.41137695, + "step": 3052, + "time_per_iteration": 2.684868097305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_mlp": 1.00359285, + "epoch": 0.5873412851096576, + "flos": 748695442944.0, + "grad_norm": 0.0344494509074348, + "language_loss": 0.86014688, + "learning_rate": 0.00038392873151121994, + "loss": 0.87059504, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.41235352, + "step": 3053, + "time_per_iteration": 3.073838949203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_mlp": 1.00079656, + "epoch": 0.5875336667949211, + "flos": 529188331776.0, + "grad_norm": 0.03507235034672983, + "language_loss": 0.83636832, + "learning_rate": 0.0003836257225340859, + "loss": 0.84678853, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.41235352, + "step": 3054, + "time_per_iteration": 2.6333680152893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_mlp": 1.000633, + "epoch": 0.5877260484801847, + "flos": 825641622528.0, + "grad_norm": 0.032727897026981576, + "language_loss": 0.82534069, + "learning_rate": 0.00038332275873745336, + "loss": 0.83575833, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.41137695, + "step": 3055, + "time_per_iteration": 3.051757335662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_mlp": 1.00292683, + "epoch": 0.5879184301654482, + "flos": 592694665728.0, + "grad_norm": 0.030899230424817493, + "language_loss": 0.83323562, + "learning_rate": 0.0003830198402389431, + "loss": 0.84367692, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.41210938, + "step": 3056, + "time_per_iteration": 2.6873278617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_mlp": 1.00317383, + "epoch": 0.5881108118507118, + "flos": 1549226531328.0, + "grad_norm": 0.008859615514711313, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78391969, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.3984375, + "step": 3057, + "time_per_iteration": 5.044417142868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045675, + "balance_loss_mlp": 1.00461972, + "epoch": 0.5883031935359754, + "flos": 490599079680.0, + "grad_norm": 0.03687508634060279, + "language_loss": 0.83287209, + "learning_rate": 0.0003824141396066855, + "loss": 0.84332883, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.41064453, + "step": 3058, + "time_per_iteration": 2.57017183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_mlp": 1.00458455, + "epoch": 0.588495575221239, + "flos": 583981025280.0, + "grad_norm": 0.03543871049956236, + "language_loss": 0.83470112, + "learning_rate": 0.000382111357708092, + "loss": 0.84515893, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.41210938, + "step": 3059, + "time_per_iteration": 2.710636615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046053, + "balance_loss_mlp": 1.00492609, + "epoch": 0.5886879569065026, + "flos": 662240472576.0, + "grad_norm": 0.03467029745908185, + "language_loss": 0.84034348, + "learning_rate": 0.00038180862157792864, + "loss": 0.85080403, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.41137695, + "step": 3060, + "time_per_iteration": 2.765730619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045299, + "balance_loss_mlp": 1.00429142, + "epoch": 0.588880338591766, + "flos": 563720162304.0, + "grad_norm": 0.034528332603885874, + "language_loss": 0.82661986, + "learning_rate": 0.0003815059313337279, + "loss": 0.83707285, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.41015625, + "step": 3061, + "time_per_iteration": 2.6512649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_mlp": 1.00339055, + "epoch": 0.5890727202770296, + "flos": 555853195008.0, + "grad_norm": 0.028645191608940447, + "language_loss": 0.78527474, + "learning_rate": 0.00038120328709300436, + "loss": 0.79571807, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.40942383, + "step": 3062, + "time_per_iteration": 2.839588165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_mlp": 1.00321651, + "epoch": 0.5892651019622932, + "flos": 656702587392.0, + "grad_norm": 0.03868775593308096, + "language_loss": 0.83858323, + "learning_rate": 0.0003809006889732549, + "loss": 0.84902555, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.41015625, + "step": 3063, + "time_per_iteration": 2.80668306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044985, + "balance_loss_mlp": 1.00395334, + "epoch": 0.5894574836475568, + "flos": 454132829952.0, + "grad_norm": 0.034675820144419535, + "language_loss": 0.8846643, + "learning_rate": 0.0003805981370919589, + "loss": 0.89511412, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.41040039, + "step": 3064, + "time_per_iteration": 2.4926044940948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.00509965, + "epoch": 0.5896498653328203, + "flos": 520112109312.0, + "grad_norm": 0.03109338069781882, + "language_loss": 0.843858, + "learning_rate": 0.0003802956315665771, + "loss": 0.85432076, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.41186523, + "step": 3065, + "time_per_iteration": 2.6821701526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00530875, + "epoch": 0.5898422470180839, + "flos": 550084930560.0, + "grad_norm": 0.039548358411626815, + "language_loss": 0.82298601, + "learning_rate": 0.0003799931725145529, + "loss": 0.83345109, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.41210938, + "step": 3066, + "time_per_iteration": 2.6161272525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_mlp": 1.00532758, + "epoch": 0.5900346287033474, + "flos": 525380731392.0, + "grad_norm": 0.034195441532662435, + "language_loss": 0.86171907, + "learning_rate": 0.00037969076005331083, + "loss": 0.87218219, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.40991211, + "step": 3067, + "time_per_iteration": 2.769503116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046067, + "balance_loss_mlp": 1.00515461, + "epoch": 0.590227010388611, + "flos": 568215988224.0, + "grad_norm": 0.03443045458348014, + "language_loss": 0.88715112, + "learning_rate": 0.00037938839430025817, + "loss": 0.8976118, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.40917969, + "step": 3068, + "time_per_iteration": 2.626838207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046372, + "balance_loss_mlp": 1.00557816, + "epoch": 0.5904193920738746, + "flos": 584456368128.0, + "grad_norm": 0.03106221395948033, + "language_loss": 0.86157519, + "learning_rate": 0.0003790860753727835, + "loss": 0.8720389, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.40795898, + "step": 3069, + "time_per_iteration": 2.825906991958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_mlp": 1.0041821, + "epoch": 0.5906117737591381, + "flos": 530797107456.0, + "grad_norm": 0.033655572520404166, + "language_loss": 0.83318973, + "learning_rate": 0.00037878380338825766, + "loss": 0.84363884, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.40722656, + "step": 3070, + "time_per_iteration": 2.6605753898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_mlp": 1.00264668, + "epoch": 0.5908041554444017, + "flos": 685516697856.0, + "grad_norm": 0.032255816781200916, + "language_loss": 0.81519401, + "learning_rate": 0.00037848157846403287, + "loss": 0.82562816, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.40771484, + "step": 3071, + "time_per_iteration": 2.8913676738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_mlp": 1.00712073, + "epoch": 0.5909965371296653, + "flos": 551133792768.0, + "grad_norm": 0.033304308768315895, + "language_loss": 0.83666503, + "learning_rate": 0.0003781794007174435, + "loss": 0.84714377, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.4074707, + "step": 3072, + "time_per_iteration": 2.7170376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044559, + "balance_loss_mlp": 1.00498199, + "epoch": 0.5911889188149289, + "flos": 1495645038336.0, + "grad_norm": 0.0062576164066865435, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7511909, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.39550781, + "step": 3073, + "time_per_iteration": 4.848031282424927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053183, + "balance_loss_mlp": 1.01248538, + "epoch": 0.5913813005001923, + "flos": 488886291456.0, + "grad_norm": 0.03164327687157731, + "language_loss": 0.81542623, + "learning_rate": 0.0003775751872264152, + "loss": 0.82595801, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.40698242, + "step": 3074, + "time_per_iteration": 2.7835612297058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_mlp": 1.00721872, + "epoch": 0.5915736821854559, + "flos": 574522778880.0, + "grad_norm": 0.03137518576611995, + "language_loss": 0.87806273, + "learning_rate": 0.0003772731517165527, + "loss": 0.88854092, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.40600586, + "step": 3075, + "time_per_iteration": 2.7984819412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045123, + "balance_loss_mlp": 1.00451982, + "epoch": 0.5917660638707195, + "flos": 790861916160.0, + "grad_norm": 0.03467745447845496, + "language_loss": 0.83953345, + "learning_rate": 0.0003769711638534784, + "loss": 0.84998471, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.40600586, + "step": 3076, + "time_per_iteration": 2.9498283863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045943, + "balance_loss_mlp": 1.0053643, + "epoch": 0.5919584455559831, + "flos": 529756993536.0, + "grad_norm": 0.038274807826461636, + "language_loss": 0.7910676, + "learning_rate": 0.00037666922375443446, + "loss": 0.80152702, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.40576172, + "step": 3077, + "time_per_iteration": 2.595907211303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043771, + "balance_loss_mlp": 1.00312054, + "epoch": 0.5921508272412467, + "flos": 561753662208.0, + "grad_norm": 0.037448898185008676, + "language_loss": 0.82402956, + "learning_rate": 0.00037636733153664396, + "loss": 0.83446729, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.40649414, + "step": 3078, + "time_per_iteration": 2.8082337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050449, + "balance_loss_mlp": 1.00984669, + "epoch": 0.5923432089265102, + "flos": 564334510848.0, + "grad_norm": 0.04535413457726027, + "language_loss": 0.80388999, + "learning_rate": 0.0003760654873173124, + "loss": 0.81439447, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.40600586, + "step": 3079, + "time_per_iteration": 2.6586430072784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048929, + "balance_loss_mlp": 1.00832665, + "epoch": 0.5925355906117737, + "flos": 496751313408.0, + "grad_norm": 0.032303837876808815, + "language_loss": 0.82224989, + "learning_rate": 0.00037576369121362566, + "loss": 0.83273923, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.40600586, + "step": 3080, + "time_per_iteration": 2.5874335765838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_mlp": 1.00846922, + "epoch": 0.5927279722970373, + "flos": 567493736448.0, + "grad_norm": 0.03169427730059961, + "language_loss": 0.82085633, + "learning_rate": 0.0003754619433427516, + "loss": 0.83134699, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.40600586, + "step": 3081, + "time_per_iteration": 2.9037671089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_mlp": 1.00400662, + "epoch": 0.5929203539823009, + "flos": 668160381696.0, + "grad_norm": 0.04430970694991959, + "language_loss": 0.78507918, + "learning_rate": 0.0003751602438218392, + "loss": 0.79552627, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.40698242, + "step": 3082, + "time_per_iteration": 2.77486252784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_mlp": 1.00195801, + "epoch": 0.5931127356675644, + "flos": 556786384128.0, + "grad_norm": 0.03446517582568327, + "language_loss": 0.84122735, + "learning_rate": 0.0003748585927680186, + "loss": 0.8516537, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.40673828, + "step": 3083, + "time_per_iteration": 2.6401243209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047685, + "balance_loss_mlp": 1.00698733, + "epoch": 0.593305117352828, + "flos": 536243619072.0, + "grad_norm": 0.03379156982252967, + "language_loss": 0.83284605, + "learning_rate": 0.00037455699029840086, + "loss": 0.84332293, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.40698242, + "step": 3084, + "time_per_iteration": 2.6359477043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_mlp": 1.00723624, + "epoch": 0.5934974990380916, + "flos": 595058740992.0, + "grad_norm": 0.03375272766067447, + "language_loss": 0.84866869, + "learning_rate": 0.0003742554365300787, + "loss": 0.85914803, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.40698242, + "step": 3085, + "time_per_iteration": 2.7629523277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047831, + "balance_loss_mlp": 1.00727594, + "epoch": 0.5936898807233552, + "flos": 714015858432.0, + "grad_norm": 0.08464198739198994, + "language_loss": 0.79301089, + "learning_rate": 0.0003739539315801255, + "loss": 0.80348921, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.40551758, + "step": 3086, + "time_per_iteration": 2.9152019023895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.01004303, + "epoch": 0.5938822624086187, + "flos": 392749498368.0, + "grad_norm": 0.03659508144201786, + "language_loss": 0.92428821, + "learning_rate": 0.000373652475565596, + "loss": 0.93479371, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.4050293, + "step": 3087, + "time_per_iteration": 2.4702134132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050448, + "balance_loss_mlp": 1.00982189, + "epoch": 0.5940746440938822, + "flos": 481336219392.0, + "grad_norm": 0.034289442552625136, + "language_loss": 0.81692433, + "learning_rate": 0.00037335106860352587, + "loss": 0.82742882, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.40625, + "step": 3088, + "time_per_iteration": 2.675694704055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_mlp": 1.00322449, + "epoch": 0.5942670257791458, + "flos": 484307840256.0, + "grad_norm": 0.03351872550432346, + "language_loss": 0.8348605, + "learning_rate": 0.00037304971081093146, + "loss": 0.84530044, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.40771484, + "step": 3089, + "time_per_iteration": 2.5974292755126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_mlp": 1.00181389, + "epoch": 0.5944594074644094, + "flos": 549058422528.0, + "grad_norm": 0.03144984032595776, + "language_loss": 0.81257939, + "learning_rate": 0.00037274840230481024, + "loss": 0.82300425, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.40673828, + "step": 3090, + "time_per_iteration": 2.7465951442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_mlp": 1.00262976, + "epoch": 0.594651789149673, + "flos": 450129843456.0, + "grad_norm": 0.0354227551067568, + "language_loss": 0.79578584, + "learning_rate": 0.00037244714320214077, + "loss": 0.80621862, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.40649414, + "step": 3091, + "time_per_iteration": 2.532076597213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_mlp": 1.00489831, + "epoch": 0.5948441708349365, + "flos": 597466557696.0, + "grad_norm": 0.033875543124705955, + "language_loss": 0.83456963, + "learning_rate": 0.000372145933619882, + "loss": 0.84502512, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.40649414, + "step": 3092, + "time_per_iteration": 2.888296127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00502765, + "epoch": 0.5950365525202, + "flos": 549581397504.0, + "grad_norm": 0.03918584024885415, + "language_loss": 0.83476591, + "learning_rate": 0.000371844773674974, + "loss": 0.84522295, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.40673828, + "step": 3093, + "time_per_iteration": 2.641191244125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042191, + "balance_loss_mlp": 1.00146902, + "epoch": 0.5952289342054636, + "flos": 655964784384.0, + "grad_norm": 0.03345437818943746, + "language_loss": 0.82307684, + "learning_rate": 0.0003715436634843375, + "loss": 0.83349872, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.40722656, + "step": 3094, + "time_per_iteration": 2.8391387462615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_mlp": 1.00185752, + "epoch": 0.5954213158907272, + "flos": 604604470272.0, + "grad_norm": 0.028714859262846556, + "language_loss": 0.8123939, + "learning_rate": 0.00037124260316487355, + "loss": 0.82281804, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.40551758, + "step": 3095, + "time_per_iteration": 2.8300905227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_mlp": 1.00742722, + "epoch": 0.5956136975759908, + "flos": 487268767488.0, + "grad_norm": 0.03390156256560374, + "language_loss": 0.89901024, + "learning_rate": 0.0003709415928334643, + "loss": 0.90949249, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.40795898, + "step": 3096, + "time_per_iteration": 2.594320297241211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_mlp": 1.00376081, + "epoch": 0.5958060792612543, + "flos": 660041647872.0, + "grad_norm": 0.036547009459556086, + "language_loss": 0.8143428, + "learning_rate": 0.00037064063260697233, + "loss": 0.82478929, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.40893555, + "step": 3097, + "time_per_iteration": 2.853452205657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00397766, + "epoch": 0.5959984609465179, + "flos": 724996364544.0, + "grad_norm": 0.03336502037481855, + "language_loss": 0.78911316, + "learning_rate": 0.0003703397226022407, + "loss": 0.79956114, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.40820312, + "step": 3098, + "time_per_iteration": 3.0299534797668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050957, + "balance_loss_mlp": 1.01147461, + "epoch": 0.5961908426317815, + "flos": 1523221703424.0, + "grad_norm": 0.010872658804754508, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76550829, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.39453125, + "step": 3099, + "time_per_iteration": 4.950707674026489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_mlp": 1.00387943, + "epoch": 0.596383224317045, + "flos": 533647219200.0, + "grad_norm": 0.033784299285581076, + "language_loss": 0.84084308, + "learning_rate": 0.0003697380537253339, + "loss": 0.85128987, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.40795898, + "step": 3100, + "time_per_iteration": 2.6651411056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044743, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5965756060023086, + "flos": 592367076864.0, + "grad_norm": 0.032025449945388196, + "language_loss": 0.82004619, + "learning_rate": 0.0003694372950867471, + "loss": 0.83049357, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.40795898, + "step": 3101, + "time_per_iteration": 2.7825992107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_mlp": 1.00341129, + "epoch": 0.5967679876875721, + "flos": 863470717440.0, + "grad_norm": 0.0338522286072748, + "language_loss": 0.78029126, + "learning_rate": 0.0003691365871370976, + "loss": 0.79073191, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.40649414, + "step": 3102, + "time_per_iteration": 3.0174319744110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044179, + "balance_loss_mlp": 1.00340927, + "epoch": 0.5969603693728357, + "flos": 554878209792.0, + "grad_norm": 0.03201933469342105, + "language_loss": 0.85875535, + "learning_rate": 0.00036883592999313093, + "loss": 0.86919713, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.40771484, + "step": 3103, + "time_per_iteration": 2.683260679244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_mlp": 1.00314891, + "epoch": 0.5971527510580993, + "flos": 719937712896.0, + "grad_norm": 0.039464615758245, + "language_loss": 0.79932439, + "learning_rate": 0.0003685353237715722, + "loss": 0.80976272, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.40673828, + "step": 3104, + "time_per_iteration": 2.8593432903289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_mlp": 1.00312221, + "epoch": 0.5973451327433629, + "flos": 648863810304.0, + "grad_norm": 0.031062495288944163, + "language_loss": 0.82383978, + "learning_rate": 0.0003682347685891274, + "loss": 0.83427846, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.4074707, + "step": 3105, + "time_per_iteration": 2.840812921524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_mlp": 1.00504565, + "epoch": 0.5975375144286263, + "flos": 723090135552.0, + "grad_norm": 0.03430317325592521, + "language_loss": 0.81334996, + "learning_rate": 0.0003679342645624822, + "loss": 0.82380736, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.40698242, + "step": 3106, + "time_per_iteration": 2.961186408996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.00520086, + "epoch": 0.5977298961138899, + "flos": 752344595712.0, + "grad_norm": 0.03201923744385334, + "language_loss": 0.82261443, + "learning_rate": 0.0003676338118083025, + "loss": 0.83307385, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.4074707, + "step": 3107, + "time_per_iteration": 2.9809908866882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105353, + "balance_loss_mlp": 1.01264107, + "epoch": 0.5979222777991535, + "flos": 531999559680.0, + "grad_norm": 0.03643788911431517, + "language_loss": 0.79681456, + "learning_rate": 0.0003673334104432347, + "loss": 0.8073498, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.40893555, + "step": 3108, + "time_per_iteration": 2.5879976749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_mlp": 1.01157844, + "epoch": 0.5981146594844171, + "flos": 622915362816.0, + "grad_norm": 0.031178647905512342, + "language_loss": 0.84073299, + "learning_rate": 0.0003670330605839048, + "loss": 0.85125697, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.40820312, + "step": 3109, + "time_per_iteration": 2.843069314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_mlp": 1.00877023, + "epoch": 0.5983070411696807, + "flos": 604710428160.0, + "grad_norm": 0.03611015998230635, + "language_loss": 0.77344596, + "learning_rate": 0.0003667327623469191, + "loss": 0.7839421, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.40844727, + "step": 3110, + "time_per_iteration": 2.7326698303222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00438774, + "epoch": 0.5984994228549442, + "flos": 634670610432.0, + "grad_norm": 0.03877534508876671, + "language_loss": 0.78326917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79372144, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.40844727, + "step": 3111, + "time_per_iteration": 2.784482717514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105463, + "balance_loss_mlp": 1.01369393, + "epoch": 0.5986918045402078, + "flos": 526294478592.0, + "grad_norm": 0.03280596002015671, + "language_loss": 0.82781613, + "learning_rate": 0.00036613232120630393, + "loss": 0.83836246, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.40942383, + "step": 3112, + "time_per_iteration": 2.5862860679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105133, + "balance_loss_mlp": 1.0103699, + "epoch": 0.5988841862254713, + "flos": 484140644352.0, + "grad_norm": 0.03859230842611924, + "language_loss": 0.80514455, + "learning_rate": 0.00036583217853578643, + "loss": 0.81565785, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.40966797, + "step": 3113, + "time_per_iteration": 2.565713405609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048995, + "balance_loss_mlp": 1.00805807, + "epoch": 0.5990765679107349, + "flos": 1142123451648.0, + "grad_norm": 0.034390898471739054, + "language_loss": 0.77730286, + "learning_rate": 0.000365532087953837, + "loss": 0.78779286, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.40942383, + "step": 3114, + "time_per_iteration": 3.646124839782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00889242, + "epoch": 0.5992689495959984, + "flos": 518019242496.0, + "grad_norm": 0.033850887819700186, + "language_loss": 0.89597213, + "learning_rate": 0.00036523204957696065, + "loss": 0.90647066, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.40966797, + "step": 3115, + "time_per_iteration": 2.594458818435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050487, + "balance_loss_mlp": 1.00952673, + "epoch": 0.599461331281262, + "flos": 745942540800.0, + "grad_norm": 0.044244117222237124, + "language_loss": 0.81526911, + "learning_rate": 0.00036493206352164324, + "loss": 0.82577395, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.40966797, + "step": 3116, + "time_per_iteration": 2.9088714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046863, + "balance_loss_mlp": 1.0058552, + "epoch": 0.5996537129665256, + "flos": 593484958464.0, + "grad_norm": 0.034019953192927346, + "language_loss": 0.85863578, + "learning_rate": 0.000364632129904349, + "loss": 0.8691045, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.41015625, + "step": 3117, + "time_per_iteration": 2.7059812545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055187, + "balance_loss_mlp": 1.01415479, + "epoch": 0.5998460946517892, + "flos": 560116696320.0, + "grad_norm": 0.0363455836603733, + "language_loss": 0.78243721, + "learning_rate": 0.00036433224884152283, + "loss": 0.79298902, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.41040039, + "step": 3118, + "time_per_iteration": 2.7368576526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049846, + "balance_loss_mlp": 1.00879073, + "epoch": 0.6000384763370528, + "flos": 485536537344.0, + "grad_norm": 0.037553840644260136, + "language_loss": 0.78583586, + "learning_rate": 0.00036403242044958875, + "loss": 0.79633433, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.41064453, + "step": 3119, + "time_per_iteration": 2.5575714111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105366, + "balance_loss_mlp": 1.01267588, + "epoch": 0.6002308580223162, + "flos": 597878717184.0, + "grad_norm": 0.03820222884564333, + "language_loss": 0.91700655, + "learning_rate": 0.0003637326448449507, + "loss": 0.9275431, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.40991211, + "step": 3120, + "time_per_iteration": 2.742879629135132 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 260120304, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7092513394130944.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/training_args.bin b/sft_pretrain/Full_xmoe/checkpoint-3120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e62437ed6fbf4cf3ea22fcfae3749bb9df2d0109 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144fbe7f1cf435dbbf0ef9621414cb3e97a5ff4a560571b878000caf2931b07 +size 7992 diff --git a/sft_pretrain/Full_xmoe/checkpoint-3120/zero_to_fp32.py b/sft_pretrain/Full_xmoe/checkpoint-3120/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-3120/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/added_tokens.json b/sft_pretrain/Full_xmoe/checkpoint-4160/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/config.json b/sft_pretrain/Full_xmoe/checkpoint-4160/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5ed860286ec8c9b3f17e5234326d2ed728ca6a65 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "xmoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/generation_config.json b/sft_pretrain/Full_xmoe/checkpoint-4160/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a884648c5a3cde8069645183d3bed3773e825b8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4780225e0661331cae5ec8e5fde5924123e75bdd6a90f204770662ab6dc30210 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5aef7c7e360194387bf31a67457af8bf6479be1 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:533dd5933dc1d0a83f54d864887d799fc605ca67995a7829db4c31389b105a9b +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..724eac43256b8d66824d2e8851b598e5f59bbd6c --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:221b9305eb987108b90503b186a1318ca52bd1626744e3d302467698e475fa4d +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e6162e6ef4bf8d7bf704d6e80dead2aa03b93b2 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e985578d4498a540723bf497201a0617e330c12a5e83e41141790c6032d6bfe5 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..243a595890d55fa694d70aaaf423e83a3c748a58 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:222fa0c52f7eb5aa16f3d0e341d88fd3b99401dcbe97b25271db5d81605494fe +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c78b51848a62082743b1073c6b9513ada36ad3e7 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aff7ddfe65c2e66eb7ee2035cd60a39bb28e9e46b9f821b75748c0d3e3aeafca +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b510057809d17b49d19df3146afd9ad2d173ef1d --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf1eb6f881a633aa02af9b67e587e29e16b24189e5a9548f95c66c8ea141524 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45e872ef33f45ea043f138205657d4a4053fed59 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da88b1d914296a38763fe06ce2ed4269efd83d61ad4b102b53de1d5a8854d528 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/latest b/sft_pretrain/Full_xmoe/checkpoint-4160/latest new file mode 100644 index 0000000000000000000000000000000000000000..ae01dfd535e9ee314b565695c1d61230ecf4c494 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/latest @@ -0,0 +1 @@ +global_step4160 \ No newline at end of file diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/model-00001-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-4160/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/model-00002-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-4160/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..533f59273baee816eee4665d4ed672b21e91f223 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aa017dd3c3eb906f49d109f05f90acbf0b1a04209fe7f9ecdeb4dfd3eb9f97a +size 3759044016 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/model.safetensors.index.json b/sft_pretrain/Full_xmoe/checkpoint-4160/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..507806fb086ee2ffdb4c1df263574fc5a7cfa513 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/model.safetensors.index.json @@ -0,0 +1,675 @@ +{ + "metadata": { + "total_size": 8731443248 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_0.pth b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_1.pth b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_2.pth b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_3.pth b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/special_tokens_map.json b/sft_pretrain/Full_xmoe/checkpoint-4160/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/tokenizer.model b/sft_pretrain/Full_xmoe/checkpoint-4160/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/tokenizer_config.json b/sft_pretrain/Full_xmoe/checkpoint-4160/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/trainer_state.json b/sft_pretrain/Full_xmoe/checkpoint-4160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..da4378d926caf1b225504ad134cc2a93c41b7b3f --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/trainer_state.json @@ -0,0 +1,62433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003078106964217, + "eval_steps": 500, + "global_step": 4160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334678, + "balance_loss_mlp": 2.48847342, + "epoch": 0.00019238168526356292, + "flos": 471022563072.0, + "grad_norm": 15.010934477254423, + "language_loss": 2.91277003, + "learning_rate": 0.0, + "loss": 1.95375419, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 8.6015625, + "step": 1, + "time_per_iteration": 23.313215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03608113, + "balance_loss_mlp": 3.00043201, + "epoch": 0.00038476337052712584, + "flos": 505538830848.0, + "grad_norm": 25.821694542927546, + "language_loss": 10.7459116, + "learning_rate": 0.00013726078121135892, + "loss": 10.78199196, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.06640625, + "step": 2, + "time_per_iteration": 2.6342098712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03648002, + "balance_loss_mlp": 3.03803182, + "epoch": 0.0005771450557906887, + "flos": 600334166016.0, + "grad_norm": 27.537763142134942, + "language_loss": 10.88985825, + "learning_rate": 0.00021755319103969496, + "loss": 10.9263401, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.08984375, + "step": 3, + "time_per_iteration": 2.9129159450531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03639085, + "balance_loss_mlp": 3.03521824, + "epoch": 0.0007695267410542517, + "flos": 581497386240.0, + "grad_norm": 10.719163482624658, + "language_loss": 8.79598808, + "learning_rate": 0.00027452156242271784, + "loss": 8.83237934, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.02734375, + "step": 4, + "time_per_iteration": 2.72357439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03604871, + "balance_loss_mlp": 3.01435566, + "epoch": 0.0009619084263178145, + "flos": 487154061312.0, + "grad_norm": 22.68157363884245, + "language_loss": 9.41989708, + "learning_rate": 0.0003187096642208417, + "loss": 9.45594501, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.8984375, + "step": 5, + "time_per_iteration": 2.6791844367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03472164, + "balance_loss_mlp": 2.9011035, + "epoch": 0.0011542901115813775, + "flos": 561167503872.0, + "grad_norm": 7.113488232519407, + "language_loss": 9.41725159, + "learning_rate": 0.0003548139722510539, + "loss": 9.45197296, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.72265625, + "step": 6, + "time_per_iteration": 2.7308623790740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03266853, + "balance_loss_mlp": 2.70799947, + "epoch": 0.0013466717968449403, + "flos": 534951738624.0, + "grad_norm": 3.189932925125429, + "language_loss": 8.01036549, + "learning_rate": 0.00038533972973918044, + "loss": 8.0430336, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.59765625, + "step": 7, + "time_per_iteration": 2.6907436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02962571, + "balance_loss_mlp": 2.41211033, + "epoch": 0.0015390534821085034, + "flos": 493334485248.0, + "grad_norm": 5.13822781788523, + "language_loss": 7.84486008, + "learning_rate": 0.0004117823436340768, + "loss": 7.87448597, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.51171875, + "step": 8, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550478, + "balance_loss_mlp": 2.0114615, + "epoch": 0.0017314351673720662, + "flos": 565776090624.0, + "grad_norm": 3.8232757327488405, + "language_loss": 7.62468719, + "learning_rate": 0.00043510638207938993, + "loss": 7.65019178, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.7688682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02337757, + "balance_loss_mlp": 1.81705093, + "epoch": 0.001923816852635629, + "flos": 594509521152.0, + "grad_norm": 3.0012265425900817, + "language_loss": 6.96830463, + "learning_rate": 0.00045597044543220066, + "loss": 6.99168253, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.20703125, + "step": 10, + "time_per_iteration": 2.736985921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02262083, + "balance_loss_mlp": 1.74290299, + "epoch": 0.002116198537899192, + "flos": 610895709696.0, + "grad_norm": 2.2728267884834983, + "language_loss": 6.92078686, + "learning_rate": 0.00047484428652143135, + "loss": 6.94340801, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.19140625, + "step": 11, + "time_per_iteration": 2.8857340812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308547, + "balance_loss_mlp": 1.78135598, + "epoch": 0.002308580223162755, + "flos": 546175262976.0, + "grad_norm": 4.334726148282724, + "language_loss": 6.71077013, + "learning_rate": 0.0004920747534624128, + "loss": 6.73385572, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 5.2734375, + "step": 12, + "time_per_iteration": 2.635601282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02317905, + "balance_loss_mlp": 1.79147708, + "epoch": 0.002500961908426318, + "flos": 645924270336.0, + "grad_norm": 3.1568536142119923, + "language_loss": 6.53248501, + "learning_rate": 0.0005079252465375872, + "loss": 6.55566406, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 5.265625, + "step": 13, + "time_per_iteration": 2.8112540245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02242807, + "balance_loss_mlp": 1.72019386, + "epoch": 0.0026933435936898806, + "flos": 488849352960.0, + "grad_norm": 7.572425831928954, + "language_loss": 6.47189951, + "learning_rate": 0.0005226005109505393, + "loss": 6.49432755, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 5.2265625, + "step": 14, + "time_per_iteration": 2.590078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02247915, + "balance_loss_mlp": 1.72415757, + "epoch": 0.0028857252789534437, + "flos": 435526429440.0, + "grad_norm": 2.3229781853457747, + "language_loss": 6.01724243, + "learning_rate": 0.0005362628552605367, + "loss": 6.03972149, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 5.23828125, + "step": 15, + "time_per_iteration": 2.636983871459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02135688, + "balance_loss_mlp": 1.62108541, + "epoch": 0.0030781069642170067, + "flos": 597841778688.0, + "grad_norm": 4.36506198708269, + "language_loss": 5.46747923, + "learning_rate": 0.0005490431248454357, + "loss": 5.48883629, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 5.14453125, + "step": 16, + "time_per_iteration": 2.6904103755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02173305, + "balance_loss_mlp": 1.67586899, + "epoch": 0.0032704886494805694, + "flos": 1541513154048.0, + "grad_norm": 0.3693165783384919, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77878416, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 4.96875, + "step": 17, + "time_per_iteration": 6.815098285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01958957, + "balance_loss_mlp": 1.45846832, + "epoch": 0.0034628703347441324, + "flos": 474971102976.0, + "grad_norm": 7.376330921510473, + "language_loss": 3.16160107, + "learning_rate": 0.0005723671632907488, + "loss": 3.18119049, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 5.0, + "step": 18, + "time_per_iteration": 2.7730185985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974299, + "balance_loss_mlp": 1.48144007, + "epoch": 0.0036552520200076955, + "flos": 449478556416.0, + "grad_norm": 2.0435067055151803, + "language_loss": 1.8205657, + "learning_rate": 0.0005830738490244919, + "loss": 1.84030867, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.921875, + "step": 19, + "time_per_iteration": 2.5196421146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215561, + "balance_loss_mlp": 1.73147547, + "epoch": 0.003847633705271258, + "flos": 637351580928.0, + "grad_norm": 2.199322832792736, + "language_loss": 1.81859815, + "learning_rate": 0.0005932312266435596, + "loss": 1.84075379, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.83203125, + "step": 20, + "time_per_iteration": 2.7772061824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02397049, + "balance_loss_mlp": 1.91639686, + "epoch": 0.004040015390534821, + "flos": 590591105280.0, + "grad_norm": 2.068137361611091, + "language_loss": 1.81285238, + "learning_rate": 0.0006028929207788754, + "loss": 1.83682299, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.796875, + "step": 21, + "time_per_iteration": 2.7197327613830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02949394, + "balance_loss_mlp": 2.47560835, + "epoch": 0.004232397075798384, + "flos": 757866929664.0, + "grad_norm": 0.9893066861855494, + "language_loss": 1.43565178, + "learning_rate": 0.0006121050677327902, + "loss": 1.46514571, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.7265625, + "step": 22, + "time_per_iteration": 2.8821635246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04240368, + "balance_loss_mlp": 3.77421188, + "epoch": 0.004424778761061947, + "flos": 527727310080.0, + "grad_norm": 1.6702760591351544, + "language_loss": 1.36044598, + "learning_rate": 0.0006209076479463684, + "loss": 1.40284979, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.6484375, + "step": 23, + "time_per_iteration": 2.6194069385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04405254, + "balance_loss_mlp": 3.93871665, + "epoch": 0.00461716044632551, + "flos": 549218815488.0, + "grad_norm": 1.6356367296774819, + "language_loss": 1.46302319, + "learning_rate": 0.0006293355346737718, + "loss": 1.50707567, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.65234375, + "step": 24, + "time_per_iteration": 2.741433620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03977472, + "balance_loss_mlp": 3.50483179, + "epoch": 0.004809542131589073, + "flos": 568752569088.0, + "grad_norm": 1.079559317914091, + "language_loss": 1.33177948, + "learning_rate": 0.0006374193284416834, + "loss": 1.37155437, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.71484375, + "step": 25, + "time_per_iteration": 2.902089834213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03127712, + "balance_loss_mlp": 2.642483, + "epoch": 0.005001923816852636, + "flos": 471584410368.0, + "grad_norm": 0.4847890845471295, + "language_loss": 1.26058078, + "learning_rate": 0.0006451860277489461, + "loss": 1.29185796, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.84375, + "step": 26, + "time_per_iteration": 2.6045680046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02733563, + "balance_loss_mlp": 2.23154879, + "epoch": 0.005194305502116198, + "flos": 416381502720.0, + "grad_norm": 0.2845036760864029, + "language_loss": 1.33193052, + "learning_rate": 0.0006526595731190848, + "loss": 1.35926616, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.015625, + "step": 27, + "time_per_iteration": 2.4412264823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02759137, + "balance_loss_mlp": 2.2411015, + "epoch": 0.005386687187379761, + "flos": 629996894976.0, + "grad_norm": 0.34713687972437796, + "language_loss": 1.22031224, + "learning_rate": 0.0006598612921618983, + "loss": 1.24790359, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.1796875, + "step": 28, + "time_per_iteration": 2.80483078956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575766, + "balance_loss_mlp": 2.05010033, + "epoch": 0.005579068872643324, + "flos": 888021326592.0, + "grad_norm": 0.3062478898066755, + "language_loss": 1.16221631, + "learning_rate": 0.0006668102665011454, + "loss": 1.18797398, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.2578125, + "step": 29, + "time_per_iteration": 3.243164300918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02507804, + "balance_loss_mlp": 1.97527242, + "epoch": 0.005771450557906887, + "flos": 548658902016.0, + "grad_norm": 0.22276861521731073, + "language_loss": 1.24634933, + "learning_rate": 0.0006735236364718957, + "loss": 1.27142727, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.328125, + "step": 30, + "time_per_iteration": 2.7701382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02465182, + "balance_loss_mlp": 1.93226886, + "epoch": 0.00596383224317045, + "flos": 533069809152.0, + "grad_norm": 0.21102664747409663, + "language_loss": 1.23222375, + "learning_rate": 0.0006800168558381346, + "loss": 1.25687563, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.33203125, + "step": 31, + "time_per_iteration": 2.635246515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02445382, + "balance_loss_mlp": 1.91552007, + "epoch": 0.0061562139284340135, + "flos": 590163394560.0, + "grad_norm": 0.21886797396213825, + "language_loss": 1.26610851, + "learning_rate": 0.0006863039060567947, + "loss": 1.29056239, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.30078125, + "step": 32, + "time_per_iteration": 2.7791683673858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02338603, + "balance_loss_mlp": 1.80950415, + "epoch": 0.006348595613697576, + "flos": 619442154240.0, + "grad_norm": 0.18971916612404452, + "language_loss": 1.17543316, + "learning_rate": 0.0006923974775611263, + "loss": 1.19881916, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.29296875, + "step": 33, + "time_per_iteration": 2.836601495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02160521, + "balance_loss_mlp": 1.64134097, + "epoch": 0.006540977298961139, + "flos": 779300109312.0, + "grad_norm": 0.13369632510289112, + "language_loss": 1.13907146, + "learning_rate": 0.0006983091239737814, + "loss": 1.16067672, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.19140625, + "step": 34, + "time_per_iteration": 3.021479606628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0221033, + "balance_loss_mlp": 1.69649041, + "epoch": 0.006733358984224702, + "flos": 668373264384.0, + "grad_norm": 0.11522706717853448, + "language_loss": 1.11973858, + "learning_rate": 0.0007040493939600222, + "loss": 1.14184177, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 5.13671875, + "step": 35, + "time_per_iteration": 2.9400346279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0227657, + "balance_loss_mlp": 1.76997864, + "epoch": 0.006925740669488265, + "flos": 565496133888.0, + "grad_norm": 0.11143421895921844, + "language_loss": 1.12295914, + "learning_rate": 0.0007096279445021078, + "loss": 1.14572477, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 5.0625, + "step": 36, + "time_per_iteration": 2.698153495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02284885, + "balance_loss_mlp": 1.78668559, + "epoch": 0.007118122354751828, + "flos": 551112405504.0, + "grad_norm": 0.11733654674395574, + "language_loss": 1.1734066, + "learning_rate": 0.0007150536386503726, + "loss": 1.19625545, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.9765625, + "step": 37, + "time_per_iteration": 2.8579084873199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02268399, + "balance_loss_mlp": 1.77782845, + "epoch": 0.007310504040015391, + "flos": 703814951424.0, + "grad_norm": 0.14208952684155102, + "language_loss": 1.10088778, + "learning_rate": 0.0007203346302358509, + "loss": 1.12357187, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.8984375, + "step": 38, + "time_per_iteration": 2.928835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220017, + "balance_loss_mlp": 1.73555112, + "epoch": 0.007502885725278953, + "flos": 600501361920.0, + "grad_norm": 0.142042154575746, + "language_loss": 1.15486813, + "learning_rate": 0.000725478437577282, + "loss": 1.17706823, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.8359375, + "step": 39, + "time_per_iteration": 2.8706436157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0209897, + "balance_loss_mlp": 1.62251425, + "epoch": 0.007695267410542516, + "flos": 561428018688.0, + "grad_norm": 0.13255726845543458, + "language_loss": 1.10233212, + "learning_rate": 0.0007304920078549186, + "loss": 1.12332189, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.75390625, + "step": 40, + "time_per_iteration": 2.6895179748535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01939831, + "balance_loss_mlp": 1.46986008, + "epoch": 0.007887649095806078, + "flos": 509231725056.0, + "grad_norm": 0.11166218824526469, + "language_loss": 1.12161303, + "learning_rate": 0.0007353817735343603, + "loss": 1.14101124, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.6875, + "step": 41, + "time_per_iteration": 2.709167957305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0184399, + "balance_loss_mlp": 1.3778342, + "epoch": 0.008080030781069641, + "flos": 504905040384.0, + "grad_norm": 0.06254207778511488, + "language_loss": 1.07663667, + "learning_rate": 0.0007401537019902344, + "loss": 1.09507656, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.6484375, + "step": 42, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01789021, + "balance_loss_mlp": 1.32896876, + "epoch": 0.008272412466333205, + "flos": 519106988544.0, + "grad_norm": 0.07012531219711775, + "language_loss": 1.09992051, + "learning_rate": 0.0007448133392900729, + "loss": 1.11781073, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.5859375, + "step": 43, + "time_per_iteration": 2.6997878551483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01787217, + "balance_loss_mlp": 1.32983518, + "epoch": 0.008464794151596768, + "flos": 609184866816.0, + "grad_norm": 0.09276066699658307, + "language_loss": 1.05755496, + "learning_rate": 0.0007493658489441491, + "loss": 1.07542706, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.56640625, + "step": 44, + "time_per_iteration": 2.8852477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177156, + "balance_loss_mlp": 1.31913674, + "epoch": 0.00865717583686033, + "flos": 539007214848.0, + "grad_norm": 0.11478380715178954, + "language_loss": 1.09959674, + "learning_rate": 0.0007538160463002316, + "loss": 1.11731243, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.53125, + "step": 45, + "time_per_iteration": 2.685568332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01802016, + "balance_loss_mlp": 1.35378933, + "epoch": 0.008849557522123894, + "flos": 509010094080.0, + "grad_norm": 0.14537339285711792, + "language_loss": 1.13533509, + "learning_rate": 0.0007581684291577274, + "loss": 1.15335524, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.49609375, + "step": 46, + "time_per_iteration": 2.5798568725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764716, + "balance_loss_mlp": 1.31915987, + "epoch": 0.009041939207387457, + "flos": 626508135168.0, + "grad_norm": 0.13285081251714825, + "language_loss": 1.15270185, + "learning_rate": 0.0007624272050891776, + "loss": 1.17034888, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.46875, + "step": 47, + "time_per_iteration": 2.822632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0175788, + "balance_loss_mlp": 1.31461263, + "epoch": 0.00923432089265102, + "flos": 550610817792.0, + "grad_norm": 0.11934546954286276, + "language_loss": 1.04916859, + "learning_rate": 0.0007665963158851307, + "loss": 1.06674731, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.4453125, + "step": 48, + "time_per_iteration": 2.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01741735, + "balance_loss_mlp": 1.29846764, + "epoch": 0.009426702577914583, + "flos": 563679333120.0, + "grad_norm": 0.08548395668661983, + "language_loss": 1.13647461, + "learning_rate": 0.0007706794594783609, + "loss": 1.15389204, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.4453125, + "step": 49, + "time_per_iteration": 2.734813928604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01727457, + "balance_loss_mlp": 1.28838515, + "epoch": 0.009619084263178146, + "flos": 617926697472.0, + "grad_norm": 0.06892583067190382, + "language_loss": 1.12110853, + "learning_rate": 0.0007746801096530423, + "loss": 1.13838315, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.40234375, + "step": 50, + "time_per_iteration": 2.7447421550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01719129, + "balance_loss_mlp": 1.28043914, + "epoch": 0.009811465948441709, + "flos": 542489171712.0, + "grad_norm": 0.04778558244894799, + "language_loss": 1.16797209, + "learning_rate": 0.0007786015338021173, + "loss": 1.1851635, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.3984375, + "step": 51, + "time_per_iteration": 2.65645694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01722789, + "balance_loss_mlp": 1.28562462, + "epoch": 0.010003847633705272, + "flos": 536977531392.0, + "grad_norm": 0.06217135289779639, + "language_loss": 1.09074998, + "learning_rate": 0.0007824468089603051, + "loss": 1.10797799, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.3828125, + "step": 52, + "time_per_iteration": 2.7218713760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01697539, + "balance_loss_mlp": 1.26380801, + "epoch": 0.010196229318968833, + "flos": 910806657792.0, + "grad_norm": 0.04206474108062499, + "language_loss": 1.08130515, + "learning_rate": 0.0007862188363098669, + "loss": 1.09828055, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.34765625, + "step": 53, + "time_per_iteration": 3.149973154067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01668333, + "balance_loss_mlp": 1.23765349, + "epoch": 0.010388611004232396, + "flos": 586970142720.0, + "grad_norm": 0.050634309517598654, + "language_loss": 1.08688021, + "learning_rate": 0.0007899203543304438, + "loss": 1.10356343, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.31640625, + "step": 54, + "time_per_iteration": 2.7033088207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01691162, + "balance_loss_mlp": 1.26315343, + "epoch": 0.01058099268949596, + "flos": 503472208896.0, + "grad_norm": 0.06464656169002964, + "language_loss": 1.22991037, + "learning_rate": 0.0007935539507422731, + "loss": 1.246822, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.2890625, + "step": 55, + "time_per_iteration": 2.601745843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017059, + "balance_loss_mlp": 1.28017938, + "epoch": 0.010773374374759523, + "flos": 545558969088.0, + "grad_norm": 0.06403483907250343, + "language_loss": 1.12561536, + "learning_rate": 0.0007971220733732573, + "loss": 1.14267421, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.265625, + "step": 56, + "time_per_iteration": 2.677314281463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0169453, + "balance_loss_mlp": 1.27262425, + "epoch": 0.010965756060023086, + "flos": 527286960384.0, + "grad_norm": 0.061369678053330295, + "language_loss": 1.07931721, + "learning_rate": 0.0008006270400641869, + "loss": 1.09626245, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.2265625, + "step": 57, + "time_per_iteration": 2.7162468433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699229, + "balance_loss_mlp": 1.27846837, + "epoch": 0.011158137745286649, + "flos": 578098054656.0, + "grad_norm": 0.06126094216688289, + "language_loss": 1.08923888, + "learning_rate": 0.0008040710477125043, + "loss": 1.10623109, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.21484375, + "step": 58, + "time_per_iteration": 2.724116563796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648065, + "balance_loss_mlp": 1.23150039, + "epoch": 0.011350519430550212, + "flos": 530314961664.0, + "grad_norm": 0.059594432794803906, + "language_loss": 1.09501219, + "learning_rate": 0.0008074561805429771, + "loss": 1.11149275, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.171875, + "step": 59, + "time_per_iteration": 2.613821268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01628698, + "balance_loss_mlp": 1.21594822, + "epoch": 0.011542901115813775, + "flos": 556971076608.0, + "grad_norm": 0.046387810099464834, + "language_loss": 1.0703913, + "learning_rate": 0.0008107844176832545, + "loss": 1.08667827, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.1328125, + "step": 60, + "time_per_iteration": 2.6809566020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01602811, + "balance_loss_mlp": 1.19349384, + "epoch": 0.011735282801077338, + "flos": 573176463360.0, + "grad_norm": 0.036957475185327084, + "language_loss": 1.08104563, + "learning_rate": 0.0008140576401132568, + "loss": 1.09707379, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.09765625, + "step": 61, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596506, + "balance_loss_mlp": 1.19024038, + "epoch": 0.0119276644863409, + "flos": 616717442304.0, + "grad_norm": 0.034032461682055544, + "language_loss": 1.09685671, + "learning_rate": 0.0008172776370494935, + "loss": 1.11282182, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.06640625, + "step": 62, + "time_per_iteration": 2.7589328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01605764, + "balance_loss_mlp": 1.20255029, + "epoch": 0.012120046171604464, + "flos": 502085064192.0, + "grad_norm": 0.035968497482949544, + "language_loss": 1.17104983, + "learning_rate": 0.0008204461118185703, + "loss": 1.18710756, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.03515625, + "step": 63, + "time_per_iteration": 2.594369411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01603311, + "balance_loss_mlp": 1.20353031, + "epoch": 0.012312427856868027, + "flos": 474302319360.0, + "grad_norm": 0.04911792883083492, + "language_loss": 1.06295228, + "learning_rate": 0.0008235646872681536, + "loss": 1.07898545, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 3.99609375, + "step": 64, + "time_per_iteration": 2.5651702880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01599528, + "balance_loss_mlp": 1.20279896, + "epoch": 0.012504809542131588, + "flos": 539471864064.0, + "grad_norm": 0.049725750424410776, + "language_loss": 1.06296277, + "learning_rate": 0.0008266349107584288, + "loss": 1.07895803, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 3.95898438, + "step": 65, + "time_per_iteration": 2.6876485347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596697, + "balance_loss_mlp": 1.20492756, + "epoch": 0.012697191227395151, + "flos": 609857541120.0, + "grad_norm": 0.056540756097456804, + "language_loss": 1.08585978, + "learning_rate": 0.0008296582587724851, + "loss": 1.10182667, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 3.91210938, + "step": 66, + "time_per_iteration": 2.71223783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587883, + "balance_loss_mlp": 1.19821179, + "epoch": 0.012889572912658714, + "flos": 769398600960.0, + "grad_norm": 0.04465917834699911, + "language_loss": 1.0627861, + "learning_rate": 0.0008326361411800136, + "loss": 1.07866502, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 3.89648438, + "step": 67, + "time_per_iteration": 2.9413115978240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01577237, + "balance_loss_mlp": 1.19099891, + "epoch": 0.013081954597922277, + "flos": 535021724928.0, + "grad_norm": 0.05343660826588632, + "language_loss": 1.06744349, + "learning_rate": 0.0008355699051851403, + "loss": 1.08321595, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 3.86132812, + "step": 68, + "time_per_iteration": 2.726212501525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0157129, + "balance_loss_mlp": 1.18829489, + "epoch": 0.01327433628318584, + "flos": 574181584128.0, + "grad_norm": 0.041490887209285586, + "language_loss": 1.14052749, + "learning_rate": 0.0008384608389860635, + "loss": 1.15624034, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 3.828125, + "step": 69, + "time_per_iteration": 2.6679208278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156381, + "balance_loss_mlp": 1.18386579, + "epoch": 0.013466717968449404, + "flos": 498259967232.0, + "grad_norm": 0.03618836919088814, + "language_loss": 1.04182374, + "learning_rate": 0.000841310175171381, + "loss": 1.05746174, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 3.796875, + "step": 70, + "time_per_iteration": 2.6277127265930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01563963, + "balance_loss_mlp": 1.18592632, + "epoch": 0.013659099653712967, + "flos": 566622763776.0, + "grad_norm": 0.04320101591589407, + "language_loss": 1.02295327, + "learning_rate": 0.000844119093875517, + "loss": 1.03859293, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 3.77734375, + "step": 71, + "time_per_iteration": 2.7236883640289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558639, + "balance_loss_mlp": 1.18403625, + "epoch": 0.01385148133897653, + "flos": 574943686656.0, + "grad_norm": 0.03416580025853519, + "language_loss": 1.06855714, + "learning_rate": 0.0008468887257134666, + "loss": 1.08414352, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 3.7421875, + "step": 72, + "time_per_iteration": 2.6696412563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558456, + "balance_loss_mlp": 1.18499684, + "epoch": 0.014043863024240093, + "flos": 577959048960.0, + "grad_norm": 0.037886537215891476, + "language_loss": 1.09368944, + "learning_rate": 0.0008496201545131264, + "loss": 1.10927403, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 3.73046875, + "step": 73, + "time_per_iteration": 2.701594591140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545785, + "balance_loss_mlp": 1.17575896, + "epoch": 0.014236244709503656, + "flos": 940265252352.0, + "grad_norm": 0.04766211184506119, + "language_loss": 1.07240248, + "learning_rate": 0.0008523144198617317, + "loss": 1.08786011, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.6953125, + "step": 74, + "time_per_iteration": 3.1882145404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01551426, + "balance_loss_mlp": 1.18387985, + "epoch": 0.014428626394767219, + "flos": 529496478720.0, + "grad_norm": 0.031986864242930464, + "language_loss": 1.06216824, + "learning_rate": 0.0008549725194813783, + "loss": 1.0776825, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.66992188, + "step": 75, + "time_per_iteration": 2.666274309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01546662, + "balance_loss_mlp": 1.18102288, + "epoch": 0.014621008080030782, + "flos": 805283549952.0, + "grad_norm": 0.03321604497436844, + "language_loss": 1.05779314, + "learning_rate": 0.0008575954114472099, + "loss": 1.07325983, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.65039062, + "step": 76, + "time_per_iteration": 3.1192731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547179, + "balance_loss_mlp": 1.18478322, + "epoch": 0.014813389765294343, + "flos": 698357746176.0, + "grad_norm": 0.03477979781895141, + "language_loss": 1.02737951, + "learning_rate": 0.0008601840162606118, + "loss": 1.04285145, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.6171875, + "step": 77, + "time_per_iteration": 3.0015783309936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547226, + "balance_loss_mlp": 1.18788171, + "epoch": 0.015005771450557906, + "flos": 598165476864.0, + "grad_norm": 0.032631512960834254, + "language_loss": 1.09477437, + "learning_rate": 0.000862739218788641, + "loss": 1.11024666, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.58984375, + "step": 78, + "time_per_iteration": 2.790245771408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536731, + "balance_loss_mlp": 1.18177319, + "epoch": 0.01519815313582147, + "flos": 550493199360.0, + "grad_norm": 0.0308447873241268, + "language_loss": 1.07131243, + "learning_rate": 0.0008652618700799138, + "loss": 1.0866797, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.55664062, + "step": 79, + "time_per_iteration": 2.6302430629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01532812, + "balance_loss_mlp": 1.18033433, + "epoch": 0.015390534821085032, + "flos": 431440817664.0, + "grad_norm": 0.04595099678969376, + "language_loss": 1.06556606, + "learning_rate": 0.0008677527890662774, + "loss": 1.08089423, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.53125, + "step": 80, + "time_per_iteration": 2.4970459938049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01520539, + "balance_loss_mlp": 1.17130363, + "epoch": 0.015582916506348595, + "flos": 525185345280.0, + "grad_norm": 0.030530536654869142, + "language_loss": 1.07461143, + "learning_rate": 0.0008702127641587799, + "loss": 1.08981681, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.49804688, + "step": 81, + "time_per_iteration": 2.6258630752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01512144, + "balance_loss_mlp": 1.16500628, + "epoch": 0.015775298191612157, + "flos": 576617591040.0, + "grad_norm": 0.026948447424875538, + "language_loss": 1.02672768, + "learning_rate": 0.0008726425547457192, + "loss": 1.04184914, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.4765625, + "step": 82, + "time_per_iteration": 2.7344956398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517079, + "balance_loss_mlp": 1.17375636, + "epoch": 0.01596767987687572, + "flos": 611440071936.0, + "grad_norm": 0.03479426421062965, + "language_loss": 1.02940345, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457426, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.4375, + "step": 83, + "time_per_iteration": 2.738685369491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01509349, + "balance_loss_mlp": 1.16850555, + "epoch": 0.016160061562139283, + "flos": 568233484800.0, + "grad_norm": 0.05178756375238081, + "language_loss": 1.08039558, + "learning_rate": 0.0008774144832015932, + "loss": 1.09548914, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.41210938, + "step": 84, + "time_per_iteration": 2.6948299407958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575775, + "balance_loss_mlp": 2.26144409, + "epoch": 0.016352443247402846, + "flos": 1414502431488.0, + "grad_norm": 0.37456313977874084, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7735008, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.140625, + "step": 85, + "time_per_iteration": 4.596364974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517697, + "balance_loss_mlp": 1.17895198, + "epoch": 0.01654482493266641, + "flos": 731786279424.0, + "grad_norm": 0.04138572693056026, + "language_loss": 1.03059626, + "learning_rate": 0.0008820741205014318, + "loss": 1.04577315, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.390625, + "step": 86, + "time_per_iteration": 2.901047706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01566516, + "balance_loss_mlp": 1.22757995, + "epoch": 0.016737206617929972, + "flos": 537405242112.0, + "grad_norm": 0.0588613682629828, + "language_loss": 1.04849172, + "learning_rate": 0.0008843634575408404, + "loss": 1.06415701, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.39257812, + "step": 87, + "time_per_iteration": 2.6739823818206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583525, + "balance_loss_mlp": 1.24497032, + "epoch": 0.016929588303193535, + "flos": 538130406144.0, + "grad_norm": 0.09131872689500015, + "language_loss": 1.06101418, + "learning_rate": 0.0008866266301555082, + "loss": 1.07684946, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.38867188, + "step": 88, + "time_per_iteration": 2.741093635559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156783, + "balance_loss_mlp": 1.23118281, + "epoch": 0.017121969988457098, + "flos": 527792438784.0, + "grad_norm": 0.07103005743700296, + "language_loss": 1.07027078, + "learning_rate": 0.0008888642296509615, + "loss": 1.08594918, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.36914062, + "step": 89, + "time_per_iteration": 2.622267007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01554346, + "balance_loss_mlp": 1.2196058, + "epoch": 0.01731435167372066, + "flos": 626768649984.0, + "grad_norm": 0.057543283798364535, + "language_loss": 1.11941445, + "learning_rate": 0.0008910768275115906, + "loss": 1.13495779, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.34960938, + "step": 90, + "time_per_iteration": 2.778939962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536545, + "balance_loss_mlp": 1.20409441, + "epoch": 0.017506733358984224, + "flos": 497385103872.0, + "grad_norm": 0.06951140803051024, + "language_loss": 1.07318401, + "learning_rate": 0.0008932649762767675, + "loss": 1.08854938, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.32617188, + "step": 91, + "time_per_iteration": 2.5841660499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01529864, + "balance_loss_mlp": 1.20122755, + "epoch": 0.017699115044247787, + "flos": 747218870016.0, + "grad_norm": 0.037985069994816135, + "language_loss": 1.10022223, + "learning_rate": 0.0008954292103690864, + "loss": 1.11552095, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.28710938, + "step": 92, + "time_per_iteration": 2.976200580596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525091, + "balance_loss_mlp": 1.19893408, + "epoch": 0.01789149672951135, + "flos": 516521282304.0, + "grad_norm": 0.05507041657686672, + "language_loss": 1.1172272, + "learning_rate": 0.0008975700468778296, + "loss": 1.13247812, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.26171875, + "step": 93, + "time_per_iteration": 2.5778274536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01518338, + "balance_loss_mlp": 1.19427943, + "epoch": 0.018083878414774913, + "flos": 587230657536.0, + "grad_norm": 0.047907590915393955, + "language_loss": 1.05762661, + "learning_rate": 0.0008996879863005366, + "loss": 1.07280993, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.24023438, + "step": 94, + "time_per_iteration": 2.6827101707458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01506508, + "balance_loss_mlp": 1.18664575, + "epoch": 0.018276260100038477, + "flos": 498370782720.0, + "grad_norm": 0.03950158468897577, + "language_loss": 1.05640411, + "learning_rate": 0.0009017835132453337, + "loss": 1.07146931, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.19726562, + "step": 95, + "time_per_iteration": 2.5879104137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01488471, + "balance_loss_mlp": 1.17223215, + "epoch": 0.01846864178530204, + "flos": 641233058304.0, + "grad_norm": 0.042611409633865054, + "language_loss": 1.05607677, + "learning_rate": 0.0009038570970964896, + "loss": 1.07096148, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.16015625, + "step": 96, + "time_per_iteration": 2.761634349822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487316, + "balance_loss_mlp": 1.17374837, + "epoch": 0.018661023470565603, + "flos": 512667995136.0, + "grad_norm": 0.026597294022958493, + "language_loss": 1.02809072, + "learning_rate": 0.0009059091926454854, + "loss": 1.04296374, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.1328125, + "step": 97, + "time_per_iteration": 2.602036952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487556, + "balance_loss_mlp": 1.17742097, + "epoch": 0.018853405155829166, + "flos": 932697683712.0, + "grad_norm": 0.04097414840704221, + "language_loss": 1.01764143, + "learning_rate": 0.0009079402406897198, + "loss": 1.03251696, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.09765625, + "step": 98, + "time_per_iteration": 3.2514705657958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483888, + "balance_loss_mlp": 1.17642295, + "epoch": 0.01904578684109273, + "flos": 577587718656.0, + "grad_norm": 0.027217181555243938, + "language_loss": 1.03385735, + "learning_rate": 0.0009099506686008212, + "loss": 1.04869628, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.0703125, + "step": 99, + "time_per_iteration": 2.7867672443389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473245, + "balance_loss_mlp": 1.16883183, + "epoch": 0.019238168526356292, + "flos": 559521789696.0, + "grad_norm": 0.02943095981266107, + "language_loss": 1.06245995, + "learning_rate": 0.0009119408908644013, + "loss": 1.07719231, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.0390625, + "step": 100, + "time_per_iteration": 2.718982219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466855, + "balance_loss_mlp": 1.164922, + "epoch": 0.019430550211619855, + "flos": 725104267776.0, + "grad_norm": 0.035830377247789626, + "language_loss": 1.12020779, + "learning_rate": 0.0009139113095929519, + "loss": 1.13487625, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.01367188, + "step": 101, + "time_per_iteration": 2.9023444652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146708, + "balance_loss_mlp": 1.16781712, + "epoch": 0.019622931896883418, + "flos": 500456846592.0, + "grad_norm": 0.031534744220975436, + "language_loss": 1.0658195, + "learning_rate": 0.0009158623150134762, + "loss": 1.08049035, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 2.98632812, + "step": 102, + "time_per_iteration": 2.5731325149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01479653, + "balance_loss_mlp": 1.1828692, + "epoch": 0.01981531358214698, + "flos": 510282532608.0, + "grad_norm": 0.0334583858191085, + "language_loss": 1.05968487, + "learning_rate": 0.000917794285931332, + "loss": 1.07448149, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 2.9609375, + "step": 103, + "time_per_iteration": 2.656132221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477107, + "balance_loss_mlp": 1.18184972, + "epoch": 0.020007695267410544, + "flos": 522393559296.0, + "grad_norm": 0.033386157220771755, + "language_loss": 0.97816026, + "learning_rate": 0.0009197075901716639, + "loss": 0.99293131, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 2.9453125, + "step": 104, + "time_per_iteration": 2.7207133769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01472947, + "balance_loss_mlp": 1.1811223, + "epoch": 0.020200076952674107, + "flos": 534444314880.0, + "grad_norm": 0.03432724584635873, + "language_loss": 1.08410704, + "learning_rate": 0.0009216025849997171, + "loss": 1.09883642, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 2.92382812, + "step": 105, + "time_per_iteration": 2.783440113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461838, + "balance_loss_mlp": 1.17115784, + "epoch": 0.020392458637937667, + "flos": 686083414272.0, + "grad_norm": 0.04360543496830388, + "language_loss": 1.02907205, + "learning_rate": 0.0009234796175212258, + "loss": 1.04369044, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 2.9140625, + "step": 106, + "time_per_iteration": 2.914760112762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450941, + "balance_loss_mlp": 1.1615957, + "epoch": 0.02058484032320123, + "flos": 703415430912.0, + "grad_norm": 0.03266429542390293, + "language_loss": 1.06572628, + "learning_rate": 0.000925339025064007, + "loss": 1.08023572, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 2.90039062, + "step": 107, + "time_per_iteration": 2.951838254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453976, + "balance_loss_mlp": 1.16558492, + "epoch": 0.020777222008464793, + "flos": 640328059392.0, + "grad_norm": 0.03192051704400644, + "language_loss": 0.99516582, + "learning_rate": 0.0009271811355418027, + "loss": 1.00970554, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 2.890625, + "step": 108, + "time_per_iteration": 2.897881507873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01449031, + "balance_loss_mlp": 1.16159379, + "epoch": 0.020969603693728356, + "flos": 683321763840.0, + "grad_norm": 0.04466737388011785, + "language_loss": 1.06219566, + "learning_rate": 0.0009290062678013548, + "loss": 1.07668602, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 2.88085938, + "step": 109, + "time_per_iteration": 2.8423218727111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430047, + "balance_loss_mlp": 1.14413536, + "epoch": 0.02116198537899192, + "flos": 534420015360.0, + "grad_norm": 0.034258615277409615, + "language_loss": 1.04797208, + "learning_rate": 0.0009308147319536321, + "loss": 1.06227255, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 2.86523438, + "step": 110, + "time_per_iteration": 2.6316323280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425495, + "balance_loss_mlp": 1.14053667, + "epoch": 0.021354367064255482, + "flos": 718728457728.0, + "grad_norm": 0.048864006828935096, + "language_loss": 1.11352324, + "learning_rate": 0.0009326068296900676, + "loss": 1.12777817, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 2.85546875, + "step": 111, + "time_per_iteration": 2.8313205242156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416698, + "balance_loss_mlp": 1.13269377, + "epoch": 0.021546748749519045, + "flos": 520624390656.0, + "grad_norm": 0.040751650479700946, + "language_loss": 1.01643181, + "learning_rate": 0.0009343828545846161, + "loss": 1.03059864, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 2.84570312, + "step": 112, + "time_per_iteration": 2.7729175090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401674, + "balance_loss_mlp": 1.11805177, + "epoch": 0.021739130434782608, + "flos": 506161927680.0, + "grad_norm": 0.042106341000359294, + "language_loss": 1.06266427, + "learning_rate": 0.0009361430923823841, + "loss": 1.07668102, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 2.84179688, + "step": 113, + "time_per_iteration": 2.5920841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01394311, + "balance_loss_mlp": 1.11126053, + "epoch": 0.02193151212004617, + "flos": 464427055872.0, + "grad_norm": 0.07156510336232694, + "language_loss": 1.09574234, + "learning_rate": 0.0009378878212755459, + "loss": 1.10968542, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 2.8359375, + "step": 114, + "time_per_iteration": 2.5213706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376382, + "balance_loss_mlp": 1.09371293, + "epoch": 0.022123893805309734, + "flos": 553332617472.0, + "grad_norm": 0.03568103744776456, + "language_loss": 0.9948864, + "learning_rate": 0.0009396173121672103, + "loss": 1.0086503, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.83203125, + "step": 115, + "time_per_iteration": 2.654648780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351182, + "balance_loss_mlp": 1.0677501, + "epoch": 0.022316275490573297, + "flos": 637379771136.0, + "grad_norm": 0.04471438423319615, + "language_loss": 1.05214882, + "learning_rate": 0.0009413318289238633, + "loss": 1.06566072, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.83984375, + "step": 116, + "time_per_iteration": 2.7842695713043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311882, + "balance_loss_mlp": 1.0282588, + "epoch": 0.02250865717583686, + "flos": 800316271872.0, + "grad_norm": 0.046340717018109684, + "language_loss": 0.97282118, + "learning_rate": 0.0009430316286169771, + "loss": 0.98593992, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.84179688, + "step": 117, + "time_per_iteration": 3.015839099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377985, + "balance_loss_mlp": 1.09283674, + "epoch": 0.022701038861100423, + "flos": 457063621632.0, + "grad_norm": 0.07808854544893538, + "language_loss": 1.02862036, + "learning_rate": 0.0009447169617543361, + "loss": 1.04240024, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.85742188, + "step": 118, + "time_per_iteration": 2.582919120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371579, + "balance_loss_mlp": 1.08871901, + "epoch": 0.022893420546363986, + "flos": 584187105024.0, + "grad_norm": 0.08661397198668377, + "language_loss": 1.09685123, + "learning_rate": 0.0009463880725016029, + "loss": 1.11056697, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.83398438, + "step": 119, + "time_per_iteration": 2.6932969093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312523, + "balance_loss_mlp": 1.03252411, + "epoch": 0.02308580223162755, + "flos": 562478826240.0, + "grad_norm": 0.04303328442288268, + "language_loss": 1.04977584, + "learning_rate": 0.0009480451988946134, + "loss": 1.06290102, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.8046875, + "step": 120, + "time_per_iteration": 2.8070547580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299284, + "balance_loss_mlp": 1.02252805, + "epoch": 0.023278183916891113, + "flos": 772646287872.0, + "grad_norm": 0.03799067846502037, + "language_loss": 1.05637264, + "learning_rate": 0.0009496885730428627, + "loss": 1.0693655, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.77148438, + "step": 121, + "time_per_iteration": 3.014753580093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130842, + "balance_loss_mlp": 1.03376198, + "epoch": 0.023470565602154676, + "flos": 554431057152.0, + "grad_norm": 0.04194740398285866, + "language_loss": 1.04016769, + "learning_rate": 0.0009513184213246156, + "loss": 1.05325174, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.75, + "step": 122, + "time_per_iteration": 2.633074998855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316034, + "balance_loss_mlp": 1.04442739, + "epoch": 0.02366294728741824, + "flos": 561167503872.0, + "grad_norm": 0.038872106950025416, + "language_loss": 1.07101583, + "learning_rate": 0.0009529349645740552, + "loss": 1.08417618, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.71875, + "step": 123, + "time_per_iteration": 2.6846470832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320226, + "balance_loss_mlp": 1.05014575, + "epoch": 0.0238553289726818, + "flos": 469517788416.0, + "grad_norm": 0.03403697644067516, + "language_loss": 1.05937934, + "learning_rate": 0.0009545384182608524, + "loss": 1.07258177, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.703125, + "step": 124, + "time_per_iteration": 2.5332376956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326404, + "balance_loss_mlp": 1.05880272, + "epoch": 0.024047710657945365, + "flos": 561104320512.0, + "grad_norm": 0.042208642163400256, + "language_loss": 1.03444421, + "learning_rate": 0.0009561289926625252, + "loss": 1.04770815, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.67773438, + "step": 125, + "time_per_iteration": 2.68180251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324487, + "balance_loss_mlp": 1.05841172, + "epoch": 0.024240092343208928, + "flos": 505771155456.0, + "grad_norm": 0.03944680997458598, + "language_loss": 1.08491933, + "learning_rate": 0.0009577068930299292, + "loss": 1.0981642, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.66210938, + "step": 126, + "time_per_iteration": 2.602088689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323529, + "balance_loss_mlp": 1.05936122, + "epoch": 0.02443247402847249, + "flos": 436753181184.0, + "grad_norm": 0.04017271590188075, + "language_loss": 1.04077768, + "learning_rate": 0.0009592723197462087, + "loss": 1.05401289, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.64257812, + "step": 127, + "time_per_iteration": 2.643617630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318089, + "balance_loss_mlp": 1.05563784, + "epoch": 0.024624855713736054, + "flos": 685069545216.0, + "grad_norm": 0.03549644551725154, + "language_loss": 1.0056293, + "learning_rate": 0.0009608254684795125, + "loss": 1.01881027, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.625, + "step": 128, + "time_per_iteration": 2.949061632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309242, + "balance_loss_mlp": 1.04831672, + "epoch": 0.024817237398999614, + "flos": 526114643712.0, + "grad_norm": 0.03183934804306691, + "language_loss": 1.03377914, + "learning_rate": 0.0009623665303297678, + "loss": 1.04687166, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.609375, + "step": 129, + "time_per_iteration": 2.7315783500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130104, + "balance_loss_mlp": 1.04106867, + "epoch": 0.025009619084263177, + "flos": 656887279872.0, + "grad_norm": 0.038944166016075116, + "language_loss": 1.07603359, + "learning_rate": 0.0009638956919697878, + "loss": 1.08904397, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.59960938, + "step": 130, + "time_per_iteration": 2.9588887691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293161, + "balance_loss_mlp": 1.03395224, + "epoch": 0.02520200076952674, + "flos": 455370275328.0, + "grad_norm": 0.03345888261117193, + "language_loss": 0.99743778, + "learning_rate": 0.0009654131357809714, + "loss": 1.0103693, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.59179688, + "step": 131, + "time_per_iteration": 2.5802786350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296775, + "balance_loss_mlp": 1.03966463, + "epoch": 0.025394382454790303, + "flos": 841269599232.0, + "grad_norm": 0.04496153180844387, + "language_loss": 1.08517051, + "learning_rate": 0.0009669190399838441, + "loss": 1.09813821, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.5703125, + "step": 132, + "time_per_iteration": 3.1034374237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297684, + "balance_loss_mlp": 1.04190826, + "epoch": 0.025586764140053866, + "flos": 582229353216.0, + "grad_norm": 0.044253016077327914, + "language_loss": 1.0183959, + "learning_rate": 0.0009684135787636724, + "loss": 1.03137255, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.55664062, + "step": 133, + "time_per_iteration": 2.8056888580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284726, + "balance_loss_mlp": 1.03066742, + "epoch": 0.02577914582531743, + "flos": 791678453760.0, + "grad_norm": 0.04023348500073193, + "language_loss": 1.06134284, + "learning_rate": 0.0009698969223913726, + "loss": 1.07419014, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.5390625, + "step": 134, + "time_per_iteration": 3.0520598888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_mlp": 1.02717578, + "epoch": 0.025971527510580992, + "flos": 596063861760.0, + "grad_norm": 0.02965492003563146, + "language_loss": 1.08660483, + "learning_rate": 0.0009713692373399265, + "loss": 1.09939814, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.51953125, + "step": 135, + "time_per_iteration": 2.679379463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01931427, + "balance_loss_mlp": 1.66744995, + "epoch": 0.026163909195844555, + "flos": 1581077391360.0, + "grad_norm": 0.18396358569787127, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81387651, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.640625, + "step": 136, + "time_per_iteration": 5.69318151473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01580238, + "balance_loss_mlp": 1.32083893, + "epoch": 0.026356290881108118, + "flos": 1505163555840.0, + "grad_norm": 0.11058621392355464, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79391277, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.59375, + "step": 137, + "time_per_iteration": 4.930646896362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336039, + "balance_loss_mlp": 1.08846498, + "epoch": 0.02654867256637168, + "flos": 598341421056.0, + "grad_norm": 0.05793494017899448, + "language_loss": 1.01254559, + "learning_rate": 0.0009757216201974225, + "loss": 1.02590609, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.47265625, + "step": 138, + "time_per_iteration": 2.8532111644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376264, + "balance_loss_mlp": 1.13059723, + "epoch": 0.026741054251635244, + "flos": 546136379136.0, + "grad_norm": 0.07027637242601113, + "language_loss": 1.06507492, + "learning_rate": 0.0009771514130396581, + "loss": 1.07883763, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.453125, + "step": 139, + "time_per_iteration": 2.742065668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01373402, + "balance_loss_mlp": 1.12792611, + "epoch": 0.026933435936898807, + "flos": 507846525696.0, + "grad_norm": 0.06681977417406691, + "language_loss": 1.06790614, + "learning_rate": 0.00097857095638274, + "loss": 1.08164012, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.45117188, + "step": 140, + "time_per_iteration": 2.689812660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01350241, + "balance_loss_mlp": 1.10533786, + "epoch": 0.02712581762216237, + "flos": 742254504192.0, + "grad_norm": 0.04346752833457442, + "language_loss": 0.97943556, + "learning_rate": 0.0009799803961288726, + "loss": 0.99293798, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.4453125, + "step": 141, + "time_per_iteration": 3.064852714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340988, + "balance_loss_mlp": 1.09684777, + "epoch": 0.027318199307425933, + "flos": 849779105280.0, + "grad_norm": 0.04419232462487818, + "language_loss": 1.04253626, + "learning_rate": 0.000981379875086876, + "loss": 1.05594611, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.4375, + "step": 142, + "time_per_iteration": 3.049978494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342845, + "balance_loss_mlp": 1.09870481, + "epoch": 0.027510580992689496, + "flos": 576638978304.0, + "grad_norm": 0.03936283820829166, + "language_loss": 0.99339008, + "learning_rate": 0.0009827695330590185, + "loss": 1.00681853, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.4375, + "step": 143, + "time_per_iteration": 2.677050828933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360296, + "balance_loss_mlp": 1.11729932, + "epoch": 0.02770296267795306, + "flos": 773790414336.0, + "grad_norm": 0.036415015399305896, + "language_loss": 0.98794824, + "learning_rate": 0.0009841495069248256, + "loss": 1.00155115, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.42578125, + "step": 144, + "time_per_iteration": 2.9983932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369915, + "balance_loss_mlp": 1.12768197, + "epoch": 0.027895344363216622, + "flos": 570449806080.0, + "grad_norm": 0.04357781303470995, + "language_loss": 0.98341697, + "learning_rate": 0.0009855199307219871, + "loss": 0.99711609, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.41796875, + "step": 145, + "time_per_iteration": 2.6622605323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136275, + "balance_loss_mlp": 1.12261522, + "epoch": 0.028087726048480186, + "flos": 548409080832.0, + "grad_norm": 0.032618269384273584, + "language_loss": 1.00131154, + "learning_rate": 0.0009868809357244854, + "loss": 1.01493907, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.39648438, + "step": 146, + "time_per_iteration": 2.7002813816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347166, + "balance_loss_mlp": 1.10836601, + "epoch": 0.02828010773374375, + "flos": 525873570816.0, + "grad_norm": 0.032542426789695725, + "language_loss": 1.04416764, + "learning_rate": 0.0009882326505180556, + "loss": 1.05763924, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.3828125, + "step": 147, + "time_per_iteration": 2.710149049758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334853, + "balance_loss_mlp": 1.09815085, + "epoch": 0.02847248941900731, + "flos": 773772917760.0, + "grad_norm": 0.045451062042893155, + "language_loss": 1.02790403, + "learning_rate": 0.0009895752010730906, + "loss": 1.04125249, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.36132812, + "step": 148, + "time_per_iteration": 2.965888261795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328294, + "balance_loss_mlp": 1.0936898, + "epoch": 0.028664871104270875, + "flos": 535470822912.0, + "grad_norm": 0.03549847888949514, + "language_loss": 1.08720016, + "learning_rate": 0.0009909087108150867, + "loss": 1.10048318, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.33984375, + "step": 149, + "time_per_iteration": 2.759585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328431, + "balance_loss_mlp": 1.09649718, + "epoch": 0.028857252789534438, + "flos": 368605212672.0, + "grad_norm": 0.04584721914032896, + "language_loss": 1.09262538, + "learning_rate": 0.0009922333006927371, + "loss": 1.10590982, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.3125, + "step": 150, + "time_per_iteration": 2.5677716732025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132956, + "balance_loss_mlp": 1.09896171, + "epoch": 0.029049634474798, + "flos": 516484343808.0, + "grad_norm": 0.054837011337671125, + "language_loss": 1.02855873, + "learning_rate": 0.0009935490892437632, + "loss": 1.04185438, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.29882812, + "step": 151, + "time_per_iteration": 2.5842795372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323589, + "balance_loss_mlp": 1.09623301, + "epoch": 0.029242016160061564, + "flos": 589349769216.0, + "grad_norm": 0.041624099188269474, + "language_loss": 1.01284385, + "learning_rate": 0.0009948561926585687, + "loss": 1.02607965, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.2734375, + "step": 152, + "time_per_iteration": 2.7717602252960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309484, + "balance_loss_mlp": 1.08422625, + "epoch": 0.029434397845325123, + "flos": 553137231360.0, + "grad_norm": 0.04242067063834005, + "language_loss": 1.0541966, + "learning_rate": 0.0009961547248418122, + "loss": 1.0672915, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.25976562, + "step": 153, + "time_per_iteration": 2.6492583751678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303287, + "balance_loss_mlp": 1.07898307, + "epoch": 0.029626779530588686, + "flos": 604608360960.0, + "grad_norm": 0.03242941124289258, + "language_loss": 1.02145946, + "learning_rate": 0.0009974447974719707, + "loss": 1.03449237, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.25, + "step": 154, + "time_per_iteration": 2.7111871242523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303637, + "balance_loss_mlp": 1.08181214, + "epoch": 0.02981916121585225, + "flos": 622218388992.0, + "grad_norm": 0.03743420896054, + "language_loss": 1.03581393, + "learning_rate": 0.0009987265200589763, + "loss": 1.0488503, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.22460938, + "step": 155, + "time_per_iteration": 2.7590832710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281243, + "balance_loss_mlp": 1.06151628, + "epoch": 0.030011542901115813, + "flos": 662881065984.0, + "grad_norm": 0.03665146617631418, + "language_loss": 1.03448439, + "learning_rate": 0.001, + "loss": 1.04729688, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.203125, + "step": 156, + "time_per_iteration": 2.868732452392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262046, + "balance_loss_mlp": 1.04441714, + "epoch": 0.030203924586379376, + "flos": 652819164672.0, + "grad_norm": 0.048414208125286275, + "language_loss": 1.0101347, + "learning_rate": 0.0009999999029413921, + "loss": 1.02275515, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.18164062, + "step": 157, + "time_per_iteration": 2.8458704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249467, + "balance_loss_mlp": 1.03393674, + "epoch": 0.03039630627164294, + "flos": 532444766976.0, + "grad_norm": 0.038165698108555156, + "language_loss": 1.02398324, + "learning_rate": 0.0009999996117656068, + "loss": 1.03647804, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.16015625, + "step": 158, + "time_per_iteration": 2.7255747318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250196, + "balance_loss_mlp": 1.03657281, + "epoch": 0.030588687956906502, + "flos": 587295786240.0, + "grad_norm": 0.04636715302465643, + "language_loss": 0.95869231, + "learning_rate": 0.0009999991264727564, + "loss": 0.97119427, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.140625, + "step": 159, + "time_per_iteration": 2.7805936336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_mlp": 1.05284619, + "epoch": 0.030781069642170065, + "flos": 514287464448.0, + "grad_norm": 0.055354258548617474, + "language_loss": 1.07316554, + "learning_rate": 0.0009999984470630296, + "loss": 1.08580732, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.1171875, + "step": 160, + "time_per_iteration": 2.6011087894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284107, + "balance_loss_mlp": 1.07372677, + "epoch": 0.030973451327433628, + "flos": 719560546560.0, + "grad_norm": 0.03499871632601644, + "language_loss": 0.95530587, + "learning_rate": 0.0009999975735366902, + "loss": 0.96814692, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.10742188, + "step": 161, + "time_per_iteration": 3.083415985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_mlp": 1.07439709, + "epoch": 0.03116583301269719, + "flos": 1111615994880.0, + "grad_norm": 0.03722431710536786, + "language_loss": 0.96960843, + "learning_rate": 0.0009999965058940775, + "loss": 0.9824428, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.09375, + "step": 162, + "time_per_iteration": 3.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_mlp": 1.05655301, + "epoch": 0.031358214697960754, + "flos": 451833883392.0, + "grad_norm": 0.04231417263227255, + "language_loss": 1.04135799, + "learning_rate": 0.0009999952441356057, + "loss": 1.05399871, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.078125, + "step": 163, + "time_per_iteration": 2.5445146560668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239952, + "balance_loss_mlp": 1.03357697, + "epoch": 0.031550596383224314, + "flos": 1257087309312.0, + "grad_norm": 0.03293922474511325, + "language_loss": 1.04807603, + "learning_rate": 0.000999993788261765, + "loss": 1.06047547, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.06640625, + "step": 164, + "time_per_iteration": 3.603273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233685, + "balance_loss_mlp": 1.02769136, + "epoch": 0.03174297806848788, + "flos": 669323950080.0, + "grad_norm": 0.03785089383184646, + "language_loss": 1.05591631, + "learning_rate": 0.00099999213827312, + "loss": 1.06825328, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.0625, + "step": 165, + "time_per_iteration": 2.822242498397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237294, + "balance_loss_mlp": 1.03206336, + "epoch": 0.03193535975375144, + "flos": 552364435200.0, + "grad_norm": 0.03413051380570177, + "language_loss": 1.00392842, + "learning_rate": 0.000999990294170312, + "loss": 1.01630139, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.0546875, + "step": 166, + "time_per_iteration": 2.6473989486694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124218, + "balance_loss_mlp": 1.03790259, + "epoch": 0.032127741439015006, + "flos": 544740486144.0, + "grad_norm": 0.02951320831702663, + "language_loss": 1.04371905, + "learning_rate": 0.0009999882559540566, + "loss": 1.0561409, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.04492188, + "step": 167, + "time_per_iteration": 2.654994487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_mlp": 1.04661989, + "epoch": 0.032320123124278566, + "flos": 549514323456.0, + "grad_norm": 0.03217165834370848, + "language_loss": 1.01348543, + "learning_rate": 0.000999986023625145, + "loss": 1.02598298, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.03320312, + "step": 168, + "time_per_iteration": 2.759324550628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01736656, + "balance_loss_mlp": 1.53829193, + "epoch": 0.03251250480954213, + "flos": 1308817963776.0, + "grad_norm": 0.15145695156494207, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8066107, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.9765625, + "step": 169, + "time_per_iteration": 4.9954283237457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125768, + "balance_loss_mlp": 1.05588245, + "epoch": 0.03270488649480569, + "flos": 562202760192.0, + "grad_norm": 0.04037677915440104, + "language_loss": 1.01481748, + "learning_rate": 0.0009999809766328958, + "loss": 1.02739429, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.01953125, + "step": 170, + "time_per_iteration": 2.6656970977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250876, + "balance_loss_mlp": 1.0494597, + "epoch": 0.03289726818006926, + "flos": 483339657984.0, + "grad_norm": 0.04232720535630845, + "language_loss": 1.03883123, + "learning_rate": 0.0009999781619715177, + "loss": 1.0513401, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.015625, + "step": 171, + "time_per_iteration": 2.5408902168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238141, + "balance_loss_mlp": 1.03786898, + "epoch": 0.03308964986533282, + "flos": 675821269248.0, + "grad_norm": 0.04278552863969592, + "language_loss": 1.04043615, + "learning_rate": 0.000999975153201402, + "loss": 1.05281758, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.00390625, + "step": 172, + "time_per_iteration": 2.85229754447937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233367, + "balance_loss_mlp": 1.03385854, + "epoch": 0.033282031550596385, + "flos": 610341632256.0, + "grad_norm": 0.04144744195910536, + "language_loss": 1.01965618, + "learning_rate": 0.0009999719503237174, + "loss": 1.03198993, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 1.9921875, + "step": 173, + "time_per_iteration": 2.7612979412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234993, + "balance_loss_mlp": 1.03739214, + "epoch": 0.033474413235859944, + "flos": 468996758784.0, + "grad_norm": 0.06741318195929925, + "language_loss": 1.10547054, + "learning_rate": 0.0009999685533397073, + "loss": 1.1178205, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 1.97265625, + "step": 174, + "time_per_iteration": 2.5750949382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246896, + "balance_loss_mlp": 1.05101097, + "epoch": 0.03366679492112351, + "flos": 580715841792.0, + "grad_norm": 0.0354258140398677, + "language_loss": 1.02665091, + "learning_rate": 0.00099996496225069, + "loss": 1.03911996, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 1.95605469, + "step": 175, + "time_per_iteration": 2.6886191368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124614, + "balance_loss_mlp": 1.05168545, + "epoch": 0.03385917660638707, + "flos": 638886479616.0, + "grad_norm": 0.036851717024697625, + "language_loss": 1.04551578, + "learning_rate": 0.0009999611770580604, + "loss": 1.0579772, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 1.94433594, + "step": 176, + "time_per_iteration": 2.8528547286987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227252, + "balance_loss_mlp": 1.03422809, + "epoch": 0.03405155829165064, + "flos": 442740164352.0, + "grad_norm": 0.05003520598604069, + "language_loss": 1.03819132, + "learning_rate": 0.0009999571977632876, + "loss": 1.0504638, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 1.9296875, + "step": 177, + "time_per_iteration": 2.6220269203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224145, + "balance_loss_mlp": 1.03188384, + "epoch": 0.034243939976914196, + "flos": 467275222272.0, + "grad_norm": 0.0554689754659714, + "language_loss": 1.0658946, + "learning_rate": 0.0009999530243679166, + "loss": 1.07813609, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 1.921875, + "step": 178, + "time_per_iteration": 2.5593671798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235914, + "balance_loss_mlp": 1.04479802, + "epoch": 0.03443632166217776, + "flos": 780713498880.0, + "grad_norm": 0.03675993055709111, + "language_loss": 1.01102996, + "learning_rate": 0.0009999486568735675, + "loss": 1.02338898, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 1.91015625, + "step": 179, + "time_per_iteration": 3.083312749862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235549, + "balance_loss_mlp": 1.04548192, + "epoch": 0.03462870334744132, + "flos": 1265760120576.0, + "grad_norm": 0.04656515886260978, + "language_loss": 1.01660061, + "learning_rate": 0.0009999440952819362, + "loss": 1.02895617, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 1.89941406, + "step": 180, + "time_per_iteration": 3.691354513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231777, + "balance_loss_mlp": 1.04390287, + "epoch": 0.03482108503270489, + "flos": 608303200512.0, + "grad_norm": 0.04339398829325753, + "language_loss": 1.02140999, + "learning_rate": 0.0009999393395947935, + "loss": 1.03372765, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 1.87695312, + "step": 181, + "time_per_iteration": 2.8826780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222143, + "balance_loss_mlp": 1.03617644, + "epoch": 0.03501346671796845, + "flos": 539315361792.0, + "grad_norm": 0.033650569268787865, + "language_loss": 1.05363226, + "learning_rate": 0.0009999343898139858, + "loss": 1.06585371, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 1.85742188, + "step": 182, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217643, + "balance_loss_mlp": 1.03329813, + "epoch": 0.035205848403232015, + "flos": 519499706112.0, + "grad_norm": 0.04889617812287003, + "language_loss": 1.03914642, + "learning_rate": 0.0009999292459414348, + "loss": 1.05132294, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 1.84082031, + "step": 183, + "time_per_iteration": 2.648263931274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223183, + "balance_loss_mlp": 1.04103076, + "epoch": 0.035398230088495575, + "flos": 473334137088.0, + "grad_norm": 0.03546540132303448, + "language_loss": 1.08284354, + "learning_rate": 0.0009999239079791374, + "loss": 1.09507537, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 1.81835938, + "step": 184, + "time_per_iteration": 2.6003947257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229231, + "balance_loss_mlp": 1.04908144, + "epoch": 0.03559061177375914, + "flos": 513095705856.0, + "grad_norm": 0.03580873522044792, + "language_loss": 1.00877666, + "learning_rate": 0.0009999183759291659, + "loss": 1.02106905, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 1.79785156, + "step": 185, + "time_per_iteration": 2.7518959045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229953, + "balance_loss_mlp": 1.05161583, + "epoch": 0.0357829934590227, + "flos": 478350992640.0, + "grad_norm": 0.05401643684385997, + "language_loss": 1.03586912, + "learning_rate": 0.0009999126497936682, + "loss": 1.04816866, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 1.78710938, + "step": 186, + "time_per_iteration": 2.565373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218003, + "balance_loss_mlp": 1.04052448, + "epoch": 0.03597537514428627, + "flos": 645885386496.0, + "grad_norm": 0.027605248849540943, + "language_loss": 1.06344712, + "learning_rate": 0.0009999067295748676, + "loss": 1.07562721, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 1.77832031, + "step": 187, + "time_per_iteration": 2.862023115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208675, + "balance_loss_mlp": 1.03167319, + "epoch": 0.03616775682954983, + "flos": 582270182400.0, + "grad_norm": 0.041753828035088196, + "language_loss": 1.04174721, + "learning_rate": 0.000999900615275062, + "loss": 1.05383396, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 1.7734375, + "step": 188, + "time_per_iteration": 2.7248780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206757, + "balance_loss_mlp": 1.02994609, + "epoch": 0.03636013851481339, + "flos": 383265007104.0, + "grad_norm": 0.05119808239604003, + "language_loss": 1.10189009, + "learning_rate": 0.0009998943068966256, + "loss": 1.11395764, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 1.77148438, + "step": 189, + "time_per_iteration": 2.487445592880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216253, + "balance_loss_mlp": 1.04010975, + "epoch": 0.03655252020007695, + "flos": 584308614144.0, + "grad_norm": 0.029643950017142998, + "language_loss": 1.04644084, + "learning_rate": 0.0009998878044420072, + "loss": 1.05860329, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 1.76464844, + "step": 190, + "time_per_iteration": 2.736809015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012186, + "balance_loss_mlp": 1.04321897, + "epoch": 0.03674490188534051, + "flos": 472598279424.0, + "grad_norm": 0.03987592529636011, + "language_loss": 1.00565469, + "learning_rate": 0.0009998811079137318, + "loss": 1.01784062, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 1.75683594, + "step": 191, + "time_per_iteration": 2.6006946563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214791, + "balance_loss_mlp": 1.04017353, + "epoch": 0.03693728357060408, + "flos": 529411908096.0, + "grad_norm": 0.03601320862003297, + "language_loss": 1.01597381, + "learning_rate": 0.0009998742173143987, + "loss": 1.02812171, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 1.74902344, + "step": 192, + "time_per_iteration": 2.6246893405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200861, + "balance_loss_mlp": 1.02719736, + "epoch": 0.03712966525586764, + "flos": 800346407424.0, + "grad_norm": 0.02962706666311765, + "language_loss": 1.0204885, + "learning_rate": 0.0009998671326466833, + "loss": 1.03249693, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 1.73925781, + "step": 193, + "time_per_iteration": 2.9852418899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194118, + "balance_loss_mlp": 1.02121651, + "epoch": 0.037322046941131205, + "flos": 831359342592.0, + "grad_norm": 0.049736474928026, + "language_loss": 1.0340569, + "learning_rate": 0.0009998598539133362, + "loss": 1.04599798, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 1.73144531, + "step": 194, + "time_per_iteration": 3.0510568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194861, + "balance_loss_mlp": 1.02339077, + "epoch": 0.037514428626394765, + "flos": 438589423872.0, + "grad_norm": 0.030819097200883293, + "language_loss": 1.03682184, + "learning_rate": 0.0009998523811171828, + "loss": 1.04877055, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 1.71679688, + "step": 195, + "time_per_iteration": 2.5203936100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197718, + "balance_loss_mlp": 1.0269146, + "epoch": 0.03770681031165833, + "flos": 512639804928.0, + "grad_norm": 0.031890398221933944, + "language_loss": 1.04342675, + "learning_rate": 0.0009998447142611248, + "loss": 1.05540395, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 1.70996094, + "step": 196, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193737, + "balance_loss_mlp": 1.02341044, + "epoch": 0.03789919199692189, + "flos": 808843274496.0, + "grad_norm": 0.030368823498634023, + "language_loss": 0.97672093, + "learning_rate": 0.0009998368533481387, + "loss": 0.98865831, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 1.70507812, + "step": 197, + "time_per_iteration": 3.031437397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185957, + "balance_loss_mlp": 1.01677489, + "epoch": 0.03809157368218546, + "flos": 691792386048.0, + "grad_norm": 0.027429804092446938, + "language_loss": 1.00742936, + "learning_rate": 0.0009998287983812762, + "loss": 1.01928902, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 1.69335938, + "step": 198, + "time_per_iteration": 2.8533172607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186443, + "balance_loss_mlp": 1.01764262, + "epoch": 0.03828395536744902, + "flos": 519004921344.0, + "grad_norm": 0.029672573654994608, + "language_loss": 1.06761527, + "learning_rate": 0.0009998205493636646, + "loss": 1.07947969, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 1.68945312, + "step": 199, + "time_per_iteration": 2.6512415409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190294, + "balance_loss_mlp": 1.02197027, + "epoch": 0.038476337052712584, + "flos": 582763021824.0, + "grad_norm": 0.03300049351517658, + "language_loss": 0.99112457, + "learning_rate": 0.0009998121062985063, + "loss": 1.00302756, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 1.68457031, + "step": 200, + "time_per_iteration": 2.6979846954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187054, + "balance_loss_mlp": 1.01996994, + "epoch": 0.03866871873797614, + "flos": 578273998848.0, + "grad_norm": 0.03164459486115397, + "language_loss": 1.0110172, + "learning_rate": 0.0009998034691890794, + "loss": 1.02288771, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 1.671875, + "step": 201, + "time_per_iteration": 2.80670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183327, + "balance_loss_mlp": 1.01672018, + "epoch": 0.03886110042323971, + "flos": 541772755968.0, + "grad_norm": 0.032663388617215364, + "language_loss": 1.05587053, + "learning_rate": 0.0009997946380387369, + "loss": 1.06770372, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 1.66699219, + "step": 202, + "time_per_iteration": 2.6591310501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179406, + "balance_loss_mlp": 1.01394379, + "epoch": 0.03905348210850327, + "flos": 719240739072.0, + "grad_norm": 0.030305493428663434, + "language_loss": 1.08528447, + "learning_rate": 0.0009997856128509076, + "loss": 1.09707844, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 1.65527344, + "step": 203, + "time_per_iteration": 2.9006340503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181527, + "balance_loss_mlp": 1.01720893, + "epoch": 0.039245863793766836, + "flos": 428397265152.0, + "grad_norm": 0.03189317300504765, + "language_loss": 1.03375864, + "learning_rate": 0.0009997763936290952, + "loss": 1.04557395, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.64355469, + "step": 204, + "time_per_iteration": 2.5836358070373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178747, + "balance_loss_mlp": 1.01538289, + "epoch": 0.039438245479030395, + "flos": 664270156032.0, + "grad_norm": 0.033629424624266296, + "language_loss": 1.0866276, + "learning_rate": 0.0009997669803768789, + "loss": 1.09841514, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.63378906, + "step": 205, + "time_per_iteration": 2.7809464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180514, + "balance_loss_mlp": 1.01791251, + "epoch": 0.03963062716429396, + "flos": 636496159488.0, + "grad_norm": 0.025840840316256445, + "language_loss": 1.03755617, + "learning_rate": 0.0009997573730979134, + "loss": 1.04936123, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.62597656, + "step": 206, + "time_per_iteration": 2.7759904861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207138, + "balance_loss_mlp": 1.04272461, + "epoch": 0.03982300884955752, + "flos": 1421589799680.0, + "grad_norm": 0.03078548913711826, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80400336, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.64453125, + "step": 207, + "time_per_iteration": 4.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177297, + "balance_loss_mlp": 1.0162214, + "epoch": 0.04001539053482109, + "flos": 690520914432.0, + "grad_norm": 0.03233621027438014, + "language_loss": 1.02104092, + "learning_rate": 0.0009997375764747294, + "loss": 1.03281379, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.61035156, + "step": 208, + "time_per_iteration": 2.9808952808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181785, + "balance_loss_mlp": 1.02156758, + "epoch": 0.04020777222008465, + "flos": 534752461824.0, + "grad_norm": 0.037334696417832054, + "language_loss": 0.99876916, + "learning_rate": 0.0009997273871381967, + "loss": 1.01058698, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.6015625, + "step": 209, + "time_per_iteration": 2.6938650608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183132, + "balance_loss_mlp": 1.02396429, + "epoch": 0.040400153905348214, + "flos": 568997532672.0, + "grad_norm": 0.03228633343407045, + "language_loss": 1.04497194, + "learning_rate": 0.0009997170037902862, + "loss": 1.05680323, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.59082031, + "step": 210, + "time_per_iteration": 2.722900629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189763, + "balance_loss_mlp": 1.03145349, + "epoch": 0.040592535590611774, + "flos": 714679784448.0, + "grad_norm": 0.026587079094436805, + "language_loss": 1.0723207, + "learning_rate": 0.0009997064264350292, + "loss": 1.08421838, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.58203125, + "step": 211, + "time_per_iteration": 2.8636813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186902, + "balance_loss_mlp": 1.02954614, + "epoch": 0.04078491727587533, + "flos": 579207187968.0, + "grad_norm": 0.028855359605628288, + "language_loss": 1.01311755, + "learning_rate": 0.0009996956550765317, + "loss": 1.02498662, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.57226562, + "step": 212, + "time_per_iteration": 2.6752002239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183355, + "balance_loss_mlp": 1.0270474, + "epoch": 0.0409772989611389, + "flos": 553369555968.0, + "grad_norm": 0.03615073574048419, + "language_loss": 0.96463609, + "learning_rate": 0.0009996846897189762, + "loss": 0.97646964, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.56152344, + "step": 213, + "time_per_iteration": 2.618417501449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180441, + "balance_loss_mlp": 1.02470577, + "epoch": 0.04116968064640246, + "flos": 556764996864.0, + "grad_norm": 0.04473264124517712, + "language_loss": 1.02233624, + "learning_rate": 0.0009996735303666193, + "loss": 1.03414059, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.55566406, + "step": 214, + "time_per_iteration": 2.7398550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118203, + "balance_loss_mlp": 1.026963, + "epoch": 0.041362062331666026, + "flos": 579652395264.0, + "grad_norm": 0.027182691243245845, + "language_loss": 1.04435229, + "learning_rate": 0.0009996621770237937, + "loss": 1.05617261, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.54882812, + "step": 215, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182535, + "balance_loss_mlp": 1.02775347, + "epoch": 0.041554444016929586, + "flos": 612701816832.0, + "grad_norm": 0.028683660550217302, + "language_loss": 1.00582075, + "learning_rate": 0.0009996506296949073, + "loss": 1.01764607, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.54589844, + "step": 216, + "time_per_iteration": 2.877587080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180569, + "balance_loss_mlp": 1.02607429, + "epoch": 0.04174682570219315, + "flos": 529151393280.0, + "grad_norm": 0.031901868987761664, + "language_loss": 1.00452459, + "learning_rate": 0.0009996388883844428, + "loss": 1.01633024, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.54296875, + "step": 217, + "time_per_iteration": 2.6346311569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.02002692, + "epoch": 0.04193920738745671, + "flos": 512500799232.0, + "grad_norm": 0.02715845750356807, + "language_loss": 1.03465486, + "learning_rate": 0.0009996269530969588, + "loss": 1.04639161, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.53417969, + "step": 218, + "time_per_iteration": 2.6205921173095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170474, + "balance_loss_mlp": 1.0176959, + "epoch": 0.04213158907272028, + "flos": 572553366528.0, + "grad_norm": 0.03606301207395498, + "language_loss": 1.04169452, + "learning_rate": 0.0009996148238370888, + "loss": 1.05339921, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.52539062, + "step": 219, + "time_per_iteration": 2.8047173023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.01725543, + "epoch": 0.04232397075798384, + "flos": 965905552896.0, + "grad_norm": 0.026524392964530758, + "language_loss": 0.99111861, + "learning_rate": 0.0009996025006095421, + "loss": 1.00281417, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.52050781, + "step": 220, + "time_per_iteration": 3.315859317779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 0.99693298, + "epoch": 0.042516352443247404, + "flos": 1472733340416.0, + "grad_norm": 0.01509407607306266, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.78931135, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.5078125, + "step": 221, + "time_per_iteration": 5.540910243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166927, + "balance_loss_mlp": 1.0164367, + "epoch": 0.042708734128510964, + "flos": 655892852736.0, + "grad_norm": 0.029367950869880366, + "language_loss": 0.99126619, + "learning_rate": 0.0009995772722706307, + "loss": 1.00293541, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.50195312, + "step": 222, + "time_per_iteration": 2.901489019393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167445, + "balance_loss_mlp": 1.01705015, + "epoch": 0.04290111581377453, + "flos": 432734643456.0, + "grad_norm": 0.04040999725558835, + "language_loss": 1.13508129, + "learning_rate": 0.0009995643671690604, + "loss": 1.1467557, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.50097656, + "step": 223, + "time_per_iteration": 2.5576720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168153, + "balance_loss_mlp": 1.01823533, + "epoch": 0.04309349749903809, + "flos": 645867889920.0, + "grad_norm": 0.02824445481068148, + "language_loss": 1.00763512, + "learning_rate": 0.0009995512681194023, + "loss": 1.01931667, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.49609375, + "step": 224, + "time_per_iteration": 2.9571568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167559, + "balance_loss_mlp": 1.01840472, + "epoch": 0.04328587918430166, + "flos": 832897153536.0, + "grad_norm": 0.025764365733734692, + "language_loss": 0.98235118, + "learning_rate": 0.0009995379751267417, + "loss": 0.99402678, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.48828125, + "step": 225, + "time_per_iteration": 3.2627484798431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166832, + "balance_loss_mlp": 1.01824963, + "epoch": 0.043478260869565216, + "flos": 526116589056.0, + "grad_norm": 0.03531387708455554, + "language_loss": 1.00006318, + "learning_rate": 0.0009995244881962398, + "loss": 1.01173151, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.48242188, + "step": 226, + "time_per_iteration": 2.624209403991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170136, + "balance_loss_mlp": 1.02212548, + "epoch": 0.04367064255482878, + "flos": 440413027584.0, + "grad_norm": 0.039279482080902435, + "language_loss": 1.01293874, + "learning_rate": 0.0009995108073331323, + "loss": 1.02464008, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.4765625, + "step": 227, + "time_per_iteration": 2.6042520999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164096, + "balance_loss_mlp": 1.01742136, + "epoch": 0.04386302424009234, + "flos": 508467677184.0, + "grad_norm": 0.03801127181345805, + "language_loss": 1.03535032, + "learning_rate": 0.0009994969325427309, + "loss": 1.04699123, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.46582031, + "step": 228, + "time_per_iteration": 2.6691603660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163607, + "balance_loss_mlp": 1.01769507, + "epoch": 0.04405540592535591, + "flos": 541744565760.0, + "grad_norm": 0.03512041362752814, + "language_loss": 1.00143218, + "learning_rate": 0.0009994828638304218, + "loss": 1.0130682, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.46191406, + "step": 229, + "time_per_iteration": 2.627833366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164617, + "balance_loss_mlp": 1.01927722, + "epoch": 0.04424778761061947, + "flos": 447309867264.0, + "grad_norm": 0.03576658395893793, + "language_loss": 1.06260157, + "learning_rate": 0.0009994686012016675, + "loss": 1.07424784, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.45703125, + "step": 230, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159156, + "balance_loss_mlp": 1.01448417, + "epoch": 0.044440169295883035, + "flos": 701982599424.0, + "grad_norm": 0.03592315304636455, + "language_loss": 1.05298328, + "learning_rate": 0.000999454144662005, + "loss": 1.06457496, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.45019531, + "step": 231, + "time_per_iteration": 2.918896436691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156937, + "balance_loss_mlp": 1.01274192, + "epoch": 0.044632550981146595, + "flos": 589427536896.0, + "grad_norm": 0.032106980286660924, + "language_loss": 0.996499, + "learning_rate": 0.0009994394942170468, + "loss": 1.00806844, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.4453125, + "step": 232, + "time_per_iteration": 2.700378179550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169343, + "balance_loss_mlp": 1.02524316, + "epoch": 0.04482493266641016, + "flos": 555855140352.0, + "grad_norm": 0.03061962333593277, + "language_loss": 0.97402102, + "learning_rate": 0.0009994246498724808, + "loss": 0.9857145, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.44433594, + "step": 233, + "time_per_iteration": 2.692657232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171534, + "balance_loss_mlp": 1.02848291, + "epoch": 0.04501731435167372, + "flos": 724070956800.0, + "grad_norm": 0.03598428268947968, + "language_loss": 1.00358808, + "learning_rate": 0.00099940961163407, + "loss": 1.01530337, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.43359375, + "step": 234, + "time_per_iteration": 2.8496198654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167121, + "balance_loss_mlp": 1.02473748, + "epoch": 0.04520969603693728, + "flos": 512798252544.0, + "grad_norm": 0.03236637347420306, + "language_loss": 1.0231185, + "learning_rate": 0.0009993943795076528, + "loss": 1.03478956, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.42675781, + "step": 235, + "time_per_iteration": 2.6304001808166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157951, + "balance_loss_mlp": 1.01623452, + "epoch": 0.04540207772220085, + "flos": 365878555392.0, + "grad_norm": 0.04557463461025321, + "language_loss": 1.04854226, + "learning_rate": 0.0009993789534991427, + "loss": 1.06012177, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.41992188, + "step": 236, + "time_per_iteration": 2.500347852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156422, + "balance_loss_mlp": 1.01613641, + "epoch": 0.045594459407464406, + "flos": 523724323584.0, + "grad_norm": 0.028810086143122388, + "language_loss": 0.99360317, + "learning_rate": 0.0009993633336145287, + "loss": 1.00516737, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.40527344, + "step": 237, + "time_per_iteration": 2.6991968154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156358, + "balance_loss_mlp": 1.01664495, + "epoch": 0.04578684109272797, + "flos": 673116966144.0, + "grad_norm": 0.036851747197037266, + "language_loss": 1.03695393, + "learning_rate": 0.0009993475198598752, + "loss": 1.04851758, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.39941406, + "step": 238, + "time_per_iteration": 3.0150160789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160083, + "balance_loss_mlp": 1.02084696, + "epoch": 0.04597922277799153, + "flos": 542621374464.0, + "grad_norm": 0.03967898438127139, + "language_loss": 1.00323462, + "learning_rate": 0.0009993315122413212, + "loss": 1.01483548, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.39453125, + "step": 239, + "time_per_iteration": 2.6226179599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.01690221, + "epoch": 0.0461716044632551, + "flos": 459994413312.0, + "grad_norm": 0.029756199222484733, + "language_loss": 1.00536144, + "learning_rate": 0.0009993153107650818, + "loss": 1.01691425, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.38574219, + "step": 240, + "time_per_iteration": 2.635673999786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154071, + "balance_loss_mlp": 1.01607406, + "epoch": 0.04636398614851866, + "flos": 456171261696.0, + "grad_norm": 0.03103837756937707, + "language_loss": 0.99882519, + "learning_rate": 0.0009992989154374468, + "loss": 1.01036584, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.38183594, + "step": 241, + "time_per_iteration": 2.5449135303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115588, + "balance_loss_mlp": 1.01836014, + "epoch": 0.046556367833782225, + "flos": 557902320384.0, + "grad_norm": 0.06487144756994469, + "language_loss": 1.0686537, + "learning_rate": 0.0009992823262647817, + "loss": 1.08021247, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.37695312, + "step": 242, + "time_per_iteration": 2.705120325088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011561, + "balance_loss_mlp": 1.01905739, + "epoch": 0.046748749519045785, + "flos": 594088613376.0, + "grad_norm": 0.03633512017688626, + "language_loss": 1.00915635, + "learning_rate": 0.0009992655432535264, + "loss": 1.02071738, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.37207031, + "step": 243, + "time_per_iteration": 2.8158721923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160044, + "balance_loss_mlp": 1.02347767, + "epoch": 0.04694113120430935, + "flos": 570942645504.0, + "grad_norm": 0.036353271768507285, + "language_loss": 1.01172018, + "learning_rate": 0.0009992485664101973, + "loss": 1.02332067, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.3671875, + "step": 244, + "time_per_iteration": 2.723409414291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156505, + "balance_loss_mlp": 1.0207969, + "epoch": 0.04713351288957291, + "flos": 865246689024.0, + "grad_norm": 0.05316255083066814, + "language_loss": 1.03417325, + "learning_rate": 0.000999231395741385, + "loss": 1.04573822, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.35839844, + "step": 245, + "time_per_iteration": 3.1441562175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155254, + "balance_loss_mlp": 1.02011812, + "epoch": 0.04732589457483648, + "flos": 538236364032.0, + "grad_norm": 0.039550829703112036, + "language_loss": 1.01375949, + "learning_rate": 0.0009992140312537557, + "loss": 1.02531195, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.35253906, + "step": 246, + "time_per_iteration": 2.6407320499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158298, + "balance_loss_mlp": 1.02402055, + "epoch": 0.04751827626010004, + "flos": 763272612096.0, + "grad_norm": 0.029332271702031103, + "language_loss": 0.96132767, + "learning_rate": 0.000999196472954051, + "loss": 0.97291064, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.34375, + "step": 247, + "time_per_iteration": 2.9791386127471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115741, + "balance_loss_mlp": 1.02313232, + "epoch": 0.0477106579453636, + "flos": 1583128462080.0, + "grad_norm": 0.019406803026512872, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80582267, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.34375, + "step": 248, + "time_per_iteration": 5.547277927398682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115733, + "balance_loss_mlp": 1.02457833, + "epoch": 0.04790303963062716, + "flos": 458693784576.0, + "grad_norm": 0.04949407998464004, + "language_loss": 1.04053593, + "learning_rate": 0.0009991607749457578, + "loss": 1.05210924, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.328125, + "step": 249, + "time_per_iteration": 2.610372304916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158188, + "balance_loss_mlp": 1.02629459, + "epoch": 0.04809542131589073, + "flos": 783787186944.0, + "grad_norm": 0.03428496832179458, + "language_loss": 1.01565814, + "learning_rate": 0.0009991426352510286, + "loss": 1.02723992, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.31933594, + "step": 250, + "time_per_iteration": 2.9723451137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.0272516, + "epoch": 0.04828780300115429, + "flos": 560322776064.0, + "grad_norm": 0.03370153589925739, + "language_loss": 1.02967048, + "learning_rate": 0.0009991243017719422, + "loss": 1.04125512, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.3125, + "step": 251, + "time_per_iteration": 2.691317319869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115263, + "balance_loss_mlp": 1.02149975, + "epoch": 0.048480184686417856, + "flos": 502922989056.0, + "grad_norm": 0.033537523086657674, + "language_loss": 0.98110956, + "learning_rate": 0.0009991057745156165, + "loss": 0.99263585, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.31152344, + "step": 252, + "time_per_iteration": 2.615726947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126877, + "balance_loss_mlp": 0.99641418, + "epoch": 0.048672566371681415, + "flos": 1539471810048.0, + "grad_norm": 0.00943295316075806, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83037865, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.3046875, + "step": 253, + "time_per_iteration": 5.119662523269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155145, + "balance_loss_mlp": 1.02439594, + "epoch": 0.04886494805694498, + "flos": 538952779776.0, + "grad_norm": 0.04101934284448647, + "language_loss": 1.06555986, + "learning_rate": 0.0009990681387000943, + "loss": 1.07711136, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.30761719, + "step": 254, + "time_per_iteration": 2.7494144439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153346, + "balance_loss_mlp": 1.02316916, + "epoch": 0.04905732974220854, + "flos": 681485521152.0, + "grad_norm": 0.029284228955777224, + "language_loss": 1.01195645, + "learning_rate": 0.0009990490301555093, + "loss": 1.02348995, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.30175781, + "step": 255, + "time_per_iteration": 2.9595844745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_mlp": 1.00462341, + "epoch": 0.04924971142747211, + "flos": 1424277573120.0, + "grad_norm": 0.011666997955433429, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80348712, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.2890625, + "step": 256, + "time_per_iteration": 4.918023347854614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126785, + "balance_loss_mlp": 0.99822998, + "epoch": 0.04944209311273567, + "flos": 1561239381504.0, + "grad_norm": 0.006197531934497474, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80369532, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.28515625, + "step": 257, + "time_per_iteration": 4.996341228485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127556, + "balance_loss_mlp": 0.99976349, + "epoch": 0.04963447479799923, + "flos": 1574173748736.0, + "grad_norm": 0.01126324229515774, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71103442, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.27734375, + "step": 258, + "time_per_iteration": 4.951507329940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167552, + "balance_loss_mlp": 1.03966403, + "epoch": 0.049826856483262794, + "flos": 626499386880.0, + "grad_norm": 0.07394024090910019, + "language_loss": 0.96613419, + "learning_rate": 0.0009989706585723202, + "loss": 0.97780967, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.27832031, + "step": 259, + "time_per_iteration": 2.819796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158437, + "balance_loss_mlp": 1.03073978, + "epoch": 0.05001923816852635, + "flos": 505156806912.0, + "grad_norm": 0.042054435700702504, + "language_loss": 1.02184892, + "learning_rate": 0.0009989505813633442, + "loss": 1.0334332, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.27636719, + "step": 260, + "time_per_iteration": 2.671597719192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.02206886, + "epoch": 0.05021161985378992, + "flos": 588468102912.0, + "grad_norm": 0.05343186989039486, + "language_loss": 1.02308297, + "learning_rate": 0.000998930310444573, + "loss": 1.03457689, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.27246094, + "step": 261, + "time_per_iteration": 2.7573728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.01883233, + "epoch": 0.05040400153905348, + "flos": 634403292672.0, + "grad_norm": 0.052960623500171895, + "language_loss": 1.00806391, + "learning_rate": 0.0009989098458238765, + "loss": 1.01951981, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.26660156, + "step": 262, + "time_per_iteration": 2.7937912940979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146699, + "balance_loss_mlp": 1.02033675, + "epoch": 0.050596383224317046, + "flos": 554809190400.0, + "grad_norm": 0.04531187332347281, + "language_loss": 0.99888676, + "learning_rate": 0.0009988891875091998, + "loss": 1.0103538, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.26269531, + "step": 263, + "time_per_iteration": 2.811218500137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.01939976, + "epoch": 0.050788764909580605, + "flos": 550762462464.0, + "grad_norm": 0.03965392167411722, + "language_loss": 0.94696999, + "learning_rate": 0.0009988683355085636, + "loss": 0.95842183, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.25683594, + "step": 264, + "time_per_iteration": 2.7378242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141586, + "balance_loss_mlp": 1.01617777, + "epoch": 0.05098114659484417, + "flos": 606345448704.0, + "grad_norm": 0.024717188615823983, + "language_loss": 1.02827787, + "learning_rate": 0.000998847289830063, + "loss": 1.03969371, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.25292969, + "step": 265, + "time_per_iteration": 2.8625917434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142152, + "balance_loss_mlp": 1.01693416, + "epoch": 0.05117352828010773, + "flos": 439473035520.0, + "grad_norm": 0.036783183293041616, + "language_loss": 0.96527213, + "learning_rate": 0.0009988260504818682, + "loss": 0.97669363, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.25097656, + "step": 266, + "time_per_iteration": 2.5658230781555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138539, + "balance_loss_mlp": 1.0135119, + "epoch": 0.0513659099653713, + "flos": 506031670272.0, + "grad_norm": 0.04116504124695153, + "language_loss": 1.03285778, + "learning_rate": 0.000998804617472226, + "loss": 1.0442431, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.24902344, + "step": 267, + "time_per_iteration": 2.63395094871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138256, + "balance_loss_mlp": 1.01418352, + "epoch": 0.05155829165063486, + "flos": 696715922688.0, + "grad_norm": 0.034853618125567455, + "language_loss": 0.98327756, + "learning_rate": 0.0009987829908094568, + "loss": 0.9946602, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.23925781, + "step": 268, + "time_per_iteration": 2.8239262104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.01331627, + "epoch": 0.051750673335898424, + "flos": 1350302059008.0, + "grad_norm": 0.042488112993129025, + "language_loss": 1.04893267, + "learning_rate": 0.0009987611705019569, + "loss": 1.0603019, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.234375, + "step": 269, + "time_per_iteration": 4.33854079246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.01387095, + "epoch": 0.051943055021161984, + "flos": 490590331392.0, + "grad_norm": 0.037116049987967636, + "language_loss": 1.03026497, + "learning_rate": 0.0009987391565581978, + "loss": 1.04163671, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.23144531, + "step": 270, + "time_per_iteration": 2.609722852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136969, + "balance_loss_mlp": 1.01365864, + "epoch": 0.05213543670642555, + "flos": 546880985088.0, + "grad_norm": 0.03927026934880779, + "language_loss": 0.95517516, + "learning_rate": 0.000998716948986726, + "loss": 0.96654487, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.23144531, + "step": 271, + "time_per_iteration": 2.797673225402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137765, + "balance_loss_mlp": 1.01512277, + "epoch": 0.05232781839168911, + "flos": 604673489664.0, + "grad_norm": 0.04118655717732696, + "language_loss": 0.97937191, + "learning_rate": 0.0009986945477961633, + "loss": 0.9907496, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.22460938, + "step": 272, + "time_per_iteration": 2.6988775730133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135438, + "balance_loss_mlp": 1.01336777, + "epoch": 0.052520200076952676, + "flos": 539656556544.0, + "grad_norm": 0.027940819886650203, + "language_loss": 1.02222085, + "learning_rate": 0.0009986719529952066, + "loss": 1.0335753, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.21875, + "step": 273, + "time_per_iteration": 2.9503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133251, + "balance_loss_mlp": 1.01175284, + "epoch": 0.052712581762216236, + "flos": 464333736960.0, + "grad_norm": 0.036678205813438995, + "language_loss": 1.02377117, + "learning_rate": 0.000998649164592628, + "loss": 1.0351038, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.21289062, + "step": 274, + "time_per_iteration": 2.575183868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134193, + "balance_loss_mlp": 1.01279056, + "epoch": 0.0529049634474798, + "flos": 549106054656.0, + "grad_norm": 0.029580362230619023, + "language_loss": 1.00386071, + "learning_rate": 0.0009986261825972748, + "loss": 1.01520276, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.21191406, + "step": 275, + "time_per_iteration": 2.781388521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.01578796, + "epoch": 0.05309734513274336, + "flos": 619201081344.0, + "grad_norm": 0.028327187192750843, + "language_loss": 1.01742268, + "learning_rate": 0.000998603007018069, + "loss": 1.0287869, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.20410156, + "step": 276, + "time_per_iteration": 2.8231008052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137197, + "balance_loss_mlp": 1.01665294, + "epoch": 0.05328972681800693, + "flos": 606618602496.0, + "grad_norm": 0.02408735734832513, + "language_loss": 1.00149679, + "learning_rate": 0.0009985796378640089, + "loss": 1.01286888, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.203125, + "step": 277, + "time_per_iteration": 2.721719264984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136169, + "balance_loss_mlp": 1.01610124, + "epoch": 0.05348210850327049, + "flos": 605731100160.0, + "grad_norm": 0.0319931943489141, + "language_loss": 0.99697894, + "learning_rate": 0.0009985560751441665, + "loss": 1.0083406, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.19824219, + "step": 278, + "time_per_iteration": 2.835160255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133809, + "balance_loss_mlp": 1.01412332, + "epoch": 0.053674490188534055, + "flos": 631998388224.0, + "grad_norm": 0.030840524384760076, + "language_loss": 1.0228467, + "learning_rate": 0.00099853231886769, + "loss": 1.03418469, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.19433594, + "step": 279, + "time_per_iteration": 2.8541102409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131641, + "balance_loss_mlp": 1.01243138, + "epoch": 0.053866871873797614, + "flos": 480174596352.0, + "grad_norm": 0.030057370429500904, + "language_loss": 1.01521945, + "learning_rate": 0.0009985083690438024, + "loss": 1.02653599, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.18945312, + "step": 280, + "time_per_iteration": 2.778996706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133594, + "balance_loss_mlp": 1.01514757, + "epoch": 0.054059253559061174, + "flos": 789490322688.0, + "grad_norm": 0.030570218765999514, + "language_loss": 0.92515564, + "learning_rate": 0.0009984842256818016, + "loss": 0.93649161, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.18164062, + "step": 281, + "time_per_iteration": 3.113694429397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.01928854, + "epoch": 0.05425163524432474, + "flos": 629506000896.0, + "grad_norm": 0.043548376252248826, + "language_loss": 1.03102541, + "learning_rate": 0.0009984598887910613, + "loss": 1.04240274, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.18164062, + "step": 282, + "time_per_iteration": 2.8303444385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.01504183, + "epoch": 0.0544440169295883, + "flos": 616993508352.0, + "grad_norm": 0.05077708884656826, + "language_loss": 0.98823464, + "learning_rate": 0.0009984353583810297, + "loss": 0.99956, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.171875, + "step": 283, + "time_per_iteration": 2.835850954055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.01315546, + "epoch": 0.05463639861485187, + "flos": 648930884352.0, + "grad_norm": 0.03524270200319673, + "language_loss": 1.0117259, + "learning_rate": 0.0009984106344612302, + "loss": 1.02302563, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.16503906, + "step": 284, + "time_per_iteration": 2.760528564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129453, + "balance_loss_mlp": 1.01319993, + "epoch": 0.054828780300115426, + "flos": 798585987072.0, + "grad_norm": 0.03078454247465455, + "language_loss": 0.96210134, + "learning_rate": 0.0009983857170412615, + "loss": 0.97339588, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.15917969, + "step": 285, + "time_per_iteration": 2.9911587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131741, + "balance_loss_mlp": 1.01567924, + "epoch": 0.05502116198537899, + "flos": 550799400960.0, + "grad_norm": 0.028192528419898312, + "language_loss": 0.95645988, + "learning_rate": 0.000998360606130798, + "loss": 0.96777725, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.15722656, + "step": 286, + "time_per_iteration": 2.8603405952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119957, + "balance_loss_mlp": 1.00475311, + "epoch": 0.05521354367064255, + "flos": 1410909659136.0, + "grad_norm": 0.016802553847575376, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70193076, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.1484375, + "step": 287, + "time_per_iteration": 4.872994899749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139216, + "balance_loss_mlp": 1.02372622, + "epoch": 0.05540592535590612, + "flos": 646612495872.0, + "grad_norm": 0.03160477576624613, + "language_loss": 1.01500821, + "learning_rate": 0.0009983098038774552, + "loss": 1.02640033, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.15136719, + "step": 288, + "time_per_iteration": 2.7645044326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119652, + "balance_loss_mlp": 1.00521088, + "epoch": 0.05559830704116968, + "flos": 1514318512896.0, + "grad_norm": 0.011772143096286682, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79289877, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.140625, + "step": 289, + "time_per_iteration": 4.783201456069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150059, + "balance_loss_mlp": 1.03542745, + "epoch": 0.055790688726433245, + "flos": 509335737600.0, + "grad_norm": 0.037615798403722346, + "language_loss": 1.00063777, + "learning_rate": 0.0009982582277800948, + "loss": 1.01213825, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.14257812, + "step": 290, + "time_per_iteration": 2.5825588703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142418, + "balance_loss_mlp": 1.02873969, + "epoch": 0.055983070411696804, + "flos": 659075410944.0, + "grad_norm": 0.03490310528255379, + "language_loss": 1.06654799, + "learning_rate": 0.0009982321495648908, + "loss": 1.07797217, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.13671875, + "step": 291, + "time_per_iteration": 2.8099231719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137522, + "balance_loss_mlp": 1.02470279, + "epoch": 0.05617545209696037, + "flos": 588476851200.0, + "grad_norm": 0.035465642673631545, + "language_loss": 0.97683877, + "learning_rate": 0.0009982058779188115, + "loss": 0.98821402, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.13183594, + "step": 292, + "time_per_iteration": 2.7125580310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136096, + "balance_loss_mlp": 1.02384841, + "epoch": 0.05636783378222393, + "flos": 612788332800.0, + "grad_norm": 0.032210362870472055, + "language_loss": 1.05647731, + "learning_rate": 0.0009981794128520567, + "loss": 1.06783831, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.12597656, + "step": 293, + "time_per_iteration": 2.7916390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135514, + "balance_loss_mlp": 1.0241251, + "epoch": 0.0565602154674875, + "flos": 669424071936.0, + "grad_norm": 0.03595229916115603, + "language_loss": 1.02550793, + "learning_rate": 0.000998152754374901, + "loss": 1.03686309, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.1171875, + "step": 294, + "time_per_iteration": 2.8770558834075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134115, + "balance_loss_mlp": 1.0227263, + "epoch": 0.05675259715275106, + "flos": 618365101824.0, + "grad_norm": 0.028486588423889302, + "language_loss": 0.98274708, + "learning_rate": 0.0009981259024976943, + "loss": 0.99408829, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.1171875, + "step": 295, + "time_per_iteration": 2.729853630065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133246, + "balance_loss_mlp": 1.02204788, + "epoch": 0.05694497883801462, + "flos": 753154330368.0, + "grad_norm": 0.04188437456637708, + "language_loss": 0.968624, + "learning_rate": 0.0009980988572308612, + "loss": 0.97995651, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.11523438, + "step": 296, + "time_per_iteration": 3.0135345458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132496, + "balance_loss_mlp": 1.02187026, + "epoch": 0.05713736052327818, + "flos": 713382067968.0, + "grad_norm": 0.0305883196599643, + "language_loss": 0.9903996, + "learning_rate": 0.0009980716185849015, + "loss": 1.0017246, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.109375, + "step": 297, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.01865172, + "epoch": 0.05732974220854175, + "flos": 469936750848.0, + "grad_norm": 0.029025981508343963, + "language_loss": 0.95620793, + "learning_rate": 0.0009980441865703904, + "loss": 0.96750069, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.109375, + "step": 298, + "time_per_iteration": 2.67486572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.0163666, + "epoch": 0.05752212389380531, + "flos": 602541739008.0, + "grad_norm": 0.028406065642448373, + "language_loss": 1.04190016, + "learning_rate": 0.000998016561197978, + "loss": 1.05316436, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.10351562, + "step": 299, + "time_per_iteration": 2.7435965538024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127499, + "balance_loss_mlp": 1.01773107, + "epoch": 0.057714505579068875, + "flos": 679950622464.0, + "grad_norm": 0.02999406165417261, + "language_loss": 0.957955, + "learning_rate": 0.0009979887424783895, + "loss": 0.96922994, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.10058594, + "step": 300, + "time_per_iteration": 2.868412494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127678, + "balance_loss_mlp": 1.01800561, + "epoch": 0.057906887264332435, + "flos": 597012602112.0, + "grad_norm": 0.033381964405594114, + "language_loss": 0.95279002, + "learning_rate": 0.0009979607304224248, + "loss": 0.96406674, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.09960938, + "step": 301, + "time_per_iteration": 2.7196099758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127179, + "balance_loss_mlp": 1.01760185, + "epoch": 0.058099268949596, + "flos": 553165421568.0, + "grad_norm": 0.029428698202492602, + "language_loss": 1.02305853, + "learning_rate": 0.000997932525040959, + "loss": 1.03433037, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.09863281, + "step": 302, + "time_per_iteration": 2.645131826400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126073, + "balance_loss_mlp": 1.0166868, + "epoch": 0.05829165063485956, + "flos": 509231725056.0, + "grad_norm": 0.033454482596205204, + "language_loss": 1.04832363, + "learning_rate": 0.000997904126344943, + "loss": 1.05958426, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.09667969, + "step": 303, + "time_per_iteration": 2.60955810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_mlp": 1.0157212, + "epoch": 0.05848403232012313, + "flos": 616363608576.0, + "grad_norm": 0.0319979050325151, + "language_loss": 1.00779867, + "learning_rate": 0.0009978755343454018, + "loss": 1.01905453, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.1015625, + "step": 304, + "time_per_iteration": 2.733825206756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124254, + "balance_loss_mlp": 1.01467645, + "epoch": 0.05867641400538669, + "flos": 501079943424.0, + "grad_norm": 0.03385536533959698, + "language_loss": 1.01509869, + "learning_rate": 0.0009978467490534355, + "loss": 1.0263412, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.09863281, + "step": 305, + "time_per_iteration": 2.6263206005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121932, + "balance_loss_mlp": 1.01292717, + "epoch": 0.05886879569065025, + "flos": 532379638272.0, + "grad_norm": 0.03088897761094542, + "language_loss": 0.98605353, + "learning_rate": 0.00099781777048022, + "loss": 0.99727285, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.09277344, + "step": 306, + "time_per_iteration": 2.7351841926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122107, + "balance_loss_mlp": 1.01329267, + "epoch": 0.05906117737591381, + "flos": 490041111552.0, + "grad_norm": 0.034758856969872284, + "language_loss": 0.99957371, + "learning_rate": 0.0009977885986370057, + "loss": 1.01079476, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.09082031, + "step": 307, + "time_per_iteration": 2.566316843032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120423, + "balance_loss_mlp": 1.01199007, + "epoch": 0.05925355906117737, + "flos": 592710216960.0, + "grad_norm": 0.0408216139096099, + "language_loss": 0.95604599, + "learning_rate": 0.000997759233535118, + "loss": 0.96725023, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.08691406, + "step": 308, + "time_per_iteration": 2.781667470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119623, + "balance_loss_mlp": 1.01147592, + "epoch": 0.05944594074644094, + "flos": 564788466432.0, + "grad_norm": 0.03543125546238922, + "language_loss": 1.01945186, + "learning_rate": 0.0009977296751859576, + "loss": 1.03064811, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.08398438, + "step": 309, + "time_per_iteration": 2.778700828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121487, + "balance_loss_mlp": 1.0137223, + "epoch": 0.0596383224317045, + "flos": 539808201216.0, + "grad_norm": 0.03208598270087784, + "language_loss": 1.03591859, + "learning_rate": 0.0009976999236009998, + "loss": 1.04713345, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.08007812, + "step": 310, + "time_per_iteration": 2.790116786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121449, + "balance_loss_mlp": 1.01387453, + "epoch": 0.059830704116968066, + "flos": 562053060864.0, + "grad_norm": 0.03260901983169028, + "language_loss": 1.05564129, + "learning_rate": 0.0009976699787917955, + "loss": 1.06685579, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.078125, + "step": 311, + "time_per_iteration": 2.6586148738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108932, + "balance_loss_mlp": 1.00326538, + "epoch": 0.060023085802231625, + "flos": 1574050294272.0, + "grad_norm": 0.018314702584398344, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74551928, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.05859375, + "step": 312, + "time_per_iteration": 4.943182945251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128012, + "balance_loss_mlp": 1.02101004, + "epoch": 0.06021546748749519, + "flos": 483628363008.0, + "grad_norm": 0.04396023920554742, + "language_loss": 0.97026515, + "learning_rate": 0.0009976095095472243, + "loss": 0.98154521, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.07226562, + "step": 313, + "time_per_iteration": 2.619016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_mlp": 1.02425838, + "epoch": 0.06040784917275875, + "flos": 621424205568.0, + "grad_norm": 0.03687701456451143, + "language_loss": 0.97965562, + "learning_rate": 0.0009975789851353334, + "loss": 0.99096727, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.07128906, + "step": 314, + "time_per_iteration": 2.8331894874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125758, + "balance_loss_mlp": 1.01980519, + "epoch": 0.06060023085802232, + "flos": 484603348224.0, + "grad_norm": 0.029408756794299912, + "language_loss": 1.00726843, + "learning_rate": 0.0009975482675461487, + "loss": 1.01852608, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.06152344, + "step": 315, + "time_per_iteration": 2.659079074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125971, + "balance_loss_mlp": 1.02001762, + "epoch": 0.06079261254328588, + "flos": 582986598144.0, + "grad_norm": 0.027344501346145803, + "language_loss": 0.98408186, + "learning_rate": 0.0009975173567915952, + "loss": 0.99534154, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.06152344, + "step": 316, + "time_per_iteration": 2.6947872638702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123736, + "balance_loss_mlp": 1.01873684, + "epoch": 0.060984994228549444, + "flos": 689009348352.0, + "grad_norm": 0.03553374767777348, + "language_loss": 0.92618632, + "learning_rate": 0.000997486252883674, + "loss": 0.93742371, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.05175781, + "step": 317, + "time_per_iteration": 2.8523428440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123139, + "balance_loss_mlp": 1.01861632, + "epoch": 0.061177375913813004, + "flos": 1316749104384.0, + "grad_norm": 0.03506621320439297, + "language_loss": 0.97693729, + "learning_rate": 0.0009974549558344602, + "loss": 0.98816866, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.046875, + "step": 318, + "time_per_iteration": 3.705524206161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121913, + "balance_loss_mlp": 1.01805806, + "epoch": 0.06136975759907657, + "flos": 575401532928.0, + "grad_norm": 0.03493031867187039, + "language_loss": 1.07333064, + "learning_rate": 0.000997423465656105, + "loss": 1.08454978, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.04003906, + "step": 319, + "time_per_iteration": 2.75838565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119126, + "balance_loss_mlp": 1.01546133, + "epoch": 0.06156213928434013, + "flos": 528565234944.0, + "grad_norm": 0.037170039701900144, + "language_loss": 1.04350638, + "learning_rate": 0.0009973917823608335, + "loss": 1.05469775, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.03808594, + "step": 320, + "time_per_iteration": 2.6494460105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117676, + "balance_loss_mlp": 1.01458335, + "epoch": 0.061754520969603696, + "flos": 496590920448.0, + "grad_norm": 0.030464742512101767, + "language_loss": 0.98981547, + "learning_rate": 0.0009973599059609462, + "loss": 1.00099218, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.03222656, + "step": 321, + "time_per_iteration": 2.7119081020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116635, + "balance_loss_mlp": 1.01344728, + "epoch": 0.061946902654867256, + "flos": 441044872704.0, + "grad_norm": 0.031106795532346753, + "language_loss": 0.97035432, + "learning_rate": 0.000997327836468819, + "loss": 0.98152065, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.03320312, + "step": 322, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121262, + "balance_loss_mlp": 1.01836073, + "epoch": 0.06213928434013082, + "flos": 600043515648.0, + "grad_norm": 0.031546338171402045, + "language_loss": 1.00120687, + "learning_rate": 0.000997295573896902, + "loss": 1.01241946, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.03027344, + "step": 323, + "time_per_iteration": 2.825425624847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113502, + "balance_loss_mlp": 1.01126862, + "epoch": 0.06233166602539438, + "flos": 1453116961536.0, + "grad_norm": 0.009515746361157745, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82309544, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.0234375, + "step": 324, + "time_per_iteration": 4.7325074672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.0074234, + "epoch": 0.06252404771065795, + "flos": 1466631651072.0, + "grad_norm": 0.010337204897298672, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79680836, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.015625, + "step": 325, + "time_per_iteration": 4.845058917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131262, + "balance_loss_mlp": 1.02950513, + "epoch": 0.06271642939592151, + "flos": 465236790528.0, + "grad_norm": 0.04479189972062717, + "language_loss": 0.94122899, + "learning_rate": 0.000997197627828043, + "loss": 0.95254159, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.01855469, + "step": 326, + "time_per_iteration": 2.531477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136139, + "balance_loss_mlp": 1.03466833, + "epoch": 0.06290881108118507, + "flos": 533432391168.0, + "grad_norm": 0.03210871152906133, + "language_loss": 0.89633012, + "learning_rate": 0.0009971645930629716, + "loss": 0.9076916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.015625, + "step": 327, + "time_per_iteration": 2.766155481338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131438, + "balance_loss_mlp": 1.0305388, + "epoch": 0.06310119276644863, + "flos": 674768516352.0, + "grad_norm": 0.03217671154768682, + "language_loss": 1.03418863, + "learning_rate": 0.0009971313652814872, + "loss": 1.0455029, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.00976562, + "step": 328, + "time_per_iteration": 2.818718433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125209, + "balance_loss_mlp": 1.02440596, + "epoch": 0.0632935744517122, + "flos": 772051381248.0, + "grad_norm": 0.03902843256426295, + "language_loss": 1.00692391, + "learning_rate": 0.0009970979444964903, + "loss": 1.01817608, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.00878906, + "step": 329, + "time_per_iteration": 2.9847218990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119216, + "balance_loss_mlp": 1.01869905, + "epoch": 0.06348595613697576, + "flos": 562975556352.0, + "grad_norm": 0.040034835413812295, + "language_loss": 1.01797342, + "learning_rate": 0.0009970643307209556, + "loss": 1.02916563, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.00585938, + "step": 330, + "time_per_iteration": 2.817711353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112644, + "balance_loss_mlp": 1.01250839, + "epoch": 0.06367833782223932, + "flos": 677384358144.0, + "grad_norm": 0.031424074947949916, + "language_loss": 0.98358697, + "learning_rate": 0.0009970305239679334, + "loss": 0.99471337, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.00195312, + "step": 331, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.01247358, + "epoch": 0.06387071950750288, + "flos": 496349847552.0, + "grad_norm": 0.04016029313197435, + "language_loss": 1.03082633, + "learning_rate": 0.0009969965242505483, + "loss": 1.04195428, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.00390625, + "step": 332, + "time_per_iteration": 2.631326675415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113411, + "balance_loss_mlp": 1.01317954, + "epoch": 0.06406310119276645, + "flos": 534557075712.0, + "grad_norm": 0.03761595064373852, + "language_loss": 0.99054992, + "learning_rate": 0.0009969623315820007, + "loss": 1.00168395, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.00292969, + "step": 333, + "time_per_iteration": 2.6700048446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.01369655, + "epoch": 0.06425548287803001, + "flos": 457165688832.0, + "grad_norm": 0.0356255093132357, + "language_loss": 0.99075055, + "learning_rate": 0.000996927945975565, + "loss": 1.00188696, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.99951172, + "step": 334, + "time_per_iteration": 2.567225933074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112774, + "balance_loss_mlp": 1.01282871, + "epoch": 0.06444786456329357, + "flos": 561123762432.0, + "grad_norm": 0.034265188200332725, + "language_loss": 0.96451521, + "learning_rate": 0.0009968933674445906, + "loss": 0.97564298, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.99951172, + "step": 335, + "time_per_iteration": 2.6834452152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110954, + "balance_loss_mlp": 1.01100898, + "epoch": 0.06464024624855713, + "flos": 667357449984.0, + "grad_norm": 0.026754476738251005, + "language_loss": 0.980811, + "learning_rate": 0.0009968585960025028, + "loss": 0.99192053, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.99853516, + "step": 336, + "time_per_iteration": 2.9675402641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112488, + "balance_loss_mlp": 1.01368713, + "epoch": 0.0648326279338207, + "flos": 1524558303744.0, + "grad_norm": 0.027483244216433014, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78765678, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.98632812, + "step": 337, + "time_per_iteration": 4.80242133140564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.01540756, + "epoch": 0.06502500961908426, + "flos": 1145216581632.0, + "grad_norm": 0.03509421691107687, + "language_loss": 0.96500707, + "learning_rate": 0.0009967884744390583, + "loss": 0.97615772, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.99414062, + "step": 338, + "time_per_iteration": 3.517488479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118827, + "balance_loss_mlp": 1.01945412, + "epoch": 0.06521739130434782, + "flos": 583694265600.0, + "grad_norm": 0.03507378265000135, + "language_loss": 0.97375119, + "learning_rate": 0.0009967531243449256, + "loss": 0.98493946, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.9921875, + "step": 339, + "time_per_iteration": 2.713430404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119012, + "balance_loss_mlp": 1.02002037, + "epoch": 0.06540977298961138, + "flos": 498659487744.0, + "grad_norm": 0.03215705196534619, + "language_loss": 1.04762673, + "learning_rate": 0.000996717581394126, + "loss": 1.05881691, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.98876953, + "step": 340, + "time_per_iteration": 2.5391135215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116775, + "balance_loss_mlp": 1.01787901, + "epoch": 0.06560215467487496, + "flos": 543904506624.0, + "grad_norm": 0.030763143460584817, + "language_loss": 1.05044627, + "learning_rate": 0.000996681845600459, + "loss": 1.06161404, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.98632812, + "step": 341, + "time_per_iteration": 2.670804262161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118249, + "balance_loss_mlp": 1.01963949, + "epoch": 0.06579453636013852, + "flos": 414351819264.0, + "grad_norm": 0.040583240554979534, + "language_loss": 0.9744029, + "learning_rate": 0.0009966459169777982, + "loss": 0.98558539, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.98388672, + "step": 342, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115105, + "balance_loss_mlp": 1.01706719, + "epoch": 0.06598691804540208, + "flos": 561681730560.0, + "grad_norm": 0.04164342519277061, + "language_loss": 1.05655766, + "learning_rate": 0.0009966097955400924, + "loss": 1.06770873, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.97949219, + "step": 343, + "time_per_iteration": 2.666548728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_mlp": 1.01532912, + "epoch": 0.06617929973066564, + "flos": 573302830080.0, + "grad_norm": 0.03386977599556249, + "language_loss": 0.99970496, + "learning_rate": 0.0009965734813013652, + "loss": 1.01082909, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.97070312, + "step": 344, + "time_per_iteration": 2.8448328971862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_mlp": 1.01261127, + "epoch": 0.06637168141592921, + "flos": 491465194752.0, + "grad_norm": 0.03376822413453626, + "language_loss": 1.02026749, + "learning_rate": 0.0009965369742757151, + "loss": 1.03136492, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.97119141, + "step": 345, + "time_per_iteration": 2.568521738052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.01176453, + "epoch": 0.06656406310119277, + "flos": 1081039518720.0, + "grad_norm": 0.03449730062562062, + "language_loss": 0.98245382, + "learning_rate": 0.0009965002744773152, + "loss": 0.99353665, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.96484375, + "step": 346, + "time_per_iteration": 3.501471519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109602, + "balance_loss_mlp": 1.01347148, + "epoch": 0.06675644478645633, + "flos": 514723923456.0, + "grad_norm": 0.029121068034632647, + "language_loss": 0.95998263, + "learning_rate": 0.0009964633819204139, + "loss": 0.97107863, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.9609375, + "step": 347, + "time_per_iteration": 2.6675100326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093636, + "balance_loss_mlp": 0.9986496, + "epoch": 0.06694882647171989, + "flos": 1450537079808.0, + "grad_norm": 0.008592618933675954, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.82894754, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.94921875, + "step": 348, + "time_per_iteration": 4.92915415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 0.99832916, + "epoch": 0.06714120815698346, + "flos": 1555400152320.0, + "grad_norm": 0.006174818833869298, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76247013, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.94726562, + "step": 349, + "time_per_iteration": 4.8783159255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112042, + "balance_loss_mlp": 1.01719952, + "epoch": 0.06733358984224702, + "flos": 881617326336.0, + "grad_norm": 0.039044792628629706, + "language_loss": 0.95966816, + "learning_rate": 0.000996351547842304, + "loss": 0.97078854, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.94775391, + "step": 350, + "time_per_iteration": 3.151158094406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106972, + "balance_loss_mlp": 1.01222503, + "epoch": 0.06752597152751058, + "flos": 519918668544.0, + "grad_norm": 0.04011951728876299, + "language_loss": 0.94198334, + "learning_rate": 0.0009963138843953744, + "loss": 0.953053, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.94677734, + "step": 351, + "time_per_iteration": 2.6077194213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111271, + "balance_loss_mlp": 1.01661849, + "epoch": 0.06771835321277414, + "flos": 540883308288.0, + "grad_norm": 0.02897454745239974, + "language_loss": 0.98297268, + "learning_rate": 0.000996276028262306, + "loss": 0.99408543, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.94580078, + "step": 352, + "time_per_iteration": 2.8440346717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115128, + "balance_loss_mlp": 1.02052331, + "epoch": 0.0679107348980377, + "flos": 461615827968.0, + "grad_norm": 0.03358261828070724, + "language_loss": 1.05270672, + "learning_rate": 0.0009962379794577964, + "loss": 1.06385791, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.9453125, + "step": 353, + "time_per_iteration": 2.6153147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115897, + "balance_loss_mlp": 1.02129257, + "epoch": 0.06810311658330127, + "flos": 637208684544.0, + "grad_norm": 0.03193767698980152, + "language_loss": 0.94629884, + "learning_rate": 0.000996199737996617, + "loss": 0.95745778, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.9453125, + "step": 354, + "time_per_iteration": 2.9557363986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114833, + "balance_loss_mlp": 1.0208956, + "epoch": 0.06829549826856483, + "flos": 465627562752.0, + "grad_norm": 0.034421374529713736, + "language_loss": 1.03816652, + "learning_rate": 0.0009961613038936149, + "loss": 1.04931474, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.93847656, + "step": 355, + "time_per_iteration": 2.583648204803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112591, + "balance_loss_mlp": 1.01879704, + "epoch": 0.06848787995382839, + "flos": 635897362176.0, + "grad_norm": 0.027271592740405557, + "language_loss": 0.95725697, + "learning_rate": 0.000996122677163711, + "loss": 0.96838284, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.93701172, + "step": 356, + "time_per_iteration": 2.7997536659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.02022934, + "epoch": 0.06868026163909195, + "flos": 807781773312.0, + "grad_norm": 0.036098266403844226, + "language_loss": 1.02058005, + "learning_rate": 0.000996083857821902, + "loss": 1.03171647, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.93310547, + "step": 357, + "time_per_iteration": 3.0117554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113245, + "balance_loss_mlp": 1.01978505, + "epoch": 0.06887264332435553, + "flos": 440152512768.0, + "grad_norm": 0.03587140172627376, + "language_loss": 1.00045025, + "learning_rate": 0.0009960448458832588, + "loss": 1.01158273, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.93359375, + "step": 358, + "time_per_iteration": 2.6948373317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110172, + "balance_loss_mlp": 1.01714087, + "epoch": 0.06906502500961909, + "flos": 485786358528.0, + "grad_norm": 0.028895953236024122, + "language_loss": 0.99980301, + "learning_rate": 0.000996005641362927, + "loss": 1.01090467, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.92919922, + "step": 359, + "time_per_iteration": 2.600889205932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110333, + "balance_loss_mlp": 1.01715922, + "epoch": 0.06925740669488265, + "flos": 734886212352.0, + "grad_norm": 0.03093408458560108, + "language_loss": 1.02453041, + "learning_rate": 0.0009959662442761274, + "loss": 1.0356338, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.93066406, + "step": 360, + "time_per_iteration": 2.9324746131896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107676, + "balance_loss_mlp": 1.01445436, + "epoch": 0.0694497883801462, + "flos": 553571745024.0, + "grad_norm": 0.03028505188811882, + "language_loss": 0.95860314, + "learning_rate": 0.000995926654638155, + "loss": 0.96967983, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.93115234, + "step": 361, + "time_per_iteration": 2.8280868530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104746, + "balance_loss_mlp": 1.01157248, + "epoch": 0.06964217006540978, + "flos": 679244900352.0, + "grad_norm": 0.03450824772288923, + "language_loss": 0.98644811, + "learning_rate": 0.00099588687246438, + "loss": 0.99749553, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.93066406, + "step": 362, + "time_per_iteration": 2.8108932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108438, + "balance_loss_mlp": 1.01535928, + "epoch": 0.06983455175067334, + "flos": 525261167616.0, + "grad_norm": 0.03621302361184023, + "language_loss": 1.06105995, + "learning_rate": 0.0009958468977702471, + "loss": 1.07214439, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.9296875, + "step": 363, + "time_per_iteration": 2.6087372303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135422, + "balance_loss_mlp": 1.04272461, + "epoch": 0.0700269334359369, + "flos": 1580176283136.0, + "grad_norm": 0.03651647631774479, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.80870128, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.92578125, + "step": 364, + "time_per_iteration": 4.806072235107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104861, + "balance_loss_mlp": 1.01254511, + "epoch": 0.07021931512120046, + "flos": 1014858050304.0, + "grad_norm": 0.04058448706036458, + "language_loss": 0.94071019, + "learning_rate": 0.0009957663708830612, + "loss": 0.9517588, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.921875, + "step": 365, + "time_per_iteration": 3.30859637260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110656, + "balance_loss_mlp": 1.01862633, + "epoch": 0.07041169680646403, + "flos": 824432367360.0, + "grad_norm": 0.04186203278400794, + "language_loss": 0.98041129, + "learning_rate": 0.0009957258187212714, + "loss": 0.9915179, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.91894531, + "step": 366, + "time_per_iteration": 3.00058913230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097015, + "balance_loss_mlp": 1.00565338, + "epoch": 0.07060407849172759, + "flos": 1417293250560.0, + "grad_norm": 0.011820269564466843, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80291873, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.91210938, + "step": 367, + "time_per_iteration": 4.794500827789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113703, + "balance_loss_mlp": 1.02186394, + "epoch": 0.07079646017699115, + "flos": 513942379008.0, + "grad_norm": 0.041641563183133855, + "language_loss": 0.94691038, + "learning_rate": 0.0009956441370400167, + "loss": 0.95804739, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.91699219, + "step": 368, + "time_per_iteration": 2.63948917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111577, + "balance_loss_mlp": 1.02436066, + "epoch": 0.07098884186225471, + "flos": 541549179648.0, + "grad_norm": 0.03426405251061256, + "language_loss": 1.00885093, + "learning_rate": 0.0009956030075522636, + "loss": 1.02000868, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.91259766, + "step": 369, + "time_per_iteration": 2.74157452583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107449, + "balance_loss_mlp": 1.01613438, + "epoch": 0.07118122354751828, + "flos": 549739845120.0, + "grad_norm": 0.030296400642036637, + "language_loss": 1.0031743, + "learning_rate": 0.0009955616856543587, + "loss": 1.01424885, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.91162109, + "step": 370, + "time_per_iteration": 2.6210479736328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105786, + "balance_loss_mlp": 1.01475775, + "epoch": 0.07137360523278184, + "flos": 622077437952.0, + "grad_norm": 0.029509682347833893, + "language_loss": 0.92550498, + "learning_rate": 0.0009955201713623448, + "loss": 0.93656284, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.90869141, + "step": 371, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.00284576, + "epoch": 0.0715659869180454, + "flos": 1505976202752.0, + "grad_norm": 0.005566886599578838, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77765214, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.89648438, + "step": 372, + "time_per_iteration": 4.947838306427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126764, + "balance_loss_mlp": 1.0361172, + "epoch": 0.07175836860330896, + "flos": 496482050304.0, + "grad_norm": 0.040308561934975694, + "language_loss": 1.05629396, + "learning_rate": 0.0009954365656605333, + "loss": 1.06756163, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.90478516, + "step": 373, + "time_per_iteration": 2.5537302494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124141, + "balance_loss_mlp": 1.03416181, + "epoch": 0.07195075028857253, + "flos": 787082505984.0, + "grad_norm": 0.034789914575730614, + "language_loss": 0.98912442, + "learning_rate": 0.0009953944742831947, + "loss": 1.00036585, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.89892578, + "step": 374, + "time_per_iteration": 2.976074695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106044, + "balance_loss_mlp": 1.01678061, + "epoch": 0.0721431319738361, + "flos": 594347182848.0, + "grad_norm": 0.029628456658550576, + "language_loss": 1.02558136, + "learning_rate": 0.0009953521905766642, + "loss": 1.03664172, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.89404297, + "step": 375, + "time_per_iteration": 2.9556005001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101275, + "balance_loss_mlp": 1.01234496, + "epoch": 0.07233551365909965, + "flos": 549329630976.0, + "grad_norm": 0.034208323574026145, + "language_loss": 1.01073325, + "learning_rate": 0.0009953097145573577, + "loss": 1.02174592, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.89111328, + "step": 376, + "time_per_iteration": 2.6449482440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_mlp": 1.01759815, + "epoch": 0.07252789534436321, + "flos": 959169106176.0, + "grad_norm": 0.031040198427254525, + "language_loss": 0.98588479, + "learning_rate": 0.000995267046241766, + "loss": 0.99694908, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.89013672, + "step": 377, + "time_per_iteration": 3.2564361095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106989, + "balance_loss_mlp": 1.01877415, + "epoch": 0.07272027702962677, + "flos": 508656260352.0, + "grad_norm": 0.029229214223645432, + "language_loss": 0.98238575, + "learning_rate": 0.0009952241856464547, + "loss": 0.99345565, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.88378906, + "step": 378, + "time_per_iteration": 2.5843191146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108111, + "balance_loss_mlp": 1.02013505, + "epoch": 0.07291265871489035, + "flos": 613552380672.0, + "grad_norm": 0.03194005050639913, + "language_loss": 1.05557346, + "learning_rate": 0.0009951811327880632, + "loss": 1.06665444, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.88134766, + "step": 379, + "time_per_iteration": 2.727449655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107323, + "balance_loss_mlp": 1.01934636, + "epoch": 0.0731050404001539, + "flos": 496742565120.0, + "grad_norm": 0.03092115392183015, + "language_loss": 0.98400533, + "learning_rate": 0.0009951378876833063, + "loss": 0.99507862, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.88134766, + "step": 380, + "time_per_iteration": 2.5320205688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101258, + "balance_loss_mlp": 1.01332915, + "epoch": 0.07329742208541747, + "flos": 641130991104.0, + "grad_norm": 0.032065094183830696, + "language_loss": 1.04703462, + "learning_rate": 0.0009950944503489736, + "loss": 1.05804706, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.88085938, + "step": 381, + "time_per_iteration": 2.7422876358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_mlp": 1.01453876, + "epoch": 0.07348980377068103, + "flos": 817741607424.0, + "grad_norm": 0.030510114485064205, + "language_loss": 0.99112171, + "learning_rate": 0.0009950508208019285, + "loss": 1.00214303, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.87744141, + "step": 382, + "time_per_iteration": 3.046475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_mlp": 1.01323569, + "epoch": 0.0736821854559446, + "flos": 509670129408.0, + "grad_norm": 0.035756321159612754, + "language_loss": 1.03789318, + "learning_rate": 0.0009950069990591096, + "loss": 1.04890537, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.88134766, + "step": 383, + "time_per_iteration": 2.620088577270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.02674103, + "epoch": 0.07387456714120816, + "flos": 1558050987264.0, + "grad_norm": 0.043940663043905655, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77514511, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.86523438, + "step": 384, + "time_per_iteration": 4.87653374671936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121594, + "balance_loss_mlp": 1.03299809, + "epoch": 0.07406694882647172, + "flos": 526644421632.0, + "grad_norm": 0.039102279996233, + "language_loss": 0.96614265, + "learning_rate": 0.0009949187790542777, + "loss": 0.97735858, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.88769531, + "step": 385, + "time_per_iteration": 2.734100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112471, + "balance_loss_mlp": 1.03625691, + "epoch": 0.07425933051173528, + "flos": 498824738304.0, + "grad_norm": 0.03701278047407747, + "language_loss": 0.92462552, + "learning_rate": 0.0009948743808265148, + "loss": 0.93587261, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.88623047, + "step": 386, + "time_per_iteration": 2.7154581546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_mlp": 1.03704965, + "epoch": 0.07445171219699885, + "flos": 506057915136.0, + "grad_norm": 0.06663512882119103, + "language_loss": 1.02268195, + "learning_rate": 0.0009948297904714782, + "loss": 1.0339365, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.88574219, + "step": 387, + "time_per_iteration": 2.68532133102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112575, + "balance_loss_mlp": 1.03777313, + "epoch": 0.07464409388226241, + "flos": 555117337344.0, + "grad_norm": 0.036483324457394946, + "language_loss": 0.94151849, + "learning_rate": 0.0009947850080064796, + "loss": 0.95277596, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.88134766, + "step": 388, + "time_per_iteration": 2.789128303527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121204, + "balance_loss_mlp": 1.03370392, + "epoch": 0.07483647556752597, + "flos": 778275546624.0, + "grad_norm": 0.0421926900222792, + "language_loss": 0.99476451, + "learning_rate": 0.0009947400334489047, + "loss": 1.00597644, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.87646484, + "step": 389, + "time_per_iteration": 2.9937496185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011085, + "balance_loss_mlp": 1.02133441, + "epoch": 0.07502885725278953, + "flos": 613682638080.0, + "grad_norm": 0.0417493031738284, + "language_loss": 0.90741575, + "learning_rate": 0.0009946948668162145, + "loss": 0.91850078, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.87304688, + "step": 390, + "time_per_iteration": 2.7264010906219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101473, + "balance_loss_mlp": 1.01502275, + "epoch": 0.0752212389380531, + "flos": 689856021504.0, + "grad_norm": 0.03330838563423677, + "language_loss": 0.95001, + "learning_rate": 0.0009946495081259441, + "loss": 0.9610247, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.86572266, + "step": 391, + "time_per_iteration": 2.832472085952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097898, + "balance_loss_mlp": 1.01182938, + "epoch": 0.07541362062331666, + "flos": 767052022272.0, + "grad_norm": 0.03859494705227578, + "language_loss": 0.99014449, + "learning_rate": 0.0009946039573957035, + "loss": 1.00112355, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.86181641, + "step": 392, + "time_per_iteration": 2.925933361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101692, + "balance_loss_mlp": 1.01576602, + "epoch": 0.07560600230858022, + "flos": 589909682688.0, + "grad_norm": 0.039112379024015986, + "language_loss": 0.95485294, + "learning_rate": 0.000994558214643177, + "loss": 0.9658699, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.86035156, + "step": 393, + "time_per_iteration": 2.763448476791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095538, + "balance_loss_mlp": 1.00961244, + "epoch": 0.07579838399384378, + "flos": 751146034176.0, + "grad_norm": 0.03818992224284351, + "language_loss": 0.96862066, + "learning_rate": 0.000994512279886123, + "loss": 0.97957599, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.86035156, + "step": 394, + "time_per_iteration": 3.143615245819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101397, + "balance_loss_mlp": 1.01561391, + "epoch": 0.07599076567910736, + "flos": 524551554816.0, + "grad_norm": 0.030240351127206026, + "language_loss": 0.96659988, + "learning_rate": 0.0009944661531423758, + "loss": 0.97761387, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.85888672, + "step": 395, + "time_per_iteration": 2.6748764514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107513, + "balance_loss_mlp": 1.02206361, + "epoch": 0.07618314736437092, + "flos": 552186545664.0, + "grad_norm": 0.03358451790414236, + "language_loss": 0.95614338, + "learning_rate": 0.000994419834429843, + "loss": 0.96721858, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.85546875, + "step": 396, + "time_per_iteration": 2.6525089740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105372, + "balance_loss_mlp": 1.01987493, + "epoch": 0.07637552904963447, + "flos": 699433831680.0, + "grad_norm": 0.04315212632526892, + "language_loss": 1.00552011, + "learning_rate": 0.0009943733237665069, + "loss": 1.01657379, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.85595703, + "step": 397, + "time_per_iteration": 2.8678157329559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097353, + "balance_loss_mlp": 1.01218963, + "epoch": 0.07656791073489803, + "flos": 580636128768.0, + "grad_norm": 0.029538416941692198, + "language_loss": 0.99224108, + "learning_rate": 0.0009943266211704248, + "loss": 1.0032146, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.85253906, + "step": 398, + "time_per_iteration": 3.0023248195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099387, + "balance_loss_mlp": 1.01460528, + "epoch": 0.0767602924201616, + "flos": 418037910528.0, + "grad_norm": 0.03167845871290285, + "language_loss": 1.01143491, + "learning_rate": 0.000994279726659728, + "loss": 1.02242875, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.84863281, + "step": 399, + "time_per_iteration": 2.527693271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107007, + "balance_loss_mlp": 1.02246368, + "epoch": 0.07695267410542517, + "flos": 483888877824.0, + "grad_norm": 0.03414294034973106, + "language_loss": 0.9968133, + "learning_rate": 0.0009942326402526231, + "loss": 1.00788331, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.84619141, + "step": 400, + "time_per_iteration": 2.5610573291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_mlp": 1.02848434, + "epoch": 0.07714505579068873, + "flos": 532027749888.0, + "grad_norm": 0.030264499227930883, + "language_loss": 0.97403878, + "learning_rate": 0.0009941853619673902, + "loss": 0.98516715, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.84423828, + "step": 401, + "time_per_iteration": 2.680175542831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107236, + "balance_loss_mlp": 1.02302694, + "epoch": 0.07733743747595229, + "flos": 806440315392.0, + "grad_norm": 0.03979329481069023, + "language_loss": 1.01160502, + "learning_rate": 0.0009941378918223844, + "loss": 1.02267742, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.84277344, + "step": 402, + "time_per_iteration": 3.0908427238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098686, + "balance_loss_mlp": 1.01447606, + "epoch": 0.07752981916121585, + "flos": 623614281984.0, + "grad_norm": 0.03310929598543939, + "language_loss": 0.93567806, + "learning_rate": 0.0009940902298360354, + "loss": 0.94666493, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.84277344, + "step": 403, + "time_per_iteration": 2.7569308280944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.01048076, + "epoch": 0.07772220084647942, + "flos": 729543713280.0, + "grad_norm": 0.03955766616265138, + "language_loss": 1.03173304, + "learning_rate": 0.0009940423760268473, + "loss": 1.04268289, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.84570312, + "step": 404, + "time_per_iteration": 2.8456103801727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098252, + "balance_loss_mlp": 1.01375628, + "epoch": 0.07791458253174298, + "flos": 556469488896.0, + "grad_norm": 0.042207617679060144, + "language_loss": 0.96929657, + "learning_rate": 0.0009939943304133982, + "loss": 0.98027909, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.84570312, + "step": 405, + "time_per_iteration": 2.615145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104796, + "balance_loss_mlp": 1.02044404, + "epoch": 0.07810696421700654, + "flos": 554235671040.0, + "grad_norm": 0.04104566792755741, + "language_loss": 1.03659868, + "learning_rate": 0.0009939460930143416, + "loss": 1.04764676, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.84423828, + "step": 406, + "time_per_iteration": 2.6304614543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_mlp": 1.01745594, + "epoch": 0.0782993459022701, + "flos": 651879172608.0, + "grad_norm": 0.0317151282671847, + "language_loss": 0.97752666, + "learning_rate": 0.0009938976638484043, + "loss": 0.98854232, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.84179688, + "step": 407, + "time_per_iteration": 2.9032115936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109564, + "balance_loss_mlp": 1.01205039, + "epoch": 0.07849172758753367, + "flos": 497161527552.0, + "grad_norm": 0.04013855375776475, + "language_loss": 0.97246277, + "learning_rate": 0.0009938490429343887, + "loss": 0.98341918, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.83642578, + "step": 408, + "time_per_iteration": 2.5688796043395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.01236188, + "epoch": 0.07868410927279723, + "flos": 579076930560.0, + "grad_norm": 0.0397915036848884, + "language_loss": 0.97571141, + "learning_rate": 0.0009938002302911709, + "loss": 0.98666751, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.83300781, + "step": 409, + "time_per_iteration": 2.75036883354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096533, + "balance_loss_mlp": 1.01365864, + "epoch": 0.07887649095806079, + "flos": 524067463680.0, + "grad_norm": 0.03678821175613874, + "language_loss": 1.00230122, + "learning_rate": 0.0009937512259377015, + "loss": 1.01326644, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.82910156, + "step": 410, + "time_per_iteration": 2.6584975719451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110197, + "balance_loss_mlp": 1.01938236, + "epoch": 0.07906887264332435, + "flos": 558438901248.0, + "grad_norm": 0.04956969404692801, + "language_loss": 0.989124, + "learning_rate": 0.000993702029893006, + "loss": 1.00014377, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.82617188, + "step": 411, + "time_per_iteration": 2.7666263580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_mlp": 1.0196116, + "epoch": 0.07926125432858792, + "flos": 823364063232.0, + "grad_norm": 0.03322797228086769, + "language_loss": 0.99091381, + "learning_rate": 0.0009936526421761838, + "loss": 1.00193632, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.82666016, + "step": 412, + "time_per_iteration": 3.0222113132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102099, + "balance_loss_mlp": 1.01955855, + "epoch": 0.07945363601385148, + "flos": 563394518784.0, + "grad_norm": 0.04210923401756456, + "language_loss": 1.01423764, + "learning_rate": 0.000993603062806409, + "loss": 1.02525866, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.82568359, + "step": 413, + "time_per_iteration": 2.713226079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100317, + "balance_loss_mlp": 1.0176332, + "epoch": 0.07964601769911504, + "flos": 518885357568.0, + "grad_norm": 0.041362228888401006, + "language_loss": 1.04903626, + "learning_rate": 0.0009935532918029298, + "loss": 1.06003952, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.82714844, + "step": 414, + "time_per_iteration": 2.59602689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095709, + "balance_loss_mlp": 1.01326394, + "epoch": 0.0798383993843786, + "flos": 540301040640.0, + "grad_norm": 0.030384950019726516, + "language_loss": 0.97377884, + "learning_rate": 0.0009935033291850694, + "loss": 0.98473597, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.82470703, + "step": 415, + "time_per_iteration": 2.6417808532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094851, + "balance_loss_mlp": 1.013026, + "epoch": 0.08003078106964218, + "flos": 486122695680.0, + "grad_norm": 0.03579523867672845, + "language_loss": 1.00004411, + "learning_rate": 0.0009934531749722247, + "loss": 1.01099253, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.81835938, + "step": 416, + "time_per_iteration": 2.593029737472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095566, + "balance_loss_mlp": 1.01383638, + "epoch": 0.08022316275490574, + "flos": 519276129792.0, + "grad_norm": 0.0354518245662521, + "language_loss": 0.98370755, + "learning_rate": 0.0009934028291838672, + "loss": 0.99466318, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.81738281, + "step": 417, + "time_per_iteration": 2.7351250648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096643, + "balance_loss_mlp": 1.01496112, + "epoch": 0.0804155444401693, + "flos": 495047273472.0, + "grad_norm": 0.032920982329526526, + "language_loss": 0.93668723, + "learning_rate": 0.0009933522918395433, + "loss": 0.94765365, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.81689453, + "step": 418, + "time_per_iteration": 2.6427221298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.03316498, + "epoch": 0.08060792612543285, + "flos": 1584856801536.0, + "grad_norm": 0.029973653623271358, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79365897, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.81640625, + "step": 419, + "time_per_iteration": 4.8632917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.01569724, + "epoch": 0.08080030781069643, + "flos": 526359607296.0, + "grad_norm": 0.04163447523548115, + "language_loss": 1.12134457, + "learning_rate": 0.000993250642561551, + "loss": 1.13230991, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.80810547, + "step": 420, + "time_per_iteration": 2.608396053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109578, + "balance_loss_mlp": 1.01505113, + "epoch": 0.08099268949595999, + "flos": 547757793792.0, + "grad_norm": 0.04746808509414602, + "language_loss": 0.97398257, + "learning_rate": 0.0009931995306673466, + "loss": 0.98494035, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.80712891, + "step": 421, + "time_per_iteration": 2.7215850353240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097341, + "balance_loss_mlp": 1.01670778, + "epoch": 0.08118507118122355, + "flos": 511374169344.0, + "grad_norm": 0.04020038552675014, + "language_loss": 1.02514148, + "learning_rate": 0.000993148227296103, + "loss": 1.03611493, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.80615234, + "step": 422, + "time_per_iteration": 2.625366449356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010968, + "balance_loss_mlp": 1.01607168, + "epoch": 0.08137745286648711, + "flos": 722002389504.0, + "grad_norm": 0.03556088777041087, + "language_loss": 0.90137196, + "learning_rate": 0.000993096732467738, + "loss": 0.91233999, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.80712891, + "step": 423, + "time_per_iteration": 2.9795689582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.0118531, + "epoch": 0.08156983455175067, + "flos": 680818682880.0, + "grad_norm": 0.04422604915428747, + "language_loss": 0.99073571, + "learning_rate": 0.0009930450462022435, + "loss": 1.00165915, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.8046875, + "step": 424, + "time_per_iteration": 2.879889726638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.00783539, + "epoch": 0.08176221623701424, + "flos": 1456591137024.0, + "grad_norm": 0.006453860192715822, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.8027699, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.79296875, + "step": 425, + "time_per_iteration": 4.908784627914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.01541877, + "epoch": 0.0819545979222778, + "flos": 1558885044480.0, + "grad_norm": 0.04271462185638088, + "language_loss": 0.96659774, + "learning_rate": 0.0009929410994402065, + "loss": 0.9775573, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.80517578, + "step": 426, + "time_per_iteration": 3.7266876697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100573, + "balance_loss_mlp": 1.02013052, + "epoch": 0.08214697960754136, + "flos": 513801427968.0, + "grad_norm": 0.040597463537132866, + "language_loss": 1.00489211, + "learning_rate": 0.0009928888389840196, + "loss": 1.01589799, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.80419922, + "step": 427, + "time_per_iteration": 2.695010185241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098577, + "balance_loss_mlp": 1.01822996, + "epoch": 0.08233936129280492, + "flos": 596222309376.0, + "grad_norm": 0.03622779747664415, + "language_loss": 1.02622843, + "learning_rate": 0.0009928363871714147, + "loss": 1.03721428, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.80322266, + "step": 428, + "time_per_iteration": 2.66733455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097324, + "balance_loss_mlp": 1.01721525, + "epoch": 0.08253174297806849, + "flos": 573165769728.0, + "grad_norm": 0.028981657602537042, + "language_loss": 0.97141832, + "learning_rate": 0.0009927837440227556, + "loss": 0.98239154, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.80078125, + "step": 429, + "time_per_iteration": 2.8499114513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093938, + "balance_loss_mlp": 1.01392436, + "epoch": 0.08272412466333205, + "flos": 624643702272.0, + "grad_norm": 0.031878488957356683, + "language_loss": 0.91184896, + "learning_rate": 0.0009927309095584798, + "loss": 0.92278832, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.79980469, + "step": 430, + "time_per_iteration": 3.020768165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097624, + "balance_loss_mlp": 1.01756275, + "epoch": 0.08291650634859561, + "flos": 514995131904.0, + "grad_norm": 0.040558959270141796, + "language_loss": 1.03523278, + "learning_rate": 0.0009926778837991, + "loss": 1.0462091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.80029297, + "step": 431, + "time_per_iteration": 2.609189033508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101176, + "balance_loss_mlp": 1.02125835, + "epoch": 0.08310888803385917, + "flos": 668542405632.0, + "grad_norm": 0.035092839201242565, + "language_loss": 1.01323938, + "learning_rate": 0.000992624666765202, + "loss": 1.0242511, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.79882812, + "step": 432, + "time_per_iteration": 2.817399501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_mlp": 1.02154219, + "epoch": 0.08330126971912274, + "flos": 584491361280.0, + "grad_norm": 0.0354530922421884, + "language_loss": 0.98992586, + "learning_rate": 0.000992571258477447, + "loss": 1.00094295, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.80126953, + "step": 433, + "time_per_iteration": 2.777506113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010961, + "balance_loss_mlp": 1.0161345, + "epoch": 0.0834936514043863, + "flos": 562498268160.0, + "grad_norm": 0.03167346665720251, + "language_loss": 0.92772877, + "learning_rate": 0.0009925176589565695, + "loss": 0.93868983, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.79931641, + "step": 434, + "time_per_iteration": 2.801501512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093857, + "balance_loss_mlp": 1.01398647, + "epoch": 0.08368603308964986, + "flos": 495513868032.0, + "grad_norm": 0.03411426988917409, + "language_loss": 1.03318536, + "learning_rate": 0.0009924638682233791, + "loss": 1.04412401, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.79833984, + "step": 435, + "time_per_iteration": 2.573282241821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.01512909, + "epoch": 0.08387841477491342, + "flos": 1391811397632.0, + "grad_norm": 0.030642245427906535, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.8065716, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.7734375, + "step": 436, + "time_per_iteration": 4.596274375915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099407, + "balance_loss_mlp": 1.02006125, + "epoch": 0.084070796460177, + "flos": 800355155712.0, + "grad_norm": 0.040681894877429646, + "language_loss": 0.92768085, + "learning_rate": 0.0009923557132036668, + "loss": 0.93867493, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.79296875, + "step": 437, + "time_per_iteration": 3.0366878509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_mlp": 1.02364242, + "epoch": 0.08426317814544056, + "flos": 560097254400.0, + "grad_norm": 0.034275916488964116, + "language_loss": 0.96774155, + "learning_rate": 0.0009923013489591345, + "loss": 0.97876477, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.78613281, + "step": 438, + "time_per_iteration": 2.8060851097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_mlp": 1.0219903, + "epoch": 0.08445555983070412, + "flos": 811884881664.0, + "grad_norm": 0.035250716051411925, + "language_loss": 0.95655745, + "learning_rate": 0.0009922467935862681, + "loss": 0.96756417, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.78613281, + "step": 439, + "time_per_iteration": 3.116757869720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098598, + "balance_loss_mlp": 1.0204916, + "epoch": 0.08464794151596768, + "flos": 511170034944.0, + "grad_norm": 0.03561138790794706, + "language_loss": 0.98418635, + "learning_rate": 0.0009921920471062478, + "loss": 0.99517238, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.78027344, + "step": 440, + "time_per_iteration": 2.6008944511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093389, + "balance_loss_mlp": 1.01561701, + "epoch": 0.08484032320123125, + "flos": 557474609664.0, + "grad_norm": 0.02914226137027636, + "language_loss": 0.96590662, + "learning_rate": 0.0009921371095403281, + "loss": 0.97684056, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.77685547, + "step": 441, + "time_per_iteration": 2.638679265975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_mlp": 1.01697087, + "epoch": 0.08503270488649481, + "flos": 528361100544.0, + "grad_norm": 0.02987504029564206, + "language_loss": 0.99685514, + "learning_rate": 0.0009920819809098379, + "loss": 1.00780344, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.77783203, + "step": 442, + "time_per_iteration": 2.5915398597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089542, + "balance_loss_mlp": 1.01172209, + "epoch": 0.08522508657175837, + "flos": 615386678016.0, + "grad_norm": 0.03983619354546574, + "language_loss": 0.95535469, + "learning_rate": 0.0009920266612361798, + "loss": 0.96625006, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.77734375, + "step": 443, + "time_per_iteration": 2.724025249481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091681, + "balance_loss_mlp": 1.01371801, + "epoch": 0.08541746825702193, + "flos": 620987746560.0, + "grad_norm": 0.032808156584867194, + "language_loss": 0.9504559, + "learning_rate": 0.0009919711505408308, + "loss": 0.96137273, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.77880859, + "step": 444, + "time_per_iteration": 2.780973434448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087177, + "balance_loss_mlp": 1.00926137, + "epoch": 0.08560984994228549, + "flos": 483888877824.0, + "grad_norm": 0.03232110076143325, + "language_loss": 0.92813373, + "learning_rate": 0.000991915448845342, + "loss": 0.93900549, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.77832031, + "step": 445, + "time_per_iteration": 2.6011459827423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090069, + "balance_loss_mlp": 1.01243973, + "epoch": 0.08580223162754906, + "flos": 518177690112.0, + "grad_norm": 0.03377956208163177, + "language_loss": 1.02285504, + "learning_rate": 0.000991859556171339, + "loss": 1.03375578, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.77539062, + "step": 446, + "time_per_iteration": 2.606220006942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.01086187, + "epoch": 0.08599461331281262, + "flos": 532520589312.0, + "grad_norm": 0.037753212584348855, + "language_loss": 1.04541254, + "learning_rate": 0.000991803472540521, + "loss": 1.0562979, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.77587891, + "step": 447, + "time_per_iteration": 2.625401735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088712, + "balance_loss_mlp": 1.01113105, + "epoch": 0.08618699499807618, + "flos": 791634712320.0, + "grad_norm": 0.030920782852134367, + "language_loss": 0.98781657, + "learning_rate": 0.0009917471979746615, + "loss": 0.99870372, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.77490234, + "step": 448, + "time_per_iteration": 3.0066978931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.01195049, + "epoch": 0.08637937668333974, + "flos": 567115603200.0, + "grad_norm": 0.03238149886931097, + "language_loss": 0.98317528, + "learning_rate": 0.0009916907324956086, + "loss": 0.99407488, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.77929688, + "step": 449, + "time_per_iteration": 2.7561135292053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091057, + "balance_loss_mlp": 1.01333201, + "epoch": 0.08657175836860331, + "flos": 446118108672.0, + "grad_norm": 0.029046506526173844, + "language_loss": 0.94927382, + "learning_rate": 0.0009916340761252837, + "loss": 0.96018445, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.77636719, + "step": 450, + "time_per_iteration": 2.6452889442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089525, + "balance_loss_mlp": 1.01222932, + "epoch": 0.08676414005386687, + "flos": 845589480960.0, + "grad_norm": 0.032144406787761336, + "language_loss": 0.91630232, + "learning_rate": 0.0009915772288856832, + "loss": 0.92719758, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.77197266, + "step": 451, + "time_per_iteration": 3.0991322994232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108797, + "balance_loss_mlp": 1.01086605, + "epoch": 0.08695652173913043, + "flos": 604484906496.0, + "grad_norm": 0.025568476728402203, + "language_loss": 0.93134868, + "learning_rate": 0.000991520190798877, + "loss": 0.94222844, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.77001953, + "step": 452, + "time_per_iteration": 2.833534002304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093662, + "balance_loss_mlp": 1.01660514, + "epoch": 0.08714890342439399, + "flos": 732001107456.0, + "grad_norm": 0.03795734255344977, + "language_loss": 1.02428043, + "learning_rate": 0.0009914629618870089, + "loss": 1.03521705, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.76953125, + "step": 453, + "time_per_iteration": 2.9043643474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.02319336, + "epoch": 0.08734128510965757, + "flos": 1485456770304.0, + "grad_norm": 0.019964198948139205, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79774594, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.75390625, + "step": 454, + "time_per_iteration": 2.093019723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_mlp": 1.01278687, + "epoch": 0.08753366679492113, + "flos": 1526269146624.0, + "grad_norm": 0.012226751630218, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82515901, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.75, + "step": 455, + "time_per_iteration": 4.905871391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091043, + "balance_loss_mlp": 1.01379561, + "epoch": 0.08772604848018468, + "flos": 722525364480.0, + "grad_norm": 0.044152825797527884, + "language_loss": 0.95217329, + "learning_rate": 0.0009912901304235883, + "loss": 0.96308374, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.77148438, + "step": 456, + "time_per_iteration": 2.850330352783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090876, + "balance_loss_mlp": 1.01396191, + "epoch": 0.08791843016544824, + "flos": 709467542784.0, + "grad_norm": 0.038854584599924205, + "language_loss": 0.92178857, + "learning_rate": 0.000991232138434397, + "loss": 0.9326973, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.76806641, + "step": 457, + "time_per_iteration": 2.868957757949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.01477098, + "epoch": 0.08811081185071182, + "flos": 474022362624.0, + "grad_norm": 0.04035146689108268, + "language_loss": 0.99321103, + "learning_rate": 0.000991173955731976, + "loss": 1.00412512, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.76513672, + "step": 458, + "time_per_iteration": 2.6747970581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089272, + "balance_loss_mlp": 1.01288271, + "epoch": 0.08830319353597538, + "flos": 686315738880.0, + "grad_norm": 0.033089720334054364, + "language_loss": 1.03213239, + "learning_rate": 0.0009911155823389137, + "loss": 1.04302514, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.76269531, + "step": 459, + "time_per_iteration": 2.9462268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_mlp": 1.00881398, + "epoch": 0.08849557522123894, + "flos": 574609294848.0, + "grad_norm": 0.035557366742091014, + "language_loss": 0.99025905, + "learning_rate": 0.000991057018277873, + "loss": 1.00111353, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.76513672, + "step": 460, + "time_per_iteration": 2.6903369426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.00968456, + "epoch": 0.0886879569065025, + "flos": 565628336640.0, + "grad_norm": 0.039664118418905284, + "language_loss": 1.00002789, + "learning_rate": 0.0009909982635715898, + "loss": 1.01089334, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.76757812, + "step": 461, + "time_per_iteration": 2.620046615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010895, + "balance_loss_mlp": 1.0128243, + "epoch": 0.08888033859176607, + "flos": 564957607680.0, + "grad_norm": 0.03231802322071402, + "language_loss": 0.98670942, + "learning_rate": 0.0009909393182428751, + "loss": 0.99760437, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.765625, + "step": 462, + "time_per_iteration": 2.6466307640075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090991, + "balance_loss_mlp": 1.01412475, + "epoch": 0.08907272027702963, + "flos": 466743499008.0, + "grad_norm": 0.03344290639259395, + "language_loss": 0.93214953, + "learning_rate": 0.000990880182314614, + "loss": 0.94305944, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.76757812, + "step": 463, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_mlp": 1.0100224, + "epoch": 0.08926510196229319, + "flos": 682844475648.0, + "grad_norm": 0.03261982194681884, + "language_loss": 0.93093467, + "learning_rate": 0.0009908208558097643, + "loss": 0.94180012, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.76416016, + "step": 464, + "time_per_iteration": 2.9068925380706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089482, + "balance_loss_mlp": 1.01323605, + "epoch": 0.08945748364755675, + "flos": 597822336768.0, + "grad_norm": 0.03309433671244878, + "language_loss": 0.95414662, + "learning_rate": 0.000990761338751359, + "loss": 0.9650414, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.76123047, + "step": 465, + "time_per_iteration": 2.774606227874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079613, + "balance_loss_mlp": 1.00732422, + "epoch": 0.08964986533282032, + "flos": 1589343879168.0, + "grad_norm": 0.03434681355524106, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74739242, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.72460938, + "step": 466, + "time_per_iteration": 4.996358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092523, + "balance_loss_mlp": 1.01646745, + "epoch": 0.08984224701808388, + "flos": 534550272768.0, + "grad_norm": 0.03379784984504044, + "language_loss": 0.98391378, + "learning_rate": 0.0009906417330663815, + "loss": 0.99483901, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.75927734, + "step": 467, + "time_per_iteration": 2.6774964332580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092653, + "balance_loss_mlp": 1.01678836, + "epoch": 0.09003462870334744, + "flos": 479850898176.0, + "grad_norm": 0.04271038491910547, + "language_loss": 0.94838965, + "learning_rate": 0.0009905816444862442, + "loss": 0.95931625, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.75732422, + "step": 468, + "time_per_iteration": 2.6558451652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092919, + "balance_loss_mlp": 1.01691103, + "epoch": 0.090227010388611, + "flos": 654903283200.0, + "grad_norm": 0.031716132767048565, + "language_loss": 0.92225289, + "learning_rate": 0.0009905213654454216, + "loss": 0.933182, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.75878906, + "step": 469, + "time_per_iteration": 2.9322757720947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.01796389, + "epoch": 0.09041939207387456, + "flos": 619359528960.0, + "grad_norm": 0.03474651138537023, + "language_loss": 1.00819349, + "learning_rate": 0.0009904608959673158, + "loss": 1.01913023, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.75585938, + "step": 470, + "time_per_iteration": 2.7938003540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091787, + "balance_loss_mlp": 1.01620793, + "epoch": 0.09061177375913813, + "flos": 455296398336.0, + "grad_norm": 0.04023106246537731, + "language_loss": 1.00852847, + "learning_rate": 0.000990400236075403, + "loss": 1.01944637, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.75439453, + "step": 471, + "time_per_iteration": 2.5231049060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.01024961, + "epoch": 0.0908041554444017, + "flos": 545309147904.0, + "grad_norm": 0.036372029021066864, + "language_loss": 0.97571105, + "learning_rate": 0.0009903393857932338, + "loss": 0.98656648, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.75146484, + "step": 472, + "time_per_iteration": 2.700449228286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082684, + "balance_loss_mlp": 1.00786841, + "epoch": 0.09099653712966525, + "flos": 565467943680.0, + "grad_norm": 0.03263919317425628, + "language_loss": 0.95124531, + "learning_rate": 0.0009902783451444317, + "loss": 0.96207213, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.74658203, + "step": 473, + "time_per_iteration": 2.7006537914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.00667381, + "epoch": 0.09118891881492881, + "flos": 475502826240.0, + "grad_norm": 0.036465550100162274, + "language_loss": 0.98778975, + "learning_rate": 0.0009902171141526956, + "loss": 0.99860233, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.74414062, + "step": 474, + "time_per_iteration": 2.565852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081522, + "balance_loss_mlp": 1.00732613, + "epoch": 0.09138130050019239, + "flos": 546991800576.0, + "grad_norm": 0.03189281102051162, + "language_loss": 0.86324012, + "learning_rate": 0.000990155692841797, + "loss": 0.87405533, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.74023438, + "step": 475, + "time_per_iteration": 2.9694621562957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.0079515, + "epoch": 0.09157368218545595, + "flos": 733974410496.0, + "grad_norm": 0.03574286330183218, + "language_loss": 0.98287529, + "learning_rate": 0.0009900940812355818, + "loss": 0.99369442, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.73779297, + "step": 476, + "time_per_iteration": 2.8549702167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.00835192, + "epoch": 0.0917660638707195, + "flos": 612073862400.0, + "grad_norm": 0.03800316101532587, + "language_loss": 0.95275486, + "learning_rate": 0.00099003227935797, + "loss": 0.96357656, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.73632812, + "step": 477, + "time_per_iteration": 2.709808349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084092, + "balance_loss_mlp": 1.01051593, + "epoch": 0.09195844555598306, + "flos": 657019482624.0, + "grad_norm": 0.03875864993538346, + "language_loss": 0.99037415, + "learning_rate": 0.000989970287232955, + "loss": 1.0012151, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.73486328, + "step": 478, + "time_per_iteration": 2.7670538425445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.01252699, + "epoch": 0.09215082724124664, + "flos": 477541257984.0, + "grad_norm": 0.03367109557456403, + "language_loss": 0.95731258, + "learning_rate": 0.0009899081048846043, + "loss": 0.96817166, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.73339844, + "step": 479, + "time_per_iteration": 2.588352918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_mlp": 1.01208997, + "epoch": 0.0923432089265102, + "flos": 525326296320.0, + "grad_norm": 0.0462740033589213, + "language_loss": 1.00606585, + "learning_rate": 0.0009898457323370593, + "loss": 1.01691723, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.73046875, + "step": 480, + "time_per_iteration": 2.5808160305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082132, + "balance_loss_mlp": 1.00936687, + "epoch": 0.09253559061177376, + "flos": 546639912192.0, + "grad_norm": 0.03676160983227949, + "language_loss": 0.9798522, + "learning_rate": 0.000989783169614535, + "loss": 0.99067354, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.72900391, + "step": 481, + "time_per_iteration": 2.624483108520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097145, + "balance_loss_mlp": 1.02485657, + "epoch": 0.09272797229703732, + "flos": 1541337209856.0, + "grad_norm": 0.023489610904585654, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79849905, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.72460938, + "step": 482, + "time_per_iteration": 4.897305965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085543, + "balance_loss_mlp": 1.01330173, + "epoch": 0.09292035398230089, + "flos": 691065276672.0, + "grad_norm": 0.04252493421314706, + "language_loss": 0.95552129, + "learning_rate": 0.000989657473741779, + "loss": 0.96637678, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.72412109, + "step": 483, + "time_per_iteration": 2.8165738582611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.01184416, + "epoch": 0.09311273566756445, + "flos": 510823004160.0, + "grad_norm": 0.03895509426778844, + "language_loss": 0.97422099, + "learning_rate": 0.0009895943406403465, + "loss": 0.98506236, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.72460938, + "step": 484, + "time_per_iteration": 2.7523326873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086134, + "balance_loss_mlp": 1.01384509, + "epoch": 0.09330511735282801, + "flos": 660584064768.0, + "grad_norm": 0.04754513437429821, + "language_loss": 0.90526009, + "learning_rate": 0.0009895310174615338, + "loss": 0.91612148, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.72460938, + "step": 485, + "time_per_iteration": 2.843790292739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070969, + "balance_loss_mlp": 0.99982452, + "epoch": 0.09349749903809157, + "flos": 1456024420608.0, + "grad_norm": 0.007982392205281765, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76789486, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.71289062, + "step": 486, + "time_per_iteration": 4.649716138839722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.00877845, + "epoch": 0.09368988072335514, + "flos": 521900719872.0, + "grad_norm": 0.0379904908867083, + "language_loss": 0.94096279, + "learning_rate": 0.0009894038009701782, + "loss": 0.95177054, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.72167969, + "step": 487, + "time_per_iteration": 2.615767002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_mlp": 1.012941, + "epoch": 0.0938822624086187, + "flos": 498752806656.0, + "grad_norm": 0.041516659048387576, + "language_loss": 0.97017074, + "learning_rate": 0.0009893399077070253, + "loss": 0.98102111, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.72265625, + "step": 488, + "time_per_iteration": 2.592867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.01828361, + "epoch": 0.09407464409388226, + "flos": 534224629248.0, + "grad_norm": 0.031087819309936707, + "language_loss": 0.91152203, + "learning_rate": 0.0009892758244652718, + "loss": 0.92242396, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.72070312, + "step": 489, + "time_per_iteration": 2.702681541442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080571, + "balance_loss_mlp": 1.00852132, + "epoch": 0.09426702577914582, + "flos": 587091651840.0, + "grad_norm": 0.037758062155454256, + "language_loss": 0.98290044, + "learning_rate": 0.0009892115512697968, + "loss": 0.99370617, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.72216797, + "step": 490, + "time_per_iteration": 2.7222015857696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.01649261, + "epoch": 0.0944594074644094, + "flos": 504464690688.0, + "grad_norm": 0.03400132145466818, + "language_loss": 0.98617911, + "learning_rate": 0.0009891470881455537, + "loss": 0.99706453, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.72216797, + "step": 491, + "time_per_iteration": 2.6978650093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.01626599, + "epoch": 0.09465178914967295, + "flos": 572114962176.0, + "grad_norm": 0.03537229102294209, + "language_loss": 0.97051454, + "learning_rate": 0.0009890824351175692, + "loss": 0.98139298, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.71728516, + "step": 492, + "time_per_iteration": 2.7183802127838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.01590919, + "epoch": 0.09484417083493651, + "flos": 550419322368.0, + "grad_norm": 0.028677449722299516, + "language_loss": 1.00688422, + "learning_rate": 0.0009890175922109435, + "loss": 1.01776004, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.71826172, + "step": 493, + "time_per_iteration": 2.680469512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082456, + "balance_loss_mlp": 1.01088285, + "epoch": 0.09503655252020007, + "flos": 825272237568.0, + "grad_norm": 0.03488638846892438, + "language_loss": 0.98808897, + "learning_rate": 0.0009889525594508513, + "loss": 0.99891359, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.71728516, + "step": 494, + "time_per_iteration": 2.983400344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083066, + "balance_loss_mlp": 1.01154041, + "epoch": 0.09522893420546363, + "flos": 405518615040.0, + "grad_norm": 0.028649644857800794, + "language_loss": 0.9245472, + "learning_rate": 0.0009888873368625404, + "loss": 0.93537784, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.71679688, + "step": 495, + "time_per_iteration": 2.497526168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108369, + "balance_loss_mlp": 1.01206875, + "epoch": 0.0954213158907272, + "flos": 692257035264.0, + "grad_norm": 0.03396045626839725, + "language_loss": 0.96602595, + "learning_rate": 0.0009888219244713326, + "loss": 0.97686291, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.71777344, + "step": 496, + "time_per_iteration": 2.8588504791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.01018417, + "epoch": 0.09561369757599077, + "flos": 520075170816.0, + "grad_norm": 0.039869543083186736, + "language_loss": 0.97707164, + "learning_rate": 0.0009887563223026229, + "loss": 0.98788875, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.71679688, + "step": 497, + "time_per_iteration": 2.6856894493103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075874, + "balance_loss_mlp": 1.00644684, + "epoch": 0.09580607926125433, + "flos": 1388784363264.0, + "grad_norm": 0.01625235818526382, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80144036, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.6953125, + "step": 498, + "time_per_iteration": 4.882593393325806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086748, + "balance_loss_mlp": 1.0150795, + "epoch": 0.09599846094651789, + "flos": 718826634240.0, + "grad_norm": 0.03326061844711544, + "language_loss": 0.95632416, + "learning_rate": 0.0009886245487346482, + "loss": 0.9671917, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.71826172, + "step": 499, + "time_per_iteration": 3.0426785945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.01568544, + "epoch": 0.09619084263178146, + "flos": 386894717952.0, + "grad_norm": 0.04298067648683731, + "language_loss": 0.98954022, + "learning_rate": 0.0009885583773865422, + "loss": 1.00041187, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.71630859, + "step": 500, + "time_per_iteration": 2.452941417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.01467967, + "epoch": 0.09638322431704502, + "flos": 535173369600.0, + "grad_norm": 0.04172266818012015, + "language_loss": 0.95971203, + "learning_rate": 0.0009884920163632524, + "loss": 0.97057414, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.71679688, + "step": 501, + "time_per_iteration": 2.657940626144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080406, + "balance_loss_mlp": 1.00911927, + "epoch": 0.09657560600230858, + "flos": 501657353472.0, + "grad_norm": 0.041437287127294276, + "language_loss": 0.9960922, + "learning_rate": 0.000988425465690543, + "loss": 1.00689626, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.71435547, + "step": 502, + "time_per_iteration": 2.5540428161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.00642741, + "epoch": 0.09676798768757214, + "flos": 530332458240.0, + "grad_norm": 0.03187665411612151, + "language_loss": 0.96807587, + "learning_rate": 0.0009883587253942505, + "loss": 0.97885495, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.71630859, + "step": 503, + "time_per_iteration": 2.7744338512420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086901, + "balance_loss_mlp": 1.01542282, + "epoch": 0.09696036937283571, + "flos": 464557313280.0, + "grad_norm": 0.038653015311582224, + "language_loss": 1.0234406, + "learning_rate": 0.0009882917955002862, + "loss": 1.03430974, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.71630859, + "step": 504, + "time_per_iteration": 2.500669479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_mlp": 1.01074982, + "epoch": 0.09715275105809927, + "flos": 536011294464.0, + "grad_norm": 0.035792041916504785, + "language_loss": 0.94188601, + "learning_rate": 0.0009882246760346343, + "loss": 0.95270395, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.71191406, + "step": 505, + "time_per_iteration": 2.6442148685455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077575, + "balance_loss_mlp": 1.00652647, + "epoch": 0.09734513274336283, + "flos": 455882556672.0, + "grad_norm": 0.04461237962136338, + "language_loss": 1.00418711, + "learning_rate": 0.0009881573670233533, + "loss": 1.01496279, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.71191406, + "step": 506, + "time_per_iteration": 2.5102410316467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075502, + "balance_loss_mlp": 1.00450063, + "epoch": 0.09753751442862639, + "flos": 509828577024.0, + "grad_norm": 0.03506590591484262, + "language_loss": 0.93374205, + "learning_rate": 0.0009880898684925747, + "loss": 0.94449711, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.71142578, + "step": 507, + "time_per_iteration": 2.652381658554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.00624609, + "epoch": 0.09772989611388996, + "flos": 485247832320.0, + "grad_norm": 0.03501422949918711, + "language_loss": 0.92606336, + "learning_rate": 0.0009880221804685037, + "loss": 0.9368335, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.70898438, + "step": 508, + "time_per_iteration": 2.5481274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073608, + "balance_loss_mlp": 1.00456238, + "epoch": 0.09792227779915352, + "flos": 1569319231488.0, + "grad_norm": 0.011873284077886747, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80418032, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.69140625, + "step": 509, + "time_per_iteration": 4.725191354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076044, + "balance_loss_mlp": 1.00590122, + "epoch": 0.09811465948441708, + "flos": 588915255552.0, + "grad_norm": 0.04172960474096109, + "language_loss": 0.98818666, + "learning_rate": 0.0009878862360456733, + "loss": 0.99894708, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.70263672, + "step": 510, + "time_per_iteration": 2.7094569206237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.00828481, + "epoch": 0.09830704116968064, + "flos": 614129790720.0, + "grad_norm": 0.037035801977756785, + "language_loss": 0.90851068, + "learning_rate": 0.0009878179796996922, + "loss": 0.919294, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.70166016, + "step": 511, + "time_per_iteration": 2.6973366737365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079637, + "balance_loss_mlp": 1.00973296, + "epoch": 0.09849942285494422, + "flos": 539936513280.0, + "grad_norm": 0.0318668020933778, + "language_loss": 0.94484478, + "learning_rate": 0.0009877495339659754, + "loss": 0.95564115, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.70019531, + "step": 512, + "time_per_iteration": 2.7476089000701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083598, + "balance_loss_mlp": 1.0137887, + "epoch": 0.09869180454020778, + "flos": 621604040448.0, + "grad_norm": 0.03763698097825182, + "language_loss": 0.89467418, + "learning_rate": 0.000987680898871096, + "loss": 0.90551007, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.69921875, + "step": 513, + "time_per_iteration": 2.7254321575164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.01382184, + "epoch": 0.09888418622547133, + "flos": 813061089024.0, + "grad_norm": 0.049179676158016515, + "language_loss": 0.91816097, + "learning_rate": 0.0009876120744417, + "loss": 0.9289968, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.69873047, + "step": 514, + "time_per_iteration": 2.9596974849700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083293, + "balance_loss_mlp": 1.01357901, + "epoch": 0.0990765679107349, + "flos": 536857967616.0, + "grad_norm": 0.03966041946019195, + "language_loss": 0.99294269, + "learning_rate": 0.0009875430607045078, + "loss": 1.0037756, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.69824219, + "step": 515, + "time_per_iteration": 2.7065181732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_mlp": 1.01439941, + "epoch": 0.09926894959599845, + "flos": 588971635968.0, + "grad_norm": 0.037836000479060286, + "language_loss": 0.94664383, + "learning_rate": 0.000987473857686313, + "loss": 0.95748156, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.69482422, + "step": 516, + "time_per_iteration": 2.712947130203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_mlp": 1.01582849, + "epoch": 0.09946133128126203, + "flos": 642387878400.0, + "grad_norm": 0.04191957443387863, + "language_loss": 0.98466003, + "learning_rate": 0.0009874044654139824, + "loss": 0.99551111, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.69384766, + "step": 517, + "time_per_iteration": 2.7391469478607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081227, + "balance_loss_mlp": 1.01194227, + "epoch": 0.09965371296652559, + "flos": 466726002432.0, + "grad_norm": 0.049265237591549625, + "language_loss": 0.97911566, + "learning_rate": 0.0009873348839144563, + "loss": 0.98992795, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.69384766, + "step": 518, + "time_per_iteration": 2.5496554374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081078, + "balance_loss_mlp": 1.01198411, + "epoch": 0.09984609465178915, + "flos": 484559606784.0, + "grad_norm": 0.04039588305244337, + "language_loss": 0.99084902, + "learning_rate": 0.000987265113214749, + "loss": 1.00165975, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.69189453, + "step": 519, + "time_per_iteration": 2.592350721359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_mlp": 1.01200545, + "epoch": 0.1000384763370527, + "flos": 570095972352.0, + "grad_norm": 0.04690738730083641, + "language_loss": 1.01784182, + "learning_rate": 0.0009871951533419476, + "loss": 1.02865279, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.69189453, + "step": 520, + "time_per_iteration": 2.699725866317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.00854921, + "epoch": 0.10023085802231628, + "flos": 546926671872.0, + "grad_norm": 0.03422053119670882, + "language_loss": 0.91227025, + "learning_rate": 0.0009871250043232132, + "loss": 0.92304718, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.69238281, + "step": 521, + "time_per_iteration": 2.74124813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078273, + "balance_loss_mlp": 1.00913203, + "epoch": 0.10042323970757984, + "flos": 504440391168.0, + "grad_norm": 0.0407416967929008, + "language_loss": 0.91114902, + "learning_rate": 0.0009870546661857797, + "loss": 0.92193174, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.69238281, + "step": 522, + "time_per_iteration": 2.6524126529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080712, + "balance_loss_mlp": 1.01199949, + "epoch": 0.1006156213928434, + "flos": 771725737728.0, + "grad_norm": 0.04764395650012834, + "language_loss": 1.0071038, + "learning_rate": 0.0009869841389569553, + "loss": 1.01791096, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.68798828, + "step": 523, + "time_per_iteration": 2.9797816276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.01237857, + "epoch": 0.10080800307810696, + "flos": 491009293824.0, + "grad_norm": 0.04526617857315469, + "language_loss": 0.93126583, + "learning_rate": 0.0009869134226641206, + "loss": 0.94207817, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.68945312, + "step": 524, + "time_per_iteration": 2.624396562576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079355, + "balance_loss_mlp": 1.01064241, + "epoch": 0.10100038476337053, + "flos": 455713415424.0, + "grad_norm": 0.04976961118682096, + "language_loss": 0.93662071, + "learning_rate": 0.0009868425173347303, + "loss": 0.94741422, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.68798828, + "step": 525, + "time_per_iteration": 2.659106731414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077138, + "balance_loss_mlp": 1.00809169, + "epoch": 0.10119276644863409, + "flos": 557574731520.0, + "grad_norm": 0.04197638521891018, + "language_loss": 0.9924143, + "learning_rate": 0.0009867714229963125, + "loss": 1.00318575, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.69140625, + "step": 526, + "time_per_iteration": 2.7414495944976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.01201165, + "epoch": 0.10138514813389765, + "flos": 517220201472.0, + "grad_norm": 0.044929109849797505, + "language_loss": 0.96641302, + "learning_rate": 0.000986700139676468, + "loss": 0.97722065, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.68847656, + "step": 527, + "time_per_iteration": 2.620313882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083057, + "balance_loss_mlp": 1.01405847, + "epoch": 0.10157752981916121, + "flos": 501564034560.0, + "grad_norm": 0.03558874762709202, + "language_loss": 0.9424324, + "learning_rate": 0.0009866286674028717, + "loss": 0.95326293, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.69091797, + "step": 528, + "time_per_iteration": 2.632835865020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082511, + "balance_loss_mlp": 1.01379848, + "epoch": 0.10176991150442478, + "flos": 658094589696.0, + "grad_norm": 0.042026744727430246, + "language_loss": 0.91470444, + "learning_rate": 0.0009865570062032717, + "loss": 0.9255296, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.68798828, + "step": 529, + "time_per_iteration": 2.9185874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.01519477, + "epoch": 0.10196229318968834, + "flos": 574403215104.0, + "grad_norm": 0.031693910674612406, + "language_loss": 0.95307148, + "learning_rate": 0.0009864851561054893, + "loss": 0.96391344, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.69091797, + "step": 530, + "time_per_iteration": 2.7826597690582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.01765728, + "epoch": 0.1021546748749519, + "flos": 519256687872.0, + "grad_norm": 0.0418084670656813, + "language_loss": 0.94574928, + "learning_rate": 0.0009864131171374191, + "loss": 0.95661592, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.69091797, + "step": 531, + "time_per_iteration": 2.67000150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088238, + "balance_loss_mlp": 1.01919198, + "epoch": 0.10234705656021546, + "flos": 610954035456.0, + "grad_norm": 0.03906444640078033, + "language_loss": 0.94287467, + "learning_rate": 0.0009863408893270292, + "loss": 0.95375705, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.69140625, + "step": 532, + "time_per_iteration": 2.7893166542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_mlp": 1.02029741, + "epoch": 0.10253943824547904, + "flos": 602913069312.0, + "grad_norm": 0.046708965243717, + "language_loss": 0.90346718, + "learning_rate": 0.0009862684727023605, + "loss": 0.91435778, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.68847656, + "step": 533, + "time_per_iteration": 2.7212483882904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.0105468, + "epoch": 0.1027318199307426, + "flos": 664157395200.0, + "grad_norm": 0.04923575085492922, + "language_loss": 0.9286049, + "learning_rate": 0.0009861958672915283, + "loss": 0.93939555, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.68603516, + "step": 534, + "time_per_iteration": 2.8216443061828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080998, + "balance_loss_mlp": 1.01271474, + "epoch": 0.10292420161600616, + "flos": 684531019008.0, + "grad_norm": 0.03566434899904423, + "language_loss": 0.91122925, + "learning_rate": 0.0009861230731227201, + "loss": 0.92203927, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.68359375, + "step": 535, + "time_per_iteration": 2.8432843685150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082908, + "balance_loss_mlp": 1.01514912, + "epoch": 0.10311658330126972, + "flos": 491269808640.0, + "grad_norm": 0.04656876258351904, + "language_loss": 0.9494285, + "learning_rate": 0.0009860500902241973, + "loss": 0.96025753, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.67822266, + "step": 536, + "time_per_iteration": 2.601234197616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085787, + "balance_loss_mlp": 1.01831496, + "epoch": 0.10330896498653329, + "flos": 432687011328.0, + "grad_norm": 0.046264109011482965, + "language_loss": 0.99409795, + "learning_rate": 0.0009859769186242942, + "loss": 1.00495577, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.67529297, + "step": 537, + "time_per_iteration": 2.527156114578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079891, + "balance_loss_mlp": 1.01265681, + "epoch": 0.10350134667179685, + "flos": 550642898688.0, + "grad_norm": 0.04274411195548745, + "language_loss": 0.92667055, + "learning_rate": 0.0009859035583514187, + "loss": 0.93746948, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.67285156, + "step": 538, + "time_per_iteration": 2.6489107608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082236, + "balance_loss_mlp": 1.01505005, + "epoch": 0.10369372835706041, + "flos": 641827964928.0, + "grad_norm": 0.04978782417937993, + "language_loss": 0.95941103, + "learning_rate": 0.0009858300094340517, + "loss": 0.97023344, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.67236328, + "step": 539, + "time_per_iteration": 2.8078534603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107826, + "balance_loss_mlp": 1.01102614, + "epoch": 0.10388611004232397, + "flos": 522766834944.0, + "grad_norm": 0.04233995967203171, + "language_loss": 0.8846426, + "learning_rate": 0.0009857562719007473, + "loss": 0.8954252, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.67285156, + "step": 540, + "time_per_iteration": 2.605253219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108211, + "balance_loss_mlp": 1.01487637, + "epoch": 0.10407849172758753, + "flos": 703741074432.0, + "grad_norm": 0.04489314852578161, + "language_loss": 0.9024663, + "learning_rate": 0.0009856823457801331, + "loss": 0.91328734, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.67285156, + "step": 541, + "time_per_iteration": 2.8836264610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074756, + "balance_loss_mlp": 1.00737894, + "epoch": 0.1042708734128511, + "flos": 503945606400.0, + "grad_norm": 0.04545070943505171, + "language_loss": 0.97841358, + "learning_rate": 0.00098560823110091, + "loss": 0.98916113, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.67431641, + "step": 542, + "time_per_iteration": 2.629241466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078174, + "balance_loss_mlp": 1.01084471, + "epoch": 0.10446325509811466, + "flos": 486641779968.0, + "grad_norm": 0.04151430298304091, + "language_loss": 0.974545, + "learning_rate": 0.000985533927891851, + "loss": 0.98532677, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.67382812, + "step": 543, + "time_per_iteration": 2.712714195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.01110125, + "epoch": 0.10465563678337822, + "flos": 569713948416.0, + "grad_norm": 0.043537531534841835, + "language_loss": 0.9559319, + "learning_rate": 0.0009854594361818044, + "loss": 0.96671236, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.66992188, + "step": 544, + "time_per_iteration": 2.66324520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075343, + "balance_loss_mlp": 1.00806153, + "epoch": 0.10484801846864178, + "flos": 627243992832.0, + "grad_norm": 0.042858245855360314, + "language_loss": 0.94459403, + "learning_rate": 0.0009853847559996897, + "loss": 0.95534742, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.67333984, + "step": 545, + "time_per_iteration": 2.749379873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.00697374, + "epoch": 0.10504040015390535, + "flos": 744813965568.0, + "grad_norm": 0.04113973833070077, + "language_loss": 0.93940508, + "learning_rate": 0.0009853098873745, + "loss": 0.95015049, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.67626953, + "step": 546, + "time_per_iteration": 3.0356035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082094, + "balance_loss_mlp": 1.01457405, + "epoch": 0.10523278183916891, + "flos": 587843060736.0, + "grad_norm": 0.04039468180414331, + "language_loss": 0.92498314, + "learning_rate": 0.0009852348303353027, + "loss": 0.93580401, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.67578125, + "step": 547, + "time_per_iteration": 2.787853479385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_mlp": 1.01283157, + "epoch": 0.10542516352443247, + "flos": 871147156224.0, + "grad_norm": 0.04319215205461418, + "language_loss": 0.86143011, + "learning_rate": 0.000985159584911237, + "loss": 0.872235, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.67724609, + "step": 548, + "time_per_iteration": 3.103173017501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.00949633, + "epoch": 0.10561754520969603, + "flos": 506413694208.0, + "grad_norm": 0.04405333210851084, + "language_loss": 0.94064271, + "learning_rate": 0.0009850841511315162, + "loss": 0.95141286, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.67578125, + "step": 549, + "time_per_iteration": 2.647629737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107948, + "balance_loss_mlp": 1.01176953, + "epoch": 0.1058099268949596, + "flos": 561148061952.0, + "grad_norm": 0.03728506713954383, + "language_loss": 0.9326818, + "learning_rate": 0.0009850085290254256, + "loss": 0.94347662, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.67773438, + "step": 550, + "time_per_iteration": 2.7680838108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081585, + "balance_loss_mlp": 1.01411295, + "epoch": 0.10600230858022316, + "flos": 563160248832.0, + "grad_norm": 0.031635589688873186, + "language_loss": 0.90350562, + "learning_rate": 0.0009849327186223246, + "loss": 0.91432148, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.67529297, + "step": 551, + "time_per_iteration": 2.7540531158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077249, + "balance_loss_mlp": 1.01001453, + "epoch": 0.10619469026548672, + "flos": 495318481920.0, + "grad_norm": 0.03875875468173829, + "language_loss": 0.97612774, + "learning_rate": 0.000984856719951646, + "loss": 0.98690015, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.67285156, + "step": 552, + "time_per_iteration": 2.5471906661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_mlp": 1.01300704, + "epoch": 0.10638707195075028, + "flos": 677465038080.0, + "grad_norm": 0.04041077275123314, + "language_loss": 0.94560456, + "learning_rate": 0.0009847805330428943, + "loss": 0.95640558, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.67138672, + "step": 553, + "time_per_iteration": 2.879901647567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081173, + "balance_loss_mlp": 1.01398706, + "epoch": 0.10657945363601386, + "flos": 489035990784.0, + "grad_norm": 0.051524237529684984, + "language_loss": 0.97161597, + "learning_rate": 0.0009847041579256481, + "loss": 0.98242772, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.67236328, + "step": 554, + "time_per_iteration": 2.5838425159454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076637, + "balance_loss_mlp": 1.00997543, + "epoch": 0.10677183532127742, + "flos": 483971503104.0, + "grad_norm": 0.03890900728724459, + "language_loss": 0.96058643, + "learning_rate": 0.0009846275946295592, + "loss": 0.97135282, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.66699219, + "step": 555, + "time_per_iteration": 2.619490623474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074794, + "balance_loss_mlp": 1.00813222, + "epoch": 0.10696421700654098, + "flos": 657582308352.0, + "grad_norm": 0.03350037319549477, + "language_loss": 0.89189553, + "learning_rate": 0.0009845508431843518, + "loss": 0.9026435, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.66699219, + "step": 556, + "time_per_iteration": 3.0074055194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_mlp": 1.00895333, + "epoch": 0.10715659869180454, + "flos": 568793398272.0, + "grad_norm": 0.03867425342149035, + "language_loss": 0.90383601, + "learning_rate": 0.0009844739036198233, + "loss": 0.91459262, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.66748047, + "step": 557, + "time_per_iteration": 2.719309091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073849, + "balance_loss_mlp": 1.00756896, + "epoch": 0.10734898037706811, + "flos": 541744565760.0, + "grad_norm": 0.03845092177051005, + "language_loss": 0.97656357, + "learning_rate": 0.0009843967759658448, + "loss": 0.98730206, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.66308594, + "step": 558, + "time_per_iteration": 2.679964065551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077583, + "balance_loss_mlp": 1.01311493, + "epoch": 0.10754136206233167, + "flos": 1479734192640.0, + "grad_norm": 0.013283033162601723, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73845339, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.64453125, + "step": 559, + "time_per_iteration": 4.837440729141235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_mlp": 1.00977802, + "epoch": 0.10773374374759523, + "flos": 513412601088.0, + "grad_norm": 0.03702065367467253, + "language_loss": 0.97501957, + "learning_rate": 0.000984241956509384, + "loss": 0.98577774, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.66064453, + "step": 560, + "time_per_iteration": 2.6579978466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079218, + "balance_loss_mlp": 1.01312864, + "epoch": 0.10792612543285879, + "flos": 497478422784.0, + "grad_norm": 0.05173888564395698, + "language_loss": 0.9404971, + "learning_rate": 0.0009841642647670078, + "loss": 0.9512893, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.66113281, + "step": 561, + "time_per_iteration": 2.557605743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080143, + "balance_loss_mlp": 1.01429176, + "epoch": 0.10811850711812235, + "flos": 736838128128.0, + "grad_norm": 0.0493873548723288, + "language_loss": 0.88547891, + "learning_rate": 0.0009840863850553944, + "loss": 0.89628035, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.65869141, + "step": 562, + "time_per_iteration": 2.949580669403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077668, + "balance_loss_mlp": 1.0115304, + "epoch": 0.10831088880338592, + "flos": 612677517312.0, + "grad_norm": 0.04173462884607535, + "language_loss": 0.94150907, + "learning_rate": 0.0009840083174047782, + "loss": 0.95228577, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.66162109, + "step": 563, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081559, + "balance_loss_mlp": 1.01561248, + "epoch": 0.10850327048864948, + "flos": 557498909184.0, + "grad_norm": 0.034100755270258146, + "language_loss": 0.88515103, + "learning_rate": 0.0009839300618454685, + "loss": 0.89596659, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.65966797, + "step": 564, + "time_per_iteration": 2.8846256732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080528, + "balance_loss_mlp": 1.0148201, + "epoch": 0.10869565217391304, + "flos": 604437274368.0, + "grad_norm": 0.036735298053950545, + "language_loss": 0.93941957, + "learning_rate": 0.0009838516184078466, + "loss": 0.95022488, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.65722656, + "step": 565, + "time_per_iteration": 2.813284158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078727, + "balance_loss_mlp": 1.01297164, + "epoch": 0.1088880338591766, + "flos": 527206280448.0, + "grad_norm": 0.040314305725270186, + "language_loss": 0.91096556, + "learning_rate": 0.0009837729871223669, + "loss": 0.92175281, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.65771484, + "step": 566, + "time_per_iteration": 2.651611089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078801, + "balance_loss_mlp": 1.01318836, + "epoch": 0.10908041554444017, + "flos": 621417402624.0, + "grad_norm": 0.042325065837349046, + "language_loss": 0.91458869, + "learning_rate": 0.0009836941680195568, + "loss": 0.92537665, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.65625, + "step": 567, + "time_per_iteration": 2.8296427726745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.01508534, + "epoch": 0.10927279722970373, + "flos": 899674507008.0, + "grad_norm": 0.04990856516123606, + "language_loss": 0.87414277, + "learning_rate": 0.0009836151611300166, + "loss": 0.88495302, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.65966797, + "step": 568, + "time_per_iteration": 3.2401816844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107825, + "balance_loss_mlp": 1.01206517, + "epoch": 0.10946517891496729, + "flos": 529700613120.0, + "grad_norm": 0.0427731854110213, + "language_loss": 0.96863574, + "learning_rate": 0.0009835359664844194, + "loss": 0.97941828, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.66210938, + "step": 569, + "time_per_iteration": 2.6190173625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064438, + "balance_loss_mlp": 1.00092316, + "epoch": 0.10965756060023085, + "flos": 1563994228992.0, + "grad_norm": 0.005811935039235345, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.8210125, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.63476562, + "step": 570, + "time_per_iteration": 4.957117795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080699, + "balance_loss_mlp": 1.0151341, + "epoch": 0.10984994228549443, + "flos": 514100826624.0, + "grad_norm": 0.04369440603786518, + "language_loss": 0.94858396, + "learning_rate": 0.0009833770140481118, + "loss": 0.95939088, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.65576172, + "step": 571, + "time_per_iteration": 2.6529860496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_mlp": 1.02059519, + "epoch": 0.11004232397075799, + "flos": 956275252992.0, + "grad_norm": 0.04378732511153692, + "language_loss": 0.85010409, + "learning_rate": 0.000983297256319112, + "loss": 0.86096668, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.65673828, + "step": 572, + "time_per_iteration": 3.2036497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.01499045, + "epoch": 0.11023470565602154, + "flos": 489229431552.0, + "grad_norm": 0.043497603291787354, + "language_loss": 0.89141667, + "learning_rate": 0.000983217310957477, + "loss": 0.90222269, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.65625, + "step": 573, + "time_per_iteration": 2.7763278484344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078757, + "balance_loss_mlp": 1.01333535, + "epoch": 0.1104270873412851, + "flos": 656991292416.0, + "grad_norm": 0.04901418812727031, + "language_loss": 0.9269613, + "learning_rate": 0.000983137177994244, + "loss": 0.93774891, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.65429688, + "step": 574, + "time_per_iteration": 2.8529646396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080019, + "balance_loss_mlp": 1.01474011, + "epoch": 0.11061946902654868, + "flos": 724748488704.0, + "grad_norm": 0.03457948694206611, + "language_loss": 0.87449324, + "learning_rate": 0.0009830568574605235, + "loss": 0.88529336, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.65283203, + "step": 575, + "time_per_iteration": 2.94710373878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010791, + "balance_loss_mlp": 1.01367807, + "epoch": 0.11081185071181224, + "flos": 836869037568.0, + "grad_norm": 0.04085001299476677, + "language_loss": 0.90086508, + "learning_rate": 0.0009829763493874992, + "loss": 0.91165602, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.65429688, + "step": 576, + "time_per_iteration": 3.0296730995178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107807, + "balance_loss_mlp": 1.01283884, + "epoch": 0.1110042323970758, + "flos": 610283306496.0, + "grad_norm": 0.03775485835018356, + "language_loss": 0.95256275, + "learning_rate": 0.0009828956538064264, + "loss": 0.9633435, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.65234375, + "step": 577, + "time_per_iteration": 2.7944416999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.00893569, + "epoch": 0.11119661408233936, + "flos": 597040792320.0, + "grad_norm": 0.04378674390965236, + "language_loss": 0.93033826, + "learning_rate": 0.0009828147707486344, + "loss": 0.94107759, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.64990234, + "step": 578, + "time_per_iteration": 2.7034592628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.01099229, + "epoch": 0.11138899576760293, + "flos": 556888451328.0, + "grad_norm": 0.05042820660432219, + "language_loss": 0.89312434, + "learning_rate": 0.0009827337002455245, + "loss": 0.90388274, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.6484375, + "step": 579, + "time_per_iteration": 2.6187195777893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074948, + "balance_loss_mlp": 1.01057482, + "epoch": 0.11158137745286649, + "flos": 691063331328.0, + "grad_norm": 0.03501309245374513, + "language_loss": 0.89977694, + "learning_rate": 0.0009826524423285712, + "loss": 0.91052639, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.64355469, + "step": 580, + "time_per_iteration": 2.9009909629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.0148946, + "epoch": 0.11177375913813005, + "flos": 764307868416.0, + "grad_norm": 0.04023884017549449, + "language_loss": 0.91280103, + "learning_rate": 0.0009825709970293218, + "loss": 0.92359698, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.64697266, + "step": 581, + "time_per_iteration": 2.9111618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.0095998, + "epoch": 0.11196614082339361, + "flos": 808031594496.0, + "grad_norm": 0.038028140255108665, + "language_loss": 0.97163212, + "learning_rate": 0.0009824893643793956, + "loss": 0.98237336, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.64501953, + "step": 582, + "time_per_iteration": 3.0907368659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072862, + "balance_loss_mlp": 1.00796497, + "epoch": 0.11215852250865718, + "flos": 559725924096.0, + "grad_norm": 0.04580369165919148, + "language_loss": 0.90464842, + "learning_rate": 0.0009824075444104857, + "loss": 0.91537702, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.64892578, + "step": 583, + "time_per_iteration": 2.7276525497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_mlp": 1.01285601, + "epoch": 0.11235090419392074, + "flos": 514576169472.0, + "grad_norm": 0.03926612419770205, + "language_loss": 0.95381963, + "learning_rate": 0.000982325537154357, + "loss": 0.96459383, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.64550781, + "step": 584, + "time_per_iteration": 2.6261777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074375, + "balance_loss_mlp": 1.0100019, + "epoch": 0.1125432858791843, + "flos": 492433377024.0, + "grad_norm": 0.043221505898455144, + "language_loss": 0.96143711, + "learning_rate": 0.0009822433426428484, + "loss": 0.97218084, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.64355469, + "step": 585, + "time_per_iteration": 2.5630125999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.01079714, + "epoch": 0.11273566756444786, + "flos": 511728003072.0, + "grad_norm": 0.04466131563000304, + "language_loss": 0.88984096, + "learning_rate": 0.0009821609609078697, + "loss": 0.90059173, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.64257812, + "step": 586, + "time_per_iteration": 2.649122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.01077783, + "epoch": 0.11292804924971142, + "flos": 623640526848.0, + "grad_norm": 0.03579172726266892, + "language_loss": 0.91595018, + "learning_rate": 0.0009820783919814045, + "loss": 0.92670119, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.64306641, + "step": 587, + "time_per_iteration": 2.7977845668792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_mlp": 1.00830126, + "epoch": 0.113120430934975, + "flos": 479039218176.0, + "grad_norm": 0.04738669495581529, + "language_loss": 0.85574889, + "learning_rate": 0.0009819956358955095, + "loss": 0.86647511, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.64306641, + "step": 588, + "time_per_iteration": 2.59133243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_mlp": 1.01245642, + "epoch": 0.11331281262023855, + "flos": 467991638016.0, + "grad_norm": 0.048752038127388646, + "language_loss": 0.86982751, + "learning_rate": 0.0009819126926823127, + "loss": 0.88059437, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.64208984, + "step": 589, + "time_per_iteration": 2.511939764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075971, + "balance_loss_mlp": 1.01174104, + "epoch": 0.11350519430550211, + "flos": 651611854848.0, + "grad_norm": 0.04204370934342767, + "language_loss": 0.89311969, + "learning_rate": 0.000981829562374016, + "loss": 0.9038794, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.64208984, + "step": 590, + "time_per_iteration": 2.798734426498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.01111591, + "epoch": 0.11369757599076567, + "flos": 558861754368.0, + "grad_norm": 0.04723710161718091, + "language_loss": 0.99783856, + "learning_rate": 0.0009817462450028933, + "loss": 1.00858927, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.63916016, + "step": 591, + "time_per_iteration": 2.717622756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076867, + "balance_loss_mlp": 1.01316178, + "epoch": 0.11388995767602925, + "flos": 572306457600.0, + "grad_norm": 0.041300229846526024, + "language_loss": 0.87103492, + "learning_rate": 0.0009816627406012916, + "loss": 0.88180363, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.63671875, + "step": 592, + "time_per_iteration": 2.783677339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077614, + "balance_loss_mlp": 1.01376593, + "epoch": 0.1140823393612928, + "flos": 741744168192.0, + "grad_norm": 0.04574882804976793, + "language_loss": 0.87044728, + "learning_rate": 0.0009815790492016295, + "loss": 0.88122344, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.63818359, + "step": 593, + "time_per_iteration": 2.920262336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079358, + "balance_loss_mlp": 1.01560438, + "epoch": 0.11427472104655637, + "flos": 700252314624.0, + "grad_norm": 0.042792726491020304, + "language_loss": 0.89086539, + "learning_rate": 0.0009814951708363993, + "loss": 0.90165901, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.63720703, + "step": 594, + "time_per_iteration": 2.8244025707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.00799561, + "epoch": 0.11446710273181993, + "flos": 1480355344128.0, + "grad_norm": 0.0135056408383676, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79060781, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.6171875, + "step": 595, + "time_per_iteration": 4.779642105102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.01189995, + "epoch": 0.1146594844170835, + "flos": 495913388544.0, + "grad_norm": 0.038757735955663945, + "language_loss": 0.90035105, + "learning_rate": 0.0009813268533395648, + "loss": 0.91110331, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.6328125, + "step": 596, + "time_per_iteration": 2.5933825969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082133, + "balance_loss_mlp": 1.01895213, + "epoch": 0.11485186610234706, + "flos": 475791531264.0, + "grad_norm": 0.0538004660752225, + "language_loss": 0.90474582, + "learning_rate": 0.0009812424142733073, + "loss": 0.9155671, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.63134766, + "step": 597, + "time_per_iteration": 2.528027296066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.01089013, + "epoch": 0.11504424778761062, + "flos": 732620313600.0, + "grad_norm": 0.03283482462688361, + "language_loss": 0.87953097, + "learning_rate": 0.000981157788372175, + "loss": 0.89027071, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.63037109, + "step": 598, + "time_per_iteration": 3.008469343185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.01160276, + "epoch": 0.11523662947287418, + "flos": 546963610368.0, + "grad_norm": 0.037424804687157906, + "language_loss": 0.91041148, + "learning_rate": 0.0009810729756690223, + "loss": 0.92115927, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.63134766, + "step": 599, + "time_per_iteration": 2.75840163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077312, + "balance_loss_mlp": 1.01408339, + "epoch": 0.11542901115813775, + "flos": 776388759552.0, + "grad_norm": 0.04126969924944996, + "language_loss": 0.9391377, + "learning_rate": 0.0009809879761967766, + "loss": 0.94991082, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.63183594, + "step": 600, + "time_per_iteration": 2.9511778354644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081843, + "balance_loss_mlp": 1.01828074, + "epoch": 0.11562139284340131, + "flos": 732213990144.0, + "grad_norm": 0.05544181306164312, + "language_loss": 0.88981479, + "learning_rate": 0.0009809027899884378, + "loss": 0.90063322, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.63525391, + "step": 601, + "time_per_iteration": 2.888591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.01256609, + "epoch": 0.11581377452866487, + "flos": 537040714752.0, + "grad_norm": 0.03483284203155477, + "language_loss": 0.90335476, + "learning_rate": 0.0009808174170770779, + "loss": 0.9141165, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.63574219, + "step": 602, + "time_per_iteration": 2.7933802604675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073479, + "balance_loss_mlp": 1.01263428, + "epoch": 0.11600615621392843, + "flos": 1559214555648.0, + "grad_norm": 0.012041981792172347, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85971725, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.60742188, + "step": 603, + "time_per_iteration": 4.875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.01658237, + "epoch": 0.116198537899192, + "flos": 538468688640.0, + "grad_norm": 0.046063141341509364, + "language_loss": 0.95944118, + "learning_rate": 0.0009806461112779462, + "loss": 0.97023928, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.63183594, + "step": 604, + "time_per_iteration": 2.708552360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077772, + "balance_loss_mlp": 1.01444781, + "epoch": 0.11639091958445556, + "flos": 455137950720.0, + "grad_norm": 0.05737724930332189, + "language_loss": 0.90764457, + "learning_rate": 0.0009805601784566814, + "loss": 0.91842222, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.6328125, + "step": 605, + "time_per_iteration": 2.545696496963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076475, + "balance_loss_mlp": 1.01329422, + "epoch": 0.11658330126971912, + "flos": 556152593664.0, + "grad_norm": 0.04016687987230144, + "language_loss": 0.97276044, + "learning_rate": 0.0009804740590654089, + "loss": 0.98352522, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.63134766, + "step": 606, + "time_per_iteration": 2.6464574337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_mlp": 1.01399851, + "epoch": 0.11677568295498268, + "flos": 717601827840.0, + "grad_norm": 0.0453344941203476, + "language_loss": 0.91881627, + "learning_rate": 0.0009803877531375635, + "loss": 0.9295876, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.63085938, + "step": 607, + "time_per_iteration": 2.8467392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074228, + "balance_loss_mlp": 1.0111903, + "epoch": 0.11696806464024626, + "flos": 610899600384.0, + "grad_norm": 0.04469679718872237, + "language_loss": 0.92976171, + "learning_rate": 0.0009803012607066523, + "loss": 0.94050401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.62988281, + "step": 608, + "time_per_iteration": 2.7587811946868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.01030838, + "epoch": 0.11716044632550981, + "flos": 521416628736.0, + "grad_norm": 0.04044307397502579, + "language_loss": 0.91207683, + "learning_rate": 0.0009802145818062543, + "loss": 0.92280889, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.62841797, + "step": 609, + "time_per_iteration": 2.7623538970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.00919068, + "epoch": 0.11735282801077337, + "flos": 508489064448.0, + "grad_norm": 0.04251091083777229, + "language_loss": 0.93763256, + "learning_rate": 0.0009801277164700212, + "loss": 0.9483524, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.62744141, + "step": 610, + "time_per_iteration": 2.6250369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079805, + "balance_loss_mlp": 1.0171963, + "epoch": 0.11754520969603693, + "flos": 687837031680.0, + "grad_norm": 0.044835447829723894, + "language_loss": 0.91796255, + "learning_rate": 0.0009800406647316776, + "loss": 0.92876053, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.62548828, + "step": 611, + "time_per_iteration": 2.81438946723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058731, + "balance_loss_mlp": 0.99807739, + "epoch": 0.1177375913813005, + "flos": 1545759158784.0, + "grad_norm": 0.00493114536612535, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77973187, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.60546875, + "step": 612, + "time_per_iteration": 4.795796871185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.01008153, + "epoch": 0.11792997306656407, + "flos": 521538137856.0, + "grad_norm": 0.049162221556570344, + "language_loss": 0.91035461, + "learning_rate": 0.000979866002183916, + "loss": 0.92108488, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.62890625, + "step": 613, + "time_per_iteration": 2.6470768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.00820458, + "epoch": 0.11812235475182763, + "flos": 667489652736.0, + "grad_norm": 0.0453482214384289, + "language_loss": 0.92239928, + "learning_rate": 0.0009797783914423082, + "loss": 0.93311322, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.63134766, + "step": 614, + "time_per_iteration": 2.8020856380462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_mlp": 1.01220894, + "epoch": 0.11831473643709119, + "flos": 622505148672.0, + "grad_norm": 0.04034391423157231, + "language_loss": 0.86097217, + "learning_rate": 0.0009796905944342094, + "loss": 0.87172604, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.63134766, + "step": 615, + "time_per_iteration": 2.839617967605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_mlp": 1.0160079, + "epoch": 0.11850711812235475, + "flos": 457695466752.0, + "grad_norm": 0.03330066749319758, + "language_loss": 0.89949274, + "learning_rate": 0.0009796026111937057, + "loss": 0.91028321, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.62988281, + "step": 616, + "time_per_iteration": 2.6211540699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077601, + "balance_loss_mlp": 1.0150882, + "epoch": 0.11869949980761832, + "flos": 514928057856.0, + "grad_norm": 0.034464018290856886, + "language_loss": 0.90251315, + "learning_rate": 0.0009795144417549552, + "loss": 0.91328913, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.62451172, + "step": 617, + "time_per_iteration": 2.6946897506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080332, + "balance_loss_mlp": 1.01815259, + "epoch": 0.11889188149288188, + "flos": 536157103104.0, + "grad_norm": 0.035314864293198016, + "language_loss": 0.91583192, + "learning_rate": 0.0009794260861521883, + "loss": 0.92663527, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.62109375, + "step": 618, + "time_per_iteration": 2.77822208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.01979554, + "epoch": 0.11908426317814544, + "flos": 499645166592.0, + "grad_norm": 0.042334404758790994, + "language_loss": 0.88659471, + "learning_rate": 0.0009793375444197075, + "loss": 0.89741158, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.61816406, + "step": 619, + "time_per_iteration": 2.6199400424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086152, + "balance_loss_mlp": 1.02416277, + "epoch": 0.119276644863409, + "flos": 661068155904.0, + "grad_norm": 0.043937618111938345, + "language_loss": 0.86906028, + "learning_rate": 0.000979248816591888, + "loss": 0.87992179, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.61914062, + "step": 620, + "time_per_iteration": 2.789858341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_mlp": 1.01947308, + "epoch": 0.11946902654867257, + "flos": 760153237248.0, + "grad_norm": 0.04701199265522289, + "language_loss": 0.87992656, + "learning_rate": 0.0009791599027031766, + "loss": 0.89074314, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.62109375, + "step": 621, + "time_per_iteration": 3.026487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074317, + "balance_loss_mlp": 1.01223314, + "epoch": 0.11966140823393613, + "flos": 682214575872.0, + "grad_norm": 0.0506686420393155, + "language_loss": 0.88143325, + "learning_rate": 0.0009790708027880932, + "loss": 0.89217639, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.62011719, + "step": 622, + "time_per_iteration": 2.8321774005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.02143097, + "epoch": 0.11985378991919969, + "flos": 1454300938752.0, + "grad_norm": 0.023212611497014573, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78508806, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.59960938, + "step": 623, + "time_per_iteration": 4.862462759017944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071261, + "balance_loss_mlp": 1.00936747, + "epoch": 0.12004617160446325, + "flos": 528899626752.0, + "grad_norm": 0.04437858339694968, + "language_loss": 0.95209736, + "learning_rate": 0.0009788920450172487, + "loss": 0.96280998, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.61816406, + "step": 624, + "time_per_iteration": 2.630764961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078232, + "balance_loss_mlp": 1.01619518, + "epoch": 0.12023855328972682, + "flos": 475177182720.0, + "grad_norm": 0.048047229360432486, + "language_loss": 0.92430472, + "learning_rate": 0.0009788023872308875, + "loss": 0.93508708, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.61962891, + "step": 625, + "time_per_iteration": 2.5534780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076523, + "balance_loss_mlp": 1.01682281, + "epoch": 0.12043093497499038, + "flos": 1535054718720.0, + "grad_norm": 0.022021305117703366, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.7650553, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.59570312, + "step": 626, + "time_per_iteration": 4.738527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108475, + "balance_loss_mlp": 1.023, + "epoch": 0.12062331666025394, + "flos": 540915389184.0, + "grad_norm": 0.04663901515177362, + "language_loss": 0.9603011, + "learning_rate": 0.0009786225140303285, + "loss": 0.97114861, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.61669922, + "step": 627, + "time_per_iteration": 2.634160280227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.02427304, + "epoch": 0.1208156983455175, + "flos": 513000441600.0, + "grad_norm": 0.042540459475059536, + "language_loss": 0.94019556, + "learning_rate": 0.0009785322986859634, + "loss": 0.95105481, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.61572266, + "step": 628, + "time_per_iteration": 2.681070327758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.01725972, + "epoch": 0.12100808003078108, + "flos": 597590012160.0, + "grad_norm": 0.03866803919075334, + "language_loss": 0.94614279, + "learning_rate": 0.0009784418975588838, + "loss": 0.95693052, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.61425781, + "step": 629, + "time_per_iteration": 2.7337839603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073027, + "balance_loss_mlp": 1.01132393, + "epoch": 0.12120046171604464, + "flos": 524067463680.0, + "grad_norm": 0.03279843121618067, + "language_loss": 0.94581258, + "learning_rate": 0.0009783513106841862, + "loss": 0.95654285, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.61621094, + "step": 630, + "time_per_iteration": 2.702615737915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080086, + "balance_loss_mlp": 1.01981354, + "epoch": 0.1213928434013082, + "flos": 1557910036224.0, + "grad_norm": 0.01502333088768157, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77812791, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.6015625, + "step": 631, + "time_per_iteration": 4.998409032821655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080339, + "balance_loss_mlp": 1.01835024, + "epoch": 0.12158522508657175, + "flos": 496388731392.0, + "grad_norm": 0.04174070683076465, + "language_loss": 0.89320499, + "learning_rate": 0.0009781695798326854, + "loss": 0.90400839, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.61914062, + "step": 632, + "time_per_iteration": 2.5908379554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079195, + "balance_loss_mlp": 1.01744485, + "epoch": 0.12177760677183531, + "flos": 476590572288.0, + "grad_norm": 0.04165368210868703, + "language_loss": 0.89744723, + "learning_rate": 0.0009780784359264365, + "loss": 0.90823919, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.61669922, + "step": 633, + "time_per_iteration": 2.689202070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073334, + "balance_loss_mlp": 1.01382446, + "epoch": 0.12196998845709889, + "flos": 1471787512320.0, + "grad_norm": 0.011333314510513573, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75262028, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.59375, + "step": 634, + "time_per_iteration": 4.762145757675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073337, + "balance_loss_mlp": 1.01187229, + "epoch": 0.12216237014236245, + "flos": 587749741824.0, + "grad_norm": 0.03178889939160208, + "language_loss": 0.88649213, + "learning_rate": 0.000977895591329867, + "loss": 0.8972255, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.61376953, + "step": 635, + "time_per_iteration": 2.7996504306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075051, + "balance_loss_mlp": 1.01372933, + "epoch": 0.12235475182762601, + "flos": 599107414272.0, + "grad_norm": 0.038321985001081305, + "language_loss": 0.88459468, + "learning_rate": 0.000977803890710533, + "loss": 0.89534515, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.61230469, + "step": 636, + "time_per_iteration": 2.7200405597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072884, + "balance_loss_mlp": 1.0117538, + "epoch": 0.12254713351288957, + "flos": 498761554944.0, + "grad_norm": 0.03313527469264444, + "language_loss": 0.94808865, + "learning_rate": 0.0009777120045912774, + "loss": 0.95881748, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.61035156, + "step": 637, + "time_per_iteration": 2.6253507137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072019, + "balance_loss_mlp": 1.01084125, + "epoch": 0.12273951519815314, + "flos": 606981184512.0, + "grad_norm": 0.04065251745031248, + "language_loss": 0.91558111, + "learning_rate": 0.0009776199330077736, + "loss": 0.92630136, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.61083984, + "step": 638, + "time_per_iteration": 2.724416732788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069779, + "balance_loss_mlp": 1.0086484, + "epoch": 0.1229318968834167, + "flos": 598985905152.0, + "grad_norm": 0.04427923240085457, + "language_loss": 0.94062102, + "learning_rate": 0.0009775276759957667, + "loss": 0.9513188, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.61035156, + "step": 639, + "time_per_iteration": 2.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_mlp": 1.00851989, + "epoch": 0.12312427856868026, + "flos": 679589985792.0, + "grad_norm": 0.04435656949952303, + "language_loss": 0.91938198, + "learning_rate": 0.0009774352335910745, + "loss": 0.93008226, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.61425781, + "step": 640, + "time_per_iteration": 2.8135974407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072128, + "balance_loss_mlp": 1.01095021, + "epoch": 0.12331666025394382, + "flos": 610044178944.0, + "grad_norm": 0.03352322480141845, + "language_loss": 0.95842457, + "learning_rate": 0.000977342605829586, + "loss": 0.96914589, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.61083984, + "step": 641, + "time_per_iteration": 2.734373092651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107341, + "balance_loss_mlp": 1.01208854, + "epoch": 0.12350904193920739, + "flos": 763841273856.0, + "grad_norm": 0.04166007448412618, + "language_loss": 0.87458932, + "learning_rate": 0.0009772497927472623, + "loss": 0.88532341, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.61230469, + "step": 642, + "time_per_iteration": 3.069495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.01199543, + "epoch": 0.12370142362447095, + "flos": 542050767360.0, + "grad_norm": 0.04189965725350253, + "language_loss": 0.86664522, + "learning_rate": 0.0009771567943801368, + "loss": 0.87737978, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.61376953, + "step": 643, + "time_per_iteration": 2.6783955097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.01000655, + "epoch": 0.12389380530973451, + "flos": 549253808640.0, + "grad_norm": 0.03907898995026106, + "language_loss": 0.90534973, + "learning_rate": 0.0009770636107643152, + "loss": 0.91606158, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.61083984, + "step": 644, + "time_per_iteration": 2.7792532444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.01343274, + "epoch": 0.12408618699499807, + "flos": 541353793536.0, + "grad_norm": 0.03775088580197231, + "language_loss": 0.89077818, + "learning_rate": 0.0009769702419359738, + "loss": 0.9015224, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.60888672, + "step": 645, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071725, + "balance_loss_mlp": 1.01083338, + "epoch": 0.12427856868026164, + "flos": 747160544256.0, + "grad_norm": 0.03491310842571494, + "language_loss": 0.90435565, + "learning_rate": 0.000976876687931362, + "loss": 0.91507292, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.60791016, + "step": 646, + "time_per_iteration": 3.028578758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074215, + "balance_loss_mlp": 1.01332271, + "epoch": 0.1244709503655252, + "flos": 534745658880.0, + "grad_norm": 0.04739554944994068, + "language_loss": 0.86433625, + "learning_rate": 0.0009767829487868005, + "loss": 0.87507832, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.60791016, + "step": 647, + "time_per_iteration": 2.6323471069335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075713, + "balance_loss_mlp": 1.01472592, + "epoch": 0.12466333205078876, + "flos": 509112161280.0, + "grad_norm": 0.0390766896094967, + "language_loss": 0.89632404, + "learning_rate": 0.000976689024538682, + "loss": 0.90708113, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.60888672, + "step": 648, + "time_per_iteration": 2.6233997344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069043, + "balance_loss_mlp": 1.00819838, + "epoch": 0.12485571373605232, + "flos": 682640341248.0, + "grad_norm": 0.04106035596266842, + "language_loss": 0.87981439, + "learning_rate": 0.0009765949152234716, + "loss": 0.89050484, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.60742188, + "step": 649, + "time_per_iteration": 2.9135711193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064964, + "balance_loss_mlp": 1.00659943, + "epoch": 0.1250480954213159, + "flos": 1333201377024.0, + "grad_norm": 0.013063081234142807, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79751045, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.58203125, + "step": 650, + "time_per_iteration": 4.696362495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.0093261, + "epoch": 0.12524047710657946, + "flos": 940198178304.0, + "grad_norm": 0.03723688894295025, + "language_loss": 0.82869852, + "learning_rate": 0.0009764061415379919, + "loss": 0.83939779, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.60498047, + "step": 651, + "time_per_iteration": 3.287029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071729, + "balance_loss_mlp": 1.01078951, + "epoch": 0.12543285879184302, + "flos": 514901812992.0, + "grad_norm": 0.03842788822410913, + "language_loss": 0.90123397, + "learning_rate": 0.0009763114772410109, + "loss": 0.91195124, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.60839844, + "step": 652, + "time_per_iteration": 2.5726470947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071215, + "balance_loss_mlp": 1.01075244, + "epoch": 0.12562524047710658, + "flos": 719684001024.0, + "grad_norm": 0.03790395950388449, + "language_loss": 0.88320071, + "learning_rate": 0.0009762166280235146, + "loss": 0.89391285, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.60351562, + "step": 653, + "time_per_iteration": 2.9728682041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073992, + "balance_loss_mlp": 1.01372027, + "epoch": 0.12581762216237014, + "flos": 564799160064.0, + "grad_norm": 0.039966468352906216, + "language_loss": 0.88308495, + "learning_rate": 0.0009761215939223267, + "loss": 0.89382488, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.6015625, + "step": 654, + "time_per_iteration": 2.7552366256713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.01100981, + "epoch": 0.1260100038476337, + "flos": 482901253632.0, + "grad_norm": 0.045851790315233704, + "language_loss": 0.87049586, + "learning_rate": 0.0009760263749743428, + "loss": 0.88121206, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.60498047, + "step": 655, + "time_per_iteration": 2.5859339237213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.01301908, + "epoch": 0.12620238553289725, + "flos": 576702161664.0, + "grad_norm": 0.03680601760412016, + "language_loss": 0.91127861, + "learning_rate": 0.0009759309712165299, + "loss": 0.9220134, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.60351562, + "step": 656, + "time_per_iteration": 2.7411043643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.00841653, + "epoch": 0.12639476721816084, + "flos": 532186197504.0, + "grad_norm": 0.050748048847022796, + "language_loss": 0.94208288, + "learning_rate": 0.0009758353826859272, + "loss": 0.95277309, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.60498047, + "step": 657, + "time_per_iteration": 2.5851681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071924, + "balance_loss_mlp": 1.01117456, + "epoch": 0.1265871489034244, + "flos": 691232472576.0, + "grad_norm": 0.04052834214006204, + "language_loss": 0.90056133, + "learning_rate": 0.0009757396094196456, + "loss": 0.91128063, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.60644531, + "step": 658, + "time_per_iteration": 2.9119739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.01083672, + "epoch": 0.12677953058868796, + "flos": 538243166976.0, + "grad_norm": 0.03305987481805703, + "language_loss": 0.85138786, + "learning_rate": 0.0009756436514548673, + "loss": 0.86210179, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.60449219, + "step": 659, + "time_per_iteration": 2.8146860599517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070631, + "balance_loss_mlp": 1.01021552, + "epoch": 0.12697191227395152, + "flos": 520120857600.0, + "grad_norm": 0.03322369158928612, + "language_loss": 0.89052176, + "learning_rate": 0.0009755475088288466, + "loss": 0.90122807, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.60302734, + "step": 660, + "time_per_iteration": 2.7092652320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070351, + "balance_loss_mlp": 1.01007843, + "epoch": 0.12716429395921508, + "flos": 567666768384.0, + "grad_norm": 0.0427017471912124, + "language_loss": 0.91535795, + "learning_rate": 0.0009754511815789095, + "loss": 0.92606151, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.6015625, + "step": 661, + "time_per_iteration": 2.790198564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.00809085, + "epoch": 0.12735667564447864, + "flos": 515142885888.0, + "grad_norm": 0.0409493229321676, + "language_loss": 0.8685838, + "learning_rate": 0.0009753546697424533, + "loss": 0.87926698, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.60107422, + "step": 662, + "time_per_iteration": 2.6784565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070378, + "balance_loss_mlp": 1.01020074, + "epoch": 0.1275490573297422, + "flos": 542321975808.0, + "grad_norm": 0.039351291895580044, + "language_loss": 0.91270494, + "learning_rate": 0.0009752579733569475, + "loss": 0.92340875, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.60058594, + "step": 663, + "time_per_iteration": 2.679379940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071762, + "balance_loss_mlp": 1.01358795, + "epoch": 0.12774143901500576, + "flos": 1562027728896.0, + "grad_norm": 0.016936801864205438, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7595315, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.58007812, + "step": 664, + "time_per_iteration": 4.936127424240112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070961, + "balance_loss_mlp": 1.01092672, + "epoch": 0.12793382070026935, + "flos": 614874396672.0, + "grad_norm": 0.047422479810277696, + "language_loss": 0.90634137, + "learning_rate": 0.0009750640270890217, + "loss": 0.91705096, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.59912109, + "step": 665, + "time_per_iteration": 2.712202548980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.01361179, + "epoch": 0.1281262023855329, + "flos": 709118566656.0, + "grad_norm": 0.04721256261198653, + "language_loss": 0.97348696, + "learning_rate": 0.0009749667772818983, + "loss": 0.98422199, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.59765625, + "step": 666, + "time_per_iteration": 2.959563732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065521, + "balance_loss_mlp": 1.00791931, + "epoch": 0.12831858407079647, + "flos": 1428185295360.0, + "grad_norm": 0.00958948420866419, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78001463, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.57421875, + "step": 667, + "time_per_iteration": 4.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.01259768, + "epoch": 0.12851096575606002, + "flos": 450019027968.0, + "grad_norm": 0.04331482152431362, + "language_loss": 0.96237415, + "learning_rate": 0.0009747717245101093, + "loss": 0.97309327, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.59179688, + "step": 668, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.01193655, + "epoch": 0.12870334744132358, + "flos": 480910454016.0, + "grad_norm": 0.040015395826151615, + "language_loss": 0.86231172, + "learning_rate": 0.00097467392162117, + "loss": 0.87302423, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.59179688, + "step": 669, + "time_per_iteration": 2.620121717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.01342034, + "epoch": 0.12889572912658714, + "flos": 640152115200.0, + "grad_norm": 0.03307407171369126, + "language_loss": 0.91950834, + "learning_rate": 0.0009745759344474708, + "loss": 0.9302386, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.59472656, + "step": 670, + "time_per_iteration": 2.834406852722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070894, + "balance_loss_mlp": 1.01114607, + "epoch": 0.1290881108118507, + "flos": 510955206912.0, + "grad_norm": 0.03904079329345599, + "language_loss": 0.90752548, + "learning_rate": 0.0009744777630270536, + "loss": 0.91823441, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.59619141, + "step": 671, + "time_per_iteration": 2.5841259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.00967062, + "epoch": 0.12928049249711426, + "flos": 672291680256.0, + "grad_norm": 0.0427916369984872, + "language_loss": 0.94394779, + "learning_rate": 0.000974379407398032, + "loss": 0.95464385, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.59814453, + "step": 672, + "time_per_iteration": 2.8698208332061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072071, + "balance_loss_mlp": 1.0120368, + "epoch": 0.12947287418237785, + "flos": 795000017664.0, + "grad_norm": 0.03399258645873994, + "language_loss": 0.83039552, + "learning_rate": 0.0009742808675985913, + "loss": 0.84111625, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.59912109, + "step": 673, + "time_per_iteration": 3.1018688678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_mlp": 1.00729334, + "epoch": 0.1296652558676414, + "flos": 486448339200.0, + "grad_norm": 0.039807509100232605, + "language_loss": 0.91899526, + "learning_rate": 0.0009741821436669876, + "loss": 0.92966807, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.59863281, + "step": 674, + "time_per_iteration": 2.6348536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068278, + "balance_loss_mlp": 1.00853038, + "epoch": 0.12985763755290497, + "flos": 454393344768.0, + "grad_norm": 0.044170807310258554, + "language_loss": 0.93403888, + "learning_rate": 0.0009740832356415492, + "loss": 0.9447217, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.59619141, + "step": 675, + "time_per_iteration": 2.483262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072639, + "balance_loss_mlp": 1.01265311, + "epoch": 0.13005001923816853, + "flos": 826435805952.0, + "grad_norm": 0.043859966784303914, + "language_loss": 0.89693773, + "learning_rate": 0.0009739841435606756, + "loss": 0.90766412, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.59863281, + "step": 676, + "time_per_iteration": 2.992385149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_mlp": 1.00726056, + "epoch": 0.1302424009234321, + "flos": 532481705472.0, + "grad_norm": 0.03559705023164985, + "language_loss": 0.91210669, + "learning_rate": 0.0009738848674628377, + "loss": 0.92277622, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.59570312, + "step": 677, + "time_per_iteration": 2.766364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.00765288, + "epoch": 0.13043478260869565, + "flos": 526917575424.0, + "grad_norm": 0.03838556287658105, + "language_loss": 0.90382779, + "learning_rate": 0.000973785407386578, + "loss": 0.91449988, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.59423828, + "step": 678, + "time_per_iteration": 2.772854804992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070658, + "balance_loss_mlp": 1.01076782, + "epoch": 0.1306271642939592, + "flos": 627417991680.0, + "grad_norm": 0.03509098765963207, + "language_loss": 0.88142246, + "learning_rate": 0.0009736857633705103, + "loss": 0.89212906, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.59765625, + "step": 679, + "time_per_iteration": 2.851567268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.01602292, + "epoch": 0.13081954597922277, + "flos": 551841460224.0, + "grad_norm": 0.03859467755451503, + "language_loss": 0.94306064, + "learning_rate": 0.0009735859354533196, + "loss": 0.95381933, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.59716797, + "step": 680, + "time_per_iteration": 2.6908183097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.01038456, + "epoch": 0.13101192766448633, + "flos": 537956407296.0, + "grad_norm": 0.04695623305024525, + "language_loss": 0.92768431, + "learning_rate": 0.0009734859236737628, + "loss": 0.93838656, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.59716797, + "step": 681, + "time_per_iteration": 2.618556261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.00631785, + "epoch": 0.13120430934974991, + "flos": 504514268160.0, + "grad_norm": 0.03771498494962771, + "language_loss": 0.94425803, + "learning_rate": 0.0009733857280706678, + "loss": 0.95491678, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.59423828, + "step": 682, + "time_per_iteration": 2.607445240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.00883758, + "epoch": 0.13139669103501347, + "flos": 615423616512.0, + "grad_norm": 0.040497909024236244, + "language_loss": 0.85748106, + "learning_rate": 0.000973285348682934, + "loss": 0.86816311, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.59228516, + "step": 683, + "time_per_iteration": 2.749258518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.00846863, + "epoch": 0.13158907272027703, + "flos": 1488218420736.0, + "grad_norm": 0.017735586482065788, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.78962922, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.5625, + "step": 684, + "time_per_iteration": 4.792337894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069614, + "balance_loss_mlp": 1.01053405, + "epoch": 0.1317814544055406, + "flos": 987119046912.0, + "grad_norm": 0.04121230716493085, + "language_loss": 0.86815995, + "learning_rate": 0.0009730840387095046, + "loss": 0.87885606, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.58935547, + "step": 685, + "time_per_iteration": 3.324737071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.00972676, + "epoch": 0.13197383609080415, + "flos": 612629885184.0, + "grad_norm": 0.03769323902360627, + "language_loss": 0.91733027, + "learning_rate": 0.0009729831082019642, + "loss": 0.92801929, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.59033203, + "step": 686, + "time_per_iteration": 2.883368968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069054, + "balance_loss_mlp": 1.0096879, + "epoch": 0.1321662177760677, + "flos": 495555664128.0, + "grad_norm": 0.03344682577786829, + "language_loss": 0.90060174, + "learning_rate": 0.0009728819940660958, + "loss": 0.91129231, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.59228516, + "step": 687, + "time_per_iteration": 2.7771294116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069131, + "balance_loss_mlp": 1.00971675, + "epoch": 0.13235859946133127, + "flos": 496844632320.0, + "grad_norm": 0.041743180753116546, + "language_loss": 0.8673048, + "learning_rate": 0.0009727806963411557, + "loss": 0.87799615, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.59277344, + "step": 688, + "time_per_iteration": 2.5879924297332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069414, + "balance_loss_mlp": 1.00971425, + "epoch": 0.13255098114659483, + "flos": 512768116992.0, + "grad_norm": 0.035278095584539565, + "language_loss": 0.88457793, + "learning_rate": 0.000972679215066471, + "loss": 0.89527214, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.59570312, + "step": 689, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_mlp": 1.00826621, + "epoch": 0.13274336283185842, + "flos": 548400332544.0, + "grad_norm": 0.043703661342582356, + "language_loss": 1.0036962, + "learning_rate": 0.0009725775502814401, + "loss": 1.01437247, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.59228516, + "step": 690, + "time_per_iteration": 2.580975294113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072547, + "balance_loss_mlp": 1.01313293, + "epoch": 0.13293574451712198, + "flos": 642003909120.0, + "grad_norm": 0.041755939912029, + "language_loss": 0.86554468, + "learning_rate": 0.0009724757020255327, + "loss": 0.87627012, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.59277344, + "step": 691, + "time_per_iteration": 2.895805835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074323, + "balance_loss_mlp": 1.01533794, + "epoch": 0.13312812620238554, + "flos": 492470315520.0, + "grad_norm": 0.04584738151589033, + "language_loss": 0.8907311, + "learning_rate": 0.0009723736703382902, + "loss": 0.90147436, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.58837891, + "step": 692, + "time_per_iteration": 2.593621253967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073259, + "balance_loss_mlp": 1.01427472, + "epoch": 0.1333205078876491, + "flos": 509950086144.0, + "grad_norm": 0.042207641511909956, + "language_loss": 0.84734881, + "learning_rate": 0.0009722714552593244, + "loss": 0.85808134, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.58837891, + "step": 693, + "time_per_iteration": 2.6628286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069094, + "balance_loss_mlp": 1.01010931, + "epoch": 0.13351288957291266, + "flos": 419592251136.0, + "grad_norm": 0.04342856140262568, + "language_loss": 0.95545483, + "learning_rate": 0.000972169056828319, + "loss": 0.96614575, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.58837891, + "step": 694, + "time_per_iteration": 2.491511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.00896847, + "epoch": 0.13370527125817622, + "flos": 617051834112.0, + "grad_norm": 0.03328111889388194, + "language_loss": 0.87929142, + "learning_rate": 0.0009720664750850283, + "loss": 0.88997287, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.59033203, + "step": 695, + "time_per_iteration": 2.802238941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066112, + "balance_loss_mlp": 1.00693631, + "epoch": 0.13389765294343978, + "flos": 627170115840.0, + "grad_norm": 0.04111883948503256, + "language_loss": 0.94899035, + "learning_rate": 0.0009719637100692784, + "loss": 0.95965147, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.59033203, + "step": 696, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066724, + "balance_loss_mlp": 1.00764382, + "epoch": 0.13409003462870334, + "flos": 610897655040.0, + "grad_norm": 0.03903466400724949, + "language_loss": 0.84625083, + "learning_rate": 0.0009718607618209661, + "loss": 0.85691804, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.58935547, + "step": 697, + "time_per_iteration": 2.8612687587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.00915492, + "epoch": 0.13428241631396692, + "flos": 685088987136.0, + "grad_norm": 0.03548160791415639, + "language_loss": 0.8885181, + "learning_rate": 0.0009717576303800595, + "loss": 0.89919716, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.5859375, + "step": 698, + "time_per_iteration": 3.046081304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.00870502, + "epoch": 0.13447479799923048, + "flos": 509819828736.0, + "grad_norm": 0.04099621387271608, + "language_loss": 0.8689754, + "learning_rate": 0.0009716543157865975, + "loss": 0.87964994, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.5859375, + "step": 699, + "time_per_iteration": 2.7116739749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.00893724, + "epoch": 0.13466717968449404, + "flos": 899060158464.0, + "grad_norm": 0.03800712734159662, + "language_loss": 0.8517018, + "learning_rate": 0.0009715508180806907, + "loss": 0.86237621, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.58349609, + "step": 700, + "time_per_iteration": 3.184324026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.00777256, + "epoch": 0.1348595613697576, + "flos": 991695552768.0, + "grad_norm": 0.036541360765650906, + "language_loss": 0.91219282, + "learning_rate": 0.0009714471373025202, + "loss": 0.92285609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.58398438, + "step": 701, + "time_per_iteration": 3.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.0059582, + "epoch": 0.13505194305502116, + "flos": 488812414464.0, + "grad_norm": 0.038284394577449095, + "language_loss": 0.90020943, + "learning_rate": 0.0009713432734923386, + "loss": 0.91085601, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.58544922, + "step": 702, + "time_per_iteration": 2.6416144371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.00842357, + "epoch": 0.13524432474028472, + "flos": 614520562944.0, + "grad_norm": 0.03635122731697363, + "language_loss": 0.87970936, + "learning_rate": 0.0009712392266904696, + "loss": 0.89038247, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.58740234, + "step": 703, + "time_per_iteration": 2.73490309715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.00782144, + "epoch": 0.13543670642554828, + "flos": 906275838720.0, + "grad_norm": 0.040994558071305906, + "language_loss": 0.86788869, + "learning_rate": 0.0009711349969373076, + "loss": 0.87855482, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.58642578, + "step": 704, + "time_per_iteration": 3.1667368412017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066356, + "balance_loss_mlp": 1.00765777, + "epoch": 0.13562908811081184, + "flos": 551748141312.0, + "grad_norm": 0.040707128775991024, + "language_loss": 0.81448901, + "learning_rate": 0.0009710305842733178, + "loss": 0.82515258, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.58544922, + "step": 705, + "time_per_iteration": 2.7456798553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064755, + "balance_loss_mlp": 1.00648558, + "epoch": 0.1358214697960754, + "flos": 509038284288.0, + "grad_norm": 0.04235852839756889, + "language_loss": 0.91048527, + "learning_rate": 0.0009709259887390373, + "loss": 0.9211328, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.58105469, + "step": 706, + "time_per_iteration": 2.614645481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067613, + "balance_loss_mlp": 1.0098201, + "epoch": 0.136013851481339, + "flos": 529924189440.0, + "grad_norm": 0.045207837368539144, + "language_loss": 0.92539275, + "learning_rate": 0.0009708212103750737, + "loss": 0.93606889, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.57617188, + "step": 707, + "time_per_iteration": 2.5839250087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073243, + "balance_loss_mlp": 1.01525927, + "epoch": 0.13620623316660255, + "flos": 660321604608.0, + "grad_norm": 0.04139663244511697, + "language_loss": 0.88690269, + "learning_rate": 0.0009707162492221051, + "loss": 0.8976351, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.578125, + "step": 708, + "time_per_iteration": 2.8753738403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106855, + "balance_loss_mlp": 1.01051939, + "epoch": 0.1363986148518661, + "flos": 673083918336.0, + "grad_norm": 0.04870142688483653, + "language_loss": 0.89226341, + "learning_rate": 0.0009706111053208815, + "loss": 0.90294898, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.57861328, + "step": 709, + "time_per_iteration": 2.792555570602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.0069865, + "epoch": 0.13659099653712967, + "flos": 474004866048.0, + "grad_norm": 0.041589756065930725, + "language_loss": 0.87875092, + "learning_rate": 0.0009705057787122232, + "loss": 0.88940346, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.58105469, + "step": 710, + "time_per_iteration": 2.5474488735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106841, + "balance_loss_mlp": 1.00980711, + "epoch": 0.13678337822239323, + "flos": 453648738816.0, + "grad_norm": 0.03947638411835938, + "language_loss": 0.92397159, + "learning_rate": 0.0009704002694370216, + "loss": 0.93465567, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.58447266, + "step": 711, + "time_per_iteration": 2.5812153816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107162, + "balance_loss_mlp": 1.01306474, + "epoch": 0.13697575990765679, + "flos": 520626336000.0, + "grad_norm": 0.04103000756090051, + "language_loss": 0.88202429, + "learning_rate": 0.0009702945775362388, + "loss": 0.89274049, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.58398438, + "step": 712, + "time_per_iteration": 2.6084940433502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067699, + "balance_loss_mlp": 1.00914371, + "epoch": 0.13716814159292035, + "flos": 481366354944.0, + "grad_norm": 0.04017855754763819, + "language_loss": 0.88458985, + "learning_rate": 0.0009701887030509086, + "loss": 0.89526689, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.58398438, + "step": 713, + "time_per_iteration": 2.6361663341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.01425505, + "epoch": 0.1373605232781839, + "flos": 546750727680.0, + "grad_norm": 0.04169009137316196, + "language_loss": 0.92536753, + "learning_rate": 0.0009700826460221346, + "loss": 0.93609238, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.58056641, + "step": 714, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068882, + "balance_loss_mlp": 1.01080275, + "epoch": 0.1375529049634475, + "flos": 710071197696.0, + "grad_norm": 0.042053375460334, + "language_loss": 0.94210052, + "learning_rate": 0.0009699764064910921, + "loss": 0.95278937, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.57910156, + "step": 715, + "time_per_iteration": 2.870835542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069166, + "balance_loss_mlp": 1.01099169, + "epoch": 0.13774528664871105, + "flos": 487677036288.0, + "grad_norm": 0.04018028408764831, + "language_loss": 0.88572168, + "learning_rate": 0.0009698699844990268, + "loss": 0.89641333, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.58007812, + "step": 716, + "time_per_iteration": 2.6557233333587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106644, + "balance_loss_mlp": 1.00817037, + "epoch": 0.1379376683339746, + "flos": 681459276288.0, + "grad_norm": 0.03631196674856893, + "language_loss": 0.89737439, + "learning_rate": 0.0009697633800872555, + "loss": 0.90803885, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.58105469, + "step": 717, + "time_per_iteration": 2.9236202239990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.00998127, + "epoch": 0.13813005001923817, + "flos": 612226473984.0, + "grad_norm": 0.040527486313319094, + "language_loss": 0.9214747, + "learning_rate": 0.0009696565932971655, + "loss": 0.93215865, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.58251953, + "step": 718, + "time_per_iteration": 2.8931636810302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_mlp": 1.01394677, + "epoch": 0.13832243170450173, + "flos": 589927179264.0, + "grad_norm": 0.042228364331249636, + "language_loss": 0.91184157, + "learning_rate": 0.0009695496241702153, + "loss": 0.92256421, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.58154297, + "step": 719, + "time_per_iteration": 2.8006720542907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010673, + "balance_loss_mlp": 1.00917327, + "epoch": 0.1385148133897653, + "flos": 701320618752.0, + "grad_norm": 0.04012183054192491, + "language_loss": 0.87174737, + "learning_rate": 0.0009694424727479339, + "loss": 0.88242036, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.57958984, + "step": 720, + "time_per_iteration": 2.9363977909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.0081414, + "epoch": 0.13870719507502885, + "flos": 599367929088.0, + "grad_norm": 0.04032336097495746, + "language_loss": 0.90803999, + "learning_rate": 0.0009693351390719213, + "loss": 0.91870457, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.58154297, + "step": 721, + "time_per_iteration": 2.7786271572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01273775, + "epoch": 0.1388995767602924, + "flos": 587749741824.0, + "grad_norm": 0.04179929290372652, + "language_loss": 0.92465305, + "learning_rate": 0.000969227623183848, + "loss": 0.93536115, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.57910156, + "step": 722, + "time_per_iteration": 2.777453660964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.00870621, + "epoch": 0.139091958445556, + "flos": 652363263744.0, + "grad_norm": 0.041578114374578125, + "language_loss": 0.92603219, + "learning_rate": 0.0009691199251254554, + "loss": 0.9366982, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.57714844, + "step": 723, + "time_per_iteration": 2.813610553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.00586045, + "epoch": 0.13928434013081956, + "flos": 576906296064.0, + "grad_norm": 0.03663552971403626, + "language_loss": 0.88541949, + "learning_rate": 0.0009690120449385555, + "loss": 0.89605606, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.57617188, + "step": 724, + "time_per_iteration": 2.7604424953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_mlp": 1.00582433, + "epoch": 0.13947672181608312, + "flos": 564315068928.0, + "grad_norm": 0.034271197388489986, + "language_loss": 0.93926299, + "learning_rate": 0.0009689039826650312, + "loss": 0.94990206, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.57910156, + "step": 725, + "time_per_iteration": 2.7856695652008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095871, + "balance_loss_mlp": 1.03941345, + "epoch": 0.13966910350134668, + "flos": 1524951988224.0, + "grad_norm": 0.03128450212810151, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77618933, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.56640625, + "step": 726, + "time_per_iteration": 4.903306245803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.00902975, + "epoch": 0.13986148518661023, + "flos": 500856367104.0, + "grad_norm": 0.052764167671210026, + "language_loss": 0.89172196, + "learning_rate": 0.0009686873120259941, + "loss": 0.90239263, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.57861328, + "step": 727, + "time_per_iteration": 2.6450552940368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072692, + "balance_loss_mlp": 1.01518559, + "epoch": 0.1400538668718738, + "flos": 599850074880.0, + "grad_norm": 0.036488800736072635, + "language_loss": 0.88047451, + "learning_rate": 0.0009685787037446004, + "loss": 0.89120144, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.57324219, + "step": 728, + "time_per_iteration": 2.763434648513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072512, + "balance_loss_mlp": 1.01481462, + "epoch": 0.14024624855713735, + "flos": 595169556480.0, + "grad_norm": 0.047561697925478, + "language_loss": 0.88858587, + "learning_rate": 0.0009684699135448201, + "loss": 0.89931101, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.57519531, + "step": 729, + "time_per_iteration": 2.745037078857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067277, + "balance_loss_mlp": 1.00962722, + "epoch": 0.1404386302424009, + "flos": 507586010880.0, + "grad_norm": 0.03094406590189725, + "language_loss": 0.9291476, + "learning_rate": 0.0009683609414688895, + "loss": 0.93982029, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.57470703, + "step": 730, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.01039195, + "epoch": 0.14063101192766447, + "flos": 574515975936.0, + "grad_norm": 0.037780385553924656, + "language_loss": 0.87345785, + "learning_rate": 0.0009682517875591154, + "loss": 0.88414258, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.57910156, + "step": 731, + "time_per_iteration": 2.752572536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071345, + "balance_loss_mlp": 1.0129801, + "epoch": 0.14082339361292806, + "flos": 565765396992.0, + "grad_norm": 0.03832964150159033, + "language_loss": 0.87666118, + "learning_rate": 0.0009681424518578749, + "loss": 0.88737464, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.58203125, + "step": 732, + "time_per_iteration": 2.7323830127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.01028764, + "epoch": 0.14101577529819162, + "flos": 464583558144.0, + "grad_norm": 0.035957988569031644, + "language_loss": 0.88670099, + "learning_rate": 0.000968032934407616, + "loss": 0.8973856, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.58007812, + "step": 733, + "time_per_iteration": 2.6479005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064525, + "balance_loss_mlp": 1.00644577, + "epoch": 0.14120815698345518, + "flos": 597262423296.0, + "grad_norm": 0.039547782577588224, + "language_loss": 0.82413781, + "learning_rate": 0.0009679232352508571, + "loss": 0.83478296, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.57910156, + "step": 734, + "time_per_iteration": 2.7924795150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.00599897, + "epoch": 0.14140053866871874, + "flos": 536232925440.0, + "grad_norm": 0.03854566850595878, + "language_loss": 0.82520735, + "learning_rate": 0.0009678133544301871, + "loss": 0.83584428, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.57519531, + "step": 735, + "time_per_iteration": 2.658731698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_mlp": 1.00498438, + "epoch": 0.1415929203539823, + "flos": 521277623040.0, + "grad_norm": 0.0297517777524564, + "language_loss": 0.92917788, + "learning_rate": 0.0009677032919882658, + "loss": 0.93980187, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.57226562, + "step": 736, + "time_per_iteration": 2.661276340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_mlp": 1.0113374, + "epoch": 0.14178530203924586, + "flos": 483302719488.0, + "grad_norm": 0.041037110936195734, + "language_loss": 0.92867804, + "learning_rate": 0.000967593047967823, + "loss": 0.93936217, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.56982422, + "step": 737, + "time_per_iteration": 2.52840256690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068115, + "balance_loss_mlp": 1.01056099, + "epoch": 0.14197768372450942, + "flos": 677840259072.0, + "grad_norm": 0.04254557939420697, + "language_loss": 0.88126308, + "learning_rate": 0.0009674826224116593, + "loss": 0.89194429, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.57373047, + "step": 738, + "time_per_iteration": 2.858147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.0167979, + "epoch": 0.14217006540977298, + "flos": 446992972032.0, + "grad_norm": 0.045930563119643074, + "language_loss": 0.87994051, + "learning_rate": 0.0009673720153626455, + "loss": 0.89068353, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.57324219, + "step": 739, + "time_per_iteration": 2.664236545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.01199603, + "epoch": 0.14236244709503657, + "flos": 497478422784.0, + "grad_norm": 0.040566684483093814, + "language_loss": 0.88105047, + "learning_rate": 0.0009672612268637235, + "loss": 0.89174449, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.57226562, + "step": 740, + "time_per_iteration": 2.634126901626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069933, + "balance_loss_mlp": 1.01304626, + "epoch": 0.14255482878030012, + "flos": 649480104192.0, + "grad_norm": 0.05086050125917657, + "language_loss": 0.85906518, + "learning_rate": 0.0009671502569579048, + "loss": 0.86976457, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.56884766, + "step": 741, + "time_per_iteration": 2.7642107009887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071964, + "balance_loss_mlp": 1.01564944, + "epoch": 0.14274721046556368, + "flos": 537274984704.0, + "grad_norm": 0.037356444744632025, + "language_loss": 0.90824854, + "learning_rate": 0.0009670391056882719, + "loss": 0.91896814, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.56445312, + "step": 742, + "time_per_iteration": 2.7307372093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069292, + "balance_loss_mlp": 1.01288199, + "epoch": 0.14293959215082724, + "flos": 958584893184.0, + "grad_norm": 0.03744948002603285, + "language_loss": 0.89976203, + "learning_rate": 0.0009669277730979776, + "loss": 0.91045499, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.56494141, + "step": 743, + "time_per_iteration": 3.2251601219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068321, + "balance_loss_mlp": 1.01162553, + "epoch": 0.1431319738360908, + "flos": 694386840576.0, + "grad_norm": 0.037398516399228816, + "language_loss": 0.86562485, + "learning_rate": 0.0009668162592302449, + "loss": 0.87630802, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.56738281, + "step": 744, + "time_per_iteration": 2.924435615539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.01015854, + "epoch": 0.14332435552135436, + "flos": 566503200000.0, + "grad_norm": 0.037819132294000864, + "language_loss": 0.86981773, + "learning_rate": 0.0009667045641283676, + "loss": 0.88048917, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.56933594, + "step": 745, + "time_per_iteration": 2.6744887828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071811, + "balance_loss_mlp": 1.01540148, + "epoch": 0.14351673720661792, + "flos": 739696988160.0, + "grad_norm": 0.042480690817339954, + "language_loss": 0.96115947, + "learning_rate": 0.0009665926878357092, + "loss": 0.97187757, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.56591797, + "step": 746, + "time_per_iteration": 2.9137520790100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069908, + "balance_loss_mlp": 1.0134027, + "epoch": 0.14370911889188148, + "flos": 550352248320.0, + "grad_norm": 0.037361960218361134, + "language_loss": 0.92219329, + "learning_rate": 0.0009664806303957043, + "loss": 0.93289238, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.56542969, + "step": 747, + "time_per_iteration": 2.7734382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.01453757, + "epoch": 0.14390150057714507, + "flos": 591590390016.0, + "grad_norm": 0.040803275102161134, + "language_loss": 0.88578373, + "learning_rate": 0.0009663683918518571, + "loss": 0.89649272, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.56542969, + "step": 748, + "time_per_iteration": 2.93782114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.0103749, + "epoch": 0.14409388226240863, + "flos": 592145445888.0, + "grad_norm": 0.040391516566669984, + "language_loss": 0.87085271, + "learning_rate": 0.0009662559722477428, + "loss": 0.88152146, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.56640625, + "step": 749, + "time_per_iteration": 2.696570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.08542633, + "epoch": 0.1442862639476722, + "flos": 1514657762304.0, + "grad_norm": 0.043557664449290004, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77303517, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.55273438, + "step": 750, + "time_per_iteration": 5.024984836578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106928, + "balance_loss_mlp": 1.01263177, + "epoch": 0.14447864563293575, + "flos": 497856556032.0, + "grad_norm": 0.03544029116038115, + "language_loss": 0.90697813, + "learning_rate": 0.0009660305900333632, + "loss": 0.91767091, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.56738281, + "step": 751, + "time_per_iteration": 2.678037166595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078207, + "balance_loss_mlp": 1.02165437, + "epoch": 0.1446710273181993, + "flos": 590795239680.0, + "grad_norm": 0.04141635113788076, + "language_loss": 0.83649188, + "learning_rate": 0.0009659176275105992, + "loss": 0.84727395, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.56640625, + "step": 752, + "time_per_iteration": 2.714871883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.02074409, + "epoch": 0.14486340900346287, + "flos": 587013884160.0, + "grad_norm": 0.03637909883196532, + "language_loss": 0.87195009, + "learning_rate": 0.0009658044841025701, + "loss": 0.88271976, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.56396484, + "step": 753, + "time_per_iteration": 2.7753467559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.01901722, + "epoch": 0.14505579068872643, + "flos": 505741019904.0, + "grad_norm": 0.041255413340114844, + "language_loss": 0.82866222, + "learning_rate": 0.0009656911598532021, + "loss": 0.83941746, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.56591797, + "step": 754, + "time_per_iteration": 2.657831907272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.02119958, + "epoch": 0.14524817237399, + "flos": 487816041984.0, + "grad_norm": 0.03637506550278126, + "language_loss": 0.9138847, + "learning_rate": 0.0009655776548064917, + "loss": 0.92465889, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.56347656, + "step": 755, + "time_per_iteration": 2.6499805450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070806, + "balance_loss_mlp": 1.01477778, + "epoch": 0.14544055405925355, + "flos": 729450394368.0, + "grad_norm": 0.037726189244012505, + "language_loss": 0.89799821, + "learning_rate": 0.0009654639690065054, + "loss": 0.90870631, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.56201172, + "step": 756, + "time_per_iteration": 2.913638114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.01461017, + "epoch": 0.14563293574451713, + "flos": 594787532544.0, + "grad_norm": 0.03772784195488967, + "language_loss": 0.8914414, + "learning_rate": 0.00096535010249738, + "loss": 0.90214825, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.5625, + "step": 757, + "time_per_iteration": 2.721640110015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.01121712, + "epoch": 0.1458253174297807, + "flos": 561623404800.0, + "grad_norm": 0.04410713855467511, + "language_loss": 0.84106696, + "learning_rate": 0.0009652360553233224, + "loss": 0.8517437, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.56591797, + "step": 758, + "time_per_iteration": 2.771986484527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080421, + "balance_loss_mlp": 1.02625275, + "epoch": 0.14601769911504425, + "flos": 1561189804032.0, + "grad_norm": 0.021986445825835567, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74854165, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.54296875, + "step": 759, + "time_per_iteration": 4.951657056808472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064246, + "balance_loss_mlp": 1.00712132, + "epoch": 0.1462100808003078, + "flos": 867823646976.0, + "grad_norm": 0.03532102179266325, + "language_loss": 0.82350075, + "learning_rate": 0.0009650074191575883, + "loss": 0.83414322, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.56982422, + "step": 760, + "time_per_iteration": 3.2275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.02083874, + "epoch": 0.14640246248557137, + "flos": 524030525184.0, + "grad_norm": 0.0394901057776484, + "language_loss": 0.87295806, + "learning_rate": 0.0009648928302546766, + "loss": 0.88373965, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.57177734, + "step": 761, + "time_per_iteration": 2.6739044189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108061, + "balance_loss_mlp": 1.02319896, + "epoch": 0.14659484417083493, + "flos": 1032242556672.0, + "grad_norm": 0.0381114836464334, + "language_loss": 0.86423808, + "learning_rate": 0.0009647780608643613, + "loss": 0.87504417, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.57226562, + "step": 762, + "time_per_iteration": 3.355055332183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_mlp": 1.02742219, + "epoch": 0.1467872258560985, + "flos": 501657353472.0, + "grad_norm": 0.04884269069306727, + "language_loss": 0.89483184, + "learning_rate": 0.0009646631110312001, + "loss": 0.90568066, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.57275391, + "step": 763, + "time_per_iteration": 2.638404607772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074108, + "balance_loss_mlp": 1.01683939, + "epoch": 0.14697960754136205, + "flos": 548936913408.0, + "grad_norm": 0.030517371118051684, + "language_loss": 0.89587164, + "learning_rate": 0.0009645479807998203, + "loss": 0.90661263, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.57128906, + "step": 764, + "time_per_iteration": 2.7784340381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066881, + "balance_loss_mlp": 1.0099467, + "epoch": 0.14717198922662564, + "flos": 518902854144.0, + "grad_norm": 0.03321738346858149, + "language_loss": 0.93693149, + "learning_rate": 0.0009644326702149196, + "loss": 0.94760031, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.56884766, + "step": 765, + "time_per_iteration": 2.712148904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.009902, + "epoch": 0.1473643709118892, + "flos": 733484483328.0, + "grad_norm": 0.042813367444357694, + "language_loss": 0.86227441, + "learning_rate": 0.0009643171793212653, + "loss": 0.87293845, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.56591797, + "step": 766, + "time_per_iteration": 3.0350003242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.01357007, + "epoch": 0.14755675259715276, + "flos": 621669169152.0, + "grad_norm": 0.04397904632105779, + "language_loss": 0.90884185, + "learning_rate": 0.0009642015081636952, + "loss": 0.91953874, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.56298828, + "step": 767, + "time_per_iteration": 2.6967811584472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.01185656, + "epoch": 0.14774913428241632, + "flos": 453173395968.0, + "grad_norm": 0.040409537343205924, + "language_loss": 0.89756525, + "learning_rate": 0.0009640856567871166, + "loss": 0.90824074, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.55859375, + "step": 768, + "time_per_iteration": 2.5016207695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.00803363, + "epoch": 0.14794151596767988, + "flos": 838655702784.0, + "grad_norm": 0.03518214363191685, + "language_loss": 0.90024096, + "learning_rate": 0.0009639696252365072, + "loss": 0.91087824, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.55859375, + "step": 769, + "time_per_iteration": 3.0535316467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064247, + "balance_loss_mlp": 1.00874364, + "epoch": 0.14813389765294344, + "flos": 687405430272.0, + "grad_norm": 0.03578436651039587, + "language_loss": 0.83073497, + "learning_rate": 0.0009638534135569144, + "loss": 0.8413775, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.55664062, + "step": 770, + "time_per_iteration": 2.8983683586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065854, + "balance_loss_mlp": 1.01039767, + "epoch": 0.148326279338207, + "flos": 510944513280.0, + "grad_norm": 0.03931230706380594, + "language_loss": 0.91550887, + "learning_rate": 0.0009637370217934554, + "loss": 0.92616743, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.55615234, + "step": 771, + "time_per_iteration": 2.6311967372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061556, + "balance_loss_mlp": 1.00590932, + "epoch": 0.14851866102347056, + "flos": 589332272640.0, + "grad_norm": 0.03214719611667013, + "language_loss": 0.8436957, + "learning_rate": 0.0009636204499913175, + "loss": 0.85431123, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.55810547, + "step": 772, + "time_per_iteration": 2.8748695850372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066056, + "balance_loss_mlp": 1.01069546, + "epoch": 0.14871104270873411, + "flos": 692248286976.0, + "grad_norm": 0.034034874980260935, + "language_loss": 0.89455193, + "learning_rate": 0.0009635036981957581, + "loss": 0.9052124, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.55517578, + "step": 773, + "time_per_iteration": 2.8526012897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063541, + "balance_loss_mlp": 1.00789392, + "epoch": 0.1489034243939977, + "flos": 656283624960.0, + "grad_norm": 0.03841304714783139, + "language_loss": 0.91971016, + "learning_rate": 0.0009633867664521043, + "loss": 0.93034559, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.55810547, + "step": 774, + "time_per_iteration": 2.823320150375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_mlp": 1.00736797, + "epoch": 0.14909580607926126, + "flos": 476796652032.0, + "grad_norm": 0.0404919947218097, + "language_loss": 0.88328946, + "learning_rate": 0.0009632696548057527, + "loss": 0.89392436, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.56298828, + "step": 775, + "time_per_iteration": 2.5567190647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072265, + "balance_loss_mlp": 1.01609385, + "epoch": 0.14928818776452482, + "flos": 612284799744.0, + "grad_norm": 0.03821441574416946, + "language_loss": 0.86270714, + "learning_rate": 0.0009631523633021704, + "loss": 0.87342978, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.56347656, + "step": 776, + "time_per_iteration": 2.783348321914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.01187015, + "epoch": 0.14948056944978838, + "flos": 562917230592.0, + "grad_norm": 0.039790220133906304, + "language_loss": 0.90072912, + "learning_rate": 0.0009630348919868936, + "loss": 0.9114095, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.56347656, + "step": 777, + "time_per_iteration": 2.7115018367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.01736236, + "epoch": 0.14967295113505194, + "flos": 450112346880.0, + "grad_norm": 0.044777999480791836, + "language_loss": 0.82363755, + "learning_rate": 0.0009629172409055293, + "loss": 0.83437192, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.5625, + "step": 778, + "time_per_iteration": 2.578178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079079, + "balance_loss_mlp": 1.02319324, + "epoch": 0.1498653328203155, + "flos": 572429912064.0, + "grad_norm": 0.03699200582710457, + "language_loss": 0.8876617, + "learning_rate": 0.0009627994101037531, + "loss": 0.89845246, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.56054688, + "step": 779, + "time_per_iteration": 2.7733986377716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.01556909, + "epoch": 0.15005771450557906, + "flos": 632408602368.0, + "grad_norm": 0.04036301028093645, + "language_loss": 0.90477651, + "learning_rate": 0.0009626813996273114, + "loss": 0.91549194, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.56152344, + "step": 780, + "time_per_iteration": 2.8476834297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064638, + "balance_loss_mlp": 1.00884771, + "epoch": 0.15025009619084262, + "flos": 579166358784.0, + "grad_norm": 0.036574622666600026, + "language_loss": 0.89819682, + "learning_rate": 0.0009625632095220198, + "loss": 0.90884316, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.55957031, + "step": 781, + "time_per_iteration": 2.8279531002044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065406, + "balance_loss_mlp": 1.00961614, + "epoch": 0.1504424778761062, + "flos": 484857060096.0, + "grad_norm": 0.04416373966784989, + "language_loss": 0.8858574, + "learning_rate": 0.0009624448398337637, + "loss": 0.89651144, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.55957031, + "step": 782, + "time_per_iteration": 2.512742280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0075767, + "epoch": 0.15063485956136977, + "flos": 763895708928.0, + "grad_norm": 0.03630111779859241, + "language_loss": 0.90811443, + "learning_rate": 0.0009623262906084984, + "loss": 0.9187429, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.55419922, + "step": 783, + "time_per_iteration": 3.0409936904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.01156867, + "epoch": 0.15082724124663333, + "flos": 498676984320.0, + "grad_norm": 0.03758683048429116, + "language_loss": 0.91324949, + "learning_rate": 0.0009622075618922486, + "loss": 0.92391407, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.55029297, + "step": 784, + "time_per_iteration": 2.716580629348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.01188219, + "epoch": 0.15101962293189689, + "flos": 510722882304.0, + "grad_norm": 0.0361748672236624, + "language_loss": 0.88713133, + "learning_rate": 0.0009620886537311091, + "loss": 0.89779752, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.54882812, + "step": 785, + "time_per_iteration": 2.7197515964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.01069367, + "epoch": 0.15121200461716044, + "flos": 458702532864.0, + "grad_norm": 0.0476660620131034, + "language_loss": 0.86751854, + "learning_rate": 0.000961969566171244, + "loss": 0.87817287, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.54882812, + "step": 786, + "time_per_iteration": 2.519826650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063397, + "balance_loss_mlp": 1.00865602, + "epoch": 0.151404386302424, + "flos": 539017908480.0, + "grad_norm": 0.0401982478312821, + "language_loss": 0.91594857, + "learning_rate": 0.0009618502992588873, + "loss": 0.92658257, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.54882812, + "step": 787, + "time_per_iteration": 2.6427645683288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.02133262, + "epoch": 0.15159676798768756, + "flos": 689617860864.0, + "grad_norm": 0.04258050045209434, + "language_loss": 0.8916502, + "learning_rate": 0.0009617308530403424, + "loss": 0.9024148, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.55273438, + "step": 788, + "time_per_iteration": 3.0662577152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106461, + "balance_loss_mlp": 1.00958323, + "epoch": 0.15178914967295112, + "flos": 546433832448.0, + "grad_norm": 0.03354297731817266, + "language_loss": 0.88695067, + "learning_rate": 0.0009616112275619825, + "loss": 0.89759684, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.55175781, + "step": 789, + "time_per_iteration": 2.7230606079101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065845, + "balance_loss_mlp": 1.01081765, + "epoch": 0.1519815313582147, + "flos": 512815749120.0, + "grad_norm": 0.03087624340708216, + "language_loss": 0.85391772, + "learning_rate": 0.0009614914228702503, + "loss": 0.86457616, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.55175781, + "step": 790, + "time_per_iteration": 2.6690316200256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075082, + "balance_loss_mlp": 1.02024603, + "epoch": 0.15217391304347827, + "flos": 685458372096.0, + "grad_norm": 0.03877155611381102, + "language_loss": 0.90952718, + "learning_rate": 0.0009613714390116581, + "loss": 0.92027801, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.54980469, + "step": 791, + "time_per_iteration": 3.006898880004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069036, + "balance_loss_mlp": 1.01396108, + "epoch": 0.15236629472874183, + "flos": 645446982144.0, + "grad_norm": 0.03750254169389994, + "language_loss": 0.87660968, + "learning_rate": 0.0009612512760327879, + "loss": 0.88730001, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.55224609, + "step": 792, + "time_per_iteration": 2.858262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068429, + "balance_loss_mlp": 1.01297235, + "epoch": 0.1525586764140054, + "flos": 413765660928.0, + "grad_norm": 0.044925092089749936, + "language_loss": 0.86468709, + "learning_rate": 0.0009611309339802909, + "loss": 0.87537134, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.55615234, + "step": 793, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070738, + "balance_loss_mlp": 1.01485312, + "epoch": 0.15275105809926895, + "flos": 804234687744.0, + "grad_norm": 0.03634630877191588, + "language_loss": 0.85518378, + "learning_rate": 0.0009610104129008881, + "loss": 0.8658911, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.56054688, + "step": 794, + "time_per_iteration": 3.119896173477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064568, + "balance_loss_mlp": 1.0088737, + "epoch": 0.1529434397845325, + "flos": 613543632384.0, + "grad_norm": 0.039196324818253456, + "language_loss": 0.89691782, + "learning_rate": 0.0009608897128413701, + "loss": 0.90756351, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.55859375, + "step": 795, + "time_per_iteration": 2.7244484424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.00949657, + "epoch": 0.15313582146979607, + "flos": 616472478720.0, + "grad_norm": 0.031652256183926086, + "language_loss": 0.86697376, + "learning_rate": 0.0009607688338485965, + "loss": 0.87762469, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.55761719, + "step": 796, + "time_per_iteration": 2.859959363937378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106534, + "balance_loss_mlp": 1.00997913, + "epoch": 0.15332820315505963, + "flos": 794993214720.0, + "grad_norm": 0.036135713167076366, + "language_loss": 0.91464871, + "learning_rate": 0.0009606477759694969, + "loss": 0.92530215, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.55517578, + "step": 797, + "time_per_iteration": 3.0383169651031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.00806129, + "epoch": 0.1535205848403232, + "flos": 551257247232.0, + "grad_norm": 0.04267360012583918, + "language_loss": 0.89290035, + "learning_rate": 0.0009605265392510703, + "loss": 0.90353841, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.55908203, + "step": 798, + "time_per_iteration": 2.642423152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063071, + "balance_loss_mlp": 1.00732899, + "epoch": 0.15371296652558677, + "flos": 536979476736.0, + "grad_norm": 0.03662373873498648, + "language_loss": 0.93232477, + "learning_rate": 0.0009604051237403846, + "loss": 0.94295549, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.55908203, + "step": 799, + "time_per_iteration": 2.6661648750305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062966, + "balance_loss_mlp": 1.00693774, + "epoch": 0.15390534821085033, + "flos": 396090504192.0, + "grad_norm": 0.042222005302764924, + "language_loss": 0.87381375, + "learning_rate": 0.0009602835294845776, + "loss": 0.8844434, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.56201172, + "step": 800, + "time_per_iteration": 2.4529898166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_mlp": 1.00520432, + "epoch": 0.1540977298961139, + "flos": 536886157824.0, + "grad_norm": 0.03888031973735598, + "language_loss": 0.91938102, + "learning_rate": 0.0009601617565308565, + "loss": 0.92998952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.55810547, + "step": 801, + "time_per_iteration": 2.6380698680877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.0085746, + "epoch": 0.15429011158137745, + "flos": 725091628800.0, + "grad_norm": 0.03523983772327724, + "language_loss": 0.87975162, + "learning_rate": 0.0009600398049264977, + "loss": 0.89039195, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.55615234, + "step": 802, + "time_per_iteration": 2.9610986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064973, + "balance_loss_mlp": 1.00970769, + "epoch": 0.154482493266641, + "flos": 621749849088.0, + "grad_norm": 0.04424510077845192, + "language_loss": 0.93353879, + "learning_rate": 0.0009599176747188469, + "loss": 0.94418848, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.55419922, + "step": 803, + "time_per_iteration": 2.883296251296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065036, + "balance_loss_mlp": 1.00981843, + "epoch": 0.15467487495190457, + "flos": 526720243968.0, + "grad_norm": 0.03833070581853241, + "language_loss": 0.84471631, + "learning_rate": 0.0009597953659553196, + "loss": 0.85536671, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.55371094, + "step": 804, + "time_per_iteration": 2.7128705978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.00712788, + "epoch": 0.15486725663716813, + "flos": 528760621056.0, + "grad_norm": 0.03896986919959599, + "language_loss": 0.90159577, + "learning_rate": 0.0009596728786833997, + "loss": 0.9122197, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.55419922, + "step": 805, + "time_per_iteration": 2.605398178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062158, + "balance_loss_mlp": 1.00684452, + "epoch": 0.1550596383224317, + "flos": 1050280295424.0, + "grad_norm": 0.039312204875199507, + "language_loss": 0.90827858, + "learning_rate": 0.0009595502129506415, + "loss": 0.91890013, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.5546875, + "step": 806, + "time_per_iteration": 3.355556011199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_mlp": 1.00736439, + "epoch": 0.15525202000769528, + "flos": 614837458176.0, + "grad_norm": 0.03934214137038287, + "language_loss": 0.83726299, + "learning_rate": 0.0009594273688046678, + "loss": 0.8478874, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.55224609, + "step": 807, + "time_per_iteration": 2.765700101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062118, + "balance_loss_mlp": 1.00728118, + "epoch": 0.15544440169295884, + "flos": 534103120128.0, + "grad_norm": 0.042258492962953934, + "language_loss": 0.86714661, + "learning_rate": 0.000959304346293171, + "loss": 0.8777678, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.54980469, + "step": 808, + "time_per_iteration": 2.6490986347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.00928247, + "epoch": 0.1556367833782224, + "flos": 645887331840.0, + "grad_norm": 0.047675746935091516, + "language_loss": 0.89139616, + "learning_rate": 0.0009591811454639125, + "loss": 0.90203738, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.54980469, + "step": 809, + "time_per_iteration": 2.7880568504333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059631, + "balance_loss_mlp": 1.00469911, + "epoch": 0.15582916506348596, + "flos": 544953368832.0, + "grad_norm": 0.05205155355433054, + "language_loss": 0.89500809, + "learning_rate": 0.0009590577663647234, + "loss": 0.90560436, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.55078125, + "step": 810, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061907, + "balance_loss_mlp": 1.0068804, + "epoch": 0.15602154674874952, + "flos": 581215484160.0, + "grad_norm": 0.039153260843753375, + "language_loss": 0.87186325, + "learning_rate": 0.0009589342090435036, + "loss": 0.88248235, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.55175781, + "step": 811, + "time_per_iteration": 2.806425094604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_mlp": 1.00607169, + "epoch": 0.15621392843401308, + "flos": 536317496064.0, + "grad_norm": 0.04937652455074429, + "language_loss": 0.88453877, + "learning_rate": 0.0009588104735482223, + "loss": 0.89514732, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.54931641, + "step": 812, + "time_per_iteration": 2.647728204727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060077, + "balance_loss_mlp": 1.00538397, + "epoch": 0.15640631011927664, + "flos": 551982411264.0, + "grad_norm": 0.04402679292728805, + "language_loss": 0.85281312, + "learning_rate": 0.0009586865599269177, + "loss": 0.86341381, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.54833984, + "step": 813, + "time_per_iteration": 2.642218828201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061354, + "balance_loss_mlp": 1.0069474, + "epoch": 0.1565986918045402, + "flos": 638636658432.0, + "grad_norm": 0.0415768255708782, + "language_loss": 0.89702487, + "learning_rate": 0.0009585624682276977, + "loss": 0.90763843, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.54541016, + "step": 814, + "time_per_iteration": 2.7770931720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058453, + "balance_loss_mlp": 1.00366414, + "epoch": 0.15679107348980378, + "flos": 491782089984.0, + "grad_norm": 0.039213144049943555, + "language_loss": 0.88436091, + "learning_rate": 0.0009584381984987386, + "loss": 0.89494538, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.54931641, + "step": 815, + "time_per_iteration": 2.617560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061581, + "balance_loss_mlp": 1.00655353, + "epoch": 0.15698345517506734, + "flos": 531003187200.0, + "grad_norm": 0.030486806446719653, + "language_loss": 0.91117728, + "learning_rate": 0.0009583137507882864, + "loss": 0.92179304, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.55175781, + "step": 816, + "time_per_iteration": 2.6757051944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060617, + "balance_loss_mlp": 1.00568497, + "epoch": 0.1571758368603309, + "flos": 547078316544.0, + "grad_norm": 0.03910336486934304, + "language_loss": 0.82217371, + "learning_rate": 0.000958189125144656, + "loss": 0.83277988, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.55078125, + "step": 817, + "time_per_iteration": 2.7065701484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061392, + "balance_loss_mlp": 1.00655591, + "epoch": 0.15736821854559446, + "flos": 566744272896.0, + "grad_norm": 0.03730967846547413, + "language_loss": 0.89150202, + "learning_rate": 0.0009580643216162313, + "loss": 0.90211594, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.54980469, + "step": 818, + "time_per_iteration": 2.6849937438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_mlp": 1.00792253, + "epoch": 0.15756060023085802, + "flos": 501954806784.0, + "grad_norm": 0.041127076818974775, + "language_loss": 0.80838168, + "learning_rate": 0.0009579393402514652, + "loss": 0.81900686, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.54736328, + "step": 819, + "time_per_iteration": 2.615342378616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060056, + "balance_loss_mlp": 1.00560164, + "epoch": 0.15775298191612158, + "flos": 520272502272.0, + "grad_norm": 0.037825026421493144, + "language_loss": 0.91941106, + "learning_rate": 0.0009578141810988801, + "loss": 0.93001157, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.54589844, + "step": 820, + "time_per_iteration": 2.6530544757843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.00666904, + "epoch": 0.15794536360138514, + "flos": 467088584448.0, + "grad_norm": 0.039348813654249644, + "language_loss": 0.92238629, + "learning_rate": 0.0009576888442070668, + "loss": 0.93299985, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.54833984, + "step": 821, + "time_per_iteration": 2.5978658199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_mlp": 1.00809062, + "epoch": 0.1581377452866487, + "flos": 518168941824.0, + "grad_norm": 0.03790806580601569, + "language_loss": 0.93657464, + "learning_rate": 0.0009575633296246854, + "loss": 0.94720107, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.546875, + "step": 822, + "time_per_iteration": 2.582139492034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_mlp": 1.00711334, + "epoch": 0.15833012697191226, + "flos": 550838284800.0, + "grad_norm": 0.03604802690546967, + "language_loss": 0.84146446, + "learning_rate": 0.0009574376374004652, + "loss": 0.85208106, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.546875, + "step": 823, + "time_per_iteration": 2.6182329654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.00703347, + "epoch": 0.15852250865717585, + "flos": 488467329024.0, + "grad_norm": 0.0382059884648543, + "language_loss": 0.82121176, + "learning_rate": 0.000957311767583204, + "loss": 0.83182758, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.546875, + "step": 824, + "time_per_iteration": 2.584266185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_mlp": 1.00531006, + "epoch": 0.1587148903424394, + "flos": 1312699441152.0, + "grad_norm": 0.00659207066158758, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83129162, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.5234375, + "step": 825, + "time_per_iteration": 4.734830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_mlp": 1.00965643, + "epoch": 0.15890727202770297, + "flos": 467833190400.0, + "grad_norm": 0.04624650490850591, + "language_loss": 0.92764026, + "learning_rate": 0.0009570594953650961, + "loss": 0.93828189, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.54638672, + "step": 826, + "time_per_iteration": 2.5117454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.00937772, + "epoch": 0.15909965371296653, + "flos": 778607993088.0, + "grad_norm": 0.03976637787958364, + "language_loss": 0.81327987, + "learning_rate": 0.00095693309306219, + "loss": 0.8239187, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.54638672, + "step": 827, + "time_per_iteration": 3.1954681873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.00599849, + "epoch": 0.1592920353982301, + "flos": 1079964411648.0, + "grad_norm": 0.038150784713437476, + "language_loss": 0.89750922, + "learning_rate": 0.0009568065133621244, + "loss": 0.90811658, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54882812, + "step": 828, + "time_per_iteration": 3.3355016708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.00642896, + "epoch": 0.15948441708349365, + "flos": 726890932992.0, + "grad_norm": 0.03986186218144037, + "language_loss": 0.85834098, + "learning_rate": 0.0009566797563140422, + "loss": 0.86894989, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.54589844, + "step": 829, + "time_per_iteration": 2.873845100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_mlp": 1.00519884, + "epoch": 0.1596767987687572, + "flos": 580076215296.0, + "grad_norm": 0.03433333328837374, + "language_loss": 0.89395094, + "learning_rate": 0.0009565528219671547, + "loss": 0.90454364, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.54199219, + "step": 830, + "time_per_iteration": 2.9566032886505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.00991619, + "epoch": 0.15986918045402077, + "flos": 530026256640.0, + "grad_norm": 0.037800776955081314, + "language_loss": 0.86586118, + "learning_rate": 0.0009564257103707418, + "loss": 0.87649965, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.54052734, + "step": 831, + "time_per_iteration": 2.6305205821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.00870061, + "epoch": 0.16006156213928435, + "flos": 575670796032.0, + "grad_norm": 0.04196239075383403, + "language_loss": 0.92502224, + "learning_rate": 0.0009562984215741533, + "loss": 0.93564951, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54150391, + "step": 832, + "time_per_iteration": 2.6781210899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00743783, + "epoch": 0.1602539438245479, + "flos": 516675839232.0, + "grad_norm": 0.039654673227061156, + "language_loss": 0.83729708, + "learning_rate": 0.0009561709556268065, + "loss": 0.84791321, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.54296875, + "step": 833, + "time_per_iteration": 2.732191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064816, + "balance_loss_mlp": 1.01021826, + "epoch": 0.16044632550981147, + "flos": 622162008576.0, + "grad_norm": 0.03600956841171521, + "language_loss": 0.95349514, + "learning_rate": 0.0009560433125781884, + "loss": 0.96414334, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.54736328, + "step": 834, + "time_per_iteration": 4.227160215377808 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063475, + "balance_loss_mlp": 1.008973, + "epoch": 0.16063870719507503, + "flos": 562128883200.0, + "grad_norm": 0.03652136008848007, + "language_loss": 0.94107795, + "learning_rate": 0.0009559154924778544, + "loss": 0.95171273, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.54638672, + "step": 835, + "time_per_iteration": 2.7238283157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_mlp": 1.01251614, + "epoch": 0.1608310888803386, + "flos": 806561824512.0, + "grad_norm": 0.044196177378580975, + "language_loss": 0.86185992, + "learning_rate": 0.0009557874953754284, + "loss": 0.87252581, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.54199219, + "step": 836, + "time_per_iteration": 3.03965425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.00943184, + "epoch": 0.16102347056560215, + "flos": 601695065856.0, + "grad_norm": 0.04086380423696876, + "language_loss": 0.84961462, + "learning_rate": 0.0009556593213206038, + "loss": 0.86025023, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.54248047, + "step": 837, + "time_per_iteration": 2.714165687561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_mlp": 1.0095681, + "epoch": 0.1612158522508657, + "flos": 554615749632.0, + "grad_norm": 0.03942211179170501, + "language_loss": 0.88284755, + "learning_rate": 0.0009555309703631414, + "loss": 0.89348304, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.54101562, + "step": 838, + "time_per_iteration": 2.6616575717926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061318, + "balance_loss_mlp": 1.00729215, + "epoch": 0.16140823393612927, + "flos": 557018708736.0, + "grad_norm": 0.03970121061853926, + "language_loss": 0.88476837, + "learning_rate": 0.0009554024425528722, + "loss": 0.89538157, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.54150391, + "step": 839, + "time_per_iteration": 2.6778693199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061761, + "balance_loss_mlp": 1.00792611, + "epoch": 0.16160061562139286, + "flos": 544909627392.0, + "grad_norm": 0.03616953348933095, + "language_loss": 0.90216744, + "learning_rate": 0.0009552737379396948, + "loss": 0.91278505, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.53955078, + "step": 840, + "time_per_iteration": 2.6190080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060215, + "balance_loss_mlp": 1.00638056, + "epoch": 0.16179299730665642, + "flos": 605007881472.0, + "grad_norm": 0.03485432207779616, + "language_loss": 0.88917094, + "learning_rate": 0.0009551448565735767, + "loss": 0.89977312, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.53955078, + "step": 841, + "time_per_iteration": 2.771730422973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059839, + "balance_loss_mlp": 1.00624251, + "epoch": 0.16198537899191998, + "flos": 788552275968.0, + "grad_norm": 0.040424272174261144, + "language_loss": 0.855564, + "learning_rate": 0.0009550157985045543, + "loss": 0.86616236, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.53710938, + "step": 842, + "time_per_iteration": 3.014448642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063416, + "balance_loss_mlp": 1.00986671, + "epoch": 0.16217776067718354, + "flos": 520830470400.0, + "grad_norm": 0.03210449059239548, + "language_loss": 0.9010545, + "learning_rate": 0.0009548865637827321, + "loss": 0.91168869, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.53662109, + "step": 843, + "time_per_iteration": 2.663733959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.00725794, + "epoch": 0.1623701423624471, + "flos": 506255246592.0, + "grad_norm": 0.04236042945807781, + "language_loss": 0.91279781, + "learning_rate": 0.0009547571524582838, + "loss": 0.92340446, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.53515625, + "step": 844, + "time_per_iteration": 2.5841143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00848722, + "epoch": 0.16256252404771065, + "flos": 498157900032.0, + "grad_norm": 0.043042899099755685, + "language_loss": 0.93573415, + "learning_rate": 0.0009546275645814512, + "loss": 0.94635028, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.53222656, + "step": 845, + "time_per_iteration": 2.601743221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064884, + "balance_loss_mlp": 1.01152599, + "epoch": 0.16275490573297421, + "flos": 503287516416.0, + "grad_norm": 0.046422900850994125, + "language_loss": 0.90658545, + "learning_rate": 0.0009544978002025446, + "loss": 0.9172343, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.53466797, + "step": 846, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.00957346, + "epoch": 0.16294728741823777, + "flos": 508354916352.0, + "grad_norm": 0.03474620131823351, + "language_loss": 0.88017273, + "learning_rate": 0.0009543678593719434, + "loss": 0.89080155, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.53417969, + "step": 847, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_mlp": 1.01334834, + "epoch": 0.16313966910350133, + "flos": 510757875456.0, + "grad_norm": 0.031134263506057067, + "language_loss": 0.88570058, + "learning_rate": 0.0009542377421400945, + "loss": 0.89637142, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.53857422, + "step": 848, + "time_per_iteration": 2.79311203956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061983, + "balance_loss_mlp": 1.00810015, + "epoch": 0.16333205078876492, + "flos": 545057381376.0, + "grad_norm": 0.03805815068737175, + "language_loss": 0.84448338, + "learning_rate": 0.0009541074485575145, + "loss": 0.85510319, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.54003906, + "step": 849, + "time_per_iteration": 2.714644193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106829, + "balance_loss_mlp": 1.01450312, + "epoch": 0.16352443247402848, + "flos": 508712640768.0, + "grad_norm": 0.03447226436126556, + "language_loss": 0.93184924, + "learning_rate": 0.0009539769786747874, + "loss": 0.94253218, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.5390625, + "step": 850, + "time_per_iteration": 2.5857110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070929, + "balance_loss_mlp": 1.01709449, + "epoch": 0.16371681415929204, + "flos": 543223084032.0, + "grad_norm": 0.036141614394747515, + "language_loss": 0.82550752, + "learning_rate": 0.0009538463325425665, + "loss": 0.83621687, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.53955078, + "step": 851, + "time_per_iteration": 2.7186405658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.01242912, + "epoch": 0.1639091958445556, + "flos": 521761714176.0, + "grad_norm": 0.03784697093976771, + "language_loss": 0.87203169, + "learning_rate": 0.0009537155102115728, + "loss": 0.8826977, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.54296875, + "step": 852, + "time_per_iteration": 2.5761775970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061784, + "balance_loss_mlp": 1.00771022, + "epoch": 0.16410157752981916, + "flos": 548482957824.0, + "grad_norm": 0.03731294741121226, + "language_loss": 0.85278255, + "learning_rate": 0.0009535845117325961, + "loss": 0.8634004, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.54199219, + "step": 853, + "time_per_iteration": 2.6968846321105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065026, + "balance_loss_mlp": 1.01085758, + "epoch": 0.16429395921508272, + "flos": 584026712064.0, + "grad_norm": 0.031860977478103375, + "language_loss": 0.9423098, + "learning_rate": 0.0009534533371564946, + "loss": 0.95296007, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.54296875, + "step": 854, + "time_per_iteration": 2.7640349864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106098, + "balance_loss_mlp": 1.00709713, + "epoch": 0.16448634090034628, + "flos": 531962621184.0, + "grad_norm": 0.03950290113288642, + "language_loss": 0.89868152, + "learning_rate": 0.0009533219865341949, + "loss": 0.90929133, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.54003906, + "step": 855, + "time_per_iteration": 2.6025009155273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060489, + "balance_loss_mlp": 1.00693989, + "epoch": 0.16467872258560984, + "flos": 492961209600.0, + "grad_norm": 0.03645156199748424, + "language_loss": 0.87602645, + "learning_rate": 0.0009531904599166916, + "loss": 0.88663131, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.53662109, + "step": 856, + "time_per_iteration": 2.656604290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060758, + "balance_loss_mlp": 1.00730467, + "epoch": 0.16487110427087343, + "flos": 507260367360.0, + "grad_norm": 0.04426557796634758, + "language_loss": 0.86560714, + "learning_rate": 0.0009530587573550478, + "loss": 0.87621474, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.53564453, + "step": 857, + "time_per_iteration": 2.610445261001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_mlp": 1.00538635, + "epoch": 0.16506348595613698, + "flos": 1436111555328.0, + "grad_norm": 0.010874217326465607, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75375891, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.51171875, + "step": 858, + "time_per_iteration": 4.991516590118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.00718212, + "epoch": 0.16525586764140054, + "flos": 478090477824.0, + "grad_norm": 0.04454190836652637, + "language_loss": 0.91544032, + "learning_rate": 0.0009527948246039337, + "loss": 0.9260481, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.53710938, + "step": 859, + "time_per_iteration": 2.538290500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058142, + "balance_loss_mlp": 1.00425971, + "epoch": 0.1654482493266641, + "flos": 882541767168.0, + "grad_norm": 0.03991834039284953, + "language_loss": 0.88867122, + "learning_rate": 0.000952662594516931, + "loss": 0.89925265, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.54003906, + "step": 860, + "time_per_iteration": 3.083786964416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065202, + "balance_loss_mlp": 1.01122451, + "epoch": 0.16564063101192766, + "flos": 628106217216.0, + "grad_norm": 0.03630731527649873, + "language_loss": 0.87934124, + "learning_rate": 0.0009525301886907234, + "loss": 0.88999331, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.54101562, + "step": 861, + "time_per_iteration": 2.8606412410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.00884438, + "epoch": 0.16583301269719122, + "flos": 562593532416.0, + "grad_norm": 0.03632506699489255, + "language_loss": 0.8885988, + "learning_rate": 0.0009523976071767155, + "loss": 0.89922649, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.54052734, + "step": 862, + "time_per_iteration": 2.651202440261841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062989, + "balance_loss_mlp": 1.0094403, + "epoch": 0.16602539438245478, + "flos": 568984893696.0, + "grad_norm": 0.03883194498572106, + "language_loss": 0.88789731, + "learning_rate": 0.00095226485002638, + "loss": 0.8985272, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.53662109, + "step": 863, + "time_per_iteration": 2.798125982284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.01019073, + "epoch": 0.16621777606771834, + "flos": 576022684416.0, + "grad_norm": 0.03638934937563812, + "language_loss": 0.89892161, + "learning_rate": 0.0009521319172912576, + "loss": 0.90955949, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.53710938, + "step": 864, + "time_per_iteration": 4.098716974258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_mlp": 1.00632548, + "epoch": 0.16641015775298193, + "flos": 515598786816.0, + "grad_norm": 0.037169751839881825, + "language_loss": 0.96108532, + "learning_rate": 0.0009519988090229579, + "loss": 0.97168505, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.53759766, + "step": 865, + "time_per_iteration": 2.659381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068447, + "balance_loss_mlp": 1.01489806, + "epoch": 0.1666025394382455, + "flos": 622850234112.0, + "grad_norm": 0.04388029559541895, + "language_loss": 0.88811028, + "learning_rate": 0.0009518655252731576, + "loss": 0.89879477, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.53662109, + "step": 866, + "time_per_iteration": 2.738511323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061925, + "balance_loss_mlp": 1.00880551, + "epoch": 0.16679492112350905, + "flos": 549933285888.0, + "grad_norm": 0.03352631932153436, + "language_loss": 0.91113746, + "learning_rate": 0.0009517320660936022, + "loss": 0.92175674, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.53222656, + "step": 867, + "time_per_iteration": 2.7755699157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.01343453, + "epoch": 0.1669873028087726, + "flos": 666866555904.0, + "grad_norm": 0.04051359913494383, + "language_loss": 0.84396493, + "learning_rate": 0.0009515984315361051, + "loss": 0.85462809, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.52978516, + "step": 868, + "time_per_iteration": 2.8502533435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_mlp": 1.00944042, + "epoch": 0.16717968449403617, + "flos": 539604066816.0, + "grad_norm": 0.03969494402961726, + "language_loss": 0.88029611, + "learning_rate": 0.000951464621652548, + "loss": 0.89092225, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.53271484, + "step": 869, + "time_per_iteration": 2.6079800128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.01233244, + "epoch": 0.16737206617929973, + "flos": 531279253248.0, + "grad_norm": 0.03349656106003216, + "language_loss": 0.7990135, + "learning_rate": 0.0009513306364948804, + "loss": 0.80967236, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.53662109, + "step": 870, + "time_per_iteration": 2.824232578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106371, + "balance_loss_mlp": 1.00987494, + "epoch": 0.1675644478645633, + "flos": 481757127168.0, + "grad_norm": 0.04264569815750397, + "language_loss": 0.90229708, + "learning_rate": 0.0009511964761151197, + "loss": 0.91293418, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.53955078, + "step": 871, + "time_per_iteration": 2.6326816082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106642, + "balance_loss_mlp": 1.01344323, + "epoch": 0.16775682954982685, + "flos": 495542058240.0, + "grad_norm": 0.04000245460937008, + "language_loss": 0.91825569, + "learning_rate": 0.0009510621405653521, + "loss": 0.92891991, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.53076172, + "step": 872, + "time_per_iteration": 2.5802783966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074346, + "balance_loss_mlp": 1.02151191, + "epoch": 0.1679492112350904, + "flos": 753406096896.0, + "grad_norm": 0.04130745072346603, + "language_loss": 0.85908926, + "learning_rate": 0.0009509276298977309, + "loss": 0.86983275, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.52929688, + "step": 873, + "time_per_iteration": 2.9676413536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069963, + "balance_loss_mlp": 1.01689136, + "epoch": 0.168141592920354, + "flos": 1137733583616.0, + "grad_norm": 0.036676349776393134, + "language_loss": 0.82925022, + "learning_rate": 0.0009507929441644778, + "loss": 0.83994985, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.53173828, + "step": 874, + "time_per_iteration": 3.5441927909851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.00924039, + "epoch": 0.16833397460561755, + "flos": 633554674176.0, + "grad_norm": 0.03715311549034911, + "language_loss": 0.86810201, + "learning_rate": 0.0009506580834178826, + "loss": 0.87872851, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.53515625, + "step": 875, + "time_per_iteration": 2.767840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106879, + "balance_loss_mlp": 1.01524162, + "epoch": 0.1685263562908811, + "flos": 542543606784.0, + "grad_norm": 0.041322978640758234, + "language_loss": 0.92533737, + "learning_rate": 0.0009505230477103028, + "loss": 0.93602526, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.53662109, + "step": 876, + "time_per_iteration": 2.68626070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_mlp": 1.01151776, + "epoch": 0.16871873797614467, + "flos": 620486158848.0, + "grad_norm": 0.04979097271806245, + "language_loss": 0.82312369, + "learning_rate": 0.0009503878370941641, + "loss": 0.83377057, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.53271484, + "step": 877, + "time_per_iteration": 2.738828182220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.01455081, + "epoch": 0.16891111966140823, + "flos": 607456527360.0, + "grad_norm": 0.048240798926105125, + "language_loss": 0.90597415, + "learning_rate": 0.0009502524516219595, + "loss": 0.91664839, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.52978516, + "step": 878, + "time_per_iteration": 2.7533464431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.01234174, + "epoch": 0.1691035013466718, + "flos": 553406494464.0, + "grad_norm": 0.04285435284136928, + "language_loss": 0.91275579, + "learning_rate": 0.0009501168913462506, + "loss": 0.92340994, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.53173828, + "step": 879, + "time_per_iteration": 2.6498849391937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106115, + "balance_loss_mlp": 1.00946045, + "epoch": 0.16929588303193535, + "flos": 1479308427264.0, + "grad_norm": 0.010969186313753012, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80183077, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.51757812, + "step": 880, + "time_per_iteration": 4.8127734661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065784, + "balance_loss_mlp": 1.01228285, + "epoch": 0.1694882647171989, + "flos": 927848024064.0, + "grad_norm": 0.04254449001590413, + "language_loss": 0.86211771, + "learning_rate": 0.0009498452465949042, + "loss": 0.87277561, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.53613281, + "step": 881, + "time_per_iteration": 3.242352247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.00668061, + "epoch": 0.1696806464024625, + "flos": 547152193536.0, + "grad_norm": 0.03842920637304405, + "language_loss": 0.92758489, + "learning_rate": 0.0009497091622247285, + "loss": 0.93818152, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.53076172, + "step": 882, + "time_per_iteration": 2.7538321018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.01363766, + "epoch": 0.16987302808772606, + "flos": 530295519744.0, + "grad_norm": 0.04346709327253658, + "language_loss": 0.94739175, + "learning_rate": 0.0009495729032619723, + "loss": 0.95805502, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.52783203, + "step": 883, + "time_per_iteration": 2.681851863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.00830746, + "epoch": 0.17006540977298962, + "flos": 756479784960.0, + "grad_norm": 0.03707996109728333, + "language_loss": 0.85065424, + "learning_rate": 0.0009494364697595354, + "loss": 0.86126566, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.52929688, + "step": 884, + "time_per_iteration": 2.886613607406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058078, + "balance_loss_mlp": 1.00495851, + "epoch": 0.17025779145825318, + "flos": 559875623424.0, + "grad_norm": 0.04262534374301406, + "language_loss": 0.90753883, + "learning_rate": 0.0009492998617703867, + "loss": 0.91811961, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.53222656, + "step": 885, + "time_per_iteration": 2.7197954654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069962, + "balance_loss_mlp": 1.01684284, + "epoch": 0.17045017314351674, + "flos": 513217214976.0, + "grad_norm": 0.04472607646913617, + "language_loss": 0.89151132, + "learning_rate": 0.0009491630793475619, + "loss": 0.90221095, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.53222656, + "step": 886, + "time_per_iteration": 2.6023643016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059759, + "balance_loss_mlp": 1.00706899, + "epoch": 0.1706425548287803, + "flos": 510013269504.0, + "grad_norm": 0.03690999998020265, + "language_loss": 0.86250949, + "learning_rate": 0.0009490261225441643, + "loss": 0.87310708, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.52783203, + "step": 887, + "time_per_iteration": 2.8811516761779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01845872, + "epoch": 0.17083493651404386, + "flos": 718715818752.0, + "grad_norm": 0.037520519160069404, + "language_loss": 0.91723603, + "learning_rate": 0.0009488889914133656, + "loss": 0.92794418, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.52441406, + "step": 888, + "time_per_iteration": 2.983920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.01515496, + "epoch": 0.17102731819930742, + "flos": 560201266944.0, + "grad_norm": 0.034570155262309, + "language_loss": 0.90050644, + "learning_rate": 0.0009487516860084047, + "loss": 0.91118205, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.52490234, + "step": 889, + "time_per_iteration": 2.739945888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061028, + "balance_loss_mlp": 1.0078603, + "epoch": 0.17121969988457098, + "flos": 495765634560.0, + "grad_norm": 0.04354558177795279, + "language_loss": 0.9033885, + "learning_rate": 0.0009486142063825884, + "loss": 0.91399872, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.53271484, + "step": 890, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.02223206, + "epoch": 0.17141208156983456, + "flos": 1552108723968.0, + "grad_norm": 0.01766408052426257, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73499948, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.515625, + "step": 891, + "time_per_iteration": 4.968579053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_mlp": 1.00568736, + "epoch": 0.17160446325509812, + "flos": 620700986880.0, + "grad_norm": 0.037544702591063864, + "language_loss": 0.91210532, + "learning_rate": 0.0009483387246819542, + "loss": 0.92269152, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.53027344, + "step": 892, + "time_per_iteration": 2.7970938682556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071655, + "balance_loss_mlp": 1.0209198, + "epoch": 0.17179684494036168, + "flos": 1384695839232.0, + "grad_norm": 0.01601076320839161, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83357239, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.5078125, + "step": 893, + "time_per_iteration": 4.629605054855347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.01386988, + "epoch": 0.17198922662562524, + "flos": 493642632192.0, + "grad_norm": 0.03763004911158334, + "language_loss": 0.90241146, + "learning_rate": 0.0009480625467392688, + "loss": 0.91307414, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.52490234, + "step": 894, + "time_per_iteration": 2.6142358779907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_mlp": 1.01822662, + "epoch": 0.1721816083108888, + "flos": 1461488428800.0, + "grad_norm": 0.016749035753296605, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79063439, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.50585938, + "step": 895, + "time_per_iteration": 4.811494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112065, + "balance_loss_mlp": 1.06719661, + "epoch": 0.17237398999615236, + "flos": 529205828352.0, + "grad_norm": 0.05241044192650153, + "language_loss": 0.88738441, + "learning_rate": 0.0009477856729834196, + "loss": 0.89859092, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.53564453, + "step": 896, + "time_per_iteration": 2.7389612197875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066584, + "balance_loss_mlp": 1.01446557, + "epoch": 0.17256637168141592, + "flos": 605027323392.0, + "grad_norm": 0.03860455021635393, + "language_loss": 0.90989411, + "learning_rate": 0.0009476469753098809, + "loss": 0.92055988, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.52197266, + "step": 897, + "time_per_iteration": 2.7175238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.02507758, + "epoch": 0.17275875336667948, + "flos": 510694692096.0, + "grad_norm": 0.040412661310783936, + "language_loss": 0.88453948, + "learning_rate": 0.0009475081038443738, + "loss": 0.89531147, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.52197266, + "step": 898, + "time_per_iteration": 2.6398110389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.02753115, + "epoch": 0.17295113505194307, + "flos": 666502028544.0, + "grad_norm": 0.045107808798334564, + "language_loss": 0.87902451, + "learning_rate": 0.0009473690586408124, + "loss": 0.88981915, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.52001953, + "step": 899, + "time_per_iteration": 2.817730665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.01965487, + "epoch": 0.17314351673720663, + "flos": 556432550400.0, + "grad_norm": 0.03870851432877784, + "language_loss": 0.87576568, + "learning_rate": 0.0009472298397531792, + "loss": 0.88648236, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.52099609, + "step": 900, + "time_per_iteration": 2.6932764053344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061802, + "balance_loss_mlp": 1.00892079, + "epoch": 0.17333589842247019, + "flos": 504607587072.0, + "grad_norm": 0.03631909976073519, + "language_loss": 0.87174571, + "learning_rate": 0.0009470904472355235, + "loss": 0.88236374, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.52978516, + "step": 901, + "time_per_iteration": 2.669405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099242, + "balance_loss_mlp": 1.04593205, + "epoch": 0.17352828010773375, + "flos": 557351155200.0, + "grad_norm": 0.04839261993488341, + "language_loss": 0.80976391, + "learning_rate": 0.0009469508811419626, + "loss": 0.82075632, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.53417969, + "step": 902, + "time_per_iteration": 2.7412211894989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083992, + "balance_loss_mlp": 1.033638, + "epoch": 0.1737206617929973, + "flos": 1557794363136.0, + "grad_norm": 0.02136399149953286, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72697818, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.50390625, + "step": 903, + "time_per_iteration": 4.800720930099487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075494, + "balance_loss_mlp": 1.02318478, + "epoch": 0.17391304347826086, + "flos": 517756782336.0, + "grad_norm": 0.04178806719411302, + "language_loss": 0.85797513, + "learning_rate": 0.0009466712284439292, + "loss": 0.86873007, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.52392578, + "step": 904, + "time_per_iteration": 2.7409780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076244, + "balance_loss_mlp": 1.02360141, + "epoch": 0.17410542516352442, + "flos": 542161582848.0, + "grad_norm": 0.043268311729831165, + "language_loss": 0.90273786, + "learning_rate": 0.0009465311419480276, + "loss": 0.91350031, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.52734375, + "step": 905, + "time_per_iteration": 2.7310986518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068245, + "balance_loss_mlp": 1.01526833, + "epoch": 0.17429780684878798, + "flos": 625082106624.0, + "grad_norm": 0.0375699532684124, + "language_loss": 0.89484948, + "learning_rate": 0.0009463908820933622, + "loss": 0.905532, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.53076172, + "step": 906, + "time_per_iteration": 2.8575551509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086696, + "balance_loss_mlp": 1.03281319, + "epoch": 0.17449018853405157, + "flos": 576849915648.0, + "grad_norm": 0.04286783530345041, + "language_loss": 0.83513701, + "learning_rate": 0.0009462504489343868, + "loss": 0.84600401, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.54003906, + "step": 907, + "time_per_iteration": 2.83085036277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.0128628, + "epoch": 0.17468257021931513, + "flos": 534773849088.0, + "grad_norm": 0.0408315501053547, + "language_loss": 0.90177906, + "learning_rate": 0.0009461098425256222, + "loss": 0.91243982, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.53320312, + "step": 908, + "time_per_iteration": 2.6000654697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075411, + "balance_loss_mlp": 1.02257717, + "epoch": 0.1748749519045787, + "flos": 541809694464.0, + "grad_norm": 0.0381088809784924, + "language_loss": 0.87053907, + "learning_rate": 0.0009459690629216567, + "loss": 0.88129318, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.52929688, + "step": 909, + "time_per_iteration": 2.622178316116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_mlp": 1.02770495, + "epoch": 0.17506733358984225, + "flos": 499627670016.0, + "grad_norm": 0.039096197570908604, + "language_loss": 0.88898331, + "learning_rate": 0.0009458281101771457, + "loss": 0.89978582, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.52636719, + "step": 910, + "time_per_iteration": 2.5964770317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.01217556, + "epoch": 0.1752597152751058, + "flos": 624133366272.0, + "grad_norm": 0.035444142957055544, + "language_loss": 0.83730716, + "learning_rate": 0.0009456869843468122, + "loss": 0.84795535, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.52734375, + "step": 911, + "time_per_iteration": 2.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059336, + "balance_loss_mlp": 1.00650251, + "epoch": 0.17545209696036937, + "flos": 521994038784.0, + "grad_norm": 0.04587594362499167, + "language_loss": 0.79429859, + "learning_rate": 0.0009455456854854459, + "loss": 0.80489194, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.52929688, + "step": 912, + "time_per_iteration": 2.627058744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.0219084, + "epoch": 0.17564447864563293, + "flos": 462946592256.0, + "grad_norm": 0.044462507375804226, + "language_loss": 0.85522115, + "learning_rate": 0.0009454042136479039, + "loss": 0.86597091, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.53173828, + "step": 913, + "time_per_iteration": 2.562453031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.01250815, + "epoch": 0.1758368603308965, + "flos": 481618121472.0, + "grad_norm": 0.03599423435064716, + "language_loss": 0.84144086, + "learning_rate": 0.0009452625688891103, + "loss": 0.85208857, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.5234375, + "step": 914, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.0137558, + "epoch": 0.17602924201616005, + "flos": 1482087574272.0, + "grad_norm": 0.013260252544834742, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79798466, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.49902344, + "step": 915, + "time_per_iteration": 4.572151184082031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_mlp": 1.0219233, + "epoch": 0.17622162370142364, + "flos": 603471037440.0, + "grad_norm": 0.044830704586910027, + "language_loss": 0.94022703, + "learning_rate": 0.0009449787608278015, + "loss": 0.95096982, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.52441406, + "step": 916, + "time_per_iteration": 2.731264114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062837, + "balance_loss_mlp": 1.0104804, + "epoch": 0.1764140053866872, + "flos": 443606279424.0, + "grad_norm": 0.0370205772569368, + "language_loss": 0.92972034, + "learning_rate": 0.0009448365976354704, + "loss": 0.94034874, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.52441406, + "step": 917, + "time_per_iteration": 2.478041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.0204134, + "epoch": 0.17660638707195075, + "flos": 501592224768.0, + "grad_norm": 0.047363321454448416, + "language_loss": 0.907022, + "learning_rate": 0.0009446942617422558, + "loss": 0.91775542, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.53027344, + "step": 918, + "time_per_iteration": 2.5698564052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_mlp": 1.00789583, + "epoch": 0.17679876875721431, + "flos": 539984145408.0, + "grad_norm": 0.03732253291641402, + "language_loss": 0.86447889, + "learning_rate": 0.0009445517532034176, + "loss": 0.87508708, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.53027344, + "step": 919, + "time_per_iteration": 2.6916563510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.00926292, + "epoch": 0.17699115044247787, + "flos": 498715868160.0, + "grad_norm": 0.04444616550081301, + "language_loss": 0.8994987, + "learning_rate": 0.0009444090720742824, + "loss": 0.91012013, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.52978516, + "step": 920, + "time_per_iteration": 2.5798380374908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.01706016, + "epoch": 0.17718353212774143, + "flos": 663916322304.0, + "grad_norm": 0.04662040468857239, + "language_loss": 0.89399016, + "learning_rate": 0.0009442662184102439, + "loss": 0.90468818, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.52832031, + "step": 921, + "time_per_iteration": 2.755929708480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.01164341, + "epoch": 0.177375913813005, + "flos": 583848822528.0, + "grad_norm": 0.03479566109485236, + "language_loss": 0.88455689, + "learning_rate": 0.000944123192266763, + "loss": 0.89519787, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.52539062, + "step": 922, + "time_per_iteration": 2.8776824474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.00980616, + "epoch": 0.17756829549826855, + "flos": 553684505856.0, + "grad_norm": 0.036018663808135676, + "language_loss": 0.84559548, + "learning_rate": 0.0009439799936993671, + "loss": 0.85622525, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.53271484, + "step": 923, + "time_per_iteration": 2.708897113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.01041508, + "epoch": 0.17776067718353214, + "flos": 557372542464.0, + "grad_norm": 0.06706828820902193, + "language_loss": 0.89721078, + "learning_rate": 0.0009438366227636511, + "loss": 0.90784371, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.52978516, + "step": 924, + "time_per_iteration": 2.6524295806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.01035416, + "epoch": 0.1779530588687957, + "flos": 659652820992.0, + "grad_norm": 0.03503923634288643, + "language_loss": 0.87549317, + "learning_rate": 0.0009436930795152763, + "loss": 0.8861202, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.52441406, + "step": 925, + "time_per_iteration": 2.8627374172210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_mlp": 1.01823378, + "epoch": 0.17814544055405926, + "flos": 645672503808.0, + "grad_norm": 0.03989967380061369, + "language_loss": 0.87815237, + "learning_rate": 0.0009435493640099713, + "loss": 0.88885403, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.52001953, + "step": 926, + "time_per_iteration": 2.7886180877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.01283479, + "epoch": 0.17833782223932282, + "flos": 461885091072.0, + "grad_norm": 0.040977111340993126, + "language_loss": 0.85709256, + "learning_rate": 0.0009434054763035314, + "loss": 0.86774307, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.52294922, + "step": 927, + "time_per_iteration": 2.635576009750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00520515, + "epoch": 0.17853020392458638, + "flos": 760854101760.0, + "grad_norm": 0.029435711646972902, + "language_loss": 0.86359227, + "learning_rate": 0.0009432614164518185, + "loss": 0.8741703, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.52685547, + "step": 928, + "time_per_iteration": 2.945253849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074963, + "balance_loss_mlp": 1.02203369, + "epoch": 0.17872258560984994, + "flos": 784056450048.0, + "grad_norm": 0.039066121455708196, + "language_loss": 0.84876156, + "learning_rate": 0.000943117184510762, + "loss": 0.85951114, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 3.0016870498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092369, + "balance_loss_mlp": 1.04201508, + "epoch": 0.1789149672951135, + "flos": 1463034021120.0, + "grad_norm": 0.03241390760866092, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79882336, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.50390625, + "step": 930, + "time_per_iteration": 5.0408923625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.04005396, + "epoch": 0.17910734898037706, + "flos": 504931285248.0, + "grad_norm": 0.037670754636037675, + "language_loss": 0.90276599, + "learning_rate": 0.0009428282045846674, + "loss": 0.91368294, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.51708984, + "step": 931, + "time_per_iteration": 2.699357509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093914, + "balance_loss_mlp": 1.04260671, + "epoch": 0.17929973066564064, + "flos": 747670880256.0, + "grad_norm": 0.03557447538434831, + "language_loss": 0.91468316, + "learning_rate": 0.0009426834567118214, + "loss": 0.92562228, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.51367188, + "step": 932, + "time_per_iteration": 3.0888116359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095199, + "balance_loss_mlp": 1.04370034, + "epoch": 0.1794921123509042, + "flos": 714573826560.0, + "grad_norm": 0.03713873812168088, + "language_loss": 0.82311261, + "learning_rate": 0.0009425385369740155, + "loss": 0.8340646, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.515625, + "step": 933, + "time_per_iteration": 3.0156304836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.04465711, + "epoch": 0.17968449403616776, + "flos": 634362463488.0, + "grad_norm": 0.04581160448205157, + "language_loss": 0.89044029, + "learning_rate": 0.0009423934454275125, + "loss": 0.90140092, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.51464844, + "step": 934, + "time_per_iteration": 2.8524558544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095874, + "balance_loss_mlp": 1.04428041, + "epoch": 0.17987687572143132, + "flos": 537378997248.0, + "grad_norm": 0.045982575553228676, + "language_loss": 0.93734717, + "learning_rate": 0.0009422481821286418, + "loss": 0.94830596, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.51660156, + "step": 935, + "time_per_iteration": 2.7354249954223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096416, + "balance_loss_mlp": 1.0448221, + "epoch": 0.18006925740669488, + "flos": 539119975680.0, + "grad_norm": 0.04748543050697339, + "language_loss": 0.89948702, + "learning_rate": 0.0009421027471337998, + "loss": 0.91045117, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.51660156, + "step": 936, + "time_per_iteration": 2.660287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095184, + "balance_loss_mlp": 1.04363835, + "epoch": 0.18026163909195844, + "flos": 540535310592.0, + "grad_norm": 0.04911488628490749, + "language_loss": 0.84066534, + "learning_rate": 0.0009419571404994493, + "loss": 0.8516171, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.51611328, + "step": 937, + "time_per_iteration": 2.624769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.03865409, + "epoch": 0.180454020777222, + "flos": 501683598336.0, + "grad_norm": 0.0468107226861285, + "language_loss": 0.92304778, + "learning_rate": 0.00094181136228212, + "loss": 0.9339512, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.51757812, + "step": 938, + "time_per_iteration": 2.6784133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092284, + "balance_loss_mlp": 1.04069054, + "epoch": 0.18064640246248556, + "flos": 500007748608.0, + "grad_norm": 0.039466745711782485, + "language_loss": 0.87082231, + "learning_rate": 0.0009416654125384077, + "loss": 0.8817451, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.51660156, + "step": 939, + "time_per_iteration": 2.7231576442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081085, + "balance_loss_mlp": 1.03034973, + "epoch": 0.18083878414774912, + "flos": 1522293383424.0, + "grad_norm": 0.016406546431804496, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80853462, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.5078125, + "step": 940, + "time_per_iteration": 4.919930934906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329403, + "balance_loss_mlp": 1.27490067, + "epoch": 0.1810311658330127, + "flos": 728666904576.0, + "grad_norm": 0.12503564718566265, + "language_loss": 0.85519916, + "learning_rate": 0.000941372998698552, + "loss": 0.8684932, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.54638672, + "step": 941, + "time_per_iteration": 2.9731380939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093385, + "balance_loss_mlp": 1.04121876, + "epoch": 0.18122354751827627, + "flos": 566045353728.0, + "grad_norm": 0.05253753965114479, + "language_loss": 0.83319217, + "learning_rate": 0.0009412265347159336, + "loss": 0.84412599, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.52246094, + "step": 942, + "time_per_iteration": 2.7150988578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103842, + "balance_loss_mlp": 1.05162799, + "epoch": 0.18141592920353983, + "flos": 520318189056.0, + "grad_norm": 0.046885904923641086, + "language_loss": 0.86687338, + "learning_rate": 0.0009410798994339829, + "loss": 0.87791175, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.52294922, + "step": 943, + "time_per_iteration": 2.598576545715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111341, + "balance_loss_mlp": 1.05831623, + "epoch": 0.1816083108888034, + "flos": 513477729792.0, + "grad_norm": 0.04639702407841738, + "language_loss": 0.8991158, + "learning_rate": 0.000940933092909628, + "loss": 0.91022921, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.53125, + "step": 944, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104212, + "balance_loss_mlp": 1.05109203, + "epoch": 0.18180069257406695, + "flos": 493373369088.0, + "grad_norm": 0.04493061679832577, + "language_loss": 0.85416293, + "learning_rate": 0.0009407861151998649, + "loss": 0.86520505, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.53222656, + "step": 945, + "time_per_iteration": 2.5710983276367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.04692006, + "epoch": 0.1819930742593305, + "flos": 571231350528.0, + "grad_norm": 0.04259629183686275, + "language_loss": 0.87787771, + "learning_rate": 0.0009406389663617552, + "loss": 0.88888001, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.53417969, + "step": 946, + "time_per_iteration": 2.6741456985473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.04661465, + "epoch": 0.18218545594459407, + "flos": 607111441920.0, + "grad_norm": 0.04866460503106345, + "language_loss": 0.87927794, + "learning_rate": 0.000940491646452427, + "loss": 0.89027911, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.53613281, + "step": 947, + "time_per_iteration": 2.718358278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101327, + "balance_loss_mlp": 1.04753995, + "epoch": 0.18237783762985763, + "flos": 549739845120.0, + "grad_norm": 0.042994543525894185, + "language_loss": 0.92601323, + "learning_rate": 0.000940344155529075, + "loss": 0.93702656, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.5390625, + "step": 948, + "time_per_iteration": 2.624303102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097087, + "balance_loss_mlp": 1.04325247, + "epoch": 0.1825702193151212, + "flos": 451675435776.0, + "grad_norm": 0.046415524987670945, + "language_loss": 0.89178842, + "learning_rate": 0.0009401964936489605, + "loss": 0.90275931, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.53955078, + "step": 949, + "time_per_iteration": 2.5104119777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088983, + "balance_loss_mlp": 1.03524303, + "epoch": 0.18276260100038477, + "flos": 590385025536.0, + "grad_norm": 0.0430347708706334, + "language_loss": 0.86972219, + "learning_rate": 0.0009400486608694108, + "loss": 0.88061202, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.53857422, + "step": 950, + "time_per_iteration": 2.744044065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085663, + "balance_loss_mlp": 1.03154159, + "epoch": 0.18295498268564833, + "flos": 788710723584.0, + "grad_norm": 0.040810758702646055, + "language_loss": 0.88588369, + "learning_rate": 0.0009399006572478195, + "loss": 0.89674032, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.54248047, + "step": 951, + "time_per_iteration": 3.0828475952148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.02493632, + "epoch": 0.1831473643709119, + "flos": 579226629888.0, + "grad_norm": 0.03747434947067488, + "language_loss": 0.92113942, + "learning_rate": 0.0009397524828416468, + "loss": 0.93193376, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.54638672, + "step": 952, + "time_per_iteration": 2.6881086826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.03405273, + "epoch": 0.18333974605617545, + "flos": 567964221696.0, + "grad_norm": 0.0419825959367211, + "language_loss": 0.97306633, + "learning_rate": 0.0009396041377084192, + "loss": 0.9839648, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.55957031, + "step": 953, + "time_per_iteration": 2.673654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097804, + "balance_loss_mlp": 1.04191864, + "epoch": 0.183532127741439, + "flos": 528070450176.0, + "grad_norm": 0.04203850234568462, + "language_loss": 0.89016271, + "learning_rate": 0.0009394556219057295, + "loss": 0.90114069, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.56054688, + "step": 954, + "time_per_iteration": 2.7255043983459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.01610565, + "epoch": 0.18372450942670257, + "flos": 595644899328.0, + "grad_norm": 0.03789415730727427, + "language_loss": 0.84751296, + "learning_rate": 0.0009393069354912362, + "loss": 0.85822284, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.55029297, + "step": 955, + "time_per_iteration": 2.7474210262298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_mlp": 1.02963698, + "epoch": 0.18391689111196613, + "flos": 646284907008.0, + "grad_norm": 0.04389714766773939, + "language_loss": 0.83882308, + "learning_rate": 0.0009391580785226649, + "loss": 0.84966445, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.54638672, + "step": 956, + "time_per_iteration": 2.844409465789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081024, + "balance_loss_mlp": 1.02990723, + "epoch": 0.18410927279722972, + "flos": 1460394846720.0, + "grad_norm": 0.013082177800516761, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80421472, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.51171875, + "step": 957, + "time_per_iteration": 4.792405843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_mlp": 1.030267, + "epoch": 0.18430165448249328, + "flos": 660004709376.0, + "grad_norm": 0.04089111102732722, + "language_loss": 0.88231802, + "learning_rate": 0.0009388598531545196, + "loss": 0.89316285, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.54345703, + "step": 958, + "time_per_iteration": 2.900062084197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084489, + "balance_loss_mlp": 1.03017747, + "epoch": 0.18449403616775684, + "flos": 518950486272.0, + "grad_norm": 0.045948437313162956, + "language_loss": 0.87467843, + "learning_rate": 0.000938710484870727, + "loss": 0.88552332, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.54443359, + "step": 959, + "time_per_iteration": 2.5785140991210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.031569, + "epoch": 0.1846864178530204, + "flos": 553825456896.0, + "grad_norm": 0.04362127254920589, + "language_loss": 0.87369549, + "learning_rate": 0.0009385609462644189, + "loss": 0.88455284, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.54296875, + "step": 960, + "time_per_iteration": 2.686221122741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.02774417, + "epoch": 0.18487879953828396, + "flos": 467116774656.0, + "grad_norm": 0.04468558895083242, + "language_loss": 0.86931455, + "learning_rate": 0.0009384112373936514, + "loss": 0.88013744, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.546875, + "step": 961, + "time_per_iteration": 2.633582830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.00935197, + "epoch": 0.18507118122354752, + "flos": 649684238592.0, + "grad_norm": 0.03687654302408078, + "language_loss": 0.9259429, + "learning_rate": 0.0009382613583165467, + "loss": 0.93658715, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.55224609, + "step": 962, + "time_per_iteration": 2.7910635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458915, + "balance_loss_mlp": 1.40078855, + "epoch": 0.18526356290881107, + "flos": 627923470080.0, + "grad_norm": 0.09306974449566385, + "language_loss": 0.90611041, + "learning_rate": 0.0009381113090912928, + "loss": 0.92069954, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.57958984, + "step": 963, + "time_per_iteration": 2.7445125579833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078714, + "balance_loss_mlp": 1.02464056, + "epoch": 0.18545594459407463, + "flos": 433646445312.0, + "grad_norm": 0.04076594680163087, + "language_loss": 0.91471934, + "learning_rate": 0.000937961089776144, + "loss": 0.92550647, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.54199219, + "step": 964, + "time_per_iteration": 2.5835955142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089429, + "balance_loss_mlp": 1.03607059, + "epoch": 0.1856483262793382, + "flos": 750427673088.0, + "grad_norm": 0.041116434601540804, + "language_loss": 0.8449949, + "learning_rate": 0.0009378107004294208, + "loss": 0.8558892, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.53466797, + "step": 965, + "time_per_iteration": 2.9773664474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090833, + "balance_loss_mlp": 1.03790379, + "epoch": 0.18584070796460178, + "flos": 531402707712.0, + "grad_norm": 0.04029010126422192, + "language_loss": 0.93043375, + "learning_rate": 0.0009376601411095096, + "loss": 0.94134206, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.53027344, + "step": 966, + "time_per_iteration": 2.6703643798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088702, + "balance_loss_mlp": 1.03639269, + "epoch": 0.18603308964986534, + "flos": 484084263936.0, + "grad_norm": 0.03934020689435504, + "language_loss": 0.87718618, + "learning_rate": 0.0009375094118748622, + "loss": 0.88807321, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.52392578, + "step": 967, + "time_per_iteration": 2.5719969272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091813, + "balance_loss_mlp": 1.03974187, + "epoch": 0.1862254713351289, + "flos": 802682292480.0, + "grad_norm": 0.042176858736630414, + "language_loss": 0.92643285, + "learning_rate": 0.0009373585127839976, + "loss": 0.93735105, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.52148438, + "step": 968, + "time_per_iteration": 2.956153392791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096332, + "balance_loss_mlp": 1.04483318, + "epoch": 0.18641785302039246, + "flos": 479290984704.0, + "grad_norm": 0.04307464179422831, + "language_loss": 0.92206955, + "learning_rate": 0.0009372074438954994, + "loss": 0.93303293, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.515625, + "step": 969, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092255, + "balance_loss_mlp": 1.04085171, + "epoch": 0.18661023470565602, + "flos": 389779822848.0, + "grad_norm": 0.044792080488554424, + "language_loss": 0.93312657, + "learning_rate": 0.0009370562052680181, + "loss": 0.94404912, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.51464844, + "step": 970, + "time_per_iteration": 2.4642274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109029, + "balance_loss_mlp": 1.03926873, + "epoch": 0.18680261639091958, + "flos": 565776090624.0, + "grad_norm": 0.03666794569701081, + "language_loss": 0.90593827, + "learning_rate": 0.0009369047969602695, + "loss": 0.91684115, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.51074219, + "step": 971, + "time_per_iteration": 2.6925313472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090519, + "balance_loss_mlp": 1.03968859, + "epoch": 0.18699499807618314, + "flos": 480230976768.0, + "grad_norm": 0.04959033368050126, + "language_loss": 0.88274431, + "learning_rate": 0.0009367532190310357, + "loss": 0.89364946, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.50878906, + "step": 972, + "time_per_iteration": 2.5632824897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095286, + "balance_loss_mlp": 1.04464579, + "epoch": 0.1871873797614467, + "flos": 554328989952.0, + "grad_norm": 0.047101191533600484, + "language_loss": 0.90956879, + "learning_rate": 0.0009366014715391644, + "loss": 0.92052168, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.50683594, + "step": 973, + "time_per_iteration": 2.6131792068481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087331, + "balance_loss_mlp": 1.03669059, + "epoch": 0.18737976144671029, + "flos": 553953768960.0, + "grad_norm": 0.03277863870695053, + "language_loss": 0.85193431, + "learning_rate": 0.0009364495545435693, + "loss": 0.86280763, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.50683594, + "step": 974, + "time_per_iteration": 2.768160820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077828, + "balance_loss_mlp": 1.02647221, + "epoch": 0.18757214313197385, + "flos": 503248632576.0, + "grad_norm": 0.03709252074476072, + "language_loss": 0.90046728, + "learning_rate": 0.0009362974681032297, + "loss": 0.91124547, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.51416016, + "step": 975, + "time_per_iteration": 2.596752405166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358762, + "balance_loss_mlp": 1.30464137, + "epoch": 0.1877645248172374, + "flos": 676292721408.0, + "grad_norm": 0.11355211768831018, + "language_loss": 0.89691889, + "learning_rate": 0.0009361452122771907, + "loss": 0.91050649, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.54248047, + "step": 976, + "time_per_iteration": 2.841670036315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.03649426, + "epoch": 0.18795690650250096, + "flos": 405863700480.0, + "grad_norm": 0.05182073733860081, + "language_loss": 0.85757113, + "learning_rate": 0.0009359927871245635, + "loss": 0.86844826, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.51269531, + "step": 977, + "time_per_iteration": 2.4593758583068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110124, + "balance_loss_mlp": 1.04988456, + "epoch": 0.18814928818776452, + "flos": 639064369152.0, + "grad_norm": 0.04599902588150218, + "language_loss": 0.8843354, + "learning_rate": 0.0009358401927045246, + "loss": 0.89534783, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.51416016, + "step": 978, + "time_per_iteration": 2.8043553829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_mlp": 1.05197036, + "epoch": 0.18834166987302808, + "flos": 1140117100800.0, + "grad_norm": 0.05109113713971293, + "language_loss": 0.89583617, + "learning_rate": 0.0009356874290763166, + "loss": 0.90687132, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.51611328, + "step": 979, + "time_per_iteration": 3.4783685207366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105346, + "balance_loss_mlp": 1.0536567, + "epoch": 0.18853405155829164, + "flos": 505816842240.0, + "grad_norm": 0.03906189308485337, + "language_loss": 0.90395761, + "learning_rate": 0.0009355344962992474, + "loss": 0.91501105, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.51757812, + "step": 980, + "time_per_iteration": 2.6457359790802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_mlp": 1.05116904, + "epoch": 0.1887264332435552, + "flos": 609371504640.0, + "grad_norm": 0.038270487176229884, + "language_loss": 0.89782834, + "learning_rate": 0.0009353813944326908, + "loss": 0.9088589, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.51953125, + "step": 981, + "time_per_iteration": 2.923243761062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102538, + "balance_loss_mlp": 1.05070543, + "epoch": 0.1889188149288188, + "flos": 553593132288.0, + "grad_norm": 0.04212053297292714, + "language_loss": 0.84181225, + "learning_rate": 0.0009352281235360863, + "loss": 0.85283768, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.51904297, + "step": 982, + "time_per_iteration": 2.674790620803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_mlp": 1.05135345, + "epoch": 0.18911119661408235, + "flos": 419470742016.0, + "grad_norm": 0.03892833341753514, + "language_loss": 0.86323905, + "learning_rate": 0.0009350746836689389, + "loss": 0.87426949, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.51757812, + "step": 983, + "time_per_iteration": 2.5294649600982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103523, + "balance_loss_mlp": 1.05335999, + "epoch": 0.1893035782993459, + "flos": 1485320676864.0, + "grad_norm": 0.016207020064155576, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82542741, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.50195312, + "step": 984, + "time_per_iteration": 5.031845569610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094201, + "balance_loss_mlp": 1.04227316, + "epoch": 0.18949595998460947, + "flos": 509457246720.0, + "grad_norm": 0.045438139941342374, + "language_loss": 0.84563899, + "learning_rate": 0.0009347672972613634, + "loss": 0.85658097, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.52001953, + "step": 985, + "time_per_iteration": 2.6333274841308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.0384593, + "epoch": 0.18968834166987303, + "flos": 532193000448.0, + "grad_norm": 0.03993027053802703, + "language_loss": 0.8704083, + "learning_rate": 0.0009346133508402735, + "loss": 0.8813107, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.51855469, + "step": 986, + "time_per_iteration": 2.751340389251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089761, + "balance_loss_mlp": 1.03797686, + "epoch": 0.1898807233551366, + "flos": 500754299904.0, + "grad_norm": 0.04595906606263721, + "language_loss": 0.85852754, + "learning_rate": 0.0009344592356873166, + "loss": 0.86942512, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.51855469, + "step": 987, + "time_per_iteration": 2.6785645484924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.03223073, + "epoch": 0.19007310504040015, + "flos": 603360221952.0, + "grad_norm": 0.042275439246703725, + "language_loss": 0.79788595, + "learning_rate": 0.0009343049518623255, + "loss": 0.80872947, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.52197266, + "step": 988, + "time_per_iteration": 2.709439516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365061, + "balance_loss_mlp": 1.30979574, + "epoch": 0.1902654867256637, + "flos": 602765315328.0, + "grad_norm": 0.1049262798815586, + "language_loss": 0.8386007, + "learning_rate": 0.0009341504994251985, + "loss": 0.85225129, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.55419922, + "step": 989, + "time_per_iteration": 2.925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.03952026, + "epoch": 0.19045786841092727, + "flos": 1579234345728.0, + "grad_norm": 0.01847097645999908, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74610186, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.50195312, + "step": 990, + "time_per_iteration": 5.025054216384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_mlp": 1.04845631, + "epoch": 0.19065025009619085, + "flos": 683055412992.0, + "grad_norm": 0.039739471389523856, + "language_loss": 0.8281374, + "learning_rate": 0.0009338410889544574, + "loss": 0.83915699, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.53613281, + "step": 991, + "time_per_iteration": 3.0653748512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112616, + "balance_loss_mlp": 1.05868626, + "epoch": 0.1908426317814544, + "flos": 603442847232.0, + "grad_norm": 0.04383499470371995, + "language_loss": 0.89543211, + "learning_rate": 0.000933686131040967, + "loss": 0.90655828, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.54052734, + "step": 992, + "time_per_iteration": 2.7901530265808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106006, + "balance_loss_mlp": 1.0517416, + "epoch": 0.19103501346671797, + "flos": 587434791936.0, + "grad_norm": 0.04122735235002176, + "language_loss": 0.92173266, + "learning_rate": 0.0009335310047555883, + "loss": 0.93279278, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.54394531, + "step": 993, + "time_per_iteration": 2.7153608798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097163, + "balance_loss_mlp": 1.04285157, + "epoch": 0.19122739515198153, + "flos": 546835298304.0, + "grad_norm": 0.04052898350535971, + "language_loss": 0.89637405, + "learning_rate": 0.0009333757101585467, + "loss": 0.90734565, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.54443359, + "step": 994, + "time_per_iteration": 2.6286795139312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091826, + "balance_loss_mlp": 1.03732359, + "epoch": 0.1914197768372451, + "flos": 522550061568.0, + "grad_norm": 0.03850908176124289, + "language_loss": 0.94694555, + "learning_rate": 0.0009332202473101329, + "loss": 0.95786381, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.54638672, + "step": 995, + "time_per_iteration": 2.649850368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072176, + "balance_loss_mlp": 1.01714945, + "epoch": 0.19161215852250865, + "flos": 612388812288.0, + "grad_norm": 0.03654296504823072, + "language_loss": 0.83743644, + "learning_rate": 0.0009330646162707028, + "loss": 0.84815824, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.55175781, + "step": 996, + "time_per_iteration": 2.7329981327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059087, + "balance_loss_mlp": 1.0033443, + "epoch": 0.1918045402077722, + "flos": 848183935488.0, + "grad_norm": 0.03315860340701524, + "language_loss": 0.85236025, + "learning_rate": 0.0009329088171006779, + "loss": 0.8629511, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.55908203, + "step": 997, + "time_per_iteration": 3.135049343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290698, + "balance_loss_mlp": 1.2330482, + "epoch": 0.19199692189303577, + "flos": 466893198336.0, + "grad_norm": 0.06463762674453556, + "language_loss": 0.86239529, + "learning_rate": 0.0009327528498605446, + "loss": 0.87530231, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.57470703, + "step": 998, + "time_per_iteration": 2.5807580947875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.01727533, + "epoch": 0.19218930357829936, + "flos": 532613908224.0, + "grad_norm": 0.04280698068802137, + "language_loss": 0.90856296, + "learning_rate": 0.0009325967146108548, + "loss": 0.91928697, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.55273438, + "step": 999, + "time_per_iteration": 2.637840986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086346, + "balance_loss_mlp": 1.03217781, + "epoch": 0.19238168526356292, + "flos": 602728376832.0, + "grad_norm": 0.04847652630230049, + "language_loss": 0.88902158, + "learning_rate": 0.0009324404114122258, + "loss": 0.89988506, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.54296875, + "step": 1000, + "time_per_iteration": 4.1391942501068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090902, + "balance_loss_mlp": 1.03701913, + "epoch": 0.19257406694882648, + "flos": 573155076096.0, + "grad_norm": 0.04193719314851312, + "language_loss": 0.88362414, + "learning_rate": 0.0009322839403253397, + "loss": 0.89453316, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.54003906, + "step": 1001, + "time_per_iteration": 2.8266265392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.03395164, + "epoch": 0.19276644863409004, + "flos": 803157635328.0, + "grad_norm": 0.04353601683576214, + "language_loss": 0.85235333, + "learning_rate": 0.0009321273014109439, + "loss": 0.86323166, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.54003906, + "step": 1002, + "time_per_iteration": 2.9539175033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_mlp": 1.04068995, + "epoch": 0.1929588303193536, + "flos": 564480319488.0, + "grad_norm": 0.03718563884895513, + "language_loss": 0.86078906, + "learning_rate": 0.0009319704947298513, + "loss": 0.87173432, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.53955078, + "step": 1003, + "time_per_iteration": 2.8760387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091693, + "balance_loss_mlp": 1.0380007, + "epoch": 0.19315121200461716, + "flos": 627988598784.0, + "grad_norm": 0.03744955738150477, + "language_loss": 0.89579475, + "learning_rate": 0.0009318135203429393, + "loss": 0.9067117, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.53808594, + "step": 1004, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094654, + "balance_loss_mlp": 1.04058087, + "epoch": 0.19334359368988072, + "flos": 518584013568.0, + "grad_norm": 0.03742742378220975, + "language_loss": 0.89228511, + "learning_rate": 0.0009316563783111511, + "loss": 0.90323162, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.54199219, + "step": 1005, + "time_per_iteration": 2.7024500370025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090205, + "balance_loss_mlp": 1.03598833, + "epoch": 0.19353597537514428, + "flos": 695400709632.0, + "grad_norm": 0.036019255491177425, + "language_loss": 0.83731771, + "learning_rate": 0.0009314990686954943, + "loss": 0.84821975, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.54345703, + "step": 1006, + "time_per_iteration": 2.901319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092974, + "balance_loss_mlp": 1.03866184, + "epoch": 0.19372835706040784, + "flos": 1212200981760.0, + "grad_norm": 0.03507497873235563, + "language_loss": 0.82359284, + "learning_rate": 0.000931341591557042, + "loss": 0.8345226, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.54443359, + "step": 1007, + "time_per_iteration": 3.70509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.03467596, + "epoch": 0.19392073874567142, + "flos": 521685891840.0, + "grad_norm": 0.04354230775215961, + "language_loss": 0.88703787, + "learning_rate": 0.0009311839469569325, + "loss": 0.89792681, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.54345703, + "step": 1008, + "time_per_iteration": 2.632070302963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088222, + "balance_loss_mlp": 1.03386211, + "epoch": 0.19411312043093498, + "flos": 589911628032.0, + "grad_norm": 0.044503426382111445, + "language_loss": 0.88821465, + "learning_rate": 0.0009310261349563687, + "loss": 0.89909685, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.54492188, + "step": 1009, + "time_per_iteration": 2.7138211727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0067606, + "epoch": 0.19430550211619854, + "flos": 580572945408.0, + "grad_norm": 0.029375689409949213, + "language_loss": 0.86173785, + "learning_rate": 0.0009308681556166186, + "loss": 0.87235624, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.55224609, + "step": 1010, + "time_per_iteration": 2.834946870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05162705, + "balance_loss_mlp": 5.08607721, + "epoch": 0.1944978838014621, + "flos": 622246579200.0, + "grad_norm": 0.2884784307389343, + "language_loss": 0.88793403, + "learning_rate": 0.0009307100089990152, + "loss": 0.93956107, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.76513672, + "step": 1011, + "time_per_iteration": 2.705335855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094303, + "balance_loss_mlp": 1.04189909, + "epoch": 0.19469026548672566, + "flos": 599815081728.0, + "grad_norm": 0.04633555371791679, + "language_loss": 0.85740912, + "learning_rate": 0.0009305516951649568, + "loss": 0.86835217, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.52490234, + "step": 1012, + "time_per_iteration": 2.7048773765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164213, + "balance_loss_mlp": 1.11281013, + "epoch": 0.19488264717198922, + "flos": 553248046848.0, + "grad_norm": 0.04991787894778298, + "language_loss": 0.87912452, + "learning_rate": 0.0009303932141759057, + "loss": 0.89076668, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.51464844, + "step": 1013, + "time_per_iteration": 2.8072102069854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211245, + "balance_loss_mlp": 1.15984225, + "epoch": 0.19507502885725278, + "flos": 667313708544.0, + "grad_norm": 0.06529111316537192, + "language_loss": 0.85445917, + "learning_rate": 0.0009302345660933902, + "loss": 0.86657166, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.51464844, + "step": 1014, + "time_per_iteration": 2.7895615100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244014, + "balance_loss_mlp": 1.19265878, + "epoch": 0.19526741054251634, + "flos": 672328618752.0, + "grad_norm": 0.06071591874537116, + "language_loss": 0.86587232, + "learning_rate": 0.0009300757509790026, + "loss": 0.87831247, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.51416016, + "step": 1015, + "time_per_iteration": 2.8867006301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012313, + "balance_loss_mlp": 1.18008745, + "epoch": 0.19545979222777993, + "flos": 448147792128.0, + "grad_norm": 0.057262662434688416, + "language_loss": 0.91914976, + "learning_rate": 0.0009299167688944005, + "loss": 0.93146276, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.51269531, + "step": 1016, + "time_per_iteration": 2.526421546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226901, + "balance_loss_mlp": 1.17568827, + "epoch": 0.1956521739130435, + "flos": 570169849344.0, + "grad_norm": 0.05343522997619492, + "language_loss": 0.87454194, + "learning_rate": 0.0009297576199013063, + "loss": 0.8868109, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.51269531, + "step": 1017, + "time_per_iteration": 2.7184784412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012071, + "balance_loss_mlp": 1.15884399, + "epoch": 0.19584455559830705, + "flos": 1458883280640.0, + "grad_norm": 0.03399393552013433, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74209231, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.48242188, + "step": 1018, + "time_per_iteration": 4.916393756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159874, + "balance_loss_mlp": 1.11199951, + "epoch": 0.1960369372835706, + "flos": 1594484189184.0, + "grad_norm": 0.02523442502037962, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80586171, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.47851562, + "step": 1019, + "time_per_iteration": 5.5991902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163202, + "balance_loss_mlp": 1.11241901, + "epoch": 0.19622931896883417, + "flos": 617254023168.0, + "grad_norm": 0.06792637193668423, + "language_loss": 0.88615566, + "learning_rate": 0.0009292791720892659, + "loss": 0.89778763, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.50830078, + "step": 1020, + "time_per_iteration": 2.8419806957244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.08191884, + "epoch": 0.19642170065409773, + "flos": 467208148224.0, + "grad_norm": 0.044541966790476714, + "language_loss": 0.90245676, + "learning_rate": 0.0009291193560807218, + "loss": 0.91378373, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.50830078, + "step": 1021, + "time_per_iteration": 2.60357403755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111942, + "balance_loss_mlp": 1.06858945, + "epoch": 0.19661408233936128, + "flos": 516288957696.0, + "grad_norm": 0.03957164107654416, + "language_loss": 0.88134921, + "learning_rate": 0.0009289593734732688, + "loss": 0.89254344, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.50878906, + "step": 1022, + "time_per_iteration": 2.6077988147735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115639, + "balance_loss_mlp": 1.06461763, + "epoch": 0.19680646402462484, + "flos": 393494104320.0, + "grad_norm": 0.03618938319364158, + "language_loss": 0.94921708, + "learning_rate": 0.0009287992243290175, + "loss": 0.96037352, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.51074219, + "step": 1023, + "time_per_iteration": 2.486910820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_mlp": 1.05263603, + "epoch": 0.19699884570988843, + "flos": 627624071424.0, + "grad_norm": 0.04088238638674664, + "language_loss": 0.91379654, + "learning_rate": 0.0009286389087101435, + "loss": 0.92483938, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.51708984, + "step": 1024, + "time_per_iteration": 2.7762300968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083626, + "balance_loss_mlp": 1.03126919, + "epoch": 0.197191227395152, + "flos": 559074637056.0, + "grad_norm": 0.038177798611856564, + "language_loss": 0.89866579, + "learning_rate": 0.0009284784266788864, + "loss": 0.90950203, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.52441406, + "step": 1025, + "time_per_iteration": 2.7595441341400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.05275905, + "epoch": 0.19738360908041555, + "flos": 666250262016.0, + "grad_norm": 0.08120700653890094, + "language_loss": 0.93505025, + "learning_rate": 0.0009283177782975512, + "loss": 0.94610423, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.52734375, + "step": 1026, + "time_per_iteration": 2.9439735412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158523, + "balance_loss_mlp": 1.10511732, + "epoch": 0.1975759907656791, + "flos": 523511440896.0, + "grad_norm": 0.05175943009769999, + "language_loss": 0.89213437, + "learning_rate": 0.000928156963628507, + "loss": 0.9037196, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.53515625, + "step": 1027, + "time_per_iteration": 2.5648727416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124606, + "balance_loss_mlp": 1.0717721, + "epoch": 0.19776837245094267, + "flos": 463485118464.0, + "grad_norm": 0.0380471847687272, + "language_loss": 0.89530945, + "learning_rate": 0.0009279959827341877, + "loss": 0.90655547, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.52929688, + "step": 1028, + "time_per_iteration": 2.7482099533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114747, + "balance_loss_mlp": 1.0622474, + "epoch": 0.19796075413620623, + "flos": 504058367232.0, + "grad_norm": 0.038077776452832945, + "language_loss": 0.88821751, + "learning_rate": 0.0009278348356770915, + "loss": 0.89936495, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.52587891, + "step": 1029, + "time_per_iteration": 2.5559866428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125843, + "balance_loss_mlp": 1.07362974, + "epoch": 0.1981531358214698, + "flos": 508571689728.0, + "grad_norm": 0.03906482091144459, + "language_loss": 0.87010926, + "learning_rate": 0.0009276735225197814, + "loss": 0.88136768, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.52294922, + "step": 1030, + "time_per_iteration": 2.598353862762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116206, + "balance_loss_mlp": 1.06418335, + "epoch": 0.19834551750673335, + "flos": 532640153088.0, + "grad_norm": 0.039761606091750314, + "language_loss": 0.8715511, + "learning_rate": 0.0009275120433248847, + "loss": 0.88271314, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.52099609, + "step": 1031, + "time_per_iteration": 2.691051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105688, + "balance_loss_mlp": 1.05414224, + "epoch": 0.1985378991919969, + "flos": 776971027200.0, + "grad_norm": 0.03650424605094363, + "language_loss": 0.87217546, + "learning_rate": 0.0009273503981550931, + "loss": 0.88323236, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.51611328, + "step": 1032, + "time_per_iteration": 3.05829119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094626, + "balance_loss_mlp": 1.04336572, + "epoch": 0.1987302808772605, + "flos": 435192037632.0, + "grad_norm": 0.04492232470085823, + "language_loss": 0.88675368, + "learning_rate": 0.0009271885870731626, + "loss": 0.89769995, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.51318359, + "step": 1033, + "time_per_iteration": 2.5097644329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.04036272, + "epoch": 0.19892266256252406, + "flos": 554654633472.0, + "grad_norm": 0.041410721104386976, + "language_loss": 0.89478087, + "learning_rate": 0.0009270266101419143, + "loss": 0.90569472, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.51074219, + "step": 1034, + "time_per_iteration": 2.6359710693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.04026711, + "epoch": 0.19911504424778761, + "flos": 550949100288.0, + "grad_norm": 0.034987230226667505, + "language_loss": 0.86329561, + "learning_rate": 0.0009268644674242328, + "loss": 0.87420899, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.51123047, + "step": 1035, + "time_per_iteration": 2.679041624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091574, + "balance_loss_mlp": 1.04045713, + "epoch": 0.19930742593305117, + "flos": 519313068288.0, + "grad_norm": 0.035495194235479824, + "language_loss": 0.81977046, + "learning_rate": 0.0009267021589830678, + "loss": 0.83068615, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.51171875, + "step": 1036, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330025, + "balance_loss_mlp": 1.27871704, + "epoch": 0.19949980761831473, + "flos": 1512640717824.0, + "grad_norm": 0.0530000786951376, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78957105, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.51367188, + "step": 1037, + "time_per_iteration": 5.041083097457886 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.04635978, + "epoch": 0.1996921893035783, + "flos": 699440634624.0, + "grad_norm": 0.03827221066614039, + "language_loss": 0.93735194, + "learning_rate": 0.000926377045182406, + "loss": 0.94832766, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.51269531, + "step": 1038, + "time_per_iteration": 2.921194314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_mlp": 1.05443072, + "epoch": 0.19988457098884185, + "flos": 728395696128.0, + "grad_norm": 0.0388450926907903, + "language_loss": 0.89164472, + "learning_rate": 0.0009262142399491296, + "loss": 0.90270543, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.51708984, + "step": 1039, + "time_per_iteration": 3.0543293952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.05093122, + "epoch": 0.2000769526741054, + "flos": 561625350144.0, + "grad_norm": 0.04341407711707897, + "language_loss": 0.8911137, + "learning_rate": 0.0009260512692448105, + "loss": 0.90213847, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.51611328, + "step": 1040, + "time_per_iteration": 2.6906111240386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097091, + "balance_loss_mlp": 1.04549766, + "epoch": 0.200269334359369, + "flos": 573165769728.0, + "grad_norm": 0.03433464693573298, + "language_loss": 0.85109496, + "learning_rate": 0.000925888133132719, + "loss": 0.86206591, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.51660156, + "step": 1041, + "time_per_iteration": 2.77327561378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112812, + "balance_loss_mlp": 1.06465149, + "epoch": 0.20046171604463256, + "flos": 1489155500544.0, + "grad_norm": 0.023433110981570023, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072325, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.48144531, + "step": 1042, + "time_per_iteration": 4.926042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116989, + "balance_loss_mlp": 1.06525254, + "epoch": 0.20065409772989612, + "flos": 497578544640.0, + "grad_norm": 0.04254485219096875, + "language_loss": 0.82304472, + "learning_rate": 0.0009255613649386244, + "loss": 0.83421457, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.51806641, + "step": 1043, + "time_per_iteration": 2.6593456268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111366, + "balance_loss_mlp": 1.06144655, + "epoch": 0.20084647941515968, + "flos": 580464075264.0, + "grad_norm": 0.040062947145422745, + "language_loss": 0.79980814, + "learning_rate": 0.0009253977329834838, + "loss": 0.81094474, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.52294922, + "step": 1044, + "time_per_iteration": 2.765777111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110762, + "balance_loss_mlp": 1.0584054, + "epoch": 0.20103886110042324, + "flos": 643288986624.0, + "grad_norm": 0.040441822708095716, + "language_loss": 0.87291706, + "learning_rate": 0.0009252339358742965, + "loss": 0.88402474, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.52441406, + "step": 1045, + "time_per_iteration": 2.825388193130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.05353701, + "epoch": 0.2012312427856868, + "flos": 442970543616.0, + "grad_norm": 0.03567593499019723, + "language_loss": 0.84250462, + "learning_rate": 0.000925069973674654, + "loss": 0.85356355, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.52441406, + "step": 1046, + "time_per_iteration": 2.609393358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103182, + "balance_loss_mlp": 1.05082524, + "epoch": 0.20142362447095036, + "flos": 555473116416.0, + "grad_norm": 0.03147198417726023, + "language_loss": 0.89562172, + "learning_rate": 0.000924905846448212, + "loss": 0.90665352, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.52441406, + "step": 1047, + "time_per_iteration": 2.7771337032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.0364331, + "epoch": 0.20161600615621392, + "flos": 671555822592.0, + "grad_norm": 0.0352448826174341, + "language_loss": 0.86282432, + "learning_rate": 0.0009247415542586906, + "loss": 0.87371844, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.53076172, + "step": 1048, + "time_per_iteration": 2.8992083072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.03624833, + "epoch": 0.2018083878414775, + "flos": 574307950848.0, + "grad_norm": 0.02930747529675645, + "language_loss": 0.83574796, + "learning_rate": 0.0009245770971698735, + "loss": 0.84664071, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.53125, + "step": 1049, + "time_per_iteration": 2.890824317932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092759, + "balance_loss_mlp": 1.03992498, + "epoch": 0.20200076952674106, + "flos": 426795292416.0, + "grad_norm": 0.03785140598382088, + "language_loss": 0.89288604, + "learning_rate": 0.0009244124752456087, + "loss": 0.9038136, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.52929688, + "step": 1050, + "time_per_iteration": 2.5022785663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078262, + "balance_loss_mlp": 1.02566695, + "epoch": 0.20219315121200462, + "flos": 537685198848.0, + "grad_norm": 0.03140637951028952, + "language_loss": 0.86254251, + "learning_rate": 0.0009242476885498081, + "loss": 0.87332511, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.52685547, + "step": 1051, + "time_per_iteration": 2.732915163040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080259, + "balance_loss_mlp": 1.02771127, + "epoch": 0.20238553289726818, + "flos": 478835083776.0, + "grad_norm": 0.042472274730814934, + "language_loss": 0.82148528, + "learning_rate": 0.0009240827371464474, + "loss": 0.83228779, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.52636719, + "step": 1052, + "time_per_iteration": 2.577660322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076456, + "balance_loss_mlp": 1.02448094, + "epoch": 0.20257791458253174, + "flos": 1153847596800.0, + "grad_norm": 0.038862673250338535, + "language_loss": 0.85609984, + "learning_rate": 0.0009239176210995666, + "loss": 0.86686444, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.52050781, + "step": 1053, + "time_per_iteration": 3.517408609390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076589, + "balance_loss_mlp": 1.02485228, + "epoch": 0.2027702962677953, + "flos": 668149688064.0, + "grad_norm": 0.03591644261584591, + "language_loss": 0.94691521, + "learning_rate": 0.0009237523404732695, + "loss": 0.95768112, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51806641, + "step": 1054, + "time_per_iteration": 2.9073944091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010707, + "balance_loss_mlp": 1.01934481, + "epoch": 0.20296267795305886, + "flos": 642453007104.0, + "grad_norm": 0.03829830750428097, + "language_loss": 0.85043323, + "learning_rate": 0.0009235868953317235, + "loss": 0.86114025, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.51416016, + "step": 1055, + "time_per_iteration": 2.8769731521606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063995, + "balance_loss_mlp": 1.01249659, + "epoch": 0.20315505963832242, + "flos": 932130967296.0, + "grad_norm": 0.03371739794492534, + "language_loss": 0.86243355, + "learning_rate": 0.0009234212857391602, + "loss": 0.87307346, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.515625, + "step": 1056, + "time_per_iteration": 3.1701345443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062013, + "balance_loss_mlp": 1.01075327, + "epoch": 0.20334744132358598, + "flos": 563288560896.0, + "grad_norm": 0.028023058598955305, + "language_loss": 0.9034453, + "learning_rate": 0.000923255511759875, + "loss": 0.91406548, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.51318359, + "step": 1057, + "time_per_iteration": 2.8186585903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105938, + "balance_loss_mlp": 1.00840592, + "epoch": 0.20353982300884957, + "flos": 645429485568.0, + "grad_norm": 0.03599363132321351, + "language_loss": 0.85699975, + "learning_rate": 0.000923089573458227, + "loss": 0.86759359, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.51025391, + "step": 1058, + "time_per_iteration": 2.829428195953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_mlp": 1.01248097, + "epoch": 0.20373220469411313, + "flos": 652706403840.0, + "grad_norm": 0.03721325608628497, + "language_loss": 0.84890962, + "learning_rate": 0.0009229234708986392, + "loss": 0.85954273, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.50878906, + "step": 1059, + "time_per_iteration": 2.9125583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119614, + "balance_loss_mlp": 1.06964111, + "epoch": 0.2039245863793767, + "flos": 1440399367680.0, + "grad_norm": 0.026200157549973457, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82786512, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.49902344, + "step": 1060, + "time_per_iteration": 4.70502233505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105542, + "balance_loss_mlp": 1.00468493, + "epoch": 0.20411696806464025, + "flos": 598128538368.0, + "grad_norm": 0.03644056871626998, + "language_loss": 0.85909504, + "learning_rate": 0.0009225907732636548, + "loss": 0.86964923, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.5078125, + "step": 1061, + "time_per_iteration": 2.7681198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057413, + "balance_loss_mlp": 1.00672543, + "epoch": 0.2043093497499038, + "flos": 574897999872.0, + "grad_norm": 0.03243635340085092, + "language_loss": 0.87862682, + "learning_rate": 0.0009224241783174227, + "loss": 0.88920105, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.50732422, + "step": 1062, + "time_per_iteration": 2.682659864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058744, + "balance_loss_mlp": 1.00819898, + "epoch": 0.20450173143516737, + "flos": 631524990720.0, + "grad_norm": 0.033151959510572516, + "language_loss": 0.86810422, + "learning_rate": 0.0009222574193715802, + "loss": 0.87869167, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.50585938, + "step": 1063, + "time_per_iteration": 2.7470076084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057209, + "balance_loss_mlp": 1.00656855, + "epoch": 0.20469411312043093, + "flos": 575147821056.0, + "grad_norm": 0.03442752078644266, + "language_loss": 0.86910367, + "learning_rate": 0.000922090496490869, + "loss": 0.87967575, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.50683594, + "step": 1064, + "time_per_iteration": 2.789161443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_mlp": 1.00465047, + "epoch": 0.20488649480569449, + "flos": 638280879360.0, + "grad_norm": 0.029149473365885022, + "language_loss": 0.90671569, + "learning_rate": 0.0009219234097400937, + "loss": 0.91726714, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.50537109, + "step": 1065, + "time_per_iteration": 2.8469130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00483656, + "epoch": 0.20507887649095807, + "flos": 977439169536.0, + "grad_norm": 0.03225683406068631, + "language_loss": 0.83590472, + "learning_rate": 0.0009217561591841237, + "loss": 0.84645659, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.50390625, + "step": 1066, + "time_per_iteration": 3.331498622894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105396, + "balance_loss_mlp": 1.00332034, + "epoch": 0.20527125817622163, + "flos": 487156006656.0, + "grad_norm": 0.037421781664849635, + "language_loss": 0.81758374, + "learning_rate": 0.0009215887448878913, + "loss": 0.82812333, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.50683594, + "step": 1067, + "time_per_iteration": 2.5782346725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054496, + "balance_loss_mlp": 1.00414193, + "epoch": 0.2054636398614852, + "flos": 528211401216.0, + "grad_norm": 0.031680985043262715, + "language_loss": 0.86063826, + "learning_rate": 0.0009214211669163922, + "loss": 0.87118322, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.50390625, + "step": 1068, + "time_per_iteration": 2.689772129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054583, + "balance_loss_mlp": 1.00403798, + "epoch": 0.20565602154674875, + "flos": 559324458240.0, + "grad_norm": 0.03119808154519671, + "language_loss": 0.94868428, + "learning_rate": 0.0009212534253346862, + "loss": 0.95923012, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.50585938, + "step": 1069, + "time_per_iteration": 2.760840654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.01027393, + "epoch": 0.2058484032320123, + "flos": 505221935616.0, + "grad_norm": 0.042999288209875815, + "language_loss": 0.85068119, + "learning_rate": 0.0009210855202078964, + "loss": 0.86128938, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.50585938, + "step": 1070, + "time_per_iteration": 2.6273016929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057609, + "balance_loss_mlp": 1.00687337, + "epoch": 0.20604078491727587, + "flos": 434047911168.0, + "grad_norm": 0.03672139626538296, + "language_loss": 0.88035965, + "learning_rate": 0.0009209174516012091, + "loss": 0.89093566, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.5078125, + "step": 1071, + "time_per_iteration": 2.5263099670410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.0049957, + "epoch": 0.20623316660253943, + "flos": 609875037696.0, + "grad_norm": 0.03118890610347894, + "language_loss": 0.89938867, + "learning_rate": 0.0009207492195798747, + "loss": 0.90994692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.50878906, + "step": 1072, + "time_per_iteration": 2.773094654083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059091, + "balance_loss_mlp": 1.00816524, + "epoch": 0.206425548287803, + "flos": 481394545152.0, + "grad_norm": 0.034846135669383375, + "language_loss": 0.85408926, + "learning_rate": 0.0009205808242092061, + "loss": 0.86468017, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.50976562, + "step": 1073, + "time_per_iteration": 2.6704161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.01025188, + "epoch": 0.20661792997306658, + "flos": 951124249344.0, + "grad_norm": 0.036438983488896924, + "language_loss": 0.83303434, + "learning_rate": 0.0009204122655545808, + "loss": 0.84364516, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.50878906, + "step": 1074, + "time_per_iteration": 3.3605480194091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059315, + "balance_loss_mlp": 1.00857949, + "epoch": 0.20681031165833014, + "flos": 604617109248.0, + "grad_norm": 0.03238632395719984, + "language_loss": 0.81744164, + "learning_rate": 0.0009202435436814388, + "loss": 0.82803476, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.5078125, + "step": 1075, + "time_per_iteration": 2.6966288089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106261, + "balance_loss_mlp": 1.01163661, + "epoch": 0.2070026933435937, + "flos": 710266583808.0, + "grad_norm": 0.03297439165012413, + "language_loss": 0.90137285, + "learning_rate": 0.0009200746586552836, + "loss": 0.91199899, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.51025391, + "step": 1076, + "time_per_iteration": 2.919851779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057537, + "balance_loss_mlp": 1.00675428, + "epoch": 0.20719507502885726, + "flos": 831255330048.0, + "grad_norm": 0.031928056401627374, + "language_loss": 0.84964621, + "learning_rate": 0.0009199056105416825, + "loss": 0.86022151, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.50830078, + "step": 1077, + "time_per_iteration": 3.0944886207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059646, + "balance_loss_mlp": 1.00881469, + "epoch": 0.20738745671412082, + "flos": 639500828160.0, + "grad_norm": 0.033227407694906064, + "language_loss": 0.87196565, + "learning_rate": 0.0009197363994062654, + "loss": 0.88256204, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.50878906, + "step": 1078, + "time_per_iteration": 2.8505265712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059879, + "balance_loss_mlp": 1.00933433, + "epoch": 0.20757983839938438, + "flos": 686984522496.0, + "grad_norm": 0.03258152966614613, + "language_loss": 0.84972161, + "learning_rate": 0.0009195670253147262, + "loss": 0.86032039, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.50585938, + "step": 1079, + "time_per_iteration": 3.0077526569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_mlp": 1.01375961, + "epoch": 0.20777222008464794, + "flos": 520318189056.0, + "grad_norm": 0.03575722766779635, + "language_loss": 0.83075011, + "learning_rate": 0.0009193974883328216, + "loss": 0.84139216, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.50488281, + "step": 1080, + "time_per_iteration": 2.6277496814727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062434, + "balance_loss_mlp": 1.01212776, + "epoch": 0.2079646017699115, + "flos": 512470663680.0, + "grad_norm": 0.03316952161345372, + "language_loss": 0.87936002, + "learning_rate": 0.0009192277885263718, + "loss": 0.88998437, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.50341797, + "step": 1081, + "time_per_iteration": 2.6486003398895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056126, + "balance_loss_mlp": 1.00596321, + "epoch": 0.20815698345517505, + "flos": 933468534528.0, + "grad_norm": 0.031694408237267754, + "language_loss": 0.87043977, + "learning_rate": 0.0009190579259612602, + "loss": 0.881001, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.50195312, + "step": 1082, + "time_per_iteration": 3.280133008956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062428, + "balance_loss_mlp": 1.01202655, + "epoch": 0.20834936514043864, + "flos": 633554674176.0, + "grad_norm": 0.03367407497844021, + "language_loss": 0.87446159, + "learning_rate": 0.000918887900703433, + "loss": 0.88508588, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.50439453, + "step": 1083, + "time_per_iteration": 2.7914657592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_mlp": 1.01024699, + "epoch": 0.2085417468257022, + "flos": 395243831040.0, + "grad_norm": 0.03354838448754016, + "language_loss": 0.91036344, + "learning_rate": 0.0009187177128188999, + "loss": 0.92096996, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.50439453, + "step": 1084, + "time_per_iteration": 2.4803311824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107357, + "balance_loss_mlp": 1.02455139, + "epoch": 0.20873412851096576, + "flos": 1405197775104.0, + "grad_norm": 0.012085868941934568, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78230107, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.48925781, + "step": 1085, + "time_per_iteration": 4.883121728897095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_mlp": 1.00562024, + "epoch": 0.20892651019622932, + "flos": 448762140672.0, + "grad_norm": 0.03493036575467998, + "language_loss": 0.8691588, + "learning_rate": 0.000918376849434071, + "loss": 0.87971807, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.50317383, + "step": 1086, + "time_per_iteration": 2.537820816040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065129, + "balance_loss_mlp": 1.01444149, + "epoch": 0.20911889188149288, + "flos": 494081036544.0, + "grad_norm": 0.040745363066357655, + "language_loss": 0.91673005, + "learning_rate": 0.0009182061740661098, + "loss": 0.9273814, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.50732422, + "step": 1087, + "time_per_iteration": 2.5920886993408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056749, + "balance_loss_mlp": 1.00615633, + "epoch": 0.20931127356675644, + "flos": 842750062848.0, + "grad_norm": 0.02822254108426211, + "language_loss": 0.85810733, + "learning_rate": 0.0009180353363361127, + "loss": 0.86867487, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.50634766, + "step": 1088, + "time_per_iteration": 3.1376798152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060338, + "balance_loss_mlp": 1.00979316, + "epoch": 0.20950365525202, + "flos": 758525019648.0, + "grad_norm": 0.03922038165748564, + "language_loss": 0.83160806, + "learning_rate": 0.0009178643363104044, + "loss": 0.84221143, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.50585938, + "step": 1089, + "time_per_iteration": 3.124352216720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059844, + "balance_loss_mlp": 1.00939417, + "epoch": 0.20969603693728356, + "flos": 473492584704.0, + "grad_norm": 0.04272734591158297, + "language_loss": 0.920385, + "learning_rate": 0.0009176931740553735, + "loss": 0.93098342, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.50488281, + "step": 1090, + "time_per_iteration": 2.556528091430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067623, + "balance_loss_mlp": 1.01731646, + "epoch": 0.20988841862254715, + "flos": 978628982784.0, + "grad_norm": 0.03590255199570226, + "language_loss": 0.83530974, + "learning_rate": 0.0009175218496374708, + "loss": 0.84598601, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.50341797, + "step": 1091, + "time_per_iteration": 3.328984260559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059931, + "balance_loss_mlp": 1.00976801, + "epoch": 0.2100808003078107, + "flos": 1094819592192.0, + "grad_norm": 0.03766723451938342, + "language_loss": 0.86626744, + "learning_rate": 0.0009173503631232103, + "loss": 0.87686676, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.50170898, + "step": 1092, + "time_per_iteration": 3.4216480255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.00832939, + "epoch": 0.21027318199307427, + "flos": 1014560596992.0, + "grad_norm": 0.047058286401960234, + "language_loss": 0.82703817, + "learning_rate": 0.0009171787145791691, + "loss": 0.83762449, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.50341797, + "step": 1093, + "time_per_iteration": 3.2454655170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.00911129, + "epoch": 0.21046556367833782, + "flos": 522413001216.0, + "grad_norm": 0.043211200123957835, + "language_loss": 0.80955076, + "learning_rate": 0.000917006904071987, + "loss": 0.8201468, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.50537109, + "step": 1094, + "time_per_iteration": 2.6560592651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_mlp": 1.01053584, + "epoch": 0.21065794536360138, + "flos": 604840685568.0, + "grad_norm": 0.03488627405352903, + "language_loss": 0.87964189, + "learning_rate": 0.0009168349316683669, + "loss": 0.89025223, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.50537109, + "step": 1095, + "time_per_iteration": 2.794358253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.01243329, + "epoch": 0.21085032704886494, + "flos": 604558783488.0, + "grad_norm": 0.031199931973452354, + "language_loss": 0.82918072, + "learning_rate": 0.0009166627974350741, + "loss": 0.83981001, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.50537109, + "step": 1096, + "time_per_iteration": 2.89837384223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062823, + "balance_loss_mlp": 1.01242077, + "epoch": 0.2110427087341285, + "flos": 638832044544.0, + "grad_norm": 0.03623978918327459, + "language_loss": 0.90394479, + "learning_rate": 0.0009164905014389373, + "loss": 0.91457301, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.50439453, + "step": 1097, + "time_per_iteration": 2.79203462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055619, + "balance_loss_mlp": 1.00559878, + "epoch": 0.21123509041939206, + "flos": 523930403328.0, + "grad_norm": 0.03351990521185014, + "language_loss": 0.87381279, + "learning_rate": 0.0009163180437468476, + "loss": 0.88436902, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.50024414, + "step": 1098, + "time_per_iteration": 2.6110002994537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056208, + "balance_loss_mlp": 1.00647402, + "epoch": 0.21142747210465565, + "flos": 452194520064.0, + "grad_norm": 0.03619268995909484, + "language_loss": 0.86631316, + "learning_rate": 0.000916145424425759, + "loss": 0.87687522, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.49658203, + "step": 1099, + "time_per_iteration": 2.67106294631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060107, + "balance_loss_mlp": 1.01027727, + "epoch": 0.2116198537899192, + "flos": 877626978816.0, + "grad_norm": 0.042483916895571405, + "language_loss": 0.91832745, + "learning_rate": 0.0009159726435426885, + "loss": 0.92892849, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.49780273, + "step": 1100, + "time_per_iteration": 3.095250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052771, + "balance_loss_mlp": 1.00275087, + "epoch": 0.21181223547518277, + "flos": 524675009280.0, + "grad_norm": 0.035590136232614346, + "language_loss": 0.91126454, + "learning_rate": 0.0009157997011647154, + "loss": 0.92179227, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.49926758, + "step": 1101, + "time_per_iteration": 2.61954665184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.00615227, + "epoch": 0.21200461716044633, + "flos": 573426284544.0, + "grad_norm": 0.03167271765745466, + "language_loss": 0.86759949, + "learning_rate": 0.0009156265973589817, + "loss": 0.87816215, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.50146484, + "step": 1102, + "time_per_iteration": 2.7851946353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053549, + "balance_loss_mlp": 1.00348067, + "epoch": 0.2121969988457099, + "flos": 546175262976.0, + "grad_norm": 0.033324702660241096, + "language_loss": 0.90598941, + "learning_rate": 0.0009154533321926926, + "loss": 0.91652489, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.50073242, + "step": 1103, + "time_per_iteration": 2.658358573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056655, + "balance_loss_mlp": 1.00663483, + "epoch": 0.21238938053097345, + "flos": 845355211008.0, + "grad_norm": 0.03290940631262569, + "language_loss": 0.88234645, + "learning_rate": 0.0009152799057331156, + "loss": 0.89291298, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.50024414, + "step": 1104, + "time_per_iteration": 3.1174561977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056205, + "balance_loss_mlp": 1.00623202, + "epoch": 0.212581762216237, + "flos": 447142671360.0, + "grad_norm": 0.035279899791186564, + "language_loss": 0.91767001, + "learning_rate": 0.0009151063180475805, + "loss": 0.92823207, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.5, + "step": 1105, + "time_per_iteration": 2.538922071456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054823, + "balance_loss_mlp": 1.00489795, + "epoch": 0.21277414390150057, + "flos": 515385904128.0, + "grad_norm": 0.03737857831356842, + "language_loss": 0.85410213, + "learning_rate": 0.0009149325692034803, + "loss": 0.86465037, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.49853516, + "step": 1106, + "time_per_iteration": 2.588087558746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055756, + "balance_loss_mlp": 1.00788116, + "epoch": 0.21296652558676413, + "flos": 1488514907136.0, + "grad_norm": 0.005769411809131762, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80259192, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.47851562, + "step": 1107, + "time_per_iteration": 4.901995658874512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055609, + "balance_loss_mlp": 1.00596976, + "epoch": 0.21315890727202771, + "flos": 847451968512.0, + "grad_norm": 0.03679321288402367, + "language_loss": 0.87994891, + "learning_rate": 0.0009145845883094678, + "loss": 0.89050496, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.49584961, + "step": 1108, + "time_per_iteration": 3.034179925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057835, + "balance_loss_mlp": 1.00833917, + "epoch": 0.21335128895729127, + "flos": 630556808448.0, + "grad_norm": 0.040833312538100186, + "language_loss": 0.86006308, + "learning_rate": 0.000914410356394654, + "loss": 0.87064135, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.49438477, + "step": 1109, + "time_per_iteration": 2.793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058379, + "balance_loss_mlp": 1.00878823, + "epoch": 0.21354367064255483, + "flos": 712285573632.0, + "grad_norm": 0.029526159769499145, + "language_loss": 0.85111213, + "learning_rate": 0.0009142359635914709, + "loss": 0.86169595, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.49560547, + "step": 1110, + "time_per_iteration": 3.0403430461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063298, + "balance_loss_mlp": 1.01375508, + "epoch": 0.2137360523278184, + "flos": 457211375616.0, + "grad_norm": 0.03547311640481051, + "language_loss": 0.85051197, + "learning_rate": 0.0009140614099676245, + "loss": 0.8611449, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.49414062, + "step": 1111, + "time_per_iteration": 2.6027371883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054693, + "balance_loss_mlp": 1.00495887, + "epoch": 0.21392843401308195, + "flos": 667266076416.0, + "grad_norm": 0.03139007596896344, + "language_loss": 0.8342849, + "learning_rate": 0.0009138866955908821, + "loss": 0.84483182, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.49658203, + "step": 1112, + "time_per_iteration": 2.924180269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00517082, + "epoch": 0.2141208156983455, + "flos": 750362544384.0, + "grad_norm": 0.03405304612319473, + "language_loss": 0.81477892, + "learning_rate": 0.0009137118205290738, + "loss": 0.82533085, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.49951172, + "step": 1113, + "time_per_iteration": 2.956289768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057131, + "balance_loss_mlp": 1.00711048, + "epoch": 0.21431319738360907, + "flos": 420011213568.0, + "grad_norm": 0.037812047895131755, + "language_loss": 0.90930229, + "learning_rate": 0.0009135367848500924, + "loss": 0.9198736, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.49975586, + "step": 1114, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_mlp": 1.01079023, + "epoch": 0.21450557906887263, + "flos": 610239565056.0, + "grad_norm": 0.04455846969282107, + "language_loss": 0.87261575, + "learning_rate": 0.0009133615886218927, + "loss": 0.88322389, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.5, + "step": 1115, + "time_per_iteration": 2.7146785259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105379, + "balance_loss_mlp": 1.00367427, + "epoch": 0.21469796075413622, + "flos": 562975556352.0, + "grad_norm": 0.04025415931658291, + "language_loss": 0.88754129, + "learning_rate": 0.0009131862319124917, + "loss": 0.89807916, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.50097656, + "step": 1116, + "time_per_iteration": 2.702315092086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058084, + "balance_loss_mlp": 1.0081588, + "epoch": 0.21489034243939978, + "flos": 595738218240.0, + "grad_norm": 0.036347556106983744, + "language_loss": 0.84819156, + "learning_rate": 0.0009130107147899691, + "loss": 0.8587724, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.49902344, + "step": 1117, + "time_per_iteration": 2.705153226852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055436, + "balance_loss_mlp": 1.00555849, + "epoch": 0.21508272412466334, + "flos": 442850979840.0, + "grad_norm": 0.032390780355026266, + "language_loss": 0.85796201, + "learning_rate": 0.0009128350373224665, + "loss": 0.86851633, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.49804688, + "step": 1118, + "time_per_iteration": 2.5689737796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055817, + "balance_loss_mlp": 1.00775146, + "epoch": 0.2152751058099269, + "flos": 1499234898432.0, + "grad_norm": 0.005802610423144338, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82512248, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.48046875, + "step": 1119, + "time_per_iteration": 4.659603834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054629, + "balance_loss_mlp": 1.00475144, + "epoch": 0.21546748749519046, + "flos": 494992838400.0, + "grad_norm": 0.03550503890551413, + "language_loss": 0.86117166, + "learning_rate": 0.0009124832016254005, + "loss": 0.87171793, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.4987793, + "step": 1120, + "time_per_iteration": 2.6080243587493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054572, + "balance_loss_mlp": 1.00450444, + "epoch": 0.21565986918045402, + "flos": 635695173120.0, + "grad_norm": 0.03761657282592244, + "language_loss": 0.88987935, + "learning_rate": 0.0009123070435324316, + "loss": 0.90042508, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.50097656, + "step": 1121, + "time_per_iteration": 2.8451340198516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062664, + "balance_loss_mlp": 1.01450348, + "epoch": 0.21585225086571758, + "flos": 1586801914368.0, + "grad_norm": 0.011675507285583616, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78938448, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.48144531, + "step": 1122, + "time_per_iteration": 5.018117666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_mlp": 1.00541639, + "epoch": 0.21604463255098114, + "flos": 685323257088.0, + "grad_norm": 0.03443856201457266, + "language_loss": 0.87021005, + "learning_rate": 0.0009119542471995752, + "loss": 0.8807621, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.49682617, + "step": 1123, + "time_per_iteration": 2.8631908893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.00755107, + "epoch": 0.2162370142362447, + "flos": 782308668672.0, + "grad_norm": 0.034966150945184314, + "language_loss": 0.82536203, + "learning_rate": 0.0009117776090966554, + "loss": 0.83593345, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.49511719, + "step": 1124, + "time_per_iteration": 2.9458060264587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_mlp": 1.00877571, + "epoch": 0.21642939592150828, + "flos": 1003762838016.0, + "grad_norm": 0.03795033166932298, + "language_loss": 0.87775326, + "learning_rate": 0.0009116008111274899, + "loss": 0.88833648, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.49511719, + "step": 1125, + "time_per_iteration": 3.2748866081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_mlp": 1.00556183, + "epoch": 0.21662177760677184, + "flos": 1485764917248.0, + "grad_norm": 0.008195913283110022, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8015998, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.47460938, + "step": 1126, + "time_per_iteration": 4.803825616836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105355, + "balance_loss_mlp": 1.00391161, + "epoch": 0.2168141592920354, + "flos": 888861196800.0, + "grad_norm": 0.03626284425770287, + "language_loss": 0.85553163, + "learning_rate": 0.0009112467358650396, + "loss": 0.86606717, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.49609375, + "step": 1127, + "time_per_iteration": 3.155856132507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057313, + "balance_loss_mlp": 1.00753081, + "epoch": 0.21700654097729896, + "flos": 547085119488.0, + "grad_norm": 0.03272511127748384, + "language_loss": 0.87140059, + "learning_rate": 0.0009110694587092192, + "loss": 0.88197374, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.49682617, + "step": 1128, + "time_per_iteration": 2.7438507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.00781655, + "epoch": 0.21719892266256252, + "flos": 510536244480.0, + "grad_norm": 0.0385378102776186, + "language_loss": 0.81826651, + "learning_rate": 0.0009108920219620815, + "loss": 0.82884294, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.49829102, + "step": 1129, + "time_per_iteration": 2.6256754398345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.00682795, + "epoch": 0.21739130434782608, + "flos": 544462474752.0, + "grad_norm": 0.03288593298355655, + "language_loss": 0.9021399, + "learning_rate": 0.0009107144256925133, + "loss": 0.91270602, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.49707031, + "step": 1130, + "time_per_iteration": 2.665764808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055495, + "balance_loss_mlp": 1.00566518, + "epoch": 0.21758368603308964, + "flos": 617983077888.0, + "grad_norm": 0.04004849400109536, + "language_loss": 0.83221352, + "learning_rate": 0.0009105366699694638, + "loss": 0.84276843, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.49755859, + "step": 1131, + "time_per_iteration": 2.7092785835266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055334, + "balance_loss_mlp": 1.0055995, + "epoch": 0.2177760677183532, + "flos": 636335766528.0, + "grad_norm": 0.03327692114185805, + "language_loss": 0.82139939, + "learning_rate": 0.0009103587548619439, + "loss": 0.83195269, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.49658203, + "step": 1132, + "time_per_iteration": 2.833617925643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055546, + "balance_loss_mlp": 1.00585985, + "epoch": 0.2179684494036168, + "flos": 533597641728.0, + "grad_norm": 0.036557340203022134, + "language_loss": 0.8721149, + "learning_rate": 0.0009101806804390261, + "loss": 0.8826704, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.49609375, + "step": 1133, + "time_per_iteration": 2.7880306243896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054716, + "balance_loss_mlp": 1.0050298, + "epoch": 0.21816083108888035, + "flos": 476182303488.0, + "grad_norm": 0.03701280834454915, + "language_loss": 0.917292, + "learning_rate": 0.0009100024467698453, + "loss": 0.92783916, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.49560547, + "step": 1134, + "time_per_iteration": 2.592986822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054821, + "balance_loss_mlp": 1.00513422, + "epoch": 0.2183532127741439, + "flos": 578547152640.0, + "grad_norm": 0.04183992577645213, + "language_loss": 0.83309305, + "learning_rate": 0.0009098240539235981, + "loss": 0.84364122, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.49658203, + "step": 1135, + "time_per_iteration": 2.693387269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055318, + "balance_loss_mlp": 1.00558341, + "epoch": 0.21854559445940747, + "flos": 595280371968.0, + "grad_norm": 0.03379290176549673, + "language_loss": 0.88387418, + "learning_rate": 0.0009096455019695423, + "loss": 0.89442736, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.49609375, + "step": 1136, + "time_per_iteration": 2.781304359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059818, + "balance_loss_mlp": 1.0098455, + "epoch": 0.21873797614467103, + "flos": 409549791744.0, + "grad_norm": 0.03874067782032871, + "language_loss": 0.90736896, + "learning_rate": 0.000909466790976998, + "loss": 0.91796714, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.49951172, + "step": 1137, + "time_per_iteration": 2.4837231636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.00620675, + "epoch": 0.21893035782993459, + "flos": 895655969280.0, + "grad_norm": 0.03281311030157744, + "language_loss": 0.83296013, + "learning_rate": 0.0009092879210153473, + "loss": 0.84352005, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.49682617, + "step": 1138, + "time_per_iteration": 3.156329870223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058359, + "balance_loss_mlp": 1.00862455, + "epoch": 0.21912273951519814, + "flos": 468569048064.0, + "grad_norm": 0.03332829582894704, + "language_loss": 0.89480728, + "learning_rate": 0.0009091088921540333, + "loss": 0.90539086, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.49731445, + "step": 1139, + "time_per_iteration": 2.5444674491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060322, + "balance_loss_mlp": 1.01197052, + "epoch": 0.2193151212004617, + "flos": 1535180118528.0, + "grad_norm": 0.009447727830516332, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76569003, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.48339844, + "step": 1140, + "time_per_iteration": 4.993603944778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_mlp": 1.00158358, + "epoch": 0.2195075028857253, + "flos": 592275703296.0, + "grad_norm": 0.039648398816974934, + "language_loss": 0.85201681, + "learning_rate": 0.0009087503580104985, + "loss": 0.86252946, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.49560547, + "step": 1141, + "time_per_iteration": 2.6736245155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053862, + "balance_loss_mlp": 1.00436676, + "epoch": 0.21969988457098885, + "flos": 637518776832.0, + "grad_norm": 0.03678403810630545, + "language_loss": 0.8005864, + "learning_rate": 0.0009085708528674728, + "loss": 0.81112504, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.49414062, + "step": 1142, + "time_per_iteration": 2.799607038497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.00362051, + "epoch": 0.2198922662562524, + "flos": 913860903936.0, + "grad_norm": 0.040969430424554455, + "language_loss": 0.86853033, + "learning_rate": 0.0009083911891031745, + "loss": 0.87906301, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.49487305, + "step": 1143, + "time_per_iteration": 3.1043601036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010518, + "balance_loss_mlp": 1.00235164, + "epoch": 0.22008464794151597, + "flos": 824495550720.0, + "grad_norm": 0.03475506353694162, + "language_loss": 0.91937912, + "learning_rate": 0.0009082113667873553, + "loss": 0.92989707, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.4934082, + "step": 1144, + "time_per_iteration": 3.114678144454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055626, + "balance_loss_mlp": 1.00636888, + "epoch": 0.22027702962677953, + "flos": 460619455488.0, + "grad_norm": 0.047183367988671336, + "language_loss": 0.91319406, + "learning_rate": 0.0009080313859898283, + "loss": 0.92375034, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.49145508, + "step": 1145, + "time_per_iteration": 2.529627799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_mlp": 1.00877535, + "epoch": 0.2204694113120431, + "flos": 532288264704.0, + "grad_norm": 0.034289556826903954, + "language_loss": 0.91988164, + "learning_rate": 0.0009078512467804684, + "loss": 0.93046296, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.49243164, + "step": 1146, + "time_per_iteration": 2.692556381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056493, + "balance_loss_mlp": 1.00737858, + "epoch": 0.22066179299730665, + "flos": 523687385088.0, + "grad_norm": 0.03628724645244133, + "language_loss": 0.91349947, + "learning_rate": 0.0009076709492292119, + "loss": 0.9240644, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.49023438, + "step": 1147, + "time_per_iteration": 2.6262857913970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056943, + "balance_loss_mlp": 1.00799513, + "epoch": 0.2208541746825702, + "flos": 547506027264.0, + "grad_norm": 0.0383258843164557, + "language_loss": 0.89899343, + "learning_rate": 0.0009074904934060562, + "loss": 0.90956283, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.48901367, + "step": 1148, + "time_per_iteration": 2.710716962814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054195, + "balance_loss_mlp": 1.00498509, + "epoch": 0.22104655636783377, + "flos": 710060504064.0, + "grad_norm": 0.034028934421108444, + "language_loss": 0.85814822, + "learning_rate": 0.0009073098793810607, + "loss": 0.86869013, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.4909668, + "step": 1149, + "time_per_iteration": 2.986891269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056627, + "balance_loss_mlp": 1.00758433, + "epoch": 0.22123893805309736, + "flos": 585965021952.0, + "grad_norm": 0.03641392016248804, + "language_loss": 0.88886124, + "learning_rate": 0.000907129107224346, + "loss": 0.89942753, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.48999023, + "step": 1150, + "time_per_iteration": 2.7348337173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055224, + "balance_loss_mlp": 1.00601482, + "epoch": 0.22143131973836092, + "flos": 493251859968.0, + "grad_norm": 0.02984339906163832, + "language_loss": 0.89448893, + "learning_rate": 0.0009069481770060939, + "loss": 0.90504116, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.49121094, + "step": 1151, + "time_per_iteration": 2.688180685043335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055578, + "balance_loss_mlp": 1.00593948, + "epoch": 0.22162370142362448, + "flos": 1081469174784.0, + "grad_norm": 0.034516826316188534, + "language_loss": 0.8487525, + "learning_rate": 0.000906767088796548, + "loss": 0.85930824, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.49584961, + "step": 1152, + "time_per_iteration": 3.4747724533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057959, + "balance_loss_mlp": 1.00841522, + "epoch": 0.22181608310888803, + "flos": 493512374784.0, + "grad_norm": 0.03114695536209251, + "language_loss": 0.87880313, + "learning_rate": 0.0009065858426660127, + "loss": 0.88938272, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.49462891, + "step": 1153, + "time_per_iteration": 2.6112635135650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060109, + "balance_loss_mlp": 1.0103749, + "epoch": 0.2220084647941516, + "flos": 725325898752.0, + "grad_norm": 0.04119971901255946, + "language_loss": 0.85662532, + "learning_rate": 0.0009064044386848543, + "loss": 0.86722642, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.49658203, + "step": 1154, + "time_per_iteration": 2.893120288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105547, + "balance_loss_mlp": 1.00564086, + "epoch": 0.22220084647941515, + "flos": 490245245952.0, + "grad_norm": 0.04012578927121656, + "language_loss": 0.89651787, + "learning_rate": 0.0009062228769234997, + "loss": 0.9070726, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.49731445, + "step": 1155, + "time_per_iteration": 2.544904947280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_mlp": 1.00344408, + "epoch": 0.2223932281646787, + "flos": 537296371968.0, + "grad_norm": 0.03814815821860503, + "language_loss": 0.82016486, + "learning_rate": 0.0009060411574524376, + "loss": 0.83069855, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.49804688, + "step": 1156, + "time_per_iteration": 2.6412572860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056148, + "balance_loss_mlp": 1.00660419, + "epoch": 0.22258560984994227, + "flos": 932968892160.0, + "grad_norm": 0.0415511709861084, + "language_loss": 0.88770878, + "learning_rate": 0.0009058592803422178, + "loss": 0.89827025, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.49462891, + "step": 1157, + "time_per_iteration": 4.623233079910278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055229, + "balance_loss_mlp": 1.00792694, + "epoch": 0.22277799153520586, + "flos": 1202397638400.0, + "grad_norm": 0.007067436666665483, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79765517, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.47265625, + "step": 1158, + "time_per_iteration": 4.805820465087891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053661, + "balance_loss_mlp": 1.00397491, + "epoch": 0.22297037322046942, + "flos": 502317388800.0, + "grad_norm": 0.032485949168455416, + "language_loss": 0.91067338, + "learning_rate": 0.00090549505348681, + "loss": 0.92121005, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.49633789, + "step": 1159, + "time_per_iteration": 2.5877561569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00427544, + "epoch": 0.22316275490573298, + "flos": 754113764352.0, + "grad_norm": 0.0354615562345569, + "language_loss": 0.84617937, + "learning_rate": 0.0009053127038830275, + "loss": 0.85672045, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.49731445, + "step": 1160, + "time_per_iteration": 3.0164098739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_mlp": 1.00777233, + "epoch": 0.22335513659099654, + "flos": 515804866560.0, + "grad_norm": 0.03692799991821936, + "language_loss": 0.87995219, + "learning_rate": 0.000905130196922898, + "loss": 0.89052767, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.49682617, + "step": 1161, + "time_per_iteration": 2.603769063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058076, + "balance_loss_mlp": 1.00848484, + "epoch": 0.2235475182762601, + "flos": 485508347136.0, + "grad_norm": 0.031071089964746976, + "language_loss": 0.8758713, + "learning_rate": 0.0009049475326772769, + "loss": 0.88645208, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.49511719, + "step": 1162, + "time_per_iteration": 2.6613070964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_mlp": 1.00334835, + "epoch": 0.22373989996152366, + "flos": 471068238336.0, + "grad_norm": 0.03308636607962537, + "language_loss": 0.83887613, + "learning_rate": 0.0009047647112170811, + "loss": 0.84940416, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.49389648, + "step": 1163, + "time_per_iteration": 2.8056106567382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105253, + "balance_loss_mlp": 1.00322485, + "epoch": 0.22393228164678722, + "flos": 1273019542272.0, + "grad_norm": 0.035987441954907426, + "language_loss": 0.88180983, + "learning_rate": 0.0009045817326132876, + "loss": 0.89233518, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.49243164, + "step": 1164, + "time_per_iteration": 3.7020320892333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_mlp": 1.00575495, + "epoch": 0.22412466333205078, + "flos": 597468503040.0, + "grad_norm": 0.03371692057767332, + "language_loss": 0.84342653, + "learning_rate": 0.0009043985969369357, + "loss": 0.85397661, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.49145508, + "step": 1165, + "time_per_iteration": 2.8581626415252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_mlp": 1.00299454, + "epoch": 0.22431704501731436, + "flos": 609632019456.0, + "grad_norm": 0.03010954873673584, + "language_loss": 0.84869868, + "learning_rate": 0.0009042153042591245, + "loss": 0.85922217, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.49243164, + "step": 1166, + "time_per_iteration": 2.810300827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_mlp": 1.0050199, + "epoch": 0.22450942670257792, + "flos": 908108190720.0, + "grad_norm": 0.030118647676053625, + "language_loss": 0.86120874, + "learning_rate": 0.0009040318546510146, + "loss": 0.87175173, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.49169922, + "step": 1167, + "time_per_iteration": 3.129802942276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057032, + "balance_loss_mlp": 1.00791764, + "epoch": 0.22470180838784148, + "flos": 566381690880.0, + "grad_norm": 0.035718478093575166, + "language_loss": 0.85780692, + "learning_rate": 0.0009038482481838275, + "loss": 0.86837721, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.49047852, + "step": 1168, + "time_per_iteration": 2.674471855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00880456, + "epoch": 0.22489419007310504, + "flos": 835918351872.0, + "grad_norm": 0.03078757560697398, + "language_loss": 0.88093269, + "learning_rate": 0.0009036644849288455, + "loss": 0.89151073, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.48925781, + "step": 1169, + "time_per_iteration": 3.126168727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00331759, + "epoch": 0.2250865717583686, + "flos": 582139924992.0, + "grad_norm": 0.03503818002335677, + "language_loss": 0.86431491, + "learning_rate": 0.0009034805649574118, + "loss": 0.87483639, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.48779297, + "step": 1170, + "time_per_iteration": 2.6982839107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056949, + "balance_loss_mlp": 1.0084312, + "epoch": 0.22527895344363216, + "flos": 601671733248.0, + "grad_norm": 0.031992933731526396, + "language_loss": 0.85811341, + "learning_rate": 0.0009032964883409308, + "loss": 0.86868292, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.48510742, + "step": 1171, + "time_per_iteration": 2.9468932151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055088, + "balance_loss_mlp": 1.00826263, + "epoch": 0.22547133512889572, + "flos": 1443734537472.0, + "grad_norm": 0.010800983830845337, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.7410562, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.46777344, + "step": 1172, + "time_per_iteration": 5.044191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_mlp": 1.0051204, + "epoch": 0.22566371681415928, + "flos": 491586703872.0, + "grad_norm": 0.034976527569036825, + "language_loss": 0.88142014, + "learning_rate": 0.0009029278654587462, + "loss": 0.89195722, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.48583984, + "step": 1173, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105611, + "balance_loss_mlp": 1.00749624, + "epoch": 0.22585609849942284, + "flos": 605752487424.0, + "grad_norm": 0.03629905495680353, + "language_loss": 0.82793885, + "learning_rate": 0.0009027433193361548, + "loss": 0.83850002, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.48583984, + "step": 1174, + "time_per_iteration": 2.707061290740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105926, + "balance_loss_mlp": 1.01064646, + "epoch": 0.22604848018468643, + "flos": 636728484096.0, + "grad_norm": 0.035409171913978986, + "language_loss": 0.87780964, + "learning_rate": 0.00090255861685474, + "loss": 0.88840234, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.48608398, + "step": 1175, + "time_per_iteration": 2.7910189628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.00752461, + "epoch": 0.22624086186995, + "flos": 480845325312.0, + "grad_norm": 0.040136392489239156, + "language_loss": 0.91905487, + "learning_rate": 0.0009023737580862095, + "loss": 0.92961645, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.48632812, + "step": 1176, + "time_per_iteration": 2.5489909648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054427, + "balance_loss_mlp": 1.00600469, + "epoch": 0.22643324355521355, + "flos": 496807693824.0, + "grad_norm": 0.032828642541270554, + "language_loss": 0.83966863, + "learning_rate": 0.0009021887431023321, + "loss": 0.85021293, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.48413086, + "step": 1177, + "time_per_iteration": 2.679046392440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060571, + "balance_loss_mlp": 1.01224387, + "epoch": 0.2266256252404771, + "flos": 562684905984.0, + "grad_norm": 0.03431341234676521, + "language_loss": 0.8836711, + "learning_rate": 0.0009020035719749369, + "loss": 0.89427686, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.4831543, + "step": 1178, + "time_per_iteration": 2.777273416519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_mlp": 1.00516534, + "epoch": 0.22681800692574067, + "flos": 581033703936.0, + "grad_norm": 0.0422995660898389, + "language_loss": 0.78512251, + "learning_rate": 0.0009018182447759136, + "loss": 0.79566014, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.48583984, + "step": 1179, + "time_per_iteration": 2.9779903888702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105363, + "balance_loss_mlp": 1.00508785, + "epoch": 0.22701038861100423, + "flos": 741466156800.0, + "grad_norm": 0.03672617722264385, + "language_loss": 0.80683887, + "learning_rate": 0.0009016327615772126, + "loss": 0.81737518, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.48535156, + "step": 1180, + "time_per_iteration": 2.953355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054935, + "balance_loss_mlp": 1.00636911, + "epoch": 0.2272027702962678, + "flos": 578306079744.0, + "grad_norm": 0.03924605706365315, + "language_loss": 0.88551408, + "learning_rate": 0.0009014471224508451, + "loss": 0.89606345, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.4855957, + "step": 1181, + "time_per_iteration": 2.7092630863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00744355, + "epoch": 0.22739515198153135, + "flos": 545291651328.0, + "grad_norm": 0.04038062834310644, + "language_loss": 0.83949769, + "learning_rate": 0.0009012613274688823, + "loss": 0.85005856, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.48632812, + "step": 1182, + "time_per_iteration": 2.642143964767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055555, + "balance_loss_mlp": 1.00689363, + "epoch": 0.22758753366679493, + "flos": 441092504832.0, + "grad_norm": 0.03566258536478163, + "language_loss": 0.88506091, + "learning_rate": 0.0009010753767034565, + "loss": 0.89561647, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.48632812, + "step": 1183, + "time_per_iteration": 2.599167585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_mlp": 1.00526154, + "epoch": 0.2277799153520585, + "flos": 730824900096.0, + "grad_norm": 0.03354089847275564, + "language_loss": 0.79992342, + "learning_rate": 0.0009008892702267599, + "loss": 0.81046152, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.48535156, + "step": 1184, + "time_per_iteration": 2.9798924922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057068, + "balance_loss_mlp": 1.00855029, + "epoch": 0.22797229703732205, + "flos": 527913947904.0, + "grad_norm": 0.04184098346005727, + "language_loss": 0.89975739, + "learning_rate": 0.0009007030081110457, + "loss": 0.91032803, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.48510742, + "step": 1185, + "time_per_iteration": 2.6349968910217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.00910807, + "epoch": 0.2281646787225856, + "flos": 536521630464.0, + "grad_norm": 0.03583751901003141, + "language_loss": 0.85487026, + "learning_rate": 0.000900516590428627, + "loss": 0.86544555, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.48413086, + "step": 1186, + "time_per_iteration": 2.669015407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054531, + "balance_loss_mlp": 1.00596476, + "epoch": 0.22835706040784917, + "flos": 542478478080.0, + "grad_norm": 0.03191556588332838, + "language_loss": 0.9033947, + "learning_rate": 0.0009003300172518778, + "loss": 0.91394001, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.4855957, + "step": 1187, + "time_per_iteration": 2.7164688110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.00804579, + "epoch": 0.22854944209311273, + "flos": 792006042624.0, + "grad_norm": 0.0322044633529041, + "language_loss": 0.85374159, + "learning_rate": 0.0009001432886532321, + "loss": 0.86430913, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.48681641, + "step": 1188, + "time_per_iteration": 2.9621965885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054799, + "balance_loss_mlp": 1.00568485, + "epoch": 0.2287418237783763, + "flos": 470216707584.0, + "grad_norm": 0.03536870053258389, + "language_loss": 0.87358034, + "learning_rate": 0.0008999564047051843, + "loss": 0.88412833, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.49047852, + "step": 1189, + "time_per_iteration": 2.5233154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058932, + "balance_loss_mlp": 1.01003218, + "epoch": 0.22893420546363985, + "flos": 469005507072.0, + "grad_norm": 0.030491923293758834, + "language_loss": 0.8554523, + "learning_rate": 0.0008997693654802894, + "loss": 0.86604154, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.48852539, + "step": 1190, + "time_per_iteration": 2.6391589641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_mlp": 1.00965738, + "epoch": 0.22912658714890344, + "flos": 627402440448.0, + "grad_norm": 0.0331512035559832, + "language_loss": 0.87166977, + "learning_rate": 0.0008995821710511625, + "loss": 0.88225698, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49023438, + "step": 1191, + "time_per_iteration": 2.7549567222595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.00599909, + "epoch": 0.229318968834167, + "flos": 504021428736.0, + "grad_norm": 0.030936804790582927, + "language_loss": 0.85688579, + "learning_rate": 0.0008993948214904786, + "loss": 0.86743385, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.48779297, + "step": 1192, + "time_per_iteration": 2.596224784851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_mlp": 1.01483917, + "epoch": 0.22951135051943056, + "flos": 1377716374272.0, + "grad_norm": 0.008909469382289665, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79484069, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.46972656, + "step": 1193, + "time_per_iteration": 4.853066921234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062271, + "balance_loss_mlp": 1.01356232, + "epoch": 0.22970373220469412, + "flos": 645550994688.0, + "grad_norm": 0.0389743097765726, + "language_loss": 0.78935194, + "learning_rate": 0.0008990196572654427, + "loss": 0.79997468, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.48681641, + "step": 1194, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00771046, + "epoch": 0.22989611388995768, + "flos": 501273384192.0, + "grad_norm": 0.02988304738122761, + "language_loss": 0.88486552, + "learning_rate": 0.0008988318427467426, + "loss": 0.8954283, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.4855957, + "step": 1195, + "time_per_iteration": 2.6931521892547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053709, + "balance_loss_mlp": 1.00514269, + "epoch": 0.23008849557522124, + "flos": 1098334596864.0, + "grad_norm": 0.03694163801075408, + "language_loss": 0.87307864, + "learning_rate": 0.0008986438733877887, + "loss": 0.88361579, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.4855957, + "step": 1196, + "time_per_iteration": 3.4505865573883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00447488, + "epoch": 0.2302808772604848, + "flos": 684993722880.0, + "grad_norm": 0.030674764969734848, + "language_loss": 0.85086071, + "learning_rate": 0.0008984557492615576, + "loss": 0.86139137, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.48583984, + "step": 1197, + "time_per_iteration": 2.936891794204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056985, + "balance_loss_mlp": 1.00837183, + "epoch": 0.23047325894574835, + "flos": 529961127936.0, + "grad_norm": 0.03469763625730159, + "language_loss": 0.90249604, + "learning_rate": 0.0008982674704410854, + "loss": 0.91306591, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.48608398, + "step": 1198, + "time_per_iteration": 2.6928677558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_mlp": 1.00653744, + "epoch": 0.23066564063101191, + "flos": 684127607808.0, + "grad_norm": 0.03582939263118032, + "language_loss": 0.78263444, + "learning_rate": 0.0008980790369994682, + "loss": 0.79318547, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.4855957, + "step": 1199, + "time_per_iteration": 2.941063642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692904, + "epoch": 0.2308580223162755, + "flos": 559632605184.0, + "grad_norm": 0.03400437188822284, + "language_loss": 0.87868834, + "learning_rate": 0.000897890449009863, + "loss": 0.88924116, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.48339844, + "step": 1200, + "time_per_iteration": 2.6677346229553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058779, + "balance_loss_mlp": 1.01061893, + "epoch": 0.23105040400153906, + "flos": 556730003712.0, + "grad_norm": 0.030515141355108834, + "language_loss": 0.90571141, + "learning_rate": 0.0008977017065454853, + "loss": 0.91629916, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.48144531, + "step": 1201, + "time_per_iteration": 2.7204995155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053158, + "balance_loss_mlp": 1.00506902, + "epoch": 0.23124278568680262, + "flos": 706050714624.0, + "grad_norm": 0.034769733982414605, + "language_loss": 0.81452352, + "learning_rate": 0.0008975128096796121, + "loss": 0.82505512, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48071289, + "step": 1202, + "time_per_iteration": 2.861058473587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.00517035, + "epoch": 0.23143516737206618, + "flos": 613969397760.0, + "grad_norm": 0.03845725381901349, + "language_loss": 0.86815399, + "learning_rate": 0.0008973237584855794, + "loss": 0.87868845, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.48266602, + "step": 1203, + "time_per_iteration": 2.907670021057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055623, + "balance_loss_mlp": 1.00715244, + "epoch": 0.23162754905732974, + "flos": 390096718080.0, + "grad_norm": 0.03680581416715809, + "language_loss": 0.82972479, + "learning_rate": 0.0008971345530367832, + "loss": 0.84028101, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.48461914, + "step": 1204, + "time_per_iteration": 2.4500131607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_mlp": 1.00190353, + "epoch": 0.2318199307425933, + "flos": 668970116352.0, + "grad_norm": 0.03636020946200237, + "language_loss": 0.86001658, + "learning_rate": 0.0008969451934066799, + "loss": 0.87052464, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.48828125, + "step": 1205, + "time_per_iteration": 2.786860704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_mlp": 1.00558126, + "epoch": 0.23201231242785686, + "flos": 667628658432.0, + "grad_norm": 0.042825772722853955, + "language_loss": 0.80798173, + "learning_rate": 0.0008967556796687854, + "loss": 0.81852657, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.48852539, + "step": 1206, + "time_per_iteration": 2.9043900966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106058, + "balance_loss_mlp": 1.01153755, + "epoch": 0.23220469411312042, + "flos": 750095226624.0, + "grad_norm": 0.036226897286377145, + "language_loss": 0.84918714, + "learning_rate": 0.0008965660118966752, + "loss": 0.85979295, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.48974609, + "step": 1207, + "time_per_iteration": 2.8989100456237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054609, + "balance_loss_mlp": 1.00597119, + "epoch": 0.232397075798384, + "flos": 668262448896.0, + "grad_norm": 0.03230217319227319, + "language_loss": 0.90859735, + "learning_rate": 0.0008963761901639851, + "loss": 0.91914344, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.48632812, + "step": 1208, + "time_per_iteration": 2.801715612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050654, + "balance_loss_mlp": 1.00204051, + "epoch": 0.23258945748364757, + "flos": 611346753024.0, + "grad_norm": 0.038379048380249, + "language_loss": 0.83753544, + "learning_rate": 0.0008961862145444103, + "loss": 0.84804195, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.48608398, + "step": 1209, + "time_per_iteration": 2.6739237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105504, + "balance_loss_mlp": 1.00656986, + "epoch": 0.23278183916891113, + "flos": 490672956672.0, + "grad_norm": 0.04093378826068356, + "language_loss": 0.86382735, + "learning_rate": 0.0008959960851117059, + "loss": 0.87437773, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.48461914, + "step": 1210, + "time_per_iteration": 2.635650634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056695, + "balance_loss_mlp": 1.00808144, + "epoch": 0.23297422085417469, + "flos": 512674798080.0, + "grad_norm": 0.0354403494585401, + "language_loss": 0.84509313, + "learning_rate": 0.0008958058019396868, + "loss": 0.85566002, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.48608398, + "step": 1211, + "time_per_iteration": 2.788318157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105326, + "balance_loss_mlp": 1.00462246, + "epoch": 0.23316660253943824, + "flos": 547532272128.0, + "grad_norm": 0.03263062148431384, + "language_loss": 0.87462825, + "learning_rate": 0.0008956153651022274, + "loss": 0.8851608, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.48608398, + "step": 1212, + "time_per_iteration": 2.725313901901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_mlp": 1.00709951, + "epoch": 0.2333589842247018, + "flos": 511289598720.0, + "grad_norm": 0.03371055024816449, + "language_loss": 0.84886169, + "learning_rate": 0.0008954247746732618, + "loss": 0.85942048, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.48754883, + "step": 1213, + "time_per_iteration": 2.592165470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057864, + "balance_loss_mlp": 1.00894058, + "epoch": 0.23355136590996536, + "flos": 664407216384.0, + "grad_norm": 0.030798488974581865, + "language_loss": 0.9124192, + "learning_rate": 0.0008952340307267837, + "loss": 0.92299783, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48876953, + "step": 1214, + "time_per_iteration": 2.887542724609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_mlp": 1.00332439, + "epoch": 0.23374374759522892, + "flos": 509465995008.0, + "grad_norm": 0.038631928770240895, + "language_loss": 0.8442086, + "learning_rate": 0.0008950431333368468, + "loss": 0.85472775, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.48583984, + "step": 1215, + "time_per_iteration": 2.5713701248168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_mlp": 1.00283849, + "epoch": 0.2339361292804925, + "flos": 1296429915648.0, + "grad_norm": 0.03446682830311694, + "language_loss": 0.8584398, + "learning_rate": 0.0008948520825775634, + "loss": 0.86895549, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48706055, + "step": 1216, + "time_per_iteration": 3.631596565246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054055, + "balance_loss_mlp": 1.00541723, + "epoch": 0.23412851096575607, + "flos": 707177344512.0, + "grad_norm": 0.031791306217448204, + "language_loss": 0.84468639, + "learning_rate": 0.0008946608785231067, + "loss": 0.85522687, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48632812, + "step": 1217, + "time_per_iteration": 2.878099203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053689, + "balance_loss_mlp": 1.00517046, + "epoch": 0.23432089265101963, + "flos": 439175582208.0, + "grad_norm": 0.03486793229645632, + "language_loss": 0.85493773, + "learning_rate": 0.0008944695212477084, + "loss": 0.86547458, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.48510742, + "step": 1218, + "time_per_iteration": 2.5141704082489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053338, + "balance_loss_mlp": 1.00498641, + "epoch": 0.2345132743362832, + "flos": 481915574784.0, + "grad_norm": 0.03047714423600347, + "language_loss": 0.87145793, + "learning_rate": 0.0008942780108256599, + "loss": 0.88199133, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.48339844, + "step": 1219, + "time_per_iteration": 2.6020901203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050373, + "balance_loss_mlp": 1.00180733, + "epoch": 0.23470565602154675, + "flos": 412341577728.0, + "grad_norm": 0.03328064907126118, + "language_loss": 0.87382472, + "learning_rate": 0.0008940863473313121, + "loss": 0.88432848, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.4855957, + "step": 1220, + "time_per_iteration": 2.4561610221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_mlp": 1.00483322, + "epoch": 0.2348980377068103, + "flos": 546500906496.0, + "grad_norm": 0.04239569524538178, + "language_loss": 0.88751769, + "learning_rate": 0.0008938945308390756, + "loss": 0.89805412, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48779297, + "step": 1221, + "time_per_iteration": 2.657763719558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057653, + "balance_loss_mlp": 1.00906336, + "epoch": 0.23509041939207387, + "flos": 576843112704.0, + "grad_norm": 0.04482007629740174, + "language_loss": 0.88039029, + "learning_rate": 0.00089370256142342, + "loss": 0.89096677, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.48583984, + "step": 1222, + "time_per_iteration": 2.7348928451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_mlp": 1.00616074, + "epoch": 0.23528280107733743, + "flos": 589948566528.0, + "grad_norm": 0.030112791330182954, + "language_loss": 0.85687798, + "learning_rate": 0.0008935104391588746, + "loss": 0.86742526, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.4855957, + "step": 1223, + "time_per_iteration": 2.7620511054992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_mlp": 1.00350857, + "epoch": 0.235475182762601, + "flos": 824858132736.0, + "grad_norm": 0.028710207733723417, + "language_loss": 0.83630896, + "learning_rate": 0.0008933181641200276, + "loss": 0.84683019, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.48608398, + "step": 1224, + "time_per_iteration": 3.1587913036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053568, + "balance_loss_mlp": 1.00531197, + "epoch": 0.23566756444786457, + "flos": 681367902720.0, + "grad_norm": 0.03430983930689064, + "language_loss": 0.86561936, + "learning_rate": 0.0008931257363815271, + "loss": 0.87615514, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.48242188, + "step": 1225, + "time_per_iteration": 2.9277396202087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056611, + "balance_loss_mlp": 1.00849795, + "epoch": 0.23585994613312813, + "flos": 703135474176.0, + "grad_norm": 0.029906055234585397, + "language_loss": 0.90256047, + "learning_rate": 0.0008929331560180798, + "loss": 0.91312659, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.48095703, + "step": 1226, + "time_per_iteration": 2.911451578140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_mlp": 1.00676012, + "epoch": 0.2360523278183917, + "flos": 525196038912.0, + "grad_norm": 0.030679819106685022, + "language_loss": 0.9186613, + "learning_rate": 0.0008927404231044525, + "loss": 0.92921197, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48291016, + "step": 1227, + "time_per_iteration": 2.6848785877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.00756276, + "epoch": 0.23624470950365525, + "flos": 525443914752.0, + "grad_norm": 0.030207709240370546, + "language_loss": 0.82286787, + "learning_rate": 0.0008925475377154703, + "loss": 0.83342624, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48266602, + "step": 1228, + "time_per_iteration": 2.7278709411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058098, + "balance_loss_mlp": 1.00974643, + "epoch": 0.2364370911889188, + "flos": 597961342464.0, + "grad_norm": 0.04301213480645635, + "language_loss": 0.82405227, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463323, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.48339844, + "step": 1229, + "time_per_iteration": 2.7282724380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055873, + "balance_loss_mlp": 1.00766432, + "epoch": 0.23662947287418237, + "flos": 758173131264.0, + "grad_norm": 0.03660169780759576, + "language_loss": 0.92488217, + "learning_rate": 0.00089216130981104, + "loss": 0.93544096, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.48193359, + "step": 1230, + "time_per_iteration": 3.0333714485168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051461, + "balance_loss_mlp": 1.00337219, + "epoch": 0.23682185455944593, + "flos": 547208573952.0, + "grad_norm": 0.03138155314794734, + "language_loss": 0.83336782, + "learning_rate": 0.000891967967445539, + "loss": 0.8438825, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.48071289, + "step": 1231, + "time_per_iteration": 2.7093472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_mlp": 1.00587165, + "epoch": 0.2370142362447095, + "flos": 663523604736.0, + "grad_norm": 0.02795314572038805, + "language_loss": 0.89439881, + "learning_rate": 0.0008917744729045772, + "loss": 0.90493822, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.48046875, + "step": 1232, + "time_per_iteration": 2.8760838508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057809, + "balance_loss_mlp": 1.00974393, + "epoch": 0.23720661792997308, + "flos": 684913042944.0, + "grad_norm": 0.03460859048974857, + "language_loss": 0.8446126, + "learning_rate": 0.0008915808262632757, + "loss": 0.85519075, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.48046875, + "step": 1233, + "time_per_iteration": 2.889141321182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_mlp": 1.01058459, + "epoch": 0.23739899961523664, + "flos": 560023377408.0, + "grad_norm": 0.03296017154749467, + "language_loss": 0.94079709, + "learning_rate": 0.0008913870275968148, + "loss": 0.95138192, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.47875977, + "step": 1234, + "time_per_iteration": 2.7432892322540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_mlp": 1.00655627, + "epoch": 0.2375913813005002, + "flos": 891165000960.0, + "grad_norm": 0.03128077017401229, + "language_loss": 0.88428569, + "learning_rate": 0.0008911930769804342, + "loss": 0.89483166, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.48022461, + "step": 1235, + "time_per_iteration": 3.261483669281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692844, + "epoch": 0.23778376298576376, + "flos": 642366491136.0, + "grad_norm": 0.029107844015886564, + "language_loss": 0.91850013, + "learning_rate": 0.0008909989744894318, + "loss": 0.92905295, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.48339844, + "step": 1236, + "time_per_iteration": 2.8673832416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061412, + "balance_loss_mlp": 1.01287031, + "epoch": 0.23797614467102732, + "flos": 617946139392.0, + "grad_norm": 0.034095811880077646, + "language_loss": 0.82566786, + "learning_rate": 0.0008908047201991649, + "loss": 0.83628196, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.48535156, + "step": 1237, + "time_per_iteration": 2.7810442447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00511789, + "epoch": 0.23816852635629088, + "flos": 625464130560.0, + "grad_norm": 0.032663011960307756, + "language_loss": 0.87081301, + "learning_rate": 0.0008906103141850502, + "loss": 0.88134885, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.48461914, + "step": 1238, + "time_per_iteration": 2.880305528640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_mlp": 1.00416911, + "epoch": 0.23836090804155444, + "flos": 522441191424.0, + "grad_norm": 0.03474425243888252, + "language_loss": 0.88862967, + "learning_rate": 0.0008904157565225621, + "loss": 0.89915323, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48168945, + "step": 1239, + "time_per_iteration": 2.648766040802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052394, + "balance_loss_mlp": 1.00423324, + "epoch": 0.238553289726818, + "flos": 1155855892992.0, + "grad_norm": 0.034399895266541865, + "language_loss": 0.82445645, + "learning_rate": 0.000890221047287235, + "loss": 0.83498037, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48144531, + "step": 1240, + "time_per_iteration": 3.5001280307769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055, + "balance_loss_mlp": 1.00703037, + "epoch": 0.23874567141208156, + "flos": 500910802176.0, + "grad_norm": 0.03306053891413694, + "language_loss": 0.91726851, + "learning_rate": 0.0008900261865546615, + "loss": 0.92781848, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47949219, + "step": 1241, + "time_per_iteration": 2.6465680599212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052437, + "balance_loss_mlp": 1.00418115, + "epoch": 0.23893805309734514, + "flos": 558050074368.0, + "grad_norm": 0.0354259641755878, + "language_loss": 0.85598528, + "learning_rate": 0.0008898311744004936, + "loss": 0.86650962, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.48242188, + "step": 1242, + "time_per_iteration": 2.7268829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053623, + "balance_loss_mlp": 1.0055337, + "epoch": 0.2391304347826087, + "flos": 550317255168.0, + "grad_norm": 0.0320494810853186, + "language_loss": 0.87574649, + "learning_rate": 0.0008896360109004414, + "loss": 0.88628268, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.48071289, + "step": 1243, + "time_per_iteration": 2.6199252605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050337, + "balance_loss_mlp": 1.00222456, + "epoch": 0.23932281646787226, + "flos": 517079250432.0, + "grad_norm": 0.0302458656306059, + "language_loss": 0.85177696, + "learning_rate": 0.0008894406961302742, + "loss": 0.86228031, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.48095703, + "step": 1244, + "time_per_iteration": 2.604508876800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052152, + "balance_loss_mlp": 1.00411069, + "epoch": 0.23951519815313582, + "flos": 745002548736.0, + "grad_norm": 0.03429303167053761, + "language_loss": 0.84712255, + "learning_rate": 0.0008892452301658201, + "loss": 0.85764414, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.48022461, + "step": 1245, + "time_per_iteration": 2.924288272857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054436, + "balance_loss_mlp": 1.00651395, + "epoch": 0.23970757983839938, + "flos": 555175663104.0, + "grad_norm": 0.03219666617279603, + "language_loss": 0.84054452, + "learning_rate": 0.0008890496130829653, + "loss": 0.85108888, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.47900391, + "step": 1246, + "time_per_iteration": 2.6700189113616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052243, + "balance_loss_mlp": 1.00441635, + "epoch": 0.23989996152366294, + "flos": 481618121472.0, + "grad_norm": 0.033578246726411604, + "language_loss": 0.86002076, + "learning_rate": 0.0008888538449576555, + "loss": 0.87054318, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.47802734, + "step": 1247, + "time_per_iteration": 2.6269826889038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_mlp": 1.00886118, + "epoch": 0.2400923432089265, + "flos": 486281143296.0, + "grad_norm": 0.03580496599340432, + "language_loss": 0.83572984, + "learning_rate": 0.0008886579258658944, + "loss": 0.84630001, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48144531, + "step": 1248, + "time_per_iteration": 2.577885389328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054529, + "balance_loss_mlp": 1.0065589, + "epoch": 0.24028472489419006, + "flos": 624793401600.0, + "grad_norm": 0.03296142515540601, + "language_loss": 0.85843956, + "learning_rate": 0.0008884618558837446, + "loss": 0.86898482, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.47949219, + "step": 1249, + "time_per_iteration": 2.874666929244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.00870681, + "epoch": 0.24047710657945365, + "flos": 602809056768.0, + "grad_norm": 0.033943651692576245, + "language_loss": 0.87474859, + "learning_rate": 0.0008882656350873273, + "loss": 0.88531733, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.48144531, + "step": 1250, + "time_per_iteration": 2.8647053241729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055385, + "balance_loss_mlp": 1.00748658, + "epoch": 0.2406694882647172, + "flos": 843001829376.0, + "grad_norm": 0.04142560607115463, + "language_loss": 0.87984931, + "learning_rate": 0.0008880692635528219, + "loss": 0.89040315, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.47875977, + "step": 1251, + "time_per_iteration": 3.0643107891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105149, + "balance_loss_mlp": 1.00352037, + "epoch": 0.24086186994998077, + "flos": 528135578880.0, + "grad_norm": 0.03337559285192523, + "language_loss": 0.90356189, + "learning_rate": 0.0008878727413564669, + "loss": 0.91407681, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47949219, + "step": 1252, + "time_per_iteration": 2.7680115699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053848, + "balance_loss_mlp": 1.00826263, + "epoch": 0.24105425163524433, + "flos": 1341462028800.0, + "grad_norm": 0.009196650126926217, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81189448, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.45507812, + "step": 1253, + "time_per_iteration": 4.858070135116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_mlp": 1.00698781, + "epoch": 0.24124663332050789, + "flos": 615228230400.0, + "grad_norm": 0.036740782431925904, + "language_loss": 0.79496801, + "learning_rate": 0.0008874792452834528, + "loss": 0.80551577, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.47753906, + "step": 1254, + "time_per_iteration": 2.756243944168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_mlp": 1.00954247, + "epoch": 0.24143901500577145, + "flos": 576593291520.0, + "grad_norm": 0.037714132300224086, + "language_loss": 0.87880921, + "learning_rate": 0.0008872822715595626, + "loss": 0.88938332, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.47851562, + "step": 1255, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.00812411, + "epoch": 0.241631396691035, + "flos": 496147658496.0, + "grad_norm": 0.038695693582970765, + "language_loss": 0.87873089, + "learning_rate": 0.0008870851474793598, + "loss": 0.88929206, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.47973633, + "step": 1256, + "time_per_iteration": 2.6313350200653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058027, + "balance_loss_mlp": 1.009866, + "epoch": 0.24182377837629856, + "flos": 637397267712.0, + "grad_norm": 0.03630749648984725, + "language_loss": 0.904266, + "learning_rate": 0.0008868878731193752, + "loss": 0.9148463, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48144531, + "step": 1257, + "time_per_iteration": 2.820671558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052967, + "balance_loss_mlp": 1.00509274, + "epoch": 0.24201616006156215, + "flos": 516350195712.0, + "grad_norm": 0.04098435374075245, + "language_loss": 0.90631104, + "learning_rate": 0.0008866904485561973, + "loss": 0.91684067, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.47851562, + "step": 1258, + "time_per_iteration": 2.712970495223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053405, + "balance_loss_mlp": 1.0053165, + "epoch": 0.2422085417468257, + "flos": 616379159808.0, + "grad_norm": 0.03199149634406808, + "language_loss": 0.83463258, + "learning_rate": 0.000886492873866473, + "loss": 0.84516662, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.48071289, + "step": 1259, + "time_per_iteration": 2.8250985145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00330269, + "epoch": 0.24240092343208927, + "flos": 586913762304.0, + "grad_norm": 0.03973618931504764, + "language_loss": 0.85183978, + "learning_rate": 0.000886295149126908, + "loss": 0.86235273, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.47973633, + "step": 1260, + "time_per_iteration": 2.7110049724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_mlp": 1.00338328, + "epoch": 0.24259330511735283, + "flos": 763572010752.0, + "grad_norm": 0.03275678482299809, + "language_loss": 0.86485362, + "learning_rate": 0.0008860972744142655, + "loss": 0.87536597, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47827148, + "step": 1261, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051398, + "balance_loss_mlp": 1.00361907, + "epoch": 0.2427856868026164, + "flos": 628134407424.0, + "grad_norm": 0.03196094686024711, + "language_loss": 0.82455611, + "learning_rate": 0.0008858992498053671, + "loss": 0.83507007, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47753906, + "step": 1262, + "time_per_iteration": 2.8111376762390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054321, + "balance_loss_mlp": 1.00797272, + "epoch": 0.24297806848787995, + "flos": 1514922167808.0, + "grad_norm": 0.010120346862694057, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77643073, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.46289062, + "step": 1263, + "time_per_iteration": 4.84857177734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052202, + "balance_loss_mlp": 1.00420785, + "epoch": 0.2431704501731435, + "flos": 543073384704.0, + "grad_norm": 0.030775668427347653, + "language_loss": 0.83837479, + "learning_rate": 0.0008855027512063817, + "loss": 0.84889686, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47973633, + "step": 1264, + "time_per_iteration": 2.69954252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055608, + "balance_loss_mlp": 1.0077095, + "epoch": 0.24336283185840707, + "flos": 524879143680.0, + "grad_norm": 0.03906981412635217, + "language_loss": 0.86655742, + "learning_rate": 0.0008853042773702292, + "loss": 0.87711346, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.47875977, + "step": 1265, + "time_per_iteration": 2.703227996826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_mlp": 1.00530863, + "epoch": 0.24355521354367063, + "flos": 538206228480.0, + "grad_norm": 0.030917867079500824, + "language_loss": 0.88497615, + "learning_rate": 0.0008851056539456896, + "loss": 0.89550632, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.47680664, + "step": 1266, + "time_per_iteration": 2.6844840049743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_mlp": 1.00655031, + "epoch": 0.24374759522893422, + "flos": 932109580032.0, + "grad_norm": 0.032880300158599975, + "language_loss": 0.82697207, + "learning_rate": 0.0008849068810098755, + "loss": 0.83751392, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.47607422, + "step": 1267, + "time_per_iteration": 3.274641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_mlp": 1.00789249, + "epoch": 0.24393997691419778, + "flos": 428685970176.0, + "grad_norm": 0.04273651221625489, + "language_loss": 0.84108871, + "learning_rate": 0.0008847079586399575, + "loss": 0.85164183, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47387695, + "step": 1268, + "time_per_iteration": 2.475217819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057264, + "balance_loss_mlp": 1.00993788, + "epoch": 0.24413235859946134, + "flos": 579943045632.0, + "grad_norm": 0.03463136192779687, + "language_loss": 0.86878628, + "learning_rate": 0.0008845088869131641, + "loss": 0.87935889, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.47290039, + "step": 1269, + "time_per_iteration": 2.676954746246338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054221, + "balance_loss_mlp": 1.00689447, + "epoch": 0.2443247402847249, + "flos": 530901120000.0, + "grad_norm": 0.04739098518835349, + "language_loss": 0.8972156, + "learning_rate": 0.0008843096659067818, + "loss": 0.90775776, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.47290039, + "step": 1270, + "time_per_iteration": 2.6031625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_mlp": 1.00896251, + "epoch": 0.24451712196998845, + "flos": 697625779200.0, + "grad_norm": 0.03005687387855686, + "language_loss": 0.8676796, + "learning_rate": 0.000884110295698155, + "loss": 0.87824345, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.47387695, + "step": 1271, + "time_per_iteration": 2.946385145187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00460577, + "epoch": 0.24470950365525201, + "flos": 530864181504.0, + "grad_norm": 0.03542850047119753, + "language_loss": 0.86657912, + "learning_rate": 0.0008839107763646861, + "loss": 0.87710059, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.47509766, + "step": 1272, + "time_per_iteration": 2.6175343990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057047, + "balance_loss_mlp": 1.00955379, + "epoch": 0.24490188534051557, + "flos": 492348806400.0, + "grad_norm": 0.04294337139782129, + "language_loss": 0.9099223, + "learning_rate": 0.0008837111079838353, + "loss": 0.92049271, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.47460938, + "step": 1273, + "time_per_iteration": 2.699777126312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051504, + "balance_loss_mlp": 1.00393975, + "epoch": 0.24509426702577913, + "flos": 475112054016.0, + "grad_norm": 0.03233839715385124, + "language_loss": 0.90686411, + "learning_rate": 0.000883511290633121, + "loss": 0.91737914, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.4753418, + "step": 1274, + "time_per_iteration": 2.5347506999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053736, + "balance_loss_mlp": 1.0061239, + "epoch": 0.24528664871104272, + "flos": 551648019456.0, + "grad_norm": 0.029596958484994024, + "language_loss": 0.9283247, + "learning_rate": 0.000883311324390119, + "loss": 0.93886209, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.47583008, + "step": 1275, + "time_per_iteration": 2.7105162143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_mlp": 1.00703931, + "epoch": 0.24547903039630628, + "flos": 827336914176.0, + "grad_norm": 0.04026092464880397, + "language_loss": 0.8227402, + "learning_rate": 0.0008831112093324629, + "loss": 0.83328599, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.47509766, + "step": 1276, + "time_per_iteration": 3.0518436431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052804, + "balance_loss_mlp": 1.00523984, + "epoch": 0.24567141208156984, + "flos": 592694665728.0, + "grad_norm": 0.0350541873914122, + "language_loss": 0.89993191, + "learning_rate": 0.0008829109455378444, + "loss": 0.91045994, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.4753418, + "step": 1277, + "time_per_iteration": 2.705888032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053461, + "balance_loss_mlp": 1.00606322, + "epoch": 0.2458637937668334, + "flos": 548930110464.0, + "grad_norm": 0.03225743101348484, + "language_loss": 0.87107539, + "learning_rate": 0.000882710533084013, + "loss": 0.88161004, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.47363281, + "step": 1278, + "time_per_iteration": 2.6600000858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_mlp": 1.00418186, + "epoch": 0.24605617545209696, + "flos": 516912054528.0, + "grad_norm": 0.031446449457072034, + "language_loss": 0.89965951, + "learning_rate": 0.0008825099720487755, + "loss": 0.91017628, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.47460938, + "step": 1279, + "time_per_iteration": 2.6381545066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059078, + "balance_loss_mlp": 1.01320648, + "epoch": 0.24624855713736052, + "flos": 1515061173504.0, + "grad_norm": 0.006597619453236458, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76320213, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.45800781, + "step": 1280, + "time_per_iteration": 4.836413621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.0109787, + "epoch": 0.24644093882262408, + "flos": 1530749421312.0, + "grad_norm": 0.006438131933853504, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000866, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.45703125, + "step": 1281, + "time_per_iteration": 4.763012409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055673, + "balance_loss_mlp": 1.00817966, + "epoch": 0.24663332050788764, + "flos": 660349794816.0, + "grad_norm": 0.03366863359794558, + "language_loss": 0.89743239, + "learning_rate": 0.0008819073982335619, + "loss": 0.90798908, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.47460938, + "step": 1282, + "time_per_iteration": 2.830066204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051734, + "balance_loss_mlp": 1.00426519, + "epoch": 0.24682570219315123, + "flos": 542806066944.0, + "grad_norm": 0.034270358372240205, + "language_loss": 0.85323066, + "learning_rate": 0.0008817062436519235, + "loss": 0.86374807, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.47436523, + "step": 1283, + "time_per_iteration": 2.6451101303100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00680709, + "epoch": 0.24701808387841478, + "flos": 441659221248.0, + "grad_norm": 0.03422998600893363, + "language_loss": 0.90367711, + "learning_rate": 0.0008815049408787788, + "loss": 0.91422176, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.47631836, + "step": 1284, + "time_per_iteration": 2.5568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_mlp": 1.00672722, + "epoch": 0.24721046556367834, + "flos": 469033697280.0, + "grad_norm": 0.036620952447016124, + "language_loss": 0.86045629, + "learning_rate": 0.0008813034899922805, + "loss": 0.87100112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47729492, + "step": 1285, + "time_per_iteration": 2.5571885108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052621, + "balance_loss_mlp": 1.00498545, + "epoch": 0.2474028472489419, + "flos": 505408573440.0, + "grad_norm": 0.03938899634346209, + "language_loss": 0.90811062, + "learning_rate": 0.0008811018910706387, + "loss": 0.91863692, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.47607422, + "step": 1286, + "time_per_iteration": 2.5542702674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105262, + "balance_loss_mlp": 1.00496054, + "epoch": 0.24759522893420546, + "flos": 480956140800.0, + "grad_norm": 0.04329385189604929, + "language_loss": 0.82886434, + "learning_rate": 0.0008809001441921211, + "loss": 0.83939052, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.47631836, + "step": 1287, + "time_per_iteration": 2.7426302433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056359, + "balance_loss_mlp": 1.00879443, + "epoch": 0.24778761061946902, + "flos": 534754407168.0, + "grad_norm": 0.03495005483538565, + "language_loss": 0.86372733, + "learning_rate": 0.0008806982494350528, + "loss": 0.87429094, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.4753418, + "step": 1288, + "time_per_iteration": 2.6200613975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_mlp": 1.0063771, + "epoch": 0.24797999230473258, + "flos": 560943927552.0, + "grad_norm": 0.028534619779485338, + "language_loss": 0.90820038, + "learning_rate": 0.0008804962068778161, + "loss": 0.91874075, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.47631836, + "step": 1289, + "time_per_iteration": 2.8445866107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050603, + "balance_loss_mlp": 1.00287127, + "epoch": 0.24817237398999614, + "flos": 625481627136.0, + "grad_norm": 0.033144052318390974, + "language_loss": 0.81476247, + "learning_rate": 0.0008802940165988511, + "loss": 0.82526851, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.47705078, + "step": 1290, + "time_per_iteration": 2.874469518661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052663, + "balance_loss_mlp": 1.00500298, + "epoch": 0.2483647556752597, + "flos": 613485306624.0, + "grad_norm": 0.033485904546120666, + "language_loss": 0.88976955, + "learning_rate": 0.000880091678676655, + "loss": 0.90029621, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47631836, + "step": 1291, + "time_per_iteration": 2.8294923305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_mlp": 1.00159943, + "epoch": 0.2485571373605233, + "flos": 584688692736.0, + "grad_norm": 0.030875088012072577, + "language_loss": 0.89826584, + "learning_rate": 0.0008798891931897821, + "loss": 0.90875816, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47607422, + "step": 1292, + "time_per_iteration": 2.7068471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050902, + "balance_loss_mlp": 1.00359952, + "epoch": 0.24874951904578685, + "flos": 495737444352.0, + "grad_norm": 0.03670876005724945, + "language_loss": 0.84959131, + "learning_rate": 0.0008796865602168447, + "loss": 0.86010033, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.47265625, + "step": 1293, + "time_per_iteration": 2.550218343734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_mlp": 1.00526226, + "epoch": 0.2489419007310504, + "flos": 457174437120.0, + "grad_norm": 0.03243940706171699, + "language_loss": 0.89144397, + "learning_rate": 0.0008794837798365115, + "loss": 0.90196991, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.47290039, + "step": 1294, + "time_per_iteration": 2.6271979808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.00420678, + "epoch": 0.24913428241631397, + "flos": 486565957632.0, + "grad_norm": 0.03268946967982851, + "language_loss": 0.89255542, + "learning_rate": 0.0008792808521275089, + "loss": 0.90307105, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47314453, + "step": 1295, + "time_per_iteration": 2.733107566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_mlp": 1.00544262, + "epoch": 0.24932666410157753, + "flos": 519918668544.0, + "grad_norm": 0.031266052737173484, + "language_loss": 0.88015056, + "learning_rate": 0.0008790777771686206, + "loss": 0.89068043, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.47509766, + "step": 1296, + "time_per_iteration": 2.5860161781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_mlp": 1.0059917, + "epoch": 0.2495190457868411, + "flos": 473557713408.0, + "grad_norm": 0.03428757295266267, + "language_loss": 0.86048388, + "learning_rate": 0.0008788745550386872, + "loss": 0.8710202, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.47607422, + "step": 1297, + "time_per_iteration": 2.599851608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_mlp": 1.00776434, + "epoch": 0.24971142747210465, + "flos": 747199428096.0, + "grad_norm": 0.03345883603952397, + "language_loss": 0.80858141, + "learning_rate": 0.0008786711858166063, + "loss": 0.81913638, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47705078, + "step": 1298, + "time_per_iteration": 2.9357736110687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_mlp": 1.00770009, + "epoch": 0.2499038091573682, + "flos": 750903015936.0, + "grad_norm": 0.03503874681650984, + "language_loss": 0.84951854, + "learning_rate": 0.0008784676695813332, + "loss": 0.86007309, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.47729492, + "step": 1299, + "time_per_iteration": 2.955172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055374, + "balance_loss_mlp": 1.00776184, + "epoch": 0.2500961908426318, + "flos": 746344006656.0, + "grad_norm": 0.032686560936085865, + "language_loss": 0.85840905, + "learning_rate": 0.0008782640064118796, + "loss": 0.86896276, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47583008, + "step": 1300, + "time_per_iteration": 2.897998571395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055206, + "balance_loss_mlp": 1.00904846, + "epoch": 0.2502885725278953, + "flos": 1420526353152.0, + "grad_norm": 0.0075534145797937526, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77239954, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.4609375, + "step": 1301, + "time_per_iteration": 5.023081541061401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.00904393, + "epoch": 0.2504809542131589, + "flos": 516232577280.0, + "grad_norm": 0.03748206036604932, + "language_loss": 0.87484509, + "learning_rate": 0.0008778562395867648, + "loss": 0.88541192, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.47607422, + "step": 1302, + "time_per_iteration": 2.593972682952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105251, + "balance_loss_mlp": 1.00477886, + "epoch": 0.25067333589842244, + "flos": 526852446720.0, + "grad_norm": 0.031223058919554587, + "language_loss": 0.84117836, + "learning_rate": 0.0008776521360894127, + "loss": 0.85170352, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.47705078, + "step": 1303, + "time_per_iteration": 2.6153149604797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.02342987, + "epoch": 0.25086571758368603, + "flos": 1477160146944.0, + "grad_norm": 0.014969332736355754, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80031657, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.4609375, + "step": 1304, + "time_per_iteration": 4.792739629745483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053769, + "balance_loss_mlp": 1.00649047, + "epoch": 0.2510580992689496, + "flos": 529403159808.0, + "grad_norm": 0.03453306909815573, + "language_loss": 0.91369265, + "learning_rate": 0.0008772434893213186, + "loss": 0.92423034, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.47241211, + "step": 1305, + "time_per_iteration": 2.581268072128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056214, + "balance_loss_mlp": 1.00919807, + "epoch": 0.25125048095421315, + "flos": 518466395136.0, + "grad_norm": 0.035319884850533015, + "language_loss": 0.84733635, + "learning_rate": 0.0008770389462092276, + "loss": 0.85789847, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46972656, + "step": 1306, + "time_per_iteration": 2.627317428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056702, + "balance_loss_mlp": 1.00951862, + "epoch": 0.25144286263947674, + "flos": 621675972096.0, + "grad_norm": 0.03558379494917989, + "language_loss": 0.87486076, + "learning_rate": 0.0008768342567176357, + "loss": 0.88542777, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.47143555, + "step": 1307, + "time_per_iteration": 2.787318706512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052534, + "balance_loss_mlp": 1.00537527, + "epoch": 0.25163524432474027, + "flos": 504866156544.0, + "grad_norm": 0.03616031366836922, + "language_loss": 0.9109531, + "learning_rate": 0.0008766294209260107, + "loss": 0.92147839, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.47119141, + "step": 1308, + "time_per_iteration": 2.6384546756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_mlp": 1.00510657, + "epoch": 0.25182762601000386, + "flos": 510080343552.0, + "grad_norm": 0.03702737725286332, + "language_loss": 0.92033225, + "learning_rate": 0.0008764244389138767, + "loss": 0.93085706, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47338867, + "step": 1309, + "time_per_iteration": 2.5620551109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053435, + "balance_loss_mlp": 1.006037, + "epoch": 0.2520200076952674, + "flos": 635098321152.0, + "grad_norm": 0.03928250470986306, + "language_loss": 0.83104628, + "learning_rate": 0.000876219310760815, + "loss": 0.84158063, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.47363281, + "step": 1310, + "time_per_iteration": 2.886335849761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053423, + "balance_loss_mlp": 1.00614405, + "epoch": 0.252212389380531, + "flos": 495652873728.0, + "grad_norm": 0.03544669215118347, + "language_loss": 0.82256365, + "learning_rate": 0.0008760140365464631, + "loss": 0.83309782, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.47241211, + "step": 1311, + "time_per_iteration": 2.607191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053107, + "balance_loss_mlp": 1.00592351, + "epoch": 0.2524047710657945, + "flos": 491530323456.0, + "grad_norm": 0.037974131054051216, + "language_loss": 0.87817502, + "learning_rate": 0.0008758086163505156, + "loss": 0.88870609, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.47143555, + "step": 1312, + "time_per_iteration": 2.6121339797973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052408, + "balance_loss_mlp": 1.00505757, + "epoch": 0.2525971527510581, + "flos": 648613989120.0, + "grad_norm": 0.03226827566126977, + "language_loss": 0.90228277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91280687, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.47314453, + "step": 1313, + "time_per_iteration": 2.8256115913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049721, + "balance_loss_mlp": 1.00234711, + "epoch": 0.2527895344363217, + "flos": 570373983744.0, + "grad_norm": 0.0325160066751772, + "language_loss": 0.907884, + "learning_rate": 0.0008753973383328954, + "loss": 0.91838121, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.47338867, + "step": 1314, + "time_per_iteration": 2.722231388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051583, + "balance_loss_mlp": 1.00423265, + "epoch": 0.2529819161215852, + "flos": 515069008896.0, + "grad_norm": 0.040482030139478604, + "language_loss": 0.8500945, + "learning_rate": 0.0008751914806708952, + "loss": 0.86061025, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.47314453, + "step": 1315, + "time_per_iteration": 2.593076229095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_mlp": 1.00376213, + "epoch": 0.2531742978068488, + "flos": 532351448064.0, + "grad_norm": 0.03414491036051862, + "language_loss": 0.82694548, + "learning_rate": 0.0008749854773466439, + "loss": 0.8374573, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.47387695, + "step": 1316, + "time_per_iteration": 2.660116672515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054789, + "balance_loss_mlp": 1.00722456, + "epoch": 0.25336667949211233, + "flos": 597748459776.0, + "grad_norm": 0.03206754273868493, + "language_loss": 0.84984171, + "learning_rate": 0.0008747793284401192, + "loss": 0.86038959, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.4753418, + "step": 1317, + "time_per_iteration": 2.692183017730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105185, + "balance_loss_mlp": 1.00407052, + "epoch": 0.2535590611773759, + "flos": 603256209408.0, + "grad_norm": 0.034288977750124294, + "language_loss": 0.85941386, + "learning_rate": 0.0008745730340313551, + "loss": 0.86993235, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.47753906, + "step": 1318, + "time_per_iteration": 2.7932682037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105299, + "balance_loss_mlp": 1.00525868, + "epoch": 0.25375144286263945, + "flos": 496323602688.0, + "grad_norm": 0.035249055653748196, + "language_loss": 0.8522734, + "learning_rate": 0.0008743665942004422, + "loss": 0.86280334, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.47705078, + "step": 1319, + "time_per_iteration": 2.6616318225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052413, + "balance_loss_mlp": 1.00465751, + "epoch": 0.25394382454790304, + "flos": 513477729792.0, + "grad_norm": 0.032623992793633046, + "language_loss": 0.93257391, + "learning_rate": 0.0008741600090275277, + "loss": 0.94309807, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.47729492, + "step": 1320, + "time_per_iteration": 2.567985773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051086, + "balance_loss_mlp": 1.00333035, + "epoch": 0.25413620623316663, + "flos": 960856616448.0, + "grad_norm": 0.03465281335593922, + "language_loss": 0.8488484, + "learning_rate": 0.0008739532785928151, + "loss": 0.85935926, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.47729492, + "step": 1321, + "time_per_iteration": 3.4506430625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054222, + "balance_loss_mlp": 1.00882721, + "epoch": 0.25432858791843016, + "flos": 1580651625984.0, + "grad_norm": 0.01348888133328934, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75947809, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.453125, + "step": 1322, + "time_per_iteration": 4.819811820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055371, + "balance_loss_mlp": 1.00752044, + "epoch": 0.25452096960369375, + "flos": 584894772480.0, + "grad_norm": 0.03690210205672512, + "language_loss": 0.83839363, + "learning_rate": 0.0008735393822590908, + "loss": 0.84894735, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.47827148, + "step": 1323, + "time_per_iteration": 2.680769681930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069306, + "balance_loss_mlp": 1.02138364, + "epoch": 0.2547133512889573, + "flos": 509641939200.0, + "grad_norm": 0.03795743442729459, + "language_loss": 0.87760162, + "learning_rate": 0.0008733322165207681, + "loss": 0.8882947, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.47900391, + "step": 1324, + "time_per_iteration": 2.6391303539276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056249, + "balance_loss_mlp": 1.00856507, + "epoch": 0.25490573297422087, + "flos": 784037008128.0, + "grad_norm": 0.03625483542623235, + "language_loss": 0.83670151, + "learning_rate": 0.0008731249058420247, + "loss": 0.84726399, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.4765625, + "step": 1325, + "time_per_iteration": 3.0179827213287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.01479542, + "epoch": 0.2550981146594844, + "flos": 510953261568.0, + "grad_norm": 0.03728184694741104, + "language_loss": 0.91373062, + "learning_rate": 0.0008729174503033459, + "loss": 0.92435133, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.47241211, + "step": 1326, + "time_per_iteration": 2.644351005554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059853, + "balance_loss_mlp": 1.01262248, + "epoch": 0.255290496344748, + "flos": 677931632640.0, + "grad_norm": 0.04262364220636159, + "language_loss": 0.83700824, + "learning_rate": 0.0008727098499852728, + "loss": 0.84760678, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.47192383, + "step": 1327, + "time_per_iteration": 2.8393821716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059289, + "balance_loss_mlp": 1.01212943, + "epoch": 0.2554828780300115, + "flos": 538985827584.0, + "grad_norm": 0.0346626903619469, + "language_loss": 0.90499496, + "learning_rate": 0.0008725021049684034, + "loss": 0.91558784, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.47119141, + "step": 1328, + "time_per_iteration": 2.74480938911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_mlp": 1.00554764, + "epoch": 0.2556752597152751, + "flos": 825624125952.0, + "grad_norm": 0.0321884383853499, + "language_loss": 0.83690739, + "learning_rate": 0.000872294215333391, + "loss": 0.84743297, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.46972656, + "step": 1329, + "time_per_iteration": 3.177448034286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066156, + "balance_loss_mlp": 1.01880646, + "epoch": 0.2558676414005387, + "flos": 571891385856.0, + "grad_norm": 0.037080167806849716, + "language_loss": 0.84060931, + "learning_rate": 0.0008720861811609457, + "loss": 0.85127091, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.47314453, + "step": 1330, + "time_per_iteration": 2.7320711612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_mlp": 1.00745046, + "epoch": 0.2560600230858022, + "flos": 487748967936.0, + "grad_norm": 0.03498979971426328, + "language_loss": 0.84052318, + "learning_rate": 0.0008718780025318338, + "loss": 0.85106957, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.47143555, + "step": 1331, + "time_per_iteration": 2.7297112941741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.00705111, + "epoch": 0.2562524047710658, + "flos": 514120268544.0, + "grad_norm": 0.03699782349212247, + "language_loss": 0.84697664, + "learning_rate": 0.0008716696795268771, + "loss": 0.85751587, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.46826172, + "step": 1332, + "time_per_iteration": 2.6615397930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054318, + "balance_loss_mlp": 1.00756466, + "epoch": 0.25644478645632934, + "flos": 636110244864.0, + "grad_norm": 0.03600089626817585, + "language_loss": 0.85914254, + "learning_rate": 0.0008714612122269538, + "loss": 0.86968577, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.46704102, + "step": 1333, + "time_per_iteration": 2.849813938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056443, + "balance_loss_mlp": 1.00968957, + "epoch": 0.25663716814159293, + "flos": 437545419264.0, + "grad_norm": 0.03932780780666976, + "language_loss": 0.90516675, + "learning_rate": 0.0008712526007129982, + "loss": 0.91573119, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46704102, + "step": 1334, + "time_per_iteration": 2.520730972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_mlp": 1.00675464, + "epoch": 0.25682954982685646, + "flos": 499243700736.0, + "grad_norm": 0.03395243638019146, + "language_loss": 0.9133085, + "learning_rate": 0.0008710438450660003, + "loss": 0.9238441, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.4675293, + "step": 1335, + "time_per_iteration": 2.6936721801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00590599, + "epoch": 0.25702193151212005, + "flos": 458628655872.0, + "grad_norm": 0.038911849114865095, + "language_loss": 0.8791827, + "learning_rate": 0.0008708349453670064, + "loss": 0.88971329, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47119141, + "step": 1336, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074594, + "balance_loss_mlp": 1.02733934, + "epoch": 0.2572143131973836, + "flos": 599404867584.0, + "grad_norm": 0.03723585257139378, + "language_loss": 0.92015922, + "learning_rate": 0.0008706259016971185, + "loss": 0.93090516, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.47216797, + "step": 1337, + "time_per_iteration": 2.792436361312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055792, + "balance_loss_mlp": 1.00872791, + "epoch": 0.25740669488264717, + "flos": 699527150592.0, + "grad_norm": 0.04259016947882448, + "language_loss": 0.8355068, + "learning_rate": 0.0008704167141374944, + "loss": 0.84606469, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.47021484, + "step": 1338, + "time_per_iteration": 2.806931972503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056758, + "balance_loss_mlp": 1.01014686, + "epoch": 0.25759907656791076, + "flos": 503378889984.0, + "grad_norm": 0.03686560218677495, + "language_loss": 0.88890558, + "learning_rate": 0.0008702073827693482, + "loss": 0.89947319, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.46557617, + "step": 1339, + "time_per_iteration": 2.7613115310668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_mlp": 1.01112759, + "epoch": 0.2577914582531743, + "flos": 775242687744.0, + "grad_norm": 0.03484469931885578, + "language_loss": 0.89865053, + "learning_rate": 0.0008699979076739494, + "loss": 0.90922654, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.46411133, + "step": 1340, + "time_per_iteration": 2.9694418907165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_mlp": 1.00552797, + "epoch": 0.2579838399384379, + "flos": 460610707200.0, + "grad_norm": 0.04216529081594553, + "language_loss": 0.89380765, + "learning_rate": 0.0008697882889326234, + "loss": 0.9043293, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.46582031, + "step": 1341, + "time_per_iteration": 2.5050456523895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051599, + "balance_loss_mlp": 1.00482166, + "epoch": 0.2581762216237014, + "flos": 570263168256.0, + "grad_norm": 0.03742337984590145, + "language_loss": 0.87203884, + "learning_rate": 0.0008695785266267515, + "loss": 0.88255489, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.46728516, + "step": 1342, + "time_per_iteration": 2.677072763442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057516, + "balance_loss_mlp": 1.01069069, + "epoch": 0.258368603308965, + "flos": 605387960064.0, + "grad_norm": 0.035138016776099276, + "language_loss": 0.83827055, + "learning_rate": 0.0008693686208377704, + "loss": 0.84884572, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.46777344, + "step": 1343, + "time_per_iteration": 2.826026439666748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054134, + "balance_loss_mlp": 1.0075947, + "epoch": 0.2585609849942285, + "flos": 492487812096.0, + "grad_norm": 0.03194520317053949, + "language_loss": 0.89379156, + "learning_rate": 0.0008691585716471733, + "loss": 0.90433288, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.46484375, + "step": 1344, + "time_per_iteration": 2.6379647254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_mlp": 1.00646937, + "epoch": 0.2587533666794921, + "flos": 641958222336.0, + "grad_norm": 0.03185107281306307, + "language_loss": 0.86602217, + "learning_rate": 0.0008689483791365079, + "loss": 0.87655246, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.46508789, + "step": 1345, + "time_per_iteration": 2.8372344970703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105515, + "balance_loss_mlp": 1.00868249, + "epoch": 0.2589457483647557, + "flos": 577995987456.0, + "grad_norm": 0.038033594557881883, + "language_loss": 0.90178049, + "learning_rate": 0.0008687380433873786, + "loss": 0.91233194, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46411133, + "step": 1346, + "time_per_iteration": 2.7660248279571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.00636888, + "epoch": 0.25913813005001923, + "flos": 536467195392.0, + "grad_norm": 0.03823400300780179, + "language_loss": 0.83192778, + "learning_rate": 0.0008685275644814448, + "loss": 0.8424564, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.46435547, + "step": 1347, + "time_per_iteration": 2.6657776832580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058039, + "balance_loss_mlp": 1.01118934, + "epoch": 0.2593305117352828, + "flos": 722347474944.0, + "grad_norm": 0.04308500968206218, + "language_loss": 0.85215819, + "learning_rate": 0.0008683169425004216, + "loss": 0.86273861, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46801758, + "step": 1348, + "time_per_iteration": 2.8938682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067463, + "balance_loss_mlp": 1.02058995, + "epoch": 0.25952289342054635, + "flos": 711356275200.0, + "grad_norm": 0.04420512127692048, + "language_loss": 0.84604859, + "learning_rate": 0.0008681061775260799, + "loss": 0.85672331, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.46826172, + "step": 1349, + "time_per_iteration": 2.8803627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_mlp": 1.00634348, + "epoch": 0.25971527510580994, + "flos": 456850738944.0, + "grad_norm": 0.03368144531989068, + "language_loss": 0.92376006, + "learning_rate": 0.0008678952696402458, + "loss": 0.93428755, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46337891, + "step": 1350, + "time_per_iteration": 2.5544798374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054358, + "balance_loss_mlp": 1.00824761, + "epoch": 0.25990765679107347, + "flos": 613754569728.0, + "grad_norm": 0.03011764192417466, + "language_loss": 0.87159944, + "learning_rate": 0.000867684218924801, + "loss": 0.88214302, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.46044922, + "step": 1351, + "time_per_iteration": 2.856372833251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_mlp": 1.02496338, + "epoch": 0.26010003847633706, + "flos": 1541407196160.0, + "grad_norm": 0.012951365709411706, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80016494, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.4453125, + "step": 1352, + "time_per_iteration": 4.943616628646851 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058433, + "balance_loss_mlp": 1.01194191, + "epoch": 0.2602924201616006, + "flos": 717545447424.0, + "grad_norm": 0.029832851456929797, + "language_loss": 0.85926312, + "learning_rate": 0.0008672616893328834, + "loss": 0.86984742, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.46435547, + "step": 1353, + "time_per_iteration": 2.913235664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.01012051, + "epoch": 0.2604848018468642, + "flos": 644686824960.0, + "grad_norm": 0.03749633937906014, + "language_loss": 0.91143578, + "learning_rate": 0.0008670502106204512, + "loss": 0.92200339, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.46582031, + "step": 1354, + "time_per_iteration": 2.821753978729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091569, + "balance_loss_mlp": 1.0442189, + "epoch": 0.26067718353212777, + "flos": 518038684416.0, + "grad_norm": 0.04686611644365056, + "language_loss": 0.82400739, + "learning_rate": 0.0008668385894064892, + "loss": 0.83492303, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.47314453, + "step": 1355, + "time_per_iteration": 2.642392158508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056006, + "balance_loss_mlp": 1.00925195, + "epoch": 0.2608695652173913, + "flos": 824226287616.0, + "grad_norm": 0.03313451231790272, + "language_loss": 0.89331532, + "learning_rate": 0.0008666268257731562, + "loss": 0.90387547, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46704102, + "step": 1356, + "time_per_iteration": 3.1127805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060563, + "balance_loss_mlp": 1.01414335, + "epoch": 0.2610619469026549, + "flos": 1009450422528.0, + "grad_norm": 0.04035878870854939, + "language_loss": 0.86687934, + "learning_rate": 0.0008664149198026662, + "loss": 0.87748504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.46362305, + "step": 1357, + "time_per_iteration": 3.2328455448150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106616, + "balance_loss_mlp": 1.01971614, + "epoch": 0.2612543285879184, + "flos": 537826149888.0, + "grad_norm": 0.03943672852684058, + "language_loss": 0.8952527, + "learning_rate": 0.0008662028715772883, + "loss": 0.90591431, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.46386719, + "step": 1358, + "time_per_iteration": 2.621894359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058771, + "balance_loss_mlp": 1.01213586, + "epoch": 0.261446710273182, + "flos": 520439698176.0, + "grad_norm": 0.03590038892764462, + "language_loss": 0.86476588, + "learning_rate": 0.0008659906811793467, + "loss": 0.87535357, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.46582031, + "step": 1359, + "time_per_iteration": 2.6540629863739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054238, + "balance_loss_mlp": 1.00741243, + "epoch": 0.26163909195844554, + "flos": 584399987712.0, + "grad_norm": 0.03384500135634075, + "language_loss": 0.90458202, + "learning_rate": 0.0008657783486912215, + "loss": 0.91512442, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.46777344, + "step": 1360, + "time_per_iteration": 2.71598744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.01626348, + "epoch": 0.2618314736437091, + "flos": 960369613056.0, + "grad_norm": 0.03695926115068694, + "language_loss": 0.90376949, + "learning_rate": 0.0008655658741953472, + "loss": 0.91440493, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.47241211, + "step": 1361, + "time_per_iteration": 3.233081102371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061537, + "balance_loss_mlp": 1.01413929, + "epoch": 0.26202385532897265, + "flos": 575903120640.0, + "grad_norm": 0.032102410789184695, + "language_loss": 0.892542, + "learning_rate": 0.0008653532577742136, + "loss": 0.90315735, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.47363281, + "step": 1362, + "time_per_iteration": 2.671513319015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_mlp": 1.00673676, + "epoch": 0.26221623701423624, + "flos": 446398065408.0, + "grad_norm": 0.034188430773875136, + "language_loss": 0.88125902, + "learning_rate": 0.0008651404995103659, + "loss": 0.8917954, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.46850586, + "step": 1363, + "time_per_iteration": 2.5599000453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064691, + "balance_loss_mlp": 1.01803255, + "epoch": 0.26240861869949983, + "flos": 536755900416.0, + "grad_norm": 0.03309695956224158, + "language_loss": 0.87925225, + "learning_rate": 0.0008649275994864041, + "loss": 0.88989913, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.46606445, + "step": 1364, + "time_per_iteration": 2.68673038482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061602, + "balance_loss_mlp": 1.01472914, + "epoch": 0.26260100038476336, + "flos": 566488615680.0, + "grad_norm": 0.0327166713474878, + "language_loss": 0.84653741, + "learning_rate": 0.0008647145577849834, + "loss": 0.85715348, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46826172, + "step": 1365, + "time_per_iteration": 2.8294174671173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_mlp": 1.01471996, + "epoch": 0.26279338207002695, + "flos": 614321286144.0, + "grad_norm": 0.027467777319160957, + "language_loss": 0.83391041, + "learning_rate": 0.0008645013744888139, + "loss": 0.84452683, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.46875, + "step": 1366, + "time_per_iteration": 2.845019578933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.01238823, + "epoch": 0.2629857637552905, + "flos": 523945954560.0, + "grad_norm": 0.034051307399065846, + "language_loss": 0.88423878, + "learning_rate": 0.0008642880496806607, + "loss": 0.89483547, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.47241211, + "step": 1367, + "time_per_iteration": 2.7665200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_mlp": 1.01832283, + "epoch": 0.26317814544055407, + "flos": 535655515392.0, + "grad_norm": 0.03476637042829631, + "language_loss": 0.85672963, + "learning_rate": 0.0008640745834433437, + "loss": 0.86738896, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.47583008, + "step": 1368, + "time_per_iteration": 2.7824857234954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_mlp": 1.00967455, + "epoch": 0.2633705271258176, + "flos": 556780548096.0, + "grad_norm": 0.035052832704740904, + "language_loss": 0.8778615, + "learning_rate": 0.000863860975859738, + "loss": 0.88843262, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.47412109, + "step": 1369, + "time_per_iteration": 2.938157796859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059209, + "balance_loss_mlp": 1.01214516, + "epoch": 0.2635629088110812, + "flos": 553462874880.0, + "grad_norm": 0.04030614296387141, + "language_loss": 0.89190161, + "learning_rate": 0.0008636472270127733, + "loss": 0.90249372, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.47021484, + "step": 1370, + "time_per_iteration": 2.6449878215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_mlp": 1.0106585, + "epoch": 0.2637552904963448, + "flos": 456915867648.0, + "grad_norm": 0.03827203709322554, + "language_loss": 0.91134202, + "learning_rate": 0.0008634333369854345, + "loss": 0.9219166, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.4675293, + "step": 1371, + "time_per_iteration": 2.6090121269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053294, + "balance_loss_mlp": 1.00642049, + "epoch": 0.2639476721816083, + "flos": 614260048128.0, + "grad_norm": 0.03299961926418253, + "language_loss": 0.88250023, + "learning_rate": 0.0008632193058607608, + "loss": 0.89303321, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.46826172, + "step": 1372, + "time_per_iteration": 2.6980674266815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_mlp": 1.00562024, + "epoch": 0.2641400538668719, + "flos": 573026764032.0, + "grad_norm": 0.03659842444989107, + "language_loss": 0.81553382, + "learning_rate": 0.0008630051337218466, + "loss": 0.82606065, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47021484, + "step": 1373, + "time_per_iteration": 2.6634395122528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056457, + "balance_loss_mlp": 1.00960791, + "epoch": 0.2643324355521354, + "flos": 583340431872.0, + "grad_norm": 0.03511173854729822, + "language_loss": 0.82885635, + "learning_rate": 0.0008627908206518409, + "loss": 0.83942091, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.46801758, + "step": 1374, + "time_per_iteration": 2.6550941467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_mlp": 1.01022339, + "epoch": 0.264524817237399, + "flos": 1548027969792.0, + "grad_norm": 0.005864236448565476, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76206684, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.45117188, + "step": 1375, + "time_per_iteration": 4.995543718338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_mlp": 1.01197898, + "epoch": 0.26471719892266254, + "flos": 519043805184.0, + "grad_norm": 0.03321674595186757, + "language_loss": 0.92123759, + "learning_rate": 0.0008623617720514241, + "loss": 0.93182206, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.46411133, + "step": 1376, + "time_per_iteration": 2.592569351196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061616, + "balance_loss_mlp": 1.0151242, + "epoch": 0.26490958060792613, + "flos": 518205880320.0, + "grad_norm": 0.036665073764434085, + "language_loss": 0.85824203, + "learning_rate": 0.0008621470366875848, + "loss": 0.8688581, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.46435547, + "step": 1377, + "time_per_iteration": 2.5636963844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00766897, + "epoch": 0.26510196229318966, + "flos": 597683331072.0, + "grad_norm": 0.03396624681403314, + "language_loss": 0.88501984, + "learning_rate": 0.0008619321607257966, + "loss": 0.8955617, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.46459961, + "step": 1378, + "time_per_iteration": 2.687581777572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056779, + "balance_loss_mlp": 1.010144, + "epoch": 0.26529434397845325, + "flos": 687053541888.0, + "grad_norm": 0.031207845572821406, + "language_loss": 0.82550275, + "learning_rate": 0.000861717144249482, + "loss": 0.83607054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.46582031, + "step": 1379, + "time_per_iteration": 2.8333678245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.00819123, + "epoch": 0.26548672566371684, + "flos": 425260393728.0, + "grad_norm": 0.03047521662480035, + "language_loss": 0.90854567, + "learning_rate": 0.0008615019873421175, + "loss": 0.91909492, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.46679688, + "step": 1380, + "time_per_iteration": 2.47892689704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00437295, + "epoch": 0.26567910734898037, + "flos": 490850846208.0, + "grad_norm": 0.03515354974137605, + "language_loss": 0.8636173, + "learning_rate": 0.0008612866900872349, + "loss": 0.87412781, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.46630859, + "step": 1381, + "time_per_iteration": 2.558497428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.00833893, + "epoch": 0.26587148903424396, + "flos": 535229750016.0, + "grad_norm": 0.033124361732310995, + "language_loss": 0.88441265, + "learning_rate": 0.0008610712525684197, + "loss": 0.89496362, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.46704102, + "step": 1382, + "time_per_iteration": 2.6567015647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056513, + "balance_loss_mlp": 1.00997365, + "epoch": 0.2660638707195075, + "flos": 1019056422912.0, + "grad_norm": 0.038309225150243896, + "language_loss": 0.84641987, + "learning_rate": 0.0008608556748693121, + "loss": 0.85698497, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.46484375, + "step": 1383, + "time_per_iteration": 3.266127347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054133, + "balance_loss_mlp": 1.00754607, + "epoch": 0.2662562524047711, + "flos": 525063836160.0, + "grad_norm": 0.03266135396779854, + "language_loss": 0.86478686, + "learning_rate": 0.000860639957073607, + "loss": 0.87532818, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.46533203, + "step": 1384, + "time_per_iteration": 2.701979398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052082, + "balance_loss_mlp": 1.00542331, + "epoch": 0.2664486340900346, + "flos": 553480371456.0, + "grad_norm": 0.03507018041250785, + "language_loss": 0.88455647, + "learning_rate": 0.0008604240992650534, + "loss": 0.89507735, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.46606445, + "step": 1385, + "time_per_iteration": 2.6528589725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051347, + "balance_loss_mlp": 1.00476038, + "epoch": 0.2666410157752982, + "flos": 471209189376.0, + "grad_norm": 0.03349459525563368, + "language_loss": 0.89804894, + "learning_rate": 0.0008602081015274545, + "loss": 0.90856242, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.46533203, + "step": 1386, + "time_per_iteration": 2.7359464168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00626826, + "epoch": 0.2668333974605617, + "flos": 571016522496.0, + "grad_norm": 0.027882929979452454, + "language_loss": 0.8367793, + "learning_rate": 0.0008599919639446684, + "loss": 0.84730947, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.46704102, + "step": 1387, + "time_per_iteration": 2.72188401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_mlp": 1.00572038, + "epoch": 0.2670257791458253, + "flos": 399896159232.0, + "grad_norm": 0.038277743086958374, + "language_loss": 0.80995691, + "learning_rate": 0.000859775686600607, + "loss": 0.82048184, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46728516, + "step": 1388, + "time_per_iteration": 2.5220229625701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051137, + "balance_loss_mlp": 1.00443089, + "epoch": 0.2672181608310889, + "flos": 516892612608.0, + "grad_norm": 0.03738976993969629, + "language_loss": 0.85769641, + "learning_rate": 0.0008595592695792367, + "loss": 0.86820781, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.46655273, + "step": 1389, + "time_per_iteration": 2.7041423320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050805, + "balance_loss_mlp": 1.0042417, + "epoch": 0.26741054251635243, + "flos": 508526002944.0, + "grad_norm": 0.03398026188762752, + "language_loss": 0.91414082, + "learning_rate": 0.0008593427129645778, + "loss": 0.92464888, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.46508789, + "step": 1390, + "time_per_iteration": 2.563215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.0067687, + "epoch": 0.267602924201616, + "flos": 577809349632.0, + "grad_norm": 0.03481446530036303, + "language_loss": 0.86254311, + "learning_rate": 0.0008591260168407052, + "loss": 0.87307882, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.4675293, + "step": 1391, + "time_per_iteration": 2.788869619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_mlp": 1.00475395, + "epoch": 0.26779530588687955, + "flos": 525000652800.0, + "grad_norm": 0.029176301882166727, + "language_loss": 0.83413607, + "learning_rate": 0.0008589091812917479, + "loss": 0.84465045, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.46630859, + "step": 1392, + "time_per_iteration": 2.6471304893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057177, + "balance_loss_mlp": 1.0103997, + "epoch": 0.26798768757214314, + "flos": 557828443392.0, + "grad_norm": 0.034011915135398356, + "language_loss": 0.85611916, + "learning_rate": 0.0008586922064018887, + "loss": 0.86669087, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.46728516, + "step": 1393, + "time_per_iteration": 2.665710926055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00488269, + "epoch": 0.2681800692574067, + "flos": 932095974144.0, + "grad_norm": 0.035119979561623306, + "language_loss": 0.89861763, + "learning_rate": 0.0008584750922553651, + "loss": 0.90913308, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.46606445, + "step": 1394, + "time_per_iteration": 3.1556007862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00813687, + "epoch": 0.26837245094267026, + "flos": 702318936576.0, + "grad_norm": 0.034220503648090136, + "language_loss": 0.84388494, + "learning_rate": 0.0008582578389364677, + "loss": 0.85443103, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.46411133, + "step": 1395, + "time_per_iteration": 2.8831770420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054667, + "balance_loss_mlp": 1.00824666, + "epoch": 0.26856483262793385, + "flos": 594394814976.0, + "grad_norm": 0.030437239966241224, + "language_loss": 0.92446673, + "learning_rate": 0.0008580404465295422, + "loss": 0.93501341, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.46362305, + "step": 1396, + "time_per_iteration": 2.823685884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_mlp": 1.00578523, + "epoch": 0.2687572143131974, + "flos": 715589640960.0, + "grad_norm": 0.035135728363153845, + "language_loss": 0.88714433, + "learning_rate": 0.0008578229151189876, + "loss": 0.89766812, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.46533203, + "step": 1397, + "time_per_iteration": 2.9427757263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_mlp": 1.00858808, + "epoch": 0.26894959599846097, + "flos": 468671115264.0, + "grad_norm": 0.03944499035247069, + "language_loss": 0.82205743, + "learning_rate": 0.0008576052447892573, + "loss": 0.83260822, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.46435547, + "step": 1398, + "time_per_iteration": 2.570364475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_mlp": 1.00712895, + "epoch": 0.2691419776837245, + "flos": 469630549248.0, + "grad_norm": 0.035560759826370754, + "language_loss": 0.87260717, + "learning_rate": 0.000857387435624858, + "loss": 0.88314486, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.46582031, + "step": 1399, + "time_per_iteration": 2.5241427421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_mlp": 1.00698149, + "epoch": 0.2693343593689881, + "flos": 939286376448.0, + "grad_norm": 0.026228750880396605, + "language_loss": 0.88826966, + "learning_rate": 0.0008571694877103513, + "loss": 0.89880389, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.46386719, + "step": 1400, + "time_per_iteration": 3.2871432304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049973, + "balance_loss_mlp": 1.00355244, + "epoch": 0.2695267410542516, + "flos": 578795028480.0, + "grad_norm": 0.031687518811048296, + "language_loss": 0.88370931, + "learning_rate": 0.0008569514011303515, + "loss": 0.89420903, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.46362305, + "step": 1401, + "time_per_iteration": 2.8385562896728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00763249, + "epoch": 0.2697191227395152, + "flos": 557965503744.0, + "grad_norm": 0.03646210542720766, + "language_loss": 0.89149171, + "learning_rate": 0.0008567331759695277, + "loss": 0.90203321, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.46459961, + "step": 1402, + "time_per_iteration": 2.73796010017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_mlp": 1.00663948, + "epoch": 0.26991150442477874, + "flos": 530314961664.0, + "grad_norm": 0.03368837159460442, + "language_loss": 0.86897242, + "learning_rate": 0.0008565148123126023, + "loss": 0.87950301, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.46362305, + "step": 1403, + "time_per_iteration": 2.654782772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055763, + "balance_loss_mlp": 1.00970042, + "epoch": 0.2701038861100423, + "flos": 533087305728.0, + "grad_norm": 0.02742415368344255, + "language_loss": 0.86797845, + "learning_rate": 0.0008562963102443516, + "loss": 0.87853605, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.45996094, + "step": 1404, + "time_per_iteration": 2.6844303607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057243, + "balance_loss_mlp": 1.01122797, + "epoch": 0.2702962677953059, + "flos": 736505681664.0, + "grad_norm": 0.03794782730472634, + "language_loss": 0.85607296, + "learning_rate": 0.0008560776698496056, + "loss": 0.86664534, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.45947266, + "step": 1405, + "time_per_iteration": 2.9016945362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054152, + "balance_loss_mlp": 1.00806534, + "epoch": 0.27048864948056944, + "flos": 576001297152.0, + "grad_norm": 0.03333453941991407, + "language_loss": 0.8661586, + "learning_rate": 0.0008558588912132481, + "loss": 0.8767001, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.46020508, + "step": 1406, + "time_per_iteration": 2.8187410831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.03042603, + "epoch": 0.27068103116583303, + "flos": 1426912856832.0, + "grad_norm": 0.025019447230712623, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77533662, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.44335938, + "step": 1407, + "time_per_iteration": 4.91855001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059735, + "balance_loss_mlp": 1.01386356, + "epoch": 0.27087341285109656, + "flos": 533032870656.0, + "grad_norm": 0.03180107690871134, + "language_loss": 0.83613265, + "learning_rate": 0.0008554209195555016, + "loss": 0.84672999, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.45800781, + "step": 1408, + "time_per_iteration": 2.7004964351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.02188134, + "epoch": 0.27106579453636015, + "flos": 582465568512.0, + "grad_norm": 0.03644580883658202, + "language_loss": 0.89378774, + "learning_rate": 0.0008552017267041483, + "loss": 0.90446383, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.45654297, + "step": 1409, + "time_per_iteration": 2.7288694381713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.0219177, + "epoch": 0.2712581762216237, + "flos": 507881518848.0, + "grad_norm": 0.03188220116364099, + "language_loss": 0.84328783, + "learning_rate": 0.0008549823959512549, + "loss": 0.85396332, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.45556641, + "step": 1410, + "time_per_iteration": 2.67370343208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060246, + "balance_loss_mlp": 1.01435077, + "epoch": 0.27145055790688727, + "flos": 999143557632.0, + "grad_norm": 0.03419744556224296, + "language_loss": 0.87478781, + "learning_rate": 0.0008547629273819728, + "loss": 0.88539028, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.45825195, + "step": 1411, + "time_per_iteration": 3.3728370666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_mlp": 1.0104996, + "epoch": 0.2716429395921508, + "flos": 547729603584.0, + "grad_norm": 0.037303619224495106, + "language_loss": 0.84070724, + "learning_rate": 0.0008545433210815074, + "loss": 0.85127789, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.46508789, + "step": 1412, + "time_per_iteration": 2.6812539100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_mlp": 1.01536179, + "epoch": 0.2718353212774144, + "flos": 574311841536.0, + "grad_norm": 0.033089137280770606, + "language_loss": 0.8805269, + "learning_rate": 0.0008543235771351176, + "loss": 0.89114881, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.46777344, + "step": 1413, + "time_per_iteration": 2.713487148284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00961292, + "epoch": 0.272027702962678, + "flos": 645585987840.0, + "grad_norm": 0.026077025600286987, + "language_loss": 0.85152733, + "learning_rate": 0.0008541036956281154, + "loss": 0.86208814, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.46411133, + "step": 1414, + "time_per_iteration": 2.9018056392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062827, + "balance_loss_mlp": 1.01631117, + "epoch": 0.2722200846479415, + "flos": 654996602112.0, + "grad_norm": 0.04047455719590206, + "language_loss": 0.83293629, + "learning_rate": 0.0008538836766458665, + "loss": 0.84356457, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.46459961, + "step": 1415, + "time_per_iteration": 2.84184193611145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106005, + "balance_loss_mlp": 1.01365411, + "epoch": 0.2724124663332051, + "flos": 580779025152.0, + "grad_norm": 0.0390255284508479, + "language_loss": 0.85920322, + "learning_rate": 0.0008536635202737897, + "loss": 0.86980367, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.46337891, + "step": 1416, + "time_per_iteration": 2.814687728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01272988, + "epoch": 0.2726048480184686, + "flos": 538468688640.0, + "grad_norm": 0.03678906161491062, + "language_loss": 0.82951486, + "learning_rate": 0.0008534432265973573, + "loss": 0.8401081, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.46533203, + "step": 1417, + "time_per_iteration": 2.641660451889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00930703, + "epoch": 0.2727972297037322, + "flos": 997550333184.0, + "grad_norm": 0.4222293446211692, + "language_loss": 0.88806397, + "learning_rate": 0.000853222795702095, + "loss": 0.89862669, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.46923828, + "step": 1418, + "time_per_iteration": 3.3743135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181433, + "balance_loss_mlp": 1.1334635, + "epoch": 0.27298961138899575, + "flos": 607335018240.0, + "grad_norm": 0.06715989722341878, + "language_loss": 0.84640503, + "learning_rate": 0.0008530022276735813, + "loss": 0.85821939, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.47949219, + "step": 1419, + "time_per_iteration": 2.752645254135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069458, + "balance_loss_mlp": 1.02225161, + "epoch": 0.27318199307425933, + "flos": 530397586944.0, + "grad_norm": 0.040820608700474346, + "language_loss": 0.87344372, + "learning_rate": 0.0008527815225974489, + "loss": 0.88413835, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.47167969, + "step": 1420, + "time_per_iteration": 2.65108585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085616, + "balance_loss_mlp": 1.03852844, + "epoch": 0.2733743747595229, + "flos": 409912373760.0, + "grad_norm": 0.06690132065136703, + "language_loss": 0.92052042, + "learning_rate": 0.0008525606805593829, + "loss": 0.93137658, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.47045898, + "step": 1421, + "time_per_iteration": 2.4201173782348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.03422987, + "epoch": 0.27356675644478645, + "flos": 517228949760.0, + "grad_norm": 0.05290317096475839, + "language_loss": 0.85793996, + "learning_rate": 0.0008523397016451213, + "loss": 0.86875236, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46972656, + "step": 1422, + "time_per_iteration": 2.632446765899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080251, + "balance_loss_mlp": 1.03328276, + "epoch": 0.27375913813005004, + "flos": 1054059705600.0, + "grad_norm": 0.039766191828199446, + "language_loss": 0.90321743, + "learning_rate": 0.0008521185859404564, + "loss": 0.91401994, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.46923828, + "step": 1423, + "time_per_iteration": 3.381535291671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107676, + "balance_loss_mlp": 1.02998257, + "epoch": 0.27395151981531357, + "flos": 626004602112.0, + "grad_norm": 0.042654551092476074, + "language_loss": 0.92207062, + "learning_rate": 0.0008518973335312326, + "loss": 0.9328382, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.46728516, + "step": 1424, + "time_per_iteration": 2.787799596786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070757, + "balance_loss_mlp": 1.0240984, + "epoch": 0.27414390150057716, + "flos": 551415694848.0, + "grad_norm": 0.04883209929837253, + "language_loss": 0.85839558, + "learning_rate": 0.0008516759445033477, + "loss": 0.86910313, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.46606445, + "step": 1425, + "time_per_iteration": 2.6206350326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_mlp": 1.01881957, + "epoch": 0.2743362831858407, + "flos": 540952327680.0, + "grad_norm": 0.043467714857121094, + "language_loss": 0.87962419, + "learning_rate": 0.0008514544189427526, + "loss": 0.89028037, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.4675293, + "step": 1426, + "time_per_iteration": 2.679623603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_mlp": 1.0118494, + "epoch": 0.2745286648711043, + "flos": 469545978624.0, + "grad_norm": 0.04158543868721512, + "language_loss": 0.89037859, + "learning_rate": 0.0008512327569354511, + "loss": 0.90096468, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.46704102, + "step": 1427, + "time_per_iteration": 2.5345683097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.01036775, + "epoch": 0.2747210465563678, + "flos": 473872663296.0, + "grad_norm": 0.05094281183667316, + "language_loss": 0.85685182, + "learning_rate": 0.0008510109585675001, + "loss": 0.8674283, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.47241211, + "step": 1428, + "time_per_iteration": 2.5991017818450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076946, + "balance_loss_mlp": 1.03031158, + "epoch": 0.2749134282416314, + "flos": 1318059436800.0, + "grad_norm": 0.019364160619571847, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82230288, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.46582031, + "step": 1429, + "time_per_iteration": 4.724486351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.03459787, + "epoch": 0.275105809926895, + "flos": 972533129472.0, + "grad_norm": 0.05143903496013185, + "language_loss": 0.82696635, + "learning_rate": 0.0008505669530941415, + "loss": 0.83778298, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.47021484, + "step": 1430, + "time_per_iteration": 3.3173024654388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058611, + "balance_loss_mlp": 1.01231062, + "epoch": 0.2752981916121585, + "flos": 528369848832.0, + "grad_norm": 0.04649662222604448, + "language_loss": 0.87158883, + "learning_rate": 0.000850344746161112, + "loss": 0.88217485, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.46240234, + "step": 1431, + "time_per_iteration": 2.635831356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065544, + "balance_loss_mlp": 1.01943398, + "epoch": 0.2754905732974221, + "flos": 454599424512.0, + "grad_norm": 0.04970989937431765, + "language_loss": 0.90776384, + "learning_rate": 0.0008501224032121894, + "loss": 0.91841936, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.46044922, + "step": 1432, + "time_per_iteration": 2.531921148300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.02339363, + "epoch": 0.27568295498268564, + "flos": 498509788416.0, + "grad_norm": 0.04336527805629792, + "language_loss": 0.84821916, + "learning_rate": 0.0008498999243336946, + "loss": 0.85891324, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.45947266, + "step": 1433, + "time_per_iteration": 2.6142802238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068976, + "balance_loss_mlp": 1.02298498, + "epoch": 0.2758753366679492, + "flos": 609417191424.0, + "grad_norm": 0.03822636329404569, + "language_loss": 0.8997575, + "learning_rate": 0.0008496773096120021, + "loss": 0.91044724, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.45922852, + "step": 1434, + "time_per_iteration": 2.788863182067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066122, + "balance_loss_mlp": 1.01977372, + "epoch": 0.27606771835321275, + "flos": 741437966592.0, + "grad_norm": 0.04844453313229188, + "language_loss": 0.86675751, + "learning_rate": 0.0008494545591335381, + "loss": 0.87741876, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46289062, + "step": 1435, + "time_per_iteration": 2.8883180618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061614, + "balance_loss_mlp": 1.01516986, + "epoch": 0.27626010003847634, + "flos": 555749182464.0, + "grad_norm": 0.03304758436240527, + "language_loss": 0.88791698, + "learning_rate": 0.0008492316729847823, + "loss": 0.89853311, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.46386719, + "step": 1436, + "time_per_iteration": 2.794938087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.0072248, + "epoch": 0.2764524817237399, + "flos": 543696481536.0, + "grad_norm": 0.13725655625344893, + "language_loss": 0.82129836, + "learning_rate": 0.0008490086512522664, + "loss": 0.83184153, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47045898, + "step": 1437, + "time_per_iteration": 2.6979260444641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.01495445, + "epoch": 0.27664486340900346, + "flos": 407129336064.0, + "grad_norm": 0.04115092615815086, + "language_loss": 0.92702913, + "learning_rate": 0.0008487854940225755, + "loss": 0.93765163, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47265625, + "step": 1438, + "time_per_iteration": 2.4361565113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_mlp": 1.0080049, + "epoch": 0.27683724509426705, + "flos": 523157607168.0, + "grad_norm": 0.06281356926864295, + "language_loss": 0.92480713, + "learning_rate": 0.0008485622013823466, + "loss": 0.93535829, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.47070312, + "step": 1439, + "time_per_iteration": 2.588972568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060631, + "balance_loss_mlp": 1.01332879, + "epoch": 0.2770296267795306, + "flos": 536410814976.0, + "grad_norm": 0.048827385499573994, + "language_loss": 0.8582921, + "learning_rate": 0.00084833877341827, + "loss": 0.86889839, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47265625, + "step": 1440, + "time_per_iteration": 2.6215152740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063403, + "balance_loss_mlp": 1.01648188, + "epoch": 0.27722200846479417, + "flos": 488970862080.0, + "grad_norm": 0.04074125375838667, + "language_loss": 0.82920921, + "learning_rate": 0.000848115210217088, + "loss": 0.83984327, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.46875, + "step": 1441, + "time_per_iteration": 2.578479290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059768, + "balance_loss_mlp": 1.01244187, + "epoch": 0.2774143901500577, + "flos": 619444099584.0, + "grad_norm": 0.03981713509883016, + "language_loss": 0.84628934, + "learning_rate": 0.0008478915118655952, + "loss": 0.85688698, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.47290039, + "step": 1442, + "time_per_iteration": 2.697610855102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055568, + "balance_loss_mlp": 1.0080508, + "epoch": 0.2776067718353213, + "flos": 514845432576.0, + "grad_norm": 0.032345577367045, + "language_loss": 0.88479745, + "learning_rate": 0.0008476676784506393, + "loss": 0.89535314, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.47485352, + "step": 1443, + "time_per_iteration": 2.6315112113952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056662, + "balance_loss_mlp": 1.00897789, + "epoch": 0.2777991535205848, + "flos": 1006042342656.0, + "grad_norm": 0.04008629757661371, + "language_loss": 0.8412413, + "learning_rate": 0.0008474437100591201, + "loss": 0.85180795, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.4765625, + "step": 1444, + "time_per_iteration": 3.3463656902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051562, + "balance_loss_mlp": 1.00371146, + "epoch": 0.2779915352058484, + "flos": 551376811008.0, + "grad_norm": 0.033834103416723965, + "language_loss": 0.87362587, + "learning_rate": 0.0008472196067779898, + "loss": 0.88414145, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47827148, + "step": 1445, + "time_per_iteration": 2.6647677421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054302, + "balance_loss_mlp": 1.00649953, + "epoch": 0.278183916891112, + "flos": 875217216768.0, + "grad_norm": 0.0457526450580795, + "language_loss": 0.87776953, + "learning_rate": 0.0008469953686942531, + "loss": 0.88831258, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.4777832, + "step": 1446, + "time_per_iteration": 3.076035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056504, + "balance_loss_mlp": 1.00882006, + "epoch": 0.2783762985763755, + "flos": 625196812800.0, + "grad_norm": 0.042452946668595545, + "language_loss": 0.85090148, + "learning_rate": 0.0008467709958949668, + "loss": 0.86146653, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.4765625, + "step": 1447, + "time_per_iteration": 2.744459629058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_mlp": 1.00850928, + "epoch": 0.2785686802616391, + "flos": 582912721152.0, + "grad_norm": 0.04136143865758397, + "language_loss": 0.87796736, + "learning_rate": 0.0008465464884672403, + "loss": 0.88852853, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.47583008, + "step": 1448, + "time_per_iteration": 2.6887707710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_mlp": 1.00235641, + "epoch": 0.27876106194690264, + "flos": 588540034560.0, + "grad_norm": 0.031263057988026755, + "language_loss": 0.87220562, + "learning_rate": 0.0008463218464982348, + "loss": 0.88270551, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.47607422, + "step": 1449, + "time_per_iteration": 2.8354454040527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_mlp": 1.00326335, + "epoch": 0.27895344363216623, + "flos": 877431592704.0, + "grad_norm": 0.03730856956989286, + "language_loss": 0.89626968, + "learning_rate": 0.0008460970700751645, + "loss": 0.90677798, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.4753418, + "step": 1450, + "time_per_iteration": 3.12705135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062156, + "balance_loss_mlp": 1.01442492, + "epoch": 0.27914582531742976, + "flos": 605036071680.0, + "grad_norm": 0.0379360607610882, + "language_loss": 0.8910991, + "learning_rate": 0.000845872159285295, + "loss": 0.90172064, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.47705078, + "step": 1451, + "time_per_iteration": 2.792448043823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065174, + "balance_loss_mlp": 1.02025604, + "epoch": 0.27933820700269335, + "flos": 1501133346048.0, + "grad_norm": 0.01376981107013524, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.7883203, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.44921875, + "step": 1452, + "time_per_iteration": 4.966037034988403 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_mlp": 1.00921774, + "epoch": 0.2795305886879569, + "flos": 1033518885888.0, + "grad_norm": 0.037040263742322534, + "language_loss": 0.87809932, + "learning_rate": 0.0008454219349544836, + "loss": 0.88866544, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47363281, + "step": 1453, + "time_per_iteration": 3.428589344024658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055627, + "balance_loss_mlp": 1.00851548, + "epoch": 0.27972297037322047, + "flos": 608227378176.0, + "grad_norm": 0.03307542484781365, + "language_loss": 0.83086669, + "learning_rate": 0.000845196621588334, + "loss": 0.84142298, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.47070312, + "step": 1454, + "time_per_iteration": 2.7620909214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053868, + "balance_loss_mlp": 1.00661373, + "epoch": 0.27991535205848406, + "flos": 631561929216.0, + "grad_norm": 0.034345141589198824, + "language_loss": 0.77104861, + "learning_rate": 0.0008449711742049706, + "loss": 0.78158724, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.47216797, + "step": 1455, + "time_per_iteration": 2.7629852294921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057076, + "balance_loss_mlp": 1.009655, + "epoch": 0.2801077337437476, + "flos": 550354193664.0, + "grad_norm": 0.03843537360044117, + "language_loss": 0.85426688, + "learning_rate": 0.0008447455928919196, + "loss": 0.86483765, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.47387695, + "step": 1456, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054361, + "balance_loss_mlp": 1.00670111, + "epoch": 0.2803001154290112, + "flos": 487742164992.0, + "grad_norm": 0.03308646323695097, + "language_loss": 0.8834334, + "learning_rate": 0.0008445198777367595, + "loss": 0.89397705, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.47631836, + "step": 1457, + "time_per_iteration": 2.5908620357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054094, + "balance_loss_mlp": 1.00633848, + "epoch": 0.2804924971142747, + "flos": 523092478464.0, + "grad_norm": 0.036759152060528134, + "language_loss": 0.82140505, + "learning_rate": 0.0008442940288271208, + "loss": 0.8319459, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.47729492, + "step": 1458, + "time_per_iteration": 2.6980724334716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057218, + "balance_loss_mlp": 1.00953484, + "epoch": 0.2806848787995383, + "flos": 528850049280.0, + "grad_norm": 0.03179596299998768, + "language_loss": 0.88266242, + "learning_rate": 0.0008440680462506856, + "loss": 0.89323461, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.4765625, + "step": 1459, + "time_per_iteration": 2.818169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058111, + "balance_loss_mlp": 1.01047492, + "epoch": 0.2808772604848018, + "flos": 486485277696.0, + "grad_norm": 0.030255628698855237, + "language_loss": 0.87626624, + "learning_rate": 0.0008438419300951883, + "loss": 0.88684738, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.47607422, + "step": 1460, + "time_per_iteration": 2.644911527633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_mlp": 1.00825953, + "epoch": 0.2810696421700654, + "flos": 619340087040.0, + "grad_norm": 0.03597967684758823, + "language_loss": 0.87670606, + "learning_rate": 0.0008436156804484148, + "loss": 0.88726676, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.4777832, + "step": 1461, + "time_per_iteration": 2.7725627422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_mlp": 1.00657165, + "epoch": 0.28126202385532895, + "flos": 455687170560.0, + "grad_norm": 0.0394598317615188, + "language_loss": 0.89263237, + "learning_rate": 0.0008433892973982031, + "loss": 0.90317494, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.4765625, + "step": 1462, + "time_per_iteration": 2.5091495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063928, + "balance_loss_mlp": 1.0156002, + "epoch": 0.28145440554059253, + "flos": 531739044864.0, + "grad_norm": 0.041651284680957995, + "language_loss": 0.866346, + "learning_rate": 0.0008431627810324431, + "loss": 0.87698531, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.4831543, + "step": 1463, + "time_per_iteration": 2.6705899238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056697, + "balance_loss_mlp": 1.00872695, + "epoch": 0.2816467872258561, + "flos": 453164647680.0, + "grad_norm": 0.03544245246238935, + "language_loss": 0.81977493, + "learning_rate": 0.000842936131439076, + "loss": 0.83034194, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.47949219, + "step": 1464, + "time_per_iteration": 2.610419511795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055364, + "balance_loss_mlp": 1.00763226, + "epoch": 0.28183916891111965, + "flos": 473705467392.0, + "grad_norm": 0.034609246408770326, + "language_loss": 0.89094436, + "learning_rate": 0.0008427093487060951, + "loss": 0.90149802, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.47705078, + "step": 1465, + "time_per_iteration": 2.72540283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054608, + "balance_loss_mlp": 1.00656629, + "epoch": 0.28203155059638324, + "flos": 558189080064.0, + "grad_norm": 0.02738603689522664, + "language_loss": 0.8552286, + "learning_rate": 0.000842482432921545, + "loss": 0.86577463, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.48022461, + "step": 1466, + "time_per_iteration": 2.8388257026672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105456, + "balance_loss_mlp": 1.00654304, + "epoch": 0.28222393228164677, + "flos": 417879462912.0, + "grad_norm": 0.03402242241185157, + "language_loss": 0.88381398, + "learning_rate": 0.0008422553841735225, + "loss": 0.89435959, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.47998047, + "step": 1467, + "time_per_iteration": 2.495126485824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057213, + "balance_loss_mlp": 1.00917137, + "epoch": 0.28241631396691036, + "flos": 606041192448.0, + "grad_norm": 0.032675143321136885, + "language_loss": 0.86003613, + "learning_rate": 0.0008420282025501757, + "loss": 0.87060827, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.48022461, + "step": 1468, + "time_per_iteration": 2.7908880710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052419, + "balance_loss_mlp": 1.00473487, + "epoch": 0.2826086956521739, + "flos": 574051326720.0, + "grad_norm": 0.03300906221563125, + "language_loss": 0.86686498, + "learning_rate": 0.0008418008881397043, + "loss": 0.87738919, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.4765625, + "step": 1469, + "time_per_iteration": 2.7646520137786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054478, + "balance_loss_mlp": 1.00693762, + "epoch": 0.2828010773374375, + "flos": 844319954688.0, + "grad_norm": 0.03195966631281891, + "language_loss": 0.84124947, + "learning_rate": 0.0008415734410303595, + "loss": 0.85179424, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.47509766, + "step": 1470, + "time_per_iteration": 3.1784656047821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059132, + "balance_loss_mlp": 1.01151943, + "epoch": 0.28299345902270107, + "flos": 543772303872.0, + "grad_norm": 0.0307788797974712, + "language_loss": 0.91781342, + "learning_rate": 0.0008413458613104444, + "loss": 0.92840481, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.47583008, + "step": 1471, + "time_per_iteration": 2.7000675201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057543, + "balance_loss_mlp": 1.00995505, + "epoch": 0.2831858407079646, + "flos": 572755555584.0, + "grad_norm": 0.03187726406761503, + "language_loss": 0.84024346, + "learning_rate": 0.0008411181490683129, + "loss": 0.85081899, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.47558594, + "step": 1472, + "time_per_iteration": 2.7358603477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105744, + "balance_loss_mlp": 1.00958943, + "epoch": 0.2833782223932282, + "flos": 765172038144.0, + "grad_norm": 0.03258814259190176, + "language_loss": 0.83765668, + "learning_rate": 0.0008408903043923707, + "loss": 0.84823108, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.47827148, + "step": 1473, + "time_per_iteration": 3.016690492630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060961, + "balance_loss_mlp": 1.01291955, + "epoch": 0.2835706040784917, + "flos": 540088157952.0, + "grad_norm": 0.03783140599229066, + "language_loss": 0.82463539, + "learning_rate": 0.0008406623273710754, + "loss": 0.83524501, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.48022461, + "step": 1474, + "time_per_iteration": 2.651932954788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_mlp": 1.00736535, + "epoch": 0.2837629857637553, + "flos": 531654474240.0, + "grad_norm": 0.03425671969493541, + "language_loss": 0.84354198, + "learning_rate": 0.0008404342180929351, + "loss": 0.85409558, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.47973633, + "step": 1475, + "time_per_iteration": 2.6064491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105922, + "balance_loss_mlp": 1.01120257, + "epoch": 0.28395536744901884, + "flos": 541110775296.0, + "grad_norm": 0.03564784056716401, + "language_loss": 0.8245163, + "learning_rate": 0.00084020597664651, + "loss": 0.83510846, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.47998047, + "step": 1476, + "time_per_iteration": 2.7597527503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.00890458, + "epoch": 0.2841477491342824, + "flos": 574802735616.0, + "grad_norm": 0.037292940254278956, + "language_loss": 0.8496412, + "learning_rate": 0.0008399776031204111, + "loss": 0.86021066, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.48022461, + "step": 1477, + "time_per_iteration": 2.759089231491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.00412941, + "epoch": 0.28434013081954596, + "flos": 573139524864.0, + "grad_norm": 0.03522410712402375, + "language_loss": 0.80955458, + "learning_rate": 0.0008397490976033009, + "loss": 0.8200742, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.47802734, + "step": 1478, + "time_per_iteration": 2.6423845291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056133, + "balance_loss_mlp": 1.0100708, + "epoch": 0.28453251250480954, + "flos": 1556676481536.0, + "grad_norm": 0.010218347035897045, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78935778, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.45996094, + "step": 1479, + "time_per_iteration": 4.732174396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_mlp": 1.0056026, + "epoch": 0.28472489419007313, + "flos": 750427673088.0, + "grad_norm": 0.028762601306014927, + "language_loss": 0.86263019, + "learning_rate": 0.0008392916909509525, + "loss": 0.87316358, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.47705078, + "step": 1480, + "time_per_iteration": 3.0842366218566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105593, + "balance_loss_mlp": 1.00817478, + "epoch": 0.28491727587533666, + "flos": 491139551232.0, + "grad_norm": 0.03654292068957682, + "language_loss": 0.86134857, + "learning_rate": 0.0008390627899932954, + "loss": 0.87190789, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.47729492, + "step": 1481, + "time_per_iteration": 2.615267753601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053309, + "balance_loss_mlp": 1.0055064, + "epoch": 0.28510965756060025, + "flos": 730360250880.0, + "grad_norm": 0.03257927187729683, + "language_loss": 0.89633858, + "learning_rate": 0.000838833757399789, + "loss": 0.90687168, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.4777832, + "step": 1482, + "time_per_iteration": 2.9428212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053301, + "balance_loss_mlp": 1.00528359, + "epoch": 0.2853020392458638, + "flos": 552670636800.0, + "grad_norm": 0.036455185890550544, + "language_loss": 0.82055122, + "learning_rate": 0.0008386045932593515, + "loss": 0.83108419, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.47998047, + "step": 1483, + "time_per_iteration": 2.724045991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_mlp": 1.00416255, + "epoch": 0.28549442093112737, + "flos": 756097761024.0, + "grad_norm": 0.02777472605390161, + "language_loss": 0.8718375, + "learning_rate": 0.0008383752976609525, + "loss": 0.8823595, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.48022461, + "step": 1484, + "time_per_iteration": 2.929905891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054156, + "balance_loss_mlp": 1.00618601, + "epoch": 0.2856868026163909, + "flos": 539704188672.0, + "grad_norm": 0.028392575187028035, + "language_loss": 0.8111921, + "learning_rate": 0.0008381458706936123, + "loss": 0.82173365, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.47949219, + "step": 1485, + "time_per_iteration": 2.717545986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053651, + "balance_loss_mlp": 1.00563323, + "epoch": 0.2858791843016545, + "flos": 584921017344.0, + "grad_norm": 0.03333139148622456, + "language_loss": 0.88664746, + "learning_rate": 0.0008379163124464025, + "loss": 0.8971839, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.47998047, + "step": 1486, + "time_per_iteration": 2.7234747409820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00685012, + "epoch": 0.286071565986918, + "flos": 646052582400.0, + "grad_norm": 0.03454926432429506, + "language_loss": 0.77946562, + "learning_rate": 0.0008376866230084452, + "loss": 0.79001164, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.47729492, + "step": 1487, + "time_per_iteration": 2.856128692626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00408018, + "epoch": 0.2862639476721816, + "flos": 492331309824.0, + "grad_norm": 0.034661288064865674, + "language_loss": 0.87705112, + "learning_rate": 0.000837456802468914, + "loss": 0.88757157, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.47949219, + "step": 1488, + "time_per_iteration": 2.57454514503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_mlp": 1.00700366, + "epoch": 0.2864563293574452, + "flos": 522745447680.0, + "grad_norm": 0.035472984165373166, + "language_loss": 0.86247557, + "learning_rate": 0.0008372268509170331, + "loss": 0.87302554, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.47973633, + "step": 1489, + "time_per_iteration": 2.661430597305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_mlp": 1.00452483, + "epoch": 0.2866487110427087, + "flos": 548257436160.0, + "grad_norm": 0.03357077125927176, + "language_loss": 0.85950172, + "learning_rate": 0.0008369967684420779, + "loss": 0.8700276, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.48046875, + "step": 1490, + "time_per_iteration": 2.703200101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052654, + "balance_loss_mlp": 1.0047555, + "epoch": 0.2868410927279723, + "flos": 483218148864.0, + "grad_norm": 0.03511930922286833, + "language_loss": 0.8567192, + "learning_rate": 0.0008367665551333736, + "loss": 0.86724567, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.47875977, + "step": 1491, + "time_per_iteration": 2.6027045249938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051173, + "balance_loss_mlp": 1.00334597, + "epoch": 0.28703347441323585, + "flos": 726137578752.0, + "grad_norm": 0.03668604763704844, + "language_loss": 0.86648476, + "learning_rate": 0.0008365362110802977, + "loss": 0.87699652, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47802734, + "step": 1492, + "time_per_iteration": 2.872743606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00630987, + "epoch": 0.28722585609849943, + "flos": 636214257408.0, + "grad_norm": 0.0346446819062503, + "language_loss": 0.83264536, + "learning_rate": 0.0008363057363722773, + "loss": 0.84318721, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.47851562, + "step": 1493, + "time_per_iteration": 2.830925941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055811, + "balance_loss_mlp": 1.00827014, + "epoch": 0.28741823778376296, + "flos": 511252660224.0, + "grad_norm": 0.03541460771255837, + "language_loss": 0.8481909, + "learning_rate": 0.0008360751310987906, + "loss": 0.85874903, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.47509766, + "step": 1494, + "time_per_iteration": 2.6102633476257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055778, + "balance_loss_mlp": 1.00840437, + "epoch": 0.28761061946902655, + "flos": 604932059136.0, + "grad_norm": 0.030521465086419404, + "language_loss": 0.86298919, + "learning_rate": 0.0008358443953493666, + "loss": 0.87354696, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.47338867, + "step": 1495, + "time_per_iteration": 2.8808648586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053186, + "balance_loss_mlp": 1.00590765, + "epoch": 0.28780300115429014, + "flos": 408060579840.0, + "grad_norm": 0.03760103829607362, + "language_loss": 0.89352167, + "learning_rate": 0.0008356135292135851, + "loss": 0.90405357, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.47241211, + "step": 1496, + "time_per_iteration": 2.5025811195373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055101, + "balance_loss_mlp": 1.00794196, + "epoch": 0.28799538283955367, + "flos": 375745070592.0, + "grad_norm": 0.04396673202836768, + "language_loss": 0.93575335, + "learning_rate": 0.0008353825327810758, + "loss": 0.94630432, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.47119141, + "step": 1497, + "time_per_iteration": 2.4455389976501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053362, + "balance_loss_mlp": 1.00601161, + "epoch": 0.28818776452481726, + "flos": 593020309248.0, + "grad_norm": 0.03575929377279749, + "language_loss": 0.82620615, + "learning_rate": 0.00083515140614152, + "loss": 0.83673978, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.47314453, + "step": 1498, + "time_per_iteration": 2.7318496704101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.01204443, + "epoch": 0.2883801462100808, + "flos": 536104613376.0, + "grad_norm": 0.03408677708994041, + "language_loss": 0.8771323, + "learning_rate": 0.0008349201493846485, + "loss": 0.88772887, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.47583008, + "step": 1499, + "time_per_iteration": 2.671473503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105332, + "balance_loss_mlp": 1.00606573, + "epoch": 0.2885725278953444, + "flos": 481077649920.0, + "grad_norm": 0.037679681148910335, + "language_loss": 0.90198493, + "learning_rate": 0.0008346887626002432, + "loss": 0.91251814, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.47216797, + "step": 1500, + "time_per_iteration": 2.565556287765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_mlp": 1.00290418, + "epoch": 0.2887649095806079, + "flos": 465030710784.0, + "grad_norm": 0.03453406345592784, + "language_loss": 0.87256986, + "learning_rate": 0.000834457245878137, + "loss": 0.88307267, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.47338867, + "step": 1501, + "time_per_iteration": 2.6684980392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_mlp": 1.00411427, + "epoch": 0.2889572912658715, + "flos": 932641303296.0, + "grad_norm": 0.034149555340210275, + "language_loss": 0.82079703, + "learning_rate": 0.000834225599308212, + "loss": 0.83131123, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.47265625, + "step": 1502, + "time_per_iteration": 3.2747607231140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052615, + "balance_loss_mlp": 1.00526536, + "epoch": 0.28914967295113503, + "flos": 571257595392.0, + "grad_norm": 0.03426641952710734, + "language_loss": 0.85934782, + "learning_rate": 0.0008339938229804016, + "loss": 0.869874, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.47314453, + "step": 1503, + "time_per_iteration": 2.7027056217193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062946, + "balance_loss_mlp": 1.01783752, + "epoch": 0.2893420546363986, + "flos": 1489874828544.0, + "grad_norm": 0.016861580481692767, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76497769, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.45019531, + "step": 1504, + "time_per_iteration": 4.9503560066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010625, + "balance_loss_mlp": 1.01536465, + "epoch": 0.2895344363216622, + "flos": 471182944512.0, + "grad_norm": 0.04276572481675365, + "language_loss": 0.8589167, + "learning_rate": 0.0008335298814111094, + "loss": 0.86954165, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47094727, + "step": 1505, + "time_per_iteration": 2.548398017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.01654112, + "epoch": 0.28972681800692573, + "flos": 649341098496.0, + "grad_norm": 0.03572405467889404, + "language_loss": 0.89211309, + "learning_rate": 0.0008332977163497455, + "loss": 0.90274966, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.47070312, + "step": 1506, + "time_per_iteration": 2.786355972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059474, + "balance_loss_mlp": 1.01241064, + "epoch": 0.2899191996921893, + "flos": 573306720768.0, + "grad_norm": 0.03560254091063293, + "language_loss": 0.84471554, + "learning_rate": 0.0008330654218907325, + "loss": 0.85531026, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.47021484, + "step": 1507, + "time_per_iteration": 2.706066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054224, + "balance_loss_mlp": 1.00701702, + "epoch": 0.29011158137745285, + "flos": 662638047744.0, + "grad_norm": 0.03364876986368613, + "language_loss": 0.82771999, + "learning_rate": 0.0008328329981242548, + "loss": 0.8382622, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47167969, + "step": 1508, + "time_per_iteration": 2.9025378227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053376, + "balance_loss_mlp": 1.00607395, + "epoch": 0.29030396306271644, + "flos": 537403296768.0, + "grad_norm": 0.0314370875382877, + "language_loss": 0.88638061, + "learning_rate": 0.0008326004451405475, + "loss": 0.89691436, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47265625, + "step": 1509, + "time_per_iteration": 2.740288496017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091124, + "balance_loss_mlp": 1.04370284, + "epoch": 0.29049634474798, + "flos": 512956700160.0, + "grad_norm": 0.04021928954994292, + "language_loss": 0.83711147, + "learning_rate": 0.0008323677630298957, + "loss": 0.84802264, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.47387695, + "step": 1510, + "time_per_iteration": 2.5700840950012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_mlp": 1.00935256, + "epoch": 0.29068872643324356, + "flos": 614983266816.0, + "grad_norm": 0.03498537298994642, + "language_loss": 0.86212677, + "learning_rate": 0.0008321349518826345, + "loss": 0.87268996, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.46923828, + "step": 1511, + "time_per_iteration": 2.7968146800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060609, + "balance_loss_mlp": 1.01364064, + "epoch": 0.2908811081185071, + "flos": 547469088768.0, + "grad_norm": 0.03734404843374857, + "language_loss": 0.95525789, + "learning_rate": 0.0008319020117891491, + "loss": 0.96586394, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.46923828, + "step": 1512, + "time_per_iteration": 2.646127939224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.01107061, + "epoch": 0.2910734898037707, + "flos": 605902186752.0, + "grad_norm": 0.03463533015087841, + "language_loss": 0.88378417, + "learning_rate": 0.0008316689428398751, + "loss": 0.89436436, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.46899414, + "step": 1513, + "time_per_iteration": 2.7310631275177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056228, + "balance_loss_mlp": 1.00935447, + "epoch": 0.29126587148903427, + "flos": 575836046592.0, + "grad_norm": 0.028150288904366032, + "language_loss": 0.89498413, + "learning_rate": 0.0008314357451252979, + "loss": 0.90554643, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.46826172, + "step": 1514, + "time_per_iteration": 2.8262994289398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054124, + "balance_loss_mlp": 1.00727487, + "epoch": 0.2914582531742978, + "flos": 572134404096.0, + "grad_norm": 0.05354948204009119, + "language_loss": 0.89001274, + "learning_rate": 0.0008312024187359527, + "loss": 0.90055394, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.46801758, + "step": 1515, + "time_per_iteration": 2.717780590057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_mlp": 1.01109469, + "epoch": 0.2916506348595614, + "flos": 732303418368.0, + "grad_norm": 0.032865630858266236, + "language_loss": 0.8831327, + "learning_rate": 0.000830968963762425, + "loss": 0.89371502, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.47094727, + "step": 1516, + "time_per_iteration": 3.080526828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051181, + "balance_loss_mlp": 1.00383127, + "epoch": 0.2918430165448249, + "flos": 511467488256.0, + "grad_norm": 0.032871242995291323, + "language_loss": 0.84882748, + "learning_rate": 0.0008307353802953497, + "loss": 0.85933936, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.47314453, + "step": 1517, + "time_per_iteration": 2.744476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.03726828, + "epoch": 0.2920353982300885, + "flos": 631607616000.0, + "grad_norm": 0.03594729450056152, + "language_loss": 0.86997348, + "learning_rate": 0.0008305016684254125, + "loss": 0.88082325, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.47680664, + "step": 1518, + "time_per_iteration": 2.8340506553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_mlp": 1.00001049, + "epoch": 0.29222777991535204, + "flos": 502671222528.0, + "grad_norm": 0.03192476620539529, + "language_loss": 0.87901479, + "learning_rate": 0.0008302678282433479, + "loss": 0.88948864, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.47338867, + "step": 1519, + "time_per_iteration": 2.5783281326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048912, + "balance_loss_mlp": 1.00177681, + "epoch": 0.2924201616006156, + "flos": 487842286848.0, + "grad_norm": 0.03491462978028735, + "language_loss": 0.85667795, + "learning_rate": 0.0008300338598399411, + "loss": 0.86716712, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.47094727, + "step": 1520, + "time_per_iteration": 2.6763737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105218, + "balance_loss_mlp": 1.0049969, + "epoch": 0.2926125432858792, + "flos": 477411000576.0, + "grad_norm": 0.036990289889529016, + "language_loss": 0.957196, + "learning_rate": 0.0008297997633060263, + "loss": 0.96771777, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.47143555, + "step": 1521, + "time_per_iteration": 2.5368785858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055222, + "balance_loss_mlp": 1.00799167, + "epoch": 0.29280492497114274, + "flos": 677868449280.0, + "grad_norm": 0.0362418142607002, + "language_loss": 0.86058486, + "learning_rate": 0.0008295655387324883, + "loss": 0.87113714, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.47192383, + "step": 1522, + "time_per_iteration": 2.8447062969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055265, + "balance_loss_mlp": 1.0079869, + "epoch": 0.29299730665640633, + "flos": 459345071616.0, + "grad_norm": 0.03782463739456531, + "language_loss": 0.86245579, + "learning_rate": 0.0008293311862102609, + "loss": 0.87300849, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.47241211, + "step": 1523, + "time_per_iteration": 2.5397908687591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050568, + "balance_loss_mlp": 1.00328994, + "epoch": 0.29318968834166986, + "flos": 447496505088.0, + "grad_norm": 0.03500221637525105, + "language_loss": 0.90103561, + "learning_rate": 0.0008290967058303275, + "loss": 0.91154128, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.47241211, + "step": 1524, + "time_per_iteration": 2.4784419536590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.03349924, + "epoch": 0.29338207002693345, + "flos": 451256473344.0, + "grad_norm": 0.038529021386844775, + "language_loss": 0.87365985, + "learning_rate": 0.0008288620976837219, + "loss": 0.88447046, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.4753418, + "step": 1525, + "time_per_iteration": 2.540762424468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_mlp": 1.00684249, + "epoch": 0.293574451712197, + "flos": 503285571072.0, + "grad_norm": 0.03477645959362119, + "language_loss": 0.8372373, + "learning_rate": 0.000828627361861527, + "loss": 0.84778112, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.47509766, + "step": 1526, + "time_per_iteration": 2.583862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058639, + "balance_loss_mlp": 1.01124167, + "epoch": 0.29376683339746057, + "flos": 697684104960.0, + "grad_norm": 0.03858140978476568, + "language_loss": 0.85503912, + "learning_rate": 0.0008283924984548752, + "loss": 0.8656255, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.47363281, + "step": 1527, + "time_per_iteration": 2.848947525024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054231, + "balance_loss_mlp": 1.00680923, + "epoch": 0.2939592150827241, + "flos": 479542751232.0, + "grad_norm": 0.03208252397749005, + "language_loss": 0.8577444, + "learning_rate": 0.0008281575075549485, + "loss": 0.86828673, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.47387695, + "step": 1528, + "time_per_iteration": 2.6076998710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063099, + "balance_loss_mlp": 1.01703644, + "epoch": 0.2941515967679877, + "flos": 1488389507328.0, + "grad_norm": 0.010941905571601225, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78415793, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.45996094, + "step": 1529, + "time_per_iteration": 4.672811508178711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175133, + "balance_loss_mlp": 1.12690103, + "epoch": 0.2943439784532513, + "flos": 675400361472.0, + "grad_norm": 0.05299717257038309, + "language_loss": 0.90924174, + "learning_rate": 0.0008276871436402469, + "loss": 0.92099309, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.48217773, + "step": 1530, + "time_per_iteration": 2.8220977783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010581, + "balance_loss_mlp": 1.01096439, + "epoch": 0.2945363601385148, + "flos": 577383584256.0, + "grad_norm": 0.03620573442946411, + "language_loss": 0.88955015, + "learning_rate": 0.000827451770808083, + "loss": 0.90013111, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.47094727, + "step": 1531, + "time_per_iteration": 2.6981046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057368, + "balance_loss_mlp": 1.01013768, + "epoch": 0.2947287418237784, + "flos": 481618121472.0, + "grad_norm": 0.03382548660060083, + "language_loss": 0.84345412, + "learning_rate": 0.0008272162708478674, + "loss": 0.85402787, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.47192383, + "step": 1532, + "time_per_iteration": 2.5975306034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_mlp": 1.01151645, + "epoch": 0.2949211235090419, + "flos": 559261274880.0, + "grad_norm": 0.03154442800865326, + "language_loss": 0.87544608, + "learning_rate": 0.000826980643851029, + "loss": 0.88603282, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.47119141, + "step": 1533, + "time_per_iteration": 2.6889007091522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063865, + "balance_loss_mlp": 1.01675379, + "epoch": 0.2951135051943055, + "flos": 484857060096.0, + "grad_norm": 0.03876668067992812, + "language_loss": 0.85914761, + "learning_rate": 0.0008267448899090464, + "loss": 0.86978626, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.47070312, + "step": 1534, + "time_per_iteration": 2.5630924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062291, + "balance_loss_mlp": 1.01498842, + "epoch": 0.29530588687956905, + "flos": 551422497792.0, + "grad_norm": 0.034923849251574525, + "language_loss": 0.81812191, + "learning_rate": 0.0008265090091134473, + "loss": 0.82874477, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.47265625, + "step": 1535, + "time_per_iteration": 2.8399465084075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105968, + "balance_loss_mlp": 1.01235437, + "epoch": 0.29549826856483263, + "flos": 674310670080.0, + "grad_norm": 0.028029616611284485, + "language_loss": 0.80873084, + "learning_rate": 0.0008262730015558088, + "loss": 0.81932771, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.47290039, + "step": 1536, + "time_per_iteration": 2.874537944793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059174, + "balance_loss_mlp": 1.01151371, + "epoch": 0.29569065025009617, + "flos": 766136329728.0, + "grad_norm": 0.03177117147053012, + "language_loss": 0.82803708, + "learning_rate": 0.0008260368673277574, + "loss": 0.83862883, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.47631836, + "step": 1537, + "time_per_iteration": 3.0976641178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_mlp": 1.00573432, + "epoch": 0.29588303193535975, + "flos": 544831859712.0, + "grad_norm": 0.031452220479770684, + "language_loss": 0.84814745, + "learning_rate": 0.0008258006065209682, + "loss": 0.85868478, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.47973633, + "step": 1538, + "time_per_iteration": 2.7704694271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115804, + "balance_loss_mlp": 1.06735778, + "epoch": 0.29607541362062334, + "flos": 598146034944.0, + "grad_norm": 0.04896094729194987, + "language_loss": 0.81966412, + "learning_rate": 0.0008255642192271657, + "loss": 0.83082211, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.484375, + "step": 1539, + "time_per_iteration": 2.774122714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_mlp": 1.01219356, + "epoch": 0.29626779530588687, + "flos": 611038606080.0, + "grad_norm": 0.02837345788652225, + "language_loss": 0.84628069, + "learning_rate": 0.0008253277055381241, + "loss": 0.85687971, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.47680664, + "step": 1540, + "time_per_iteration": 2.837587833404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061212, + "balance_loss_mlp": 1.01340961, + "epoch": 0.29646017699115046, + "flos": 868959025152.0, + "grad_norm": 0.03662488769273821, + "language_loss": 0.86757702, + "learning_rate": 0.0008250910655456658, + "loss": 0.87818909, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.4777832, + "step": 1541, + "time_per_iteration": 3.123687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010574, + "balance_loss_mlp": 1.00954938, + "epoch": 0.296652558676414, + "flos": 496881570816.0, + "grad_norm": 0.03318095479066229, + "language_loss": 0.84889704, + "learning_rate": 0.0008248542993416625, + "loss": 0.85947102, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.47827148, + "step": 1542, + "time_per_iteration": 2.637747049331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068583, + "balance_loss_mlp": 1.02082753, + "epoch": 0.2968449403616776, + "flos": 572627243520.0, + "grad_norm": 0.03443634648546435, + "language_loss": 0.84426934, + "learning_rate": 0.0008246174070180352, + "loss": 0.8549552, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.47729492, + "step": 1543, + "time_per_iteration": 2.6872684955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062899, + "balance_loss_mlp": 1.01511967, + "epoch": 0.2970373220469411, + "flos": 795651304704.0, + "grad_norm": 0.035080805136432934, + "language_loss": 0.85198414, + "learning_rate": 0.0008243803886667537, + "loss": 0.86261314, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.47753906, + "step": 1544, + "time_per_iteration": 3.13710618019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069403, + "balance_loss_mlp": 1.02145684, + "epoch": 0.2972297037322047, + "flos": 662249220864.0, + "grad_norm": 0.04094703338464919, + "language_loss": 0.80137819, + "learning_rate": 0.0008241432443798364, + "loss": 0.81207222, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.47924805, + "step": 1545, + "time_per_iteration": 2.841092109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061565, + "balance_loss_mlp": 1.0138818, + "epoch": 0.29742208541746823, + "flos": 598232550912.0, + "grad_norm": 0.028624248431763765, + "language_loss": 0.86072361, + "learning_rate": 0.0008239059742493512, + "loss": 0.87133932, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.4765625, + "step": 1546, + "time_per_iteration": 2.7034194469451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349258, + "balance_loss_mlp": 1.29957151, + "epoch": 0.2976144671027318, + "flos": 771339823104.0, + "grad_norm": 0.07377893489124947, + "language_loss": 0.88059306, + "learning_rate": 0.0008236685783674142, + "loss": 0.89408565, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.49584961, + "step": 1547, + "time_per_iteration": 3.063077688217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071266, + "balance_loss_mlp": 1.02510834, + "epoch": 0.2978068487879954, + "flos": 1487914164480.0, + "grad_norm": 0.01225569795264997, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.7729246, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.4609375, + "step": 1548, + "time_per_iteration": 4.894561767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.02564275, + "epoch": 0.29799923047325894, + "flos": 476330057472.0, + "grad_norm": 0.041178192237982324, + "language_loss": 0.84313369, + "learning_rate": 0.0008231934097178955, + "loss": 0.85386503, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.47460938, + "step": 1549, + "time_per_iteration": 2.630146026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081209, + "balance_loss_mlp": 1.03362012, + "epoch": 0.2981916121585225, + "flos": 761169051648.0, + "grad_norm": 0.037198017460407115, + "language_loss": 0.86745787, + "learning_rate": 0.0008229556371347903, + "loss": 0.87826997, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.47558594, + "step": 1550, + "time_per_iteration": 2.9614980220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081757, + "balance_loss_mlp": 1.03416848, + "epoch": 0.29838399384378606, + "flos": 876517845504.0, + "grad_norm": 0.043512769843104544, + "language_loss": 0.80808616, + "learning_rate": 0.0008227177391691874, + "loss": 0.81890368, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.47558594, + "step": 1551, + "time_per_iteration": 3.11059832572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081803, + "balance_loss_mlp": 1.03445339, + "epoch": 0.29857637552904964, + "flos": 580752780288.0, + "grad_norm": 0.039547132323558824, + "language_loss": 0.90871334, + "learning_rate": 0.0008224797159134463, + "loss": 0.91953135, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.47314453, + "step": 1552, + "time_per_iteration": 2.7177717685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077357, + "balance_loss_mlp": 1.03026903, + "epoch": 0.2987687572143132, + "flos": 837809029632.0, + "grad_norm": 0.03288289742732326, + "language_loss": 0.84735203, + "learning_rate": 0.0008222415674599765, + "loss": 0.85812569, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.47045898, + "step": 1553, + "time_per_iteration": 3.090768814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.02513897, + "epoch": 0.29896113889957676, + "flos": 568168356096.0, + "grad_norm": 0.03857517262144223, + "language_loss": 0.8489393, + "learning_rate": 0.0008220032939012349, + "loss": 0.85966009, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.46899414, + "step": 1554, + "time_per_iteration": 2.7050375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.02554476, + "epoch": 0.29915352058484035, + "flos": 499836662016.0, + "grad_norm": 0.03341170745827686, + "language_loss": 0.89154899, + "learning_rate": 0.0008217648953297277, + "loss": 0.90227222, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.46728516, + "step": 1555, + "time_per_iteration": 2.8296022415161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106052, + "balance_loss_mlp": 1.01376653, + "epoch": 0.2993459022701039, + "flos": 593215695360.0, + "grad_norm": 0.042418434687241845, + "language_loss": 0.79395097, + "learning_rate": 0.0008215263718380095, + "loss": 0.80455619, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.46704102, + "step": 1556, + "time_per_iteration": 2.683760643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02541041, + "balance_loss_mlp": 2.4871583, + "epoch": 0.29953828395536747, + "flos": 573473916672.0, + "grad_norm": 0.19828678552993478, + "language_loss": 0.85491472, + "learning_rate": 0.0008212877235186833, + "loss": 0.88032514, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.54003906, + "step": 1557, + "time_per_iteration": 2.6963422298431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.0413208, + "epoch": 0.299730665640631, + "flos": 1508086566144.0, + "grad_norm": 0.015049722833054002, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78823709, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.44824219, + "step": 1558, + "time_per_iteration": 4.971554279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098273, + "balance_loss_mlp": 1.05063736, + "epoch": 0.2999230473258946, + "flos": 514808494080.0, + "grad_norm": 0.04814176942398931, + "language_loss": 0.82249933, + "learning_rate": 0.0008208100527678611, + "loss": 0.83348203, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.47607422, + "step": 1559, + "time_per_iteration": 2.6210360527038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130027, + "balance_loss_mlp": 1.08127058, + "epoch": 0.3001154290111581, + "flos": 835855168512.0, + "grad_norm": 0.05333171316141313, + "language_loss": 0.80031002, + "learning_rate": 0.0008205710305218135, + "loss": 0.81161028, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.48730469, + "step": 1560, + "time_per_iteration": 3.0021140575408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168816, + "balance_loss_mlp": 1.11898673, + "epoch": 0.3003078106964217, + "flos": 557946061824.0, + "grad_norm": 0.05314988858528354, + "language_loss": 0.91578549, + "learning_rate": 0.0008203318838190541, + "loss": 0.92747366, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.49707031, + "step": 1561, + "time_per_iteration": 2.7369065284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153064, + "balance_loss_mlp": 1.10247147, + "epoch": 0.30050019238168524, + "flos": 527169341952.0, + "grad_norm": 0.047834322975263, + "language_loss": 0.86778915, + "learning_rate": 0.0008200926127524281, + "loss": 0.87931979, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.50634766, + "step": 1562, + "time_per_iteration": 2.6357791423797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.10565686, + "epoch": 0.3006925740669488, + "flos": 578937924864.0, + "grad_norm": 0.04357261617021945, + "language_loss": 0.84502149, + "learning_rate": 0.0008198532174148289, + "loss": 0.85659254, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.51513672, + "step": 1563, + "time_per_iteration": 2.7241976261138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097195, + "balance_loss_mlp": 1.04941559, + "epoch": 0.3008849557522124, + "flos": 1493613409536.0, + "grad_norm": 0.019627167679756308, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8178336, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.47753906, + "step": 1564, + "time_per_iteration": 4.851420879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122708, + "balance_loss_mlp": 1.07035148, + "epoch": 0.30107733743747594, + "flos": 510824949504.0, + "grad_norm": 0.045341503179798265, + "language_loss": 0.90611446, + "learning_rate": 0.0008193740542985244, + "loss": 0.91734147, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.52441406, + "step": 1565, + "time_per_iteration": 2.62724232673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113673, + "balance_loss_mlp": 1.06098223, + "epoch": 0.30126971912273953, + "flos": 588821936640.0, + "grad_norm": 0.04014967632238747, + "language_loss": 0.87587321, + "learning_rate": 0.0008191342867058467, + "loss": 0.88700998, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.52783203, + "step": 1566, + "time_per_iteration": 2.766045570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133038, + "balance_loss_mlp": 1.07991791, + "epoch": 0.30146210080800306, + "flos": 603221216256.0, + "grad_norm": 0.039455426947262194, + "language_loss": 0.84397018, + "learning_rate": 0.0008188943952142509, + "loss": 0.85530061, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.53222656, + "step": 1567, + "time_per_iteration": 2.798323154449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113428, + "balance_loss_mlp": 1.06030834, + "epoch": 0.30165448249326665, + "flos": 919287973632.0, + "grad_norm": 0.03836627098538091, + "language_loss": 0.83653766, + "learning_rate": 0.0008186543799168711, + "loss": 0.84767193, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.53222656, + "step": 1568, + "time_per_iteration": 3.1216585636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.0594008, + "epoch": 0.3018468641785302, + "flos": 778631325696.0, + "grad_norm": 0.037681015369085746, + "language_loss": 0.89441907, + "learning_rate": 0.0008184142409068892, + "loss": 0.90554047, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.52832031, + "step": 1569, + "time_per_iteration": 2.9987363815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.03584409, + "epoch": 0.30203924586379377, + "flos": 523389931776.0, + "grad_norm": 0.031063886155947292, + "language_loss": 0.87584674, + "learning_rate": 0.000818173978277536, + "loss": 0.88672638, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.52197266, + "step": 1570, + "time_per_iteration": 2.657801389694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.04125619, + "epoch": 0.3022316275490573, + "flos": 525649994496.0, + "grad_norm": 0.03542742618693904, + "language_loss": 0.8460654, + "learning_rate": 0.000817933592122089, + "loss": 0.85699487, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.51757812, + "step": 1571, + "time_per_iteration": 2.699676752090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094536, + "balance_loss_mlp": 1.04289424, + "epoch": 0.3024240092343209, + "flos": 480873515520.0, + "grad_norm": 0.03710559119511486, + "language_loss": 0.84148443, + "learning_rate": 0.0008176930825338749, + "loss": 0.85242975, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.51708984, + "step": 1572, + "time_per_iteration": 2.560293197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_mlp": 1.03446782, + "epoch": 0.3026163909195845, + "flos": 688431938304.0, + "grad_norm": 0.03769478699711506, + "language_loss": 0.89810324, + "learning_rate": 0.0008174524496062679, + "loss": 0.90895915, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.51171875, + "step": 1573, + "time_per_iteration": 2.9185256958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083791, + "balance_loss_mlp": 1.03334129, + "epoch": 0.302808772604848, + "flos": 544087253760.0, + "grad_norm": 0.033203995249134796, + "language_loss": 0.86450267, + "learning_rate": 0.0008172116934326894, + "loss": 0.87534058, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.50488281, + "step": 1574, + "time_per_iteration": 2.77254056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107482, + "balance_loss_mlp": 1.02456117, + "epoch": 0.3030011542901116, + "flos": 476052046080.0, + "grad_norm": 0.03232260410081742, + "language_loss": 0.88820696, + "learning_rate": 0.0008169708141066097, + "loss": 0.89895517, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.50268555, + "step": 1575, + "time_per_iteration": 2.5428524017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083713, + "balance_loss_mlp": 1.03402615, + "epoch": 0.30319353597537513, + "flos": 482473542912.0, + "grad_norm": 0.035261838486320786, + "language_loss": 0.91478366, + "learning_rate": 0.0008167298117215465, + "loss": 0.92562079, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.49536133, + "step": 1576, + "time_per_iteration": 2.5388023853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.0151732, + "epoch": 0.3033859176606387, + "flos": 706113897984.0, + "grad_norm": 0.033895137386355495, + "language_loss": 0.89157575, + "learning_rate": 0.0008164886863710649, + "loss": 0.90221858, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.49047852, + "step": 1577, + "time_per_iteration": 2.9326250553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072249, + "balance_loss_mlp": 1.02363503, + "epoch": 0.30357829934590225, + "flos": 766110084864.0, + "grad_norm": 0.03320904121402137, + "language_loss": 0.87079322, + "learning_rate": 0.0008162474381487783, + "loss": 0.88151574, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.48608398, + "step": 1578, + "time_per_iteration": 3.0217320919036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_mlp": 1.02135277, + "epoch": 0.30377068103116583, + "flos": 533449887744.0, + "grad_norm": 0.035817825196195696, + "language_loss": 0.854909, + "learning_rate": 0.0008160060671483475, + "loss": 0.86560726, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.48461914, + "step": 1579, + "time_per_iteration": 2.6730797290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074874, + "balance_loss_mlp": 1.02647483, + "epoch": 0.3039630627164294, + "flos": 511224470016.0, + "grad_norm": 0.04566645575365512, + "language_loss": 0.84833682, + "learning_rate": 0.0008157645734634809, + "loss": 0.85908556, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.48388672, + "step": 1580, + "time_per_iteration": 2.5822741985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186287, + "balance_loss_mlp": 1.14089203, + "epoch": 0.30415544440169295, + "flos": 1509190841856.0, + "grad_norm": 0.045615209750242004, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78082776, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.453125, + "step": 1581, + "time_per_iteration": 4.900806665420532 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157879, + "balance_loss_mlp": 1.11257935, + "epoch": 0.30434782608695654, + "flos": 1461789772800.0, + "grad_norm": 0.04177274485031814, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74372375, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.45214844, + "step": 1582, + "time_per_iteration": 4.890560150146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071245, + "balance_loss_mlp": 1.02329922, + "epoch": 0.3045402077722201, + "flos": 483535044096.0, + "grad_norm": 0.03665669352532136, + "language_loss": 0.84926951, + "learning_rate": 0.000815039357240067, + "loss": 0.85998201, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.47924805, + "step": 1583, + "time_per_iteration": 2.655641555786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075238, + "balance_loss_mlp": 1.02695799, + "epoch": 0.30473258945748366, + "flos": 544627725312.0, + "grad_norm": 0.03699880598765725, + "language_loss": 0.86035675, + "learning_rate": 0.0008147973737554952, + "loss": 0.87110913, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.48266602, + "step": 1584, + "time_per_iteration": 2.8118185997009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066041, + "balance_loss_mlp": 1.01754665, + "epoch": 0.3049249711427472, + "flos": 568122669312.0, + "grad_norm": 0.039919187148179, + "language_loss": 0.86646891, + "learning_rate": 0.000814555268055744, + "loss": 0.87712932, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.48486328, + "step": 1585, + "time_per_iteration": 2.618649482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067734, + "balance_loss_mlp": 1.01926374, + "epoch": 0.3051173528280108, + "flos": 529290398976.0, + "grad_norm": 0.034961032963054674, + "language_loss": 0.88066852, + "learning_rate": 0.0008143130402348073, + "loss": 0.89134592, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.48461914, + "step": 1586, + "time_per_iteration": 2.6645073890686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.01593137, + "epoch": 0.3053097345132743, + "flos": 587600042496.0, + "grad_norm": 0.03198607314396223, + "language_loss": 0.79707628, + "learning_rate": 0.0008140706903867265, + "loss": 0.80772173, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.48608398, + "step": 1587, + "time_per_iteration": 2.772688150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.01706147, + "epoch": 0.3055021161985379, + "flos": 608201133312.0, + "grad_norm": 0.03820330265300666, + "language_loss": 0.90882033, + "learning_rate": 0.0008138282186055897, + "loss": 0.91947937, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.48803711, + "step": 1588, + "time_per_iteration": 2.6824429035186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106751, + "balance_loss_mlp": 1.01851535, + "epoch": 0.3056944978838015, + "flos": 574963128576.0, + "grad_norm": 0.03364087196891663, + "language_loss": 0.83419842, + "learning_rate": 0.0008135856249855331, + "loss": 0.84487349, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.48950195, + "step": 1589, + "time_per_iteration": 2.6829729080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.0164994, + "epoch": 0.305886879569065, + "flos": 635072076288.0, + "grad_norm": 0.036524553871552005, + "language_loss": 0.90591866, + "learning_rate": 0.0008133429096207398, + "loss": 0.91657621, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.4909668, + "step": 1590, + "time_per_iteration": 2.7734742164611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135399, + "balance_loss_mlp": 1.08351898, + "epoch": 0.3060792612543286, + "flos": 1372133769216.0, + "grad_norm": 0.023040785082221134, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76447666, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.51953125, + "step": 1591, + "time_per_iteration": 4.964044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106727, + "balance_loss_mlp": 1.01806068, + "epoch": 0.30627164293959214, + "flos": 519619269888.0, + "grad_norm": 0.029618090290997726, + "language_loss": 0.87174189, + "learning_rate": 0.0008128571140339123, + "loss": 0.88241458, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.49121094, + "step": 1592, + "time_per_iteration": 2.6813180446624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.01942289, + "epoch": 0.3064640246248557, + "flos": 456533843712.0, + "grad_norm": 0.02963099688993501, + "language_loss": 0.87551641, + "learning_rate": 0.0008126140340004805, + "loss": 0.88620031, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.48876953, + "step": 1593, + "time_per_iteration": 2.5293447971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_mlp": 1.01580834, + "epoch": 0.30665640631011926, + "flos": 851609511936.0, + "grad_norm": 0.028917997945976257, + "language_loss": 0.82855684, + "learning_rate": 0.0008123708325995172, + "loss": 0.8392061, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.49023438, + "step": 1594, + "time_per_iteration": 3.1976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068322, + "balance_loss_mlp": 1.01937473, + "epoch": 0.30684878799538284, + "flos": 759616656384.0, + "grad_norm": 0.02786640270256765, + "language_loss": 0.80270225, + "learning_rate": 0.0008121275099254414, + "loss": 0.81338549, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.48901367, + "step": 1595, + "time_per_iteration": 2.9073448181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105895, + "balance_loss_mlp": 1.01069379, + "epoch": 0.3070411696806464, + "flos": 518596652544.0, + "grad_norm": 0.02828411740511225, + "language_loss": 0.89261508, + "learning_rate": 0.0008118840660727194, + "loss": 0.90320462, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.48242188, + "step": 1596, + "time_per_iteration": 2.6137096881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105679, + "balance_loss_mlp": 1.00855815, + "epoch": 0.30723355136590996, + "flos": 845791670016.0, + "grad_norm": 0.02807637717187332, + "language_loss": 0.8853125, + "learning_rate": 0.0008116405011358644, + "loss": 0.89588046, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.48217773, + "step": 1597, + "time_per_iteration": 3.1528680324554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_mlp": 1.01163971, + "epoch": 0.30742593305117355, + "flos": 467079836160.0, + "grad_norm": 0.032917462624290315, + "language_loss": 0.80716425, + "learning_rate": 0.0008113968152094369, + "loss": 0.81776392, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.4831543, + "step": 1598, + "time_per_iteration": 2.5390987396240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059252, + "balance_loss_mlp": 1.011235, + "epoch": 0.3076183147364371, + "flos": 687817589760.0, + "grad_norm": 0.03298344899906339, + "language_loss": 0.830042, + "learning_rate": 0.0008111530083880438, + "loss": 0.84063458, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.47998047, + "step": 1599, + "time_per_iteration": 2.904327154159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059695, + "balance_loss_mlp": 1.01170099, + "epoch": 0.30781069642170067, + "flos": 615180598272.0, + "grad_norm": 0.03364515132561045, + "language_loss": 0.86925042, + "learning_rate": 0.0008109090807663399, + "loss": 0.87984729, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.47973633, + "step": 1600, + "time_per_iteration": 2.794553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059845, + "balance_loss_mlp": 1.01206601, + "epoch": 0.3080030781069642, + "flos": 591509710080.0, + "grad_norm": 0.029450986393402313, + "language_loss": 0.89288217, + "learning_rate": 0.0008106650324390257, + "loss": 0.90348059, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.47753906, + "step": 1601, + "time_per_iteration": 2.825118064880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055341, + "balance_loss_mlp": 1.00744271, + "epoch": 0.3081954597922278, + "flos": 563691972096.0, + "grad_norm": 0.03217567830931305, + "language_loss": 0.82333392, + "learning_rate": 0.0008104208635008493, + "loss": 0.83388734, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.47875977, + "step": 1602, + "time_per_iteration": 2.7727856636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057631, + "balance_loss_mlp": 1.0099231, + "epoch": 0.3083878414774913, + "flos": 448762140672.0, + "grad_norm": 0.03928010080840531, + "language_loss": 0.82422024, + "learning_rate": 0.0008101765740466058, + "loss": 0.83479655, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.47680664, + "step": 1603, + "time_per_iteration": 2.5764591693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106031, + "balance_loss_mlp": 1.01272202, + "epoch": 0.3085802231627549, + "flos": 494545685760.0, + "grad_norm": 0.03880240670965016, + "language_loss": 0.84925759, + "learning_rate": 0.0008099321641711364, + "loss": 0.85986066, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.47558594, + "step": 1604, + "time_per_iteration": 2.6562154293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059833, + "balance_loss_mlp": 1.01262641, + "epoch": 0.3087726048480185, + "flos": 488690905344.0, + "grad_norm": 0.030963234073246262, + "language_loss": 0.84138477, + "learning_rate": 0.0008096876339693295, + "loss": 0.85198307, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.47167969, + "step": 1605, + "time_per_iteration": 2.6818747520446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057926, + "balance_loss_mlp": 1.01083875, + "epoch": 0.308964986533282, + "flos": 731888346624.0, + "grad_norm": 0.03606871420254603, + "language_loss": 0.82584137, + "learning_rate": 0.0008094429835361206, + "loss": 0.83642066, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.47045898, + "step": 1606, + "time_per_iteration": 2.940202236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01211011, + "epoch": 0.3091573682185456, + "flos": 606516535296.0, + "grad_norm": 0.033324674351776856, + "language_loss": 0.86802429, + "learning_rate": 0.0008091982129664908, + "loss": 0.87861747, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.47167969, + "step": 1607, + "time_per_iteration": 2.7152366638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055819, + "balance_loss_mlp": 1.00858819, + "epoch": 0.30934974990380915, + "flos": 461307681024.0, + "grad_norm": 0.0316485976101594, + "language_loss": 0.83554763, + "learning_rate": 0.0008089533223554687, + "loss": 0.84610581, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.47192383, + "step": 1608, + "time_per_iteration": 2.73236083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00692201, + "epoch": 0.30954213158907273, + "flos": 554568117504.0, + "grad_norm": 0.03240022060424308, + "language_loss": 0.85798776, + "learning_rate": 0.0008087083117981294, + "loss": 0.86852884, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.47143555, + "step": 1609, + "time_per_iteration": 2.8992979526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052885, + "balance_loss_mlp": 1.00543988, + "epoch": 0.30973451327433627, + "flos": 554114161920.0, + "grad_norm": 0.03509024741452312, + "language_loss": 0.88937026, + "learning_rate": 0.0008084631813895943, + "loss": 0.89989913, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.47412109, + "step": 1610, + "time_per_iteration": 2.8113343715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00168037, + "epoch": 0.30992689495959985, + "flos": 566763714816.0, + "grad_norm": 0.03310460584308608, + "language_loss": 0.8446725, + "learning_rate": 0.0008082179312250315, + "loss": 0.85516399, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.47436523, + "step": 1611, + "time_per_iteration": 2.6286494731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146973, + "balance_loss_mlp": 1.09509277, + "epoch": 0.3101192766448634, + "flos": 1445562998784.0, + "grad_norm": 0.022501740699277736, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8100282, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.51953125, + "step": 1612, + "time_per_iteration": 4.877255439758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132568, + "balance_loss_mlp": 1.08087921, + "epoch": 0.31031165833012697, + "flos": 1535130541056.0, + "grad_norm": 0.020576462480935535, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.777619, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.51757812, + "step": 1613, + "time_per_iteration": 5.064774751663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.00363839, + "epoch": 0.31050404001539056, + "flos": 993633862656.0, + "grad_norm": 0.03245007970491877, + "language_loss": 0.83116508, + "learning_rate": 0.0008074814631475545, + "loss": 0.84167451, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.47265625, + "step": 1614, + "time_per_iteration": 3.322155714035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_mlp": 1.00741875, + "epoch": 0.3106964217006541, + "flos": 446973530112.0, + "grad_norm": 0.03235075185089818, + "language_loss": 0.80034411, + "learning_rate": 0.0008072357349114907, + "loss": 0.81089151, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.47290039, + "step": 1615, + "time_per_iteration": 2.699772596359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056016, + "balance_loss_mlp": 1.00880885, + "epoch": 0.3108888033859177, + "flos": 511495678464.0, + "grad_norm": 0.0340106704308988, + "language_loss": 0.89603639, + "learning_rate": 0.0008069898873959363, + "loss": 0.90659654, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.47167969, + "step": 1616, + "time_per_iteration": 2.680640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051359, + "balance_loss_mlp": 1.0043664, + "epoch": 0.3110811850711812, + "flos": 521779210752.0, + "grad_norm": 0.029395602971080924, + "language_loss": 0.86344647, + "learning_rate": 0.0008067439206963375, + "loss": 0.87396008, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.46948242, + "step": 1617, + "time_per_iteration": 2.6484971046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055386, + "balance_loss_mlp": 1.00844121, + "epoch": 0.3112735667564448, + "flos": 687731073792.0, + "grad_norm": 0.03406090033110643, + "language_loss": 0.87673247, + "learning_rate": 0.0008064978349081873, + "loss": 0.88728631, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.46899414, + "step": 1618, + "time_per_iteration": 2.92702579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_mlp": 1.00965679, + "epoch": 0.31146594844170833, + "flos": 534166303488.0, + "grad_norm": 0.030256910717709223, + "language_loss": 0.87292403, + "learning_rate": 0.0008062516301270245, + "loss": 0.88348979, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.46875, + "step": 1619, + "time_per_iteration": 2.7301478385925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.00859511, + "epoch": 0.3116583301269719, + "flos": 680842982400.0, + "grad_norm": 0.027867683897015817, + "language_loss": 0.88937479, + "learning_rate": 0.0008060053064484343, + "loss": 0.89992964, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.46850586, + "step": 1620, + "time_per_iteration": 2.947906017303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048804, + "balance_loss_mlp": 1.00202632, + "epoch": 0.31185071181223545, + "flos": 587330779392.0, + "grad_norm": 0.03167203134142694, + "language_loss": 0.86095911, + "learning_rate": 0.0008057588639680482, + "loss": 0.87144709, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.46728516, + "step": 1621, + "time_per_iteration": 2.7836551666259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_mlp": 1.00282919, + "epoch": 0.31204309349749904, + "flos": 726658608384.0, + "grad_norm": 0.037979301866738396, + "language_loss": 0.83855367, + "learning_rate": 0.0008055123027815434, + "loss": 0.84904802, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.46557617, + "step": 1622, + "time_per_iteration": 2.9263358116149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_mlp": 1.00455689, + "epoch": 0.3122354751827626, + "flos": 577895865600.0, + "grad_norm": 0.032507776226150094, + "language_loss": 0.85607505, + "learning_rate": 0.0008052656229846436, + "loss": 0.86658645, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.46533203, + "step": 1623, + "time_per_iteration": 2.662386894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051831, + "balance_loss_mlp": 1.00514877, + "epoch": 0.31242785686802615, + "flos": 577029750528.0, + "grad_norm": 0.03513403942618559, + "language_loss": 0.91195071, + "learning_rate": 0.0008050188246731182, + "loss": 0.92246902, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.46630859, + "step": 1624, + "time_per_iteration": 2.710176467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052884, + "balance_loss_mlp": 1.00624907, + "epoch": 0.31262023855328974, + "flos": 738197082624.0, + "grad_norm": 0.0324646036152644, + "language_loss": 0.82931978, + "learning_rate": 0.0008047719079427834, + "loss": 0.83984858, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.46582031, + "step": 1625, + "time_per_iteration": 2.970287561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.03533173, + "epoch": 0.3128126202385533, + "flos": 1562594445312.0, + "grad_norm": 0.01743050972952843, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75434434, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.47363281, + "step": 1626, + "time_per_iteration": 4.816533088684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053999, + "balance_loss_mlp": 1.0071733, + "epoch": 0.31300500192381686, + "flos": 515943872256.0, + "grad_norm": 0.030770809254638827, + "language_loss": 0.86711371, + "learning_rate": 0.0008042777196091757, + "loss": 0.87765372, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.46777344, + "step": 1627, + "time_per_iteration": 2.7191882133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.01072919, + "epoch": 0.3131973836090804, + "flos": 527662181376.0, + "grad_norm": 0.031150181208545357, + "language_loss": 0.82488692, + "learning_rate": 0.0008040304481977643, + "loss": 0.83546221, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.4675293, + "step": 1628, + "time_per_iteration": 2.706782579421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.01065385, + "epoch": 0.313389765294344, + "flos": 824210736384.0, + "grad_norm": 0.032636383561425994, + "language_loss": 0.87568998, + "learning_rate": 0.0008037830587512649, + "loss": 0.88626337, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.46630859, + "step": 1629, + "time_per_iteration": 3.0928542613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054937, + "balance_loss_mlp": 1.00820696, + "epoch": 0.31358214697960757, + "flos": 394703359488.0, + "grad_norm": 0.03241768310332359, + "language_loss": 0.79631239, + "learning_rate": 0.0008035355513657224, + "loss": 0.80686176, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.46679688, + "step": 1630, + "time_per_iteration": 2.449666738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054798, + "balance_loss_mlp": 1.00806797, + "epoch": 0.3137745286648711, + "flos": 573098695680.0, + "grad_norm": 0.0293939817515363, + "language_loss": 0.93494189, + "learning_rate": 0.0008032879261372279, + "loss": 0.94548988, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.46679688, + "step": 1631, + "time_per_iteration": 2.766951084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068432, + "balance_loss_mlp": 1.02256012, + "epoch": 0.3139669103501347, + "flos": 1501632021504.0, + "grad_norm": 0.011791019456215185, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80704272, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.45800781, + "step": 1632, + "time_per_iteration": 5.585620403289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_mlp": 1.00425589, + "epoch": 0.3141592920353982, + "flos": 526359607296.0, + "grad_norm": 0.030163528949794682, + "language_loss": 0.87607086, + "learning_rate": 0.0008027923225359748, + "loss": 0.88657928, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.46533203, + "step": 1633, + "time_per_iteration": 2.607407808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105517, + "balance_loss_mlp": 1.0084641, + "epoch": 0.3143516737206618, + "flos": 594388012032.0, + "grad_norm": 0.030785944321789945, + "language_loss": 0.88644683, + "learning_rate": 0.0008025443443556267, + "loss": 0.89699847, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.46655273, + "step": 1634, + "time_per_iteration": 2.704568862915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053981, + "balance_loss_mlp": 1.00756085, + "epoch": 0.31454405540592534, + "flos": 649680347904.0, + "grad_norm": 0.028625636333363444, + "language_loss": 0.88813668, + "learning_rate": 0.000802296248717147, + "loss": 0.89867646, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.46362305, + "step": 1635, + "time_per_iteration": 2.914228916168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_mlp": 1.00461841, + "epoch": 0.3147364370911889, + "flos": 644070531072.0, + "grad_norm": 0.032412817231273386, + "language_loss": 0.79727387, + "learning_rate": 0.0008020480357168554, + "loss": 0.80778593, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.46533203, + "step": 1636, + "time_per_iteration": 2.8196966648101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_mlp": 1.00505865, + "epoch": 0.31492881877645246, + "flos": 472821855744.0, + "grad_norm": 0.028828485286514015, + "language_loss": 0.88662213, + "learning_rate": 0.0008017997054511165, + "loss": 0.89713949, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.46630859, + "step": 1637, + "time_per_iteration": 2.6545960903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051567, + "balance_loss_mlp": 1.00486124, + "epoch": 0.31512120046171604, + "flos": 630630685440.0, + "grad_norm": 0.03463883423234526, + "language_loss": 0.86238796, + "learning_rate": 0.0008015512580163407, + "loss": 0.87290359, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.46655273, + "step": 1638, + "time_per_iteration": 2.775726795196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00429583, + "epoch": 0.31531358214697963, + "flos": 705054342144.0, + "grad_norm": 0.0328972983749375, + "language_loss": 0.81582069, + "learning_rate": 0.0008013026935089838, + "loss": 0.82632947, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.46533203, + "step": 1639, + "time_per_iteration": 2.859405040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048793, + "balance_loss_mlp": 1.00182474, + "epoch": 0.31550596383224316, + "flos": 573632364288.0, + "grad_norm": 0.03266078051512415, + "language_loss": 0.84787768, + "learning_rate": 0.0008010540120255472, + "loss": 0.85836554, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.46923828, + "step": 1640, + "time_per_iteration": 2.654087781906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051523, + "balance_loss_mlp": 1.00457835, + "epoch": 0.31569834551750675, + "flos": 659513815296.0, + "grad_norm": 0.0373471738494659, + "language_loss": 0.87093472, + "learning_rate": 0.0008008052136625774, + "loss": 0.88144994, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.46899414, + "step": 1641, + "time_per_iteration": 2.7806570529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_mlp": 1.00730693, + "epoch": 0.3158907272027703, + "flos": 567404308224.0, + "grad_norm": 0.028103315573088077, + "language_loss": 0.87394774, + "learning_rate": 0.0008005562985166666, + "loss": 0.88449007, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.46875, + "step": 1642, + "time_per_iteration": 2.6866798400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053754, + "balance_loss_mlp": 1.00699973, + "epoch": 0.31608310888803387, + "flos": 537973903872.0, + "grad_norm": 0.024374019828786602, + "language_loss": 0.85555339, + "learning_rate": 0.0008003072666844524, + "loss": 0.86609089, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.46704102, + "step": 1643, + "time_per_iteration": 2.684518337249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_mlp": 1.00856149, + "epoch": 0.3162754905732974, + "flos": 487640097792.0, + "grad_norm": 0.037314537224785074, + "language_loss": 0.8350842, + "learning_rate": 0.0008000581182626173, + "loss": 0.84563494, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.46459961, + "step": 1644, + "time_per_iteration": 2.5574259757995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051572, + "balance_loss_mlp": 1.00481844, + "epoch": 0.316467872258561, + "flos": 531096506112.0, + "grad_norm": 0.03327277300757214, + "language_loss": 0.87005818, + "learning_rate": 0.0007998088533478894, + "loss": 0.88057387, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.46704102, + "step": 1645, + "time_per_iteration": 2.6987338066101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_mlp": 1.00894499, + "epoch": 0.3166602539438245, + "flos": 444414068736.0, + "grad_norm": 0.040202418156990175, + "language_loss": 0.85042381, + "learning_rate": 0.000799559472037042, + "loss": 0.8609792, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.46533203, + "step": 1646, + "time_per_iteration": 2.6219563484191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056055, + "balance_loss_mlp": 1.00958765, + "epoch": 0.3168526356290881, + "flos": 647103389952.0, + "grad_norm": 0.026601574185044653, + "language_loss": 0.8823331, + "learning_rate": 0.0007993099744268932, + "loss": 0.89289367, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.46411133, + "step": 1647, + "time_per_iteration": 2.8902037143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_mlp": 1.00817358, + "epoch": 0.3170450173143517, + "flos": 587258847744.0, + "grad_norm": 0.03281471441230887, + "language_loss": 0.8855083, + "learning_rate": 0.000799060360614307, + "loss": 0.89605635, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.46582031, + "step": 1648, + "time_per_iteration": 2.694293975830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.00945473, + "epoch": 0.3172373989996152, + "flos": 828574359552.0, + "grad_norm": 0.03046931045185914, + "language_loss": 0.84284711, + "learning_rate": 0.0007988106306961917, + "loss": 0.85340536, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.46313477, + "step": 1649, + "time_per_iteration": 3.121788501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_mlp": 1.01195896, + "epoch": 0.3174297806848788, + "flos": 528434977536.0, + "grad_norm": 0.03563880571664149, + "language_loss": 0.85299373, + "learning_rate": 0.0007985607847695014, + "loss": 0.8635785, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.46459961, + "step": 1650, + "time_per_iteration": 2.625356912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.00107014, + "epoch": 0.31762216237014235, + "flos": 714482452992.0, + "grad_norm": 0.030498079123472206, + "language_loss": 0.83133662, + "learning_rate": 0.0007983108229312345, + "loss": 0.84180987, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.46191406, + "step": 1651, + "time_per_iteration": 2.894109010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_mlp": 1.00362098, + "epoch": 0.31781454405540593, + "flos": 484800679680.0, + "grad_norm": 0.03387492306443982, + "language_loss": 0.86931884, + "learning_rate": 0.0007980607452784351, + "loss": 0.87981641, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.46069336, + "step": 1652, + "time_per_iteration": 2.5593390464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048407, + "balance_loss_mlp": 1.00236845, + "epoch": 0.31800692574066947, + "flos": 549804973824.0, + "grad_norm": 0.04030851184116312, + "language_loss": 0.90997875, + "learning_rate": 0.0007978105519081919, + "loss": 0.92046285, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.4597168, + "step": 1653, + "time_per_iteration": 2.683809995651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_mlp": 0.99982309, + "epoch": 0.31819930742593305, + "flos": 517917175296.0, + "grad_norm": 0.033294821801319624, + "language_loss": 0.88831019, + "learning_rate": 0.0007975602429176385, + "loss": 0.89876974, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.46069336, + "step": 1654, + "time_per_iteration": 2.5786075592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00238276, + "epoch": 0.31839168911119664, + "flos": 456970302720.0, + "grad_norm": 0.028947480678153642, + "language_loss": 0.82318926, + "learning_rate": 0.0007973098184039536, + "loss": 0.83367276, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.45898438, + "step": 1655, + "time_per_iteration": 2.651188611984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 0.99921381, + "epoch": 0.3185840707964602, + "flos": 627296482560.0, + "grad_norm": 0.03276090001573999, + "language_loss": 0.8731916, + "learning_rate": 0.0007970592784643602, + "loss": 0.88364458, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.46020508, + "step": 1656, + "time_per_iteration": 2.8683595657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 0.99976265, + "epoch": 0.31877645248172376, + "flos": 568541631744.0, + "grad_norm": 0.035945607337745746, + "language_loss": 0.85986471, + "learning_rate": 0.0007968086231961272, + "loss": 0.87032342, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.46044922, + "step": 1657, + "time_per_iteration": 2.642733335494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00119007, + "epoch": 0.3189688341669873, + "flos": 490553392896.0, + "grad_norm": 0.04377426906704287, + "language_loss": 0.84065533, + "learning_rate": 0.0007965578526965671, + "loss": 0.85112733, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.45947266, + "step": 1658, + "time_per_iteration": 2.5638930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049099, + "balance_loss_mlp": 1.00291717, + "epoch": 0.3191612158522509, + "flos": 577381638912.0, + "grad_norm": 0.02931224295785387, + "language_loss": 0.86766565, + "learning_rate": 0.0007963069670630377, + "loss": 0.87815666, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.46118164, + "step": 1659, + "time_per_iteration": 2.7154479026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051177, + "balance_loss_mlp": 1.00506639, + "epoch": 0.3193535975375144, + "flos": 539193852672.0, + "grad_norm": 0.03496177903686506, + "language_loss": 0.88776976, + "learning_rate": 0.0007960559663929416, + "loss": 0.89828151, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.46044922, + "step": 1660, + "time_per_iteration": 2.6322021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_mlp": 1.00868368, + "epoch": 0.319545979222778, + "flos": 735628872960.0, + "grad_norm": 0.030221795014758104, + "language_loss": 0.88154632, + "learning_rate": 0.0007958048507837259, + "loss": 0.89209306, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.45922852, + "step": 1661, + "time_per_iteration": 2.9221389293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105245, + "balance_loss_mlp": 1.00648332, + "epoch": 0.31973836090804153, + "flos": 765768890112.0, + "grad_norm": 0.037416739988226255, + "language_loss": 0.87668484, + "learning_rate": 0.0007955536203328822, + "loss": 0.88720942, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.45898438, + "step": 1662, + "time_per_iteration": 2.9018445014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_mlp": 1.00184774, + "epoch": 0.3199307425933051, + "flos": 561742968576.0, + "grad_norm": 0.03025687936293395, + "language_loss": 0.84124553, + "learning_rate": 0.0007953022751379469, + "loss": 0.85172796, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.46337891, + "step": 1663, + "time_per_iteration": 2.781562566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085701, + "balance_loss_mlp": 1.03906643, + "epoch": 0.3201231242785687, + "flos": 752672184576.0, + "grad_norm": 0.03881407073457837, + "language_loss": 0.82717097, + "learning_rate": 0.000795050815296501, + "loss": 0.83802795, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.46582031, + "step": 1664, + "time_per_iteration": 2.9950287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_mlp": 1.00446498, + "epoch": 0.32031550596383224, + "flos": 497385103872.0, + "grad_norm": 0.02713287522590179, + "language_loss": 0.93810016, + "learning_rate": 0.0007947992409061695, + "loss": 0.94860852, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.46313477, + "step": 1665, + "time_per_iteration": 2.583118438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056564, + "balance_loss_mlp": 1.01045382, + "epoch": 0.3205078876490958, + "flos": 732875970816.0, + "grad_norm": 0.03263285268561658, + "language_loss": 0.86165506, + "learning_rate": 0.0007945475520646226, + "loss": 0.8722207, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.46044922, + "step": 1666, + "time_per_iteration": 2.903190851211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_mlp": 1.01324141, + "epoch": 0.32070026933435936, + "flos": 550475702784.0, + "grad_norm": 0.03801033406135743, + "language_loss": 0.85650241, + "learning_rate": 0.0007942957488695743, + "loss": 0.86709714, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.46166992, + "step": 1667, + "time_per_iteration": 2.661292791366577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_mlp": 1.01277089, + "epoch": 0.32089265101962294, + "flos": 746685201408.0, + "grad_norm": 0.031638418068872444, + "language_loss": 0.81749988, + "learning_rate": 0.0007940438314187833, + "loss": 0.82809013, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.46191406, + "step": 1668, + "time_per_iteration": 3.0293474197387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057511, + "balance_loss_mlp": 1.01144862, + "epoch": 0.3210850327048865, + "flos": 495196972800.0, + "grad_norm": 0.034120041175176606, + "language_loss": 0.81371748, + "learning_rate": 0.0007937917998100529, + "loss": 0.82429266, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.45996094, + "step": 1669, + "time_per_iteration": 2.5822434425354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08258255, + "balance_loss_mlp": 8.0, + "epoch": 0.32127741439015006, + "flos": 531673916160.0, + "grad_norm": 0.043058724234977634, + "language_loss": 0.81425405, + "learning_rate": 0.0007935396541412302, + "loss": 0.89683664, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 2.58203125, + "step": 1670, + "time_per_iteration": 2.5968360900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0830899, + "balance_loss_mlp": 8.0, + "epoch": 0.3214697960754136, + "flos": 502224069888.0, + "grad_norm": 0.0363513778225316, + "language_loss": 0.87401152, + "learning_rate": 0.0007932873945102068, + "loss": 0.9571014, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 3.0859375, + "step": 1671, + "time_per_iteration": 2.582617998123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08312805, + "balance_loss_mlp": 8.0, + "epoch": 0.3216621777606772, + "flos": 1386404736768.0, + "grad_norm": 0.003686648730821959, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.84074581, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 3.125, + "step": 1672, + "time_per_iteration": 4.829998970031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08311279, + "balance_loss_mlp": 8.0, + "epoch": 0.32185455944594077, + "flos": 572635991808.0, + "grad_norm": 0.030782594356869853, + "language_loss": 0.88089788, + "learning_rate": 0.0007927825337533461, + "loss": 0.96401072, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 3.109375, + "step": 1673, + "time_per_iteration": 2.6633598804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08310516, + "balance_loss_mlp": 8.0, + "epoch": 0.3220469411312043, + "flos": 544937817600.0, + "grad_norm": 0.040711103761993876, + "language_loss": 0.86732781, + "learning_rate": 0.0007925299328235131, + "loss": 0.95043296, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 3.1015625, + "step": 1674, + "time_per_iteration": 2.634169578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08307083, + "balance_loss_mlp": 8.0, + "epoch": 0.3222393228164679, + "flos": 492162168576.0, + "grad_norm": 0.03938689136463286, + "language_loss": 0.86802006, + "learning_rate": 0.000792277218323488, + "loss": 0.95109081, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 3.06640625, + "step": 1675, + "time_per_iteration": 2.5893990993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08270843, + "balance_loss_mlp": 8.0, + "epoch": 0.3224317045017314, + "flos": 491363127552.0, + "grad_norm": 0.03386575094399551, + "language_loss": 0.86165106, + "learning_rate": 0.0007920243903513833, + "loss": 0.94435954, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 2.7109375, + "step": 1676, + "time_per_iteration": 2.5602426528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02321873, + "balance_loss_mlp": 2.26942062, + "epoch": 0.322624086186995, + "flos": 576871302912.0, + "grad_norm": 0.12910494226103245, + "language_loss": 0.85448408, + "learning_rate": 0.0007917714490053556, + "loss": 0.87770277, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.52539062, + "step": 1677, + "time_per_iteration": 2.6558380126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071536, + "balance_loss_mlp": 1.02492559, + "epoch": 0.32281646787225854, + "flos": 630572359680.0, + "grad_norm": 0.04049679721352166, + "language_loss": 0.87627459, + "learning_rate": 0.0007915183943836055, + "loss": 0.88698995, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.46557617, + "step": 1678, + "time_per_iteration": 2.898658037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.02631712, + "epoch": 0.3230088495575221, + "flos": 782808311040.0, + "grad_norm": 0.04272749105284559, + "language_loss": 0.85738349, + "learning_rate": 0.0007912652265843773, + "loss": 0.86811107, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.46386719, + "step": 1679, + "time_per_iteration": 3.049938917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082142, + "balance_loss_mlp": 1.03557873, + "epoch": 0.3232012312427857, + "flos": 537201107712.0, + "grad_norm": 0.04201967602882564, + "language_loss": 0.83624417, + "learning_rate": 0.0007910119457059597, + "loss": 0.84706557, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.46508789, + "step": 1680, + "time_per_iteration": 2.7126853466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108585, + "balance_loss_mlp": 1.03895342, + "epoch": 0.32339361292804925, + "flos": 706233461760.0, + "grad_norm": 0.044345030126194285, + "language_loss": 0.81981564, + "learning_rate": 0.0007907585518466849, + "loss": 0.83067411, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.46850586, + "step": 1681, + "time_per_iteration": 2.9758992195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088847, + "balance_loss_mlp": 1.0419023, + "epoch": 0.32358599461331283, + "flos": 453257966592.0, + "grad_norm": 0.04210474159896445, + "language_loss": 0.91257876, + "learning_rate": 0.000790505045104929, + "loss": 0.92346722, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.46899414, + "step": 1682, + "time_per_iteration": 2.5105395317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090368, + "balance_loss_mlp": 1.04337561, + "epoch": 0.32377837629857636, + "flos": 602092641024.0, + "grad_norm": 0.04465728550727914, + "language_loss": 0.88834655, + "learning_rate": 0.0007902514255791125, + "loss": 0.89925027, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.46948242, + "step": 1683, + "time_per_iteration": 2.7610387802124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089116, + "balance_loss_mlp": 1.04190934, + "epoch": 0.32397075798383995, + "flos": 808899654912.0, + "grad_norm": 0.04108658803287063, + "language_loss": 0.89801908, + "learning_rate": 0.0007899976933676986, + "loss": 0.90891027, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.47167969, + "step": 1684, + "time_per_iteration": 2.963387966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089307, + "balance_loss_mlp": 1.04205263, + "epoch": 0.3241631396691035, + "flos": 602793505536.0, + "grad_norm": 0.046655842402160155, + "language_loss": 0.89137548, + "learning_rate": 0.0007897438485691955, + "loss": 0.90226853, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.47216797, + "step": 1685, + "time_per_iteration": 2.675910711288452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079467, + "balance_loss_mlp": 1.03195012, + "epoch": 0.32435552135436707, + "flos": 475177182720.0, + "grad_norm": 0.045429866607221585, + "language_loss": 0.84063458, + "learning_rate": 0.0007894898912821542, + "loss": 0.85142922, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.47485352, + "step": 1686, + "time_per_iteration": 2.530951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077585, + "balance_loss_mlp": 1.02980566, + "epoch": 0.3245479030396306, + "flos": 539220097536.0, + "grad_norm": 0.03833008440392265, + "language_loss": 0.88029444, + "learning_rate": 0.0007892358216051695, + "loss": 0.89107037, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.47753906, + "step": 1687, + "time_per_iteration": 2.7729742527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.01963735, + "epoch": 0.3247402847248942, + "flos": 548697785856.0, + "grad_norm": 0.039082280310976325, + "language_loss": 0.93519121, + "learning_rate": 0.0007889816396368803, + "loss": 0.94586968, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.48193359, + "step": 1688, + "time_per_iteration": 2.625795602798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_mlp": 1.01371753, + "epoch": 0.3249326664101578, + "flos": 378992757504.0, + "grad_norm": 0.03548852277095179, + "language_loss": 0.86296374, + "learning_rate": 0.0007887273454759687, + "loss": 0.87358844, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.48754883, + "step": 1689, + "time_per_iteration": 2.4798507690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070366, + "balance_loss_mlp": 1.02106154, + "epoch": 0.3251250480954213, + "flos": 529123203072.0, + "grad_norm": 0.03304707654173593, + "language_loss": 0.83602285, + "learning_rate": 0.0007884729392211603, + "loss": 0.84672654, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.49194336, + "step": 1690, + "time_per_iteration": 2.6475188732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.01732576, + "epoch": 0.3253174297806849, + "flos": 450559499520.0, + "grad_norm": 0.03986808198030794, + "language_loss": 0.86860085, + "learning_rate": 0.0007882184209712245, + "loss": 0.87927043, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.49609375, + "step": 1691, + "time_per_iteration": 2.5213029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.03961909, + "epoch": 0.32550981146594843, + "flos": 705490801152.0, + "grad_norm": 0.03183986603149819, + "language_loss": 0.86227143, + "learning_rate": 0.000787963790824974, + "loss": 0.8731674, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.49975586, + "step": 1692, + "time_per_iteration": 2.9866673946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086614, + "balance_loss_mlp": 1.03654587, + "epoch": 0.325702193151212, + "flos": 393559233024.0, + "grad_norm": 0.035135222587328305, + "language_loss": 0.90092403, + "learning_rate": 0.0007877090488812651, + "loss": 0.91179013, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.50073242, + "step": 1693, + "time_per_iteration": 2.443784475326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067298, + "balance_loss_mlp": 1.01708698, + "epoch": 0.32589457483647555, + "flos": 578584091136.0, + "grad_norm": 0.03604448220117138, + "language_loss": 0.84406531, + "learning_rate": 0.0007874541952389973, + "loss": 0.85473824, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.50219727, + "step": 1694, + "time_per_iteration": 2.6662275791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069205, + "balance_loss_mlp": 1.01918459, + "epoch": 0.32608695652173914, + "flos": 499330216704.0, + "grad_norm": 0.03462929627838828, + "language_loss": 0.87473089, + "learning_rate": 0.0007871992299971136, + "loss": 0.88542295, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.50024414, + "step": 1695, + "time_per_iteration": 2.5501420497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068106, + "balance_loss_mlp": 1.01803839, + "epoch": 0.32627933820700267, + "flos": 592301948160.0, + "grad_norm": 0.0349674772808078, + "language_loss": 0.85830671, + "learning_rate": 0.0007869441532546001, + "loss": 0.86898774, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.5, + "step": 1696, + "time_per_iteration": 2.7640528678894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065186, + "balance_loss_mlp": 1.01550007, + "epoch": 0.32647171989226625, + "flos": 610274558208.0, + "grad_norm": 0.03448959411295718, + "language_loss": 0.80548751, + "learning_rate": 0.0007866889651104867, + "loss": 0.81613934, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.49658203, + "step": 1697, + "time_per_iteration": 2.8403704166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106687, + "balance_loss_mlp": 1.01723123, + "epoch": 0.32666410157752984, + "flos": 478190599680.0, + "grad_norm": 0.0393752309547029, + "language_loss": 0.84585583, + "learning_rate": 0.000786433665663846, + "loss": 0.85652447, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.49536133, + "step": 1698, + "time_per_iteration": 2.7460434436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.01603401, + "epoch": 0.3268564832627934, + "flos": 719694694656.0, + "grad_norm": 0.03598572558720647, + "language_loss": 0.87469888, + "learning_rate": 0.0007861782550137942, + "loss": 0.88535315, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.49291992, + "step": 1699, + "time_per_iteration": 2.922189474105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_mlp": 1.01299262, + "epoch": 0.32704886494805696, + "flos": 770106268416.0, + "grad_norm": 0.033319227910548664, + "language_loss": 0.86952895, + "learning_rate": 0.0007859227332594901, + "loss": 0.88014954, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.48999023, + "step": 1700, + "time_per_iteration": 2.8891940116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_mlp": 1.00782549, + "epoch": 0.3272412466333205, + "flos": 851405377536.0, + "grad_norm": 0.0384838580126543, + "language_loss": 0.85734528, + "learning_rate": 0.0007856671005001365, + "loss": 0.8679111, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.48730469, + "step": 1701, + "time_per_iteration": 3.169032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105609, + "balance_loss_mlp": 1.00728559, + "epoch": 0.3274336283185841, + "flos": 833041995264.0, + "grad_norm": 0.03605284930108709, + "language_loss": 0.82799482, + "learning_rate": 0.0007854113568349787, + "loss": 0.83855575, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.48779297, + "step": 1702, + "time_per_iteration": 3.123967170715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060179, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3276260100038476, + "flos": 693253407744.0, + "grad_norm": 0.03564674283827795, + "language_loss": 0.81364781, + "learning_rate": 0.0007851555023633052, + "loss": 0.82424963, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.48388672, + "step": 1703, + "time_per_iteration": 2.8430581092834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059511, + "balance_loss_mlp": 1.01120698, + "epoch": 0.3278183916891112, + "flos": 436978702848.0, + "grad_norm": 0.03514994366577059, + "language_loss": 0.83518881, + "learning_rate": 0.0007848995371844474, + "loss": 0.84578383, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.48291016, + "step": 1704, + "time_per_iteration": 2.552917003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_mlp": 1.00861514, + "epoch": 0.3280107733743748, + "flos": 462017293824.0, + "grad_norm": 0.03278124420090015, + "language_loss": 0.81157213, + "learning_rate": 0.0007846434613977801, + "loss": 0.82213771, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.47924805, + "step": 1705, + "time_per_iteration": 2.496506929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062567, + "balance_loss_mlp": 1.01483595, + "epoch": 0.3282031550596383, + "flos": 680529977856.0, + "grad_norm": 0.03615486988598079, + "language_loss": 0.79136091, + "learning_rate": 0.0007843872751027203, + "loss": 0.80198663, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.47705078, + "step": 1706, + "time_per_iteration": 2.8048393726348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00088739, + "epoch": 0.3283955367449019, + "flos": 546255942912.0, + "grad_norm": 0.030185021157442368, + "language_loss": 0.879673, + "learning_rate": 0.0007841309783987287, + "loss": 0.89015824, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.47607422, + "step": 1707, + "time_per_iteration": 2.7402358055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053102, + "balance_loss_mlp": 1.00553715, + "epoch": 0.32858791843016544, + "flos": 482241218304.0, + "grad_norm": 0.035416956868504886, + "language_loss": 0.89878803, + "learning_rate": 0.0007838745713853084, + "loss": 0.90931904, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.4753418, + "step": 1708, + "time_per_iteration": 2.603816270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.00752318, + "epoch": 0.328780300115429, + "flos": 567916589568.0, + "grad_norm": 0.03507338685235107, + "language_loss": 0.84775996, + "learning_rate": 0.0007836180541620053, + "loss": 0.8583082, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.47265625, + "step": 1709, + "time_per_iteration": 2.7194666862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_mlp": 1.00730944, + "epoch": 0.32897268180069256, + "flos": 476992038144.0, + "grad_norm": 0.03621825417570051, + "language_loss": 0.86992389, + "learning_rate": 0.0007833614268284082, + "loss": 0.88046837, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.47094727, + "step": 1710, + "time_per_iteration": 2.510921001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.01346588, + "epoch": 0.32916506348595614, + "flos": 1580453327616.0, + "grad_norm": 0.014405511351568959, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75167489, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.44335938, + "step": 1711, + "time_per_iteration": 4.875708341598511 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.00429153, + "epoch": 0.3293574451712197, + "flos": 483851939328.0, + "grad_norm": 0.03545808379065215, + "language_loss": 0.7916249, + "learning_rate": 0.0007828478422289016, + "loss": 0.80213821, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.4699707, + "step": 1712, + "time_per_iteration": 2.583045721054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_mlp": 1.00582564, + "epoch": 0.32954982685648326, + "flos": 623725097472.0, + "grad_norm": 0.0327870747371716, + "language_loss": 0.89787406, + "learning_rate": 0.0007825908851623833, + "loss": 0.9084022, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.46948242, + "step": 1713, + "time_per_iteration": 2.824685573577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050866, + "balance_loss_mlp": 1.00396931, + "epoch": 0.32974220854174685, + "flos": 546071250432.0, + "grad_norm": 0.03386258255996434, + "language_loss": 0.85659784, + "learning_rate": 0.0007823338183843533, + "loss": 0.8671065, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.46850586, + "step": 1714, + "time_per_iteration": 2.672525644302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051399, + "balance_loss_mlp": 1.00459802, + "epoch": 0.3299345902270104, + "flos": 983823727872.0, + "grad_norm": 0.03566876288837857, + "language_loss": 0.82096756, + "learning_rate": 0.0007820766419946141, + "loss": 0.83148158, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4675293, + "step": 1715, + "time_per_iteration": 3.2718288898468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051636, + "balance_loss_mlp": 1.00662231, + "epoch": 0.33012697191227397, + "flos": 1406904727296.0, + "grad_norm": 0.0085720970679931, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80724114, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.44921875, + "step": 1716, + "time_per_iteration": 4.983957290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065575, + "balance_loss_mlp": 1.01836789, + "epoch": 0.3303193535975375, + "flos": 506170675968.0, + "grad_norm": 0.038525927315114124, + "language_loss": 0.76583785, + "learning_rate": 0.0007815619607794288, + "loss": 0.77649361, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.47167969, + "step": 1717, + "time_per_iteration": 2.6315019130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054249, + "balance_loss_mlp": 1.00713778, + "epoch": 0.3305117352828011, + "flos": 939485653248.0, + "grad_norm": 0.041342276741222116, + "language_loss": 0.83710063, + "learning_rate": 0.0007813044561538001, + "loss": 0.84764308, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.47070312, + "step": 1718, + "time_per_iteration": 3.127446174621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055308, + "balance_loss_mlp": 1.00814831, + "epoch": 0.3307041169680646, + "flos": 722794627584.0, + "grad_norm": 0.03526572402512133, + "language_loss": 0.88796169, + "learning_rate": 0.0007810468423160958, + "loss": 0.89851475, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.47119141, + "step": 1719, + "time_per_iteration": 2.8622305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_mlp": 1.00741386, + "epoch": 0.3308964986533282, + "flos": 584817004800.0, + "grad_norm": 0.029883098234782163, + "language_loss": 0.82424414, + "learning_rate": 0.0007807891193663306, + "loss": 0.83478725, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.46850586, + "step": 1720, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.01715815, + "epoch": 0.33108888033859174, + "flos": 474525895680.0, + "grad_norm": 0.040993977150413745, + "language_loss": 0.82757467, + "learning_rate": 0.0007805312874045614, + "loss": 0.83821499, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.46826172, + "step": 1721, + "time_per_iteration": 2.516045331954956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_mlp": 1.00279772, + "epoch": 0.3312812620238553, + "flos": 386996785152.0, + "grad_norm": 0.03885390252626127, + "language_loss": 0.87709427, + "learning_rate": 0.0007802733465308874, + "loss": 0.88759029, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.4675293, + "step": 1722, + "time_per_iteration": 2.4662280082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_mlp": 1.00108933, + "epoch": 0.3314736437091189, + "flos": 495605241600.0, + "grad_norm": 0.03316625802825005, + "language_loss": 0.85110468, + "learning_rate": 0.0007800152968454501, + "loss": 0.86158121, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.46508789, + "step": 1723, + "time_per_iteration": 2.6313533782958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_mlp": 1.00515401, + "epoch": 0.33166602539438245, + "flos": 654931473408.0, + "grad_norm": 0.02722776998075876, + "language_loss": 0.90998107, + "learning_rate": 0.0007797571384484334, + "loss": 0.92049968, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.46655273, + "step": 1724, + "time_per_iteration": 2.8411970138549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00257659, + "epoch": 0.33185840707964603, + "flos": 521835591168.0, + "grad_norm": 0.03419077024576391, + "language_loss": 0.92796665, + "learning_rate": 0.0007794988714400633, + "loss": 0.93846071, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.46777344, + "step": 1725, + "time_per_iteration": 2.5964980125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_mlp": 1.00367355, + "epoch": 0.33205078876490957, + "flos": 437899252992.0, + "grad_norm": 0.033932075991051254, + "language_loss": 0.86014992, + "learning_rate": 0.0007792404959206079, + "loss": 0.87065518, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.46801758, + "step": 1726, + "time_per_iteration": 2.491852283477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_mlp": 1.00497568, + "epoch": 0.33224317045017315, + "flos": 770095574784.0, + "grad_norm": 0.034529473302537826, + "language_loss": 0.82129228, + "learning_rate": 0.0007789820119903774, + "loss": 0.83181036, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.46777344, + "step": 1727, + "time_per_iteration": 2.9898605346679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_mlp": 1.01260376, + "epoch": 0.3324355521354367, + "flos": 1469296103424.0, + "grad_norm": 0.013638873720884416, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79550946, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.45605469, + "step": 1728, + "time_per_iteration": 4.859704971313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_mlp": 1.00343382, + "epoch": 0.3326279338207003, + "flos": 497800175616.0, + "grad_norm": 0.033386991625918766, + "language_loss": 0.84234303, + "learning_rate": 0.0007784647192990428, + "loss": 0.85284609, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.46826172, + "step": 1729, + "time_per_iteration": 2.7268624305725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_mlp": 1.00419581, + "epoch": 0.33282031550596386, + "flos": 637054127616.0, + "grad_norm": 0.031138270474946127, + "language_loss": 0.81414318, + "learning_rate": 0.0007782059107387696, + "loss": 0.82465172, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.46606445, + "step": 1730, + "time_per_iteration": 2.85831618309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00752223, + "epoch": 0.3330126971912274, + "flos": 690722136576.0, + "grad_norm": 0.03556521205278414, + "language_loss": 0.89100444, + "learning_rate": 0.0007779469941693826, + "loss": 0.9015491, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.46899414, + "step": 1731, + "time_per_iteration": 2.8736839294433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058267, + "balance_loss_mlp": 1.01168013, + "epoch": 0.333205078876491, + "flos": 567554007552.0, + "grad_norm": 0.03898705252222011, + "language_loss": 0.77083337, + "learning_rate": 0.0007776879696914029, + "loss": 0.78141606, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.46533203, + "step": 1732, + "time_per_iteration": 2.84578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055134, + "balance_loss_mlp": 1.00868976, + "epoch": 0.3333974605617545, + "flos": 642171105024.0, + "grad_norm": 0.028730663384365272, + "language_loss": 0.89631069, + "learning_rate": 0.000777428837405392, + "loss": 0.90686202, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.46386719, + "step": 1733, + "time_per_iteration": 2.8595433235168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.00275302, + "epoch": 0.3335898422470181, + "flos": 462779396352.0, + "grad_norm": 0.03984590801707433, + "language_loss": 0.87746447, + "learning_rate": 0.0007771695974119544, + "loss": 0.88795674, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.46411133, + "step": 1734, + "time_per_iteration": 2.5200014114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_mlp": 1.00537193, + "epoch": 0.33378222393228163, + "flos": 854338114560.0, + "grad_norm": 0.03554719013753984, + "language_loss": 0.76235908, + "learning_rate": 0.0007769102498117359, + "loss": 0.77287674, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.46337891, + "step": 1735, + "time_per_iteration": 3.1014633178710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052548, + "balance_loss_mlp": 1.00624716, + "epoch": 0.3339746056175452, + "flos": 956310246144.0, + "grad_norm": 0.03187783426815399, + "language_loss": 0.80701965, + "learning_rate": 0.000776650794705424, + "loss": 0.81754518, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.46240234, + "step": 1736, + "time_per_iteration": 3.253756046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_mlp": 1.00434327, + "epoch": 0.33416698730280875, + "flos": 545895306240.0, + "grad_norm": 0.03238990381642275, + "language_loss": 0.83209848, + "learning_rate": 0.0007763912321937483, + "loss": 0.84260583, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.46337891, + "step": 1737, + "time_per_iteration": 2.712942361831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051632, + "balance_loss_mlp": 1.00525999, + "epoch": 0.33435936898807234, + "flos": 1015876776960.0, + "grad_norm": 0.036470780413058734, + "language_loss": 0.8337301, + "learning_rate": 0.0007761315623774799, + "loss": 0.84424639, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.46313477, + "step": 1738, + "time_per_iteration": 3.38946795463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053137, + "balance_loss_mlp": 1.00671661, + "epoch": 0.3345517506733359, + "flos": 616372356864.0, + "grad_norm": 0.034452353492031275, + "language_loss": 0.88688117, + "learning_rate": 0.0007758717853574313, + "loss": 0.89741254, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.46362305, + "step": 1739, + "time_per_iteration": 2.7438387870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105524, + "balance_loss_mlp": 1.00896263, + "epoch": 0.33474413235859946, + "flos": 495570248448.0, + "grad_norm": 0.03665446817767542, + "language_loss": 0.90973008, + "learning_rate": 0.0007756119012344571, + "loss": 0.92028248, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.4621582, + "step": 1740, + "time_per_iteration": 2.5443572998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.0052774, + "epoch": 0.33493651404386304, + "flos": 629488504320.0, + "grad_norm": 0.0365358867260097, + "language_loss": 0.85516071, + "learning_rate": 0.0007753519101094535, + "loss": 0.86567724, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.46313477, + "step": 1741, + "time_per_iteration": 2.785595417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050508, + "balance_loss_mlp": 1.00396836, + "epoch": 0.3351288957291266, + "flos": 514743365376.0, + "grad_norm": 0.038608286094447275, + "language_loss": 0.87042749, + "learning_rate": 0.0007750918120833575, + "loss": 0.88093251, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.46484375, + "step": 1742, + "time_per_iteration": 2.5612564086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_mlp": 1.00825262, + "epoch": 0.33532127741439016, + "flos": 648483731712.0, + "grad_norm": 0.038902913238311417, + "language_loss": 0.88245445, + "learning_rate": 0.0007748316072571485, + "loss": 0.89300191, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.46435547, + "step": 1743, + "time_per_iteration": 2.8040030002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056064, + "balance_loss_mlp": 1.00969172, + "epoch": 0.3355136590996537, + "flos": 769789373184.0, + "grad_norm": 0.032744002461956113, + "language_loss": 0.80090916, + "learning_rate": 0.0007745712957318467, + "loss": 0.81146979, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.46313477, + "step": 1744, + "time_per_iteration": 2.955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_mlp": 1.00656557, + "epoch": 0.3357060407849173, + "flos": 596650020096.0, + "grad_norm": 0.027209343707751667, + "language_loss": 0.86834347, + "learning_rate": 0.0007743108776085141, + "loss": 0.87887406, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.46435547, + "step": 1745, + "time_per_iteration": 2.8065922260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059361, + "balance_loss_mlp": 1.01277399, + "epoch": 0.3358984224701808, + "flos": 599802442752.0, + "grad_norm": 0.030632877870575562, + "language_loss": 0.83193165, + "learning_rate": 0.0007740503529882543, + "loss": 0.84252524, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.46533203, + "step": 1746, + "time_per_iteration": 2.783057451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058625, + "balance_loss_mlp": 1.01218116, + "epoch": 0.3360908041554444, + "flos": 579430764288.0, + "grad_norm": 0.03209356344176002, + "language_loss": 0.91440552, + "learning_rate": 0.0007737897219722114, + "loss": 0.92499179, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.46386719, + "step": 1747, + "time_per_iteration": 2.6678693294525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00723922, + "epoch": 0.336283185840708, + "flos": 514621856256.0, + "grad_norm": 0.02947569275247992, + "language_loss": 0.81706387, + "learning_rate": 0.0007735289846615716, + "loss": 0.82759976, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.46289062, + "step": 1748, + "time_per_iteration": 2.664217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_mlp": 1.00312185, + "epoch": 0.3364755675259715, + "flos": 526014521856.0, + "grad_norm": 0.03437288512368296, + "language_loss": 0.83148289, + "learning_rate": 0.0007732681411575621, + "loss": 0.84197474, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.45996094, + "step": 1749, + "time_per_iteration": 2.679304361343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_mlp": 1.00613475, + "epoch": 0.3366679492112351, + "flos": 555974704128.0, + "grad_norm": 0.040002531784274646, + "language_loss": 0.88002014, + "learning_rate": 0.0007730071915614514, + "loss": 0.89053994, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.45776367, + "step": 1750, + "time_per_iteration": 2.6813647747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00734055, + "epoch": 0.33686033089649864, + "flos": 428164940544.0, + "grad_norm": 0.03793638318473741, + "language_loss": 0.88937026, + "learning_rate": 0.0007727461359745489, + "loss": 0.89990187, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.45751953, + "step": 1751, + "time_per_iteration": 2.459137439727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_mlp": 1.00425673, + "epoch": 0.3370527125817622, + "flos": 542841060096.0, + "grad_norm": 0.030686532457312277, + "language_loss": 0.86821485, + "learning_rate": 0.0007724849744982056, + "loss": 0.87871712, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.45898438, + "step": 1752, + "time_per_iteration": 2.682023525238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.00412822, + "epoch": 0.33724509426702576, + "flos": 543231832320.0, + "grad_norm": 0.03146587739195435, + "language_loss": 0.82788759, + "learning_rate": 0.0007722237072338131, + "loss": 0.8383888, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.45922852, + "step": 1753, + "time_per_iteration": 2.7289977073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_mlp": 1.00735557, + "epoch": 0.33743747595228935, + "flos": 473753099520.0, + "grad_norm": 0.036309304678759154, + "language_loss": 0.86263937, + "learning_rate": 0.0007719623342828046, + "loss": 0.8731702, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.45654297, + "step": 1754, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_mlp": 1.00127256, + "epoch": 0.33762985763755293, + "flos": 470837859072.0, + "grad_norm": 0.037209700878319825, + "language_loss": 0.84580374, + "learning_rate": 0.000771700855746654, + "loss": 0.85627109, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.45385742, + "step": 1755, + "time_per_iteration": 2.585667848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049151, + "balance_loss_mlp": 1.00366056, + "epoch": 0.33782223932281646, + "flos": 493251859968.0, + "grad_norm": 0.03059786996599164, + "language_loss": 0.89290714, + "learning_rate": 0.0007714392717268763, + "loss": 0.90339863, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.45410156, + "step": 1756, + "time_per_iteration": 2.5836589336395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_mlp": 1.00321686, + "epoch": 0.33801462100808005, + "flos": 466018334976.0, + "grad_norm": 0.035533831964213135, + "language_loss": 0.87473714, + "learning_rate": 0.0007711775823250273, + "loss": 0.88522607, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.45605469, + "step": 1757, + "time_per_iteration": 2.5619492530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_mlp": 1.00417781, + "epoch": 0.3382070026933436, + "flos": 797068584960.0, + "grad_norm": 0.03198873828119691, + "language_loss": 0.84101963, + "learning_rate": 0.0007709157876427039, + "loss": 0.85151625, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.45410156, + "step": 1758, + "time_per_iteration": 3.084735870361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049654, + "balance_loss_mlp": 1.00414026, + "epoch": 0.33839938437860717, + "flos": 509429056512.0, + "grad_norm": 0.031347294296384644, + "language_loss": 0.86196065, + "learning_rate": 0.0007706538877815439, + "loss": 0.87245721, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4543457, + "step": 1759, + "time_per_iteration": 2.6354048252105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_mlp": 1.00371122, + "epoch": 0.3385917660638707, + "flos": 485274077184.0, + "grad_norm": 0.03028112214235413, + "language_loss": 0.83875918, + "learning_rate": 0.0007703918828432259, + "loss": 0.84925139, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.4543457, + "step": 1760, + "time_per_iteration": 2.6017844676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_mlp": 1.00358403, + "epoch": 0.3387841477491343, + "flos": 546416335872.0, + "grad_norm": 0.033680258429279644, + "language_loss": 0.89293355, + "learning_rate": 0.000770129772929469, + "loss": 0.90342498, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.45483398, + "step": 1761, + "time_per_iteration": 2.671287775039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048026, + "balance_loss_mlp": 1.00217831, + "epoch": 0.3389765294343978, + "flos": 721064342784.0, + "grad_norm": 0.03497277274463044, + "language_loss": 0.89180952, + "learning_rate": 0.0007698675581420334, + "loss": 0.90228981, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.45776367, + "step": 1762, + "time_per_iteration": 2.9236271381378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_mlp": 1.00677264, + "epoch": 0.3391689111196614, + "flos": 701264238336.0, + "grad_norm": 0.034268369898116914, + "language_loss": 0.79778481, + "learning_rate": 0.0007696052385827199, + "loss": 0.80830908, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.45581055, + "step": 1763, + "time_per_iteration": 2.9605488777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055868, + "balance_loss_mlp": 1.01018691, + "epoch": 0.339361292804925, + "flos": 628249113600.0, + "grad_norm": 0.03454670185411084, + "language_loss": 0.78905737, + "learning_rate": 0.00076934281435337, + "loss": 0.79961604, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.45605469, + "step": 1764, + "time_per_iteration": 2.7454025745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052159, + "balance_loss_mlp": 1.00647831, + "epoch": 0.33955367449018853, + "flos": 610795587840.0, + "grad_norm": 0.03693575970108084, + "language_loss": 0.86892688, + "learning_rate": 0.0007690802855558658, + "loss": 0.87944847, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.45605469, + "step": 1765, + "time_per_iteration": 2.8936946392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.01057434, + "epoch": 0.3397460561754521, + "flos": 1456589191680.0, + "grad_norm": 0.006269192400269108, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77429777, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.44335938, + "step": 1766, + "time_per_iteration": 4.913206100463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054352, + "balance_loss_mlp": 1.00855207, + "epoch": 0.33993843786071565, + "flos": 488291384832.0, + "grad_norm": 0.039386286306125895, + "language_loss": 0.89967024, + "learning_rate": 0.0007685549146641262, + "loss": 0.91021377, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.45727539, + "step": 1767, + "time_per_iteration": 2.593353271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050967, + "balance_loss_mlp": 1.00554788, + "epoch": 0.34013081954597923, + "flos": 418233296640.0, + "grad_norm": 0.032458575290873634, + "language_loss": 0.89062989, + "learning_rate": 0.0007682920727738579, + "loss": 0.90113962, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.45336914, + "step": 1768, + "time_per_iteration": 2.510331392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054131, + "balance_loss_mlp": 1.00835514, + "epoch": 0.34032320123124277, + "flos": 438430976256.0, + "grad_norm": 0.037803385345055784, + "language_loss": 0.85379529, + "learning_rate": 0.000768029126723369, + "loss": 0.86433661, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.45703125, + "step": 1769, + "time_per_iteration": 2.5152533054351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054345, + "balance_loss_mlp": 1.00852144, + "epoch": 0.34051558291650635, + "flos": 458544085248.0, + "grad_norm": 0.04157155741286578, + "language_loss": 0.82432753, + "learning_rate": 0.0007677660766147447, + "loss": 0.83487099, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.45751953, + "step": 1770, + "time_per_iteration": 2.5669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_mlp": 1.00858307, + "epoch": 0.3407079646017699, + "flos": 1562140489728.0, + "grad_norm": 0.006526141838203855, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73523682, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.44238281, + "step": 1771, + "time_per_iteration": 4.953578233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051633, + "balance_loss_mlp": 1.00602317, + "epoch": 0.3409003462870335, + "flos": 493531816704.0, + "grad_norm": 0.043561887450476046, + "language_loss": 0.80659652, + "learning_rate": 0.0007672396646316306, + "loss": 0.81711292, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.45532227, + "step": 1772, + "time_per_iteration": 2.5720248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00356674, + "epoch": 0.34109272797229706, + "flos": 809822150400.0, + "grad_norm": 0.03735237922314452, + "language_loss": 0.80629146, + "learning_rate": 0.000766976302961512, + "loss": 0.81678128, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.45336914, + "step": 1773, + "time_per_iteration": 3.0438191890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050599, + "balance_loss_mlp": 1.00513268, + "epoch": 0.3412851096575606, + "flos": 471100319232.0, + "grad_norm": 0.03730121261656314, + "language_loss": 0.82086515, + "learning_rate": 0.0007667128376420003, + "loss": 0.83137119, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.45385742, + "step": 1774, + "time_per_iteration": 2.5461959838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_mlp": 1.00681531, + "epoch": 0.3414774913428242, + "flos": 596771529216.0, + "grad_norm": 0.03978671612524881, + "language_loss": 0.85611963, + "learning_rate": 0.0007664492687753817, + "loss": 0.86664057, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.4519043, + "step": 1775, + "time_per_iteration": 2.7454183101654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_mlp": 1.00362372, + "epoch": 0.3416698730280877, + "flos": 528508854528.0, + "grad_norm": 0.03225195621375244, + "language_loss": 0.82109249, + "learning_rate": 0.000766185596463983, + "loss": 0.83158267, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.453125, + "step": 1776, + "time_per_iteration": 2.636876106262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050456, + "balance_loss_mlp": 1.00513279, + "epoch": 0.3418622547133513, + "flos": 876118324992.0, + "grad_norm": 0.033083928099711564, + "language_loss": 0.77454132, + "learning_rate": 0.0007659218208101706, + "loss": 0.78504586, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.45239258, + "step": 1777, + "time_per_iteration": 3.097163677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055706, + "balance_loss_mlp": 1.01031137, + "epoch": 0.34205463639861483, + "flos": 604877624064.0, + "grad_norm": 0.03453483859247358, + "language_loss": 0.86064076, + "learning_rate": 0.0007656579419163515, + "loss": 0.87119782, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.453125, + "step": 1778, + "time_per_iteration": 2.7452263832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055225, + "balance_loss_mlp": 1.0096159, + "epoch": 0.3422470180838784, + "flos": 464715760896.0, + "grad_norm": 0.037184345749469765, + "language_loss": 0.77793133, + "learning_rate": 0.0007653939598849724, + "loss": 0.78848356, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.45532227, + "step": 1779, + "time_per_iteration": 2.5020663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0134964, + "epoch": 0.34243939976914195, + "flos": 1589819222016.0, + "grad_norm": 0.009860928497574006, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83937383, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.4375, + "step": 1780, + "time_per_iteration": 4.958939552307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00849116, + "epoch": 0.34263178145440554, + "flos": 874444420608.0, + "grad_norm": 0.034671274665512654, + "language_loss": 0.80890739, + "learning_rate": 0.000764865686819522, + "loss": 0.81944883, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.45581055, + "step": 1781, + "time_per_iteration": 3.0468943119049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.01148522, + "epoch": 0.3428241631396691, + "flos": 507874715904.0, + "grad_norm": 0.02984044691012994, + "language_loss": 0.86276633, + "learning_rate": 0.0007646013959905449, + "loss": 0.87333775, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.45581055, + "step": 1782, + "time_per_iteration": 2.59788179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056783, + "balance_loss_mlp": 1.01114941, + "epoch": 0.34301654482493266, + "flos": 881525952768.0, + "grad_norm": 0.034646354408830966, + "language_loss": 0.81384498, + "learning_rate": 0.0007643370024341949, + "loss": 0.82441282, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.45556641, + "step": 1783, + "time_per_iteration": 3.0783512592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_mlp": 1.00288546, + "epoch": 0.34320892651019624, + "flos": 432669514752.0, + "grad_norm": 0.031189947688426686, + "language_loss": 0.84145617, + "learning_rate": 0.0007640725062531195, + "loss": 0.85193729, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.45141602, + "step": 1784, + "time_per_iteration": 2.5152812004089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00559807, + "epoch": 0.3434013081954598, + "flos": 464594251776.0, + "grad_norm": 0.03760163078295718, + "language_loss": 0.86810297, + "learning_rate": 0.0007638079075500047, + "loss": 0.87861264, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.45288086, + "step": 1785, + "time_per_iteration": 2.5846633911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.0019455, + "epoch": 0.34359368988072336, + "flos": 1560677522688.0, + "grad_norm": 0.003111664808940008, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76225722, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.43164062, + "step": 1786, + "time_per_iteration": 4.94433856010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_mlp": 1.003739, + "epoch": 0.3437860715659869, + "flos": 496573423872.0, + "grad_norm": 0.03208809815455149, + "language_loss": 0.83580017, + "learning_rate": 0.0007632784029886026, + "loss": 0.8462882, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.45019531, + "step": 1787, + "time_per_iteration": 2.6222987174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_mlp": 1.00523186, + "epoch": 0.3439784532512505, + "flos": 719610124032.0, + "grad_norm": 0.03771035877194531, + "language_loss": 0.86448389, + "learning_rate": 0.0007630134973358873, + "loss": 0.87498415, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.44799805, + "step": 1788, + "time_per_iteration": 2.9359545707702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00320995, + "epoch": 0.34417083493651407, + "flos": 566922162432.0, + "grad_norm": 0.0315223877917514, + "language_loss": 0.8730194, + "learning_rate": 0.0007627484895722763, + "loss": 0.88349926, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.44775391, + "step": 1789, + "time_per_iteration": 2.710433006286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00397587, + "epoch": 0.3443632166217776, + "flos": 797702375424.0, + "grad_norm": 0.034658336241014505, + "language_loss": 0.80973929, + "learning_rate": 0.0007624833798006552, + "loss": 0.82022536, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.4465332, + "step": 1790, + "time_per_iteration": 3.061995506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049093, + "balance_loss_mlp": 1.00419891, + "epoch": 0.3445555983070412, + "flos": 570393425664.0, + "grad_norm": 0.0359941873064626, + "language_loss": 0.84664464, + "learning_rate": 0.0007622181681239483, + "loss": 0.85713559, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.44873047, + "step": 1791, + "time_per_iteration": 2.708204984664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00192165, + "epoch": 0.3447479799923047, + "flos": 569981266176.0, + "grad_norm": 0.030307911746310208, + "language_loss": 0.85264516, + "learning_rate": 0.0007619528546451202, + "loss": 0.86311066, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.4465332, + "step": 1792, + "time_per_iteration": 2.8142476081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047842, + "balance_loss_mlp": 1.00323367, + "epoch": 0.3449403616775683, + "flos": 969333074688.0, + "grad_norm": 0.03266645448260783, + "language_loss": 0.84415537, + "learning_rate": 0.0007616874394671745, + "loss": 0.85463381, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.4465332, + "step": 1793, + "time_per_iteration": 3.340257406234741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048671, + "balance_loss_mlp": 1.00411057, + "epoch": 0.34513274336283184, + "flos": 569677009920.0, + "grad_norm": 0.042713127170940564, + "language_loss": 0.85883492, + "learning_rate": 0.0007614219226931547, + "loss": 0.86932158, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44604492, + "step": 1794, + "time_per_iteration": 2.666299343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00301611, + "epoch": 0.3453251250480954, + "flos": 461858846208.0, + "grad_norm": 0.03409376285864792, + "language_loss": 0.85191298, + "learning_rate": 0.0007611563044261435, + "loss": 0.86238825, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.44580078, + "step": 1795, + "time_per_iteration": 2.509730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00340092, + "epoch": 0.34551750673335896, + "flos": 416520508416.0, + "grad_norm": 0.03871598691360063, + "language_loss": 0.87655377, + "learning_rate": 0.0007608905847692631, + "loss": 0.88703358, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.4465332, + "step": 1796, + "time_per_iteration": 2.468144416809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045751, + "balance_loss_mlp": 1.0012145, + "epoch": 0.34570988841862255, + "flos": 589115499264.0, + "grad_norm": 0.03133980127061019, + "language_loss": 0.87422049, + "learning_rate": 0.0007606247638256749, + "loss": 0.88467801, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.44580078, + "step": 1797, + "time_per_iteration": 2.8401029109954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_mlp": 1.00758362, + "epoch": 0.34590227010388613, + "flos": 1571145747456.0, + "grad_norm": 0.007450888717391324, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79220599, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.42773438, + "step": 1798, + "time_per_iteration": 4.913544178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_mlp": 1.00097656, + "epoch": 0.34609465178914967, + "flos": 1540930886400.0, + "grad_norm": 0.004797214297707501, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80371094, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.4296875, + "step": 1799, + "time_per_iteration": 4.771878719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_mlp": 1.00469148, + "epoch": 0.34628703347441325, + "flos": 610517576448.0, + "grad_norm": 0.037119753663607306, + "language_loss": 0.86850703, + "learning_rate": 0.0007598266943068686, + "loss": 0.8790009, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44750977, + "step": 1800, + "time_per_iteration": 2.746819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050462, + "balance_loss_mlp": 1.00535274, + "epoch": 0.3464794151596768, + "flos": 474265380864.0, + "grad_norm": 0.03436691989893219, + "language_loss": 0.84791839, + "learning_rate": 0.0007595604692488507, + "loss": 0.85842299, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.45019531, + "step": 1801, + "time_per_iteration": 2.564328908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_mlp": 1.00587356, + "epoch": 0.34667179684494037, + "flos": 606822736896.0, + "grad_norm": 0.03808690892272381, + "language_loss": 0.83437663, + "learning_rate": 0.0007592941434205215, + "loss": 0.8448841, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.44848633, + "step": 1802, + "time_per_iteration": 2.826420545578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059933, + "balance_loss_mlp": 1.016922, + "epoch": 0.3468641785302039, + "flos": 1568362709760.0, + "grad_norm": 0.013636299413791342, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74630988, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.43066406, + "step": 1803, + "time_per_iteration": 5.063625812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050175, + "balance_loss_mlp": 1.00523341, + "epoch": 0.3470565602154675, + "flos": 908724484608.0, + "grad_norm": 0.03942668215130471, + "language_loss": 0.80763334, + "learning_rate": 0.0007587611898665566, + "loss": 0.81813502, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.44921875, + "step": 1804, + "time_per_iteration": 3.0834579467773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.0052247, + "epoch": 0.347248941900731, + "flos": 640060741632.0, + "grad_norm": 0.031209613313051415, + "language_loss": 0.82727098, + "learning_rate": 0.0007584945623478315, + "loss": 0.83777213, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.44873047, + "step": 1805, + "time_per_iteration": 2.861560106277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00688517, + "epoch": 0.3474413235859946, + "flos": 848782732800.0, + "grad_norm": 0.03633023546687314, + "language_loss": 0.81859386, + "learning_rate": 0.000758227834472617, + "loss": 0.82910925, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.44702148, + "step": 1806, + "time_per_iteration": 3.0337021350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.00767589, + "epoch": 0.3476337052712582, + "flos": 516697226496.0, + "grad_norm": 0.035243207865769656, + "language_loss": 0.77929807, + "learning_rate": 0.0007579610063444664, + "loss": 0.78982013, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.44580078, + "step": 1807, + "time_per_iteration": 2.7339653968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_mlp": 1.01154768, + "epoch": 0.34782608695652173, + "flos": 915115845888.0, + "grad_norm": 0.03414685220945043, + "language_loss": 0.88006967, + "learning_rate": 0.0007576940780669712, + "loss": 0.89063108, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4465332, + "step": 1808, + "time_per_iteration": 3.211806058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_mlp": 1.00756717, + "epoch": 0.3480184686417853, + "flos": 775084240128.0, + "grad_norm": 0.07111913657628408, + "language_loss": 0.84903318, + "learning_rate": 0.0007574270497437624, + "loss": 0.85955209, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.4440918, + "step": 1809, + "time_per_iteration": 2.984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00518048, + "epoch": 0.34821085032704885, + "flos": 578004735744.0, + "grad_norm": 0.031195535995176178, + "language_loss": 0.88877916, + "learning_rate": 0.000757159921478509, + "loss": 0.89927369, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.44360352, + "step": 1810, + "time_per_iteration": 2.778917074203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.00888824, + "epoch": 0.34840323201231244, + "flos": 1528042205952.0, + "grad_norm": 0.009192534613281171, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75502062, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.42578125, + "step": 1811, + "time_per_iteration": 4.791734218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.0040617, + "epoch": 0.34859561369757597, + "flos": 510182410752.0, + "grad_norm": 0.038842956055274956, + "language_loss": 0.88272417, + "learning_rate": 0.0007566253655367423, + "loss": 0.89320654, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.44262695, + "step": 1812, + "time_per_iteration": 2.6542506217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050997, + "balance_loss_mlp": 1.00689006, + "epoch": 0.34878799538283956, + "flos": 549757341696.0, + "grad_norm": 0.030689577509801048, + "language_loss": 0.90222162, + "learning_rate": 0.000756357938067762, + "loss": 0.91273159, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.44189453, + "step": 1813, + "time_per_iteration": 2.6897120475769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047642, + "balance_loss_mlp": 1.00346339, + "epoch": 0.34898037706810314, + "flos": 985195321344.0, + "grad_norm": 0.03422241032564105, + "language_loss": 0.83499646, + "learning_rate": 0.0007560904110718033, + "loss": 0.84547287, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44262695, + "step": 1814, + "time_per_iteration": 3.3129422664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_mlp": 1.00102115, + "epoch": 0.3491727587533667, + "flos": 682837672704.0, + "grad_norm": 0.03439092984945392, + "language_loss": 0.84187126, + "learning_rate": 0.0007558227846527297, + "loss": 0.85232258, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.44189453, + "step": 1815, + "time_per_iteration": 2.8228747844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_mlp": 1.00880051, + "epoch": 0.34936514043863026, + "flos": 394889997312.0, + "grad_norm": 0.04066201843968592, + "language_loss": 0.84257603, + "learning_rate": 0.0007555550589144429, + "loss": 0.8531037, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44042969, + "step": 1816, + "time_per_iteration": 2.4170055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053968, + "balance_loss_mlp": 1.01000416, + "epoch": 0.3495575221238938, + "flos": 462340992000.0, + "grad_norm": 0.036355924698056825, + "language_loss": 0.84744954, + "learning_rate": 0.000755287233960883, + "loss": 0.85798925, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.44042969, + "step": 1817, + "time_per_iteration": 2.577195405960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_mlp": 1.01115596, + "epoch": 0.3497499038091574, + "flos": 725429911296.0, + "grad_norm": 0.037028935917378006, + "language_loss": 0.78975379, + "learning_rate": 0.0007550193098960292, + "loss": 0.80030644, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.44189453, + "step": 1818, + "time_per_iteration": 2.9124276638031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_mlp": 1.00609303, + "epoch": 0.3499422854944209, + "flos": 829197456384.0, + "grad_norm": 0.03031702063556045, + "language_loss": 0.8721534, + "learning_rate": 0.0007547512868238988, + "loss": 0.88265729, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.44384766, + "step": 1819, + "time_per_iteration": 3.1275570392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_mlp": 1.00203693, + "epoch": 0.3501346671796845, + "flos": 494543740416.0, + "grad_norm": 0.03689243892136314, + "language_loss": 0.8434422, + "learning_rate": 0.0007544831648485473, + "loss": 0.85390604, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.44433594, + "step": 1820, + "time_per_iteration": 2.6672415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053354, + "balance_loss_mlp": 1.00917482, + "epoch": 0.35032704886494803, + "flos": 579849726720.0, + "grad_norm": 0.04031883928972686, + "language_loss": 0.8166672, + "learning_rate": 0.0007542149440740694, + "loss": 0.82720077, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.44262695, + "step": 1821, + "time_per_iteration": 2.659205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_mlp": 1.0069536, + "epoch": 0.3505194305502116, + "flos": 585832819200.0, + "grad_norm": 0.035872862949689145, + "language_loss": 0.86380953, + "learning_rate": 0.000753946624604597, + "loss": 0.8743242, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.44604492, + "step": 1822, + "time_per_iteration": 2.748387575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049848, + "balance_loss_mlp": 1.00528705, + "epoch": 0.3507118122354752, + "flos": 527979076608.0, + "grad_norm": 0.036265727976650085, + "language_loss": 0.88431466, + "learning_rate": 0.0007536782065443015, + "loss": 0.89481318, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44628906, + "step": 1823, + "time_per_iteration": 2.608429193496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054753, + "balance_loss_mlp": 1.00997818, + "epoch": 0.35090419392073874, + "flos": 512546486016.0, + "grad_norm": 0.039277226542114754, + "language_loss": 0.75647306, + "learning_rate": 0.0007534096899973919, + "loss": 0.76702058, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.44799805, + "step": 1824, + "time_per_iteration": 2.702721118927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.0046134, + "epoch": 0.3510965756060023, + "flos": 565196735232.0, + "grad_norm": 0.031185756782702443, + "language_loss": 0.83427215, + "learning_rate": 0.0007531410750681154, + "loss": 0.84476435, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.44677734, + "step": 1825, + "time_per_iteration": 2.7568912506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00831807, + "epoch": 0.35128895729126586, + "flos": 1022254532352.0, + "grad_norm": 0.030666943866844928, + "language_loss": 0.87304175, + "learning_rate": 0.0007528723618607575, + "loss": 0.88357341, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.44848633, + "step": 1826, + "time_per_iteration": 3.4575371742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.00510669, + "epoch": 0.35148133897652944, + "flos": 589425591552.0, + "grad_norm": 0.04947505148138052, + "language_loss": 0.83428013, + "learning_rate": 0.0007526035504796422, + "loss": 0.84477776, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.44702148, + "step": 1827, + "time_per_iteration": 2.7913553714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053021, + "balance_loss_mlp": 1.00838912, + "epoch": 0.351673720661793, + "flos": 496286664192.0, + "grad_norm": 0.03604129919469899, + "language_loss": 0.87358594, + "learning_rate": 0.0007523346410291312, + "loss": 0.88411617, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.44702148, + "step": 1828, + "time_per_iteration": 2.769817590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.00499058, + "epoch": 0.35186610234705656, + "flos": 763999721472.0, + "grad_norm": 0.036507155273352104, + "language_loss": 0.85486639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86536574, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.44921875, + "step": 1829, + "time_per_iteration": 2.960890293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00364745, + "epoch": 0.3520584840323201, + "flos": 627389801472.0, + "grad_norm": 0.0323509050656096, + "language_loss": 0.88885164, + "learning_rate": 0.0007517965283375599, + "loss": 0.89933491, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.44702148, + "step": 1830, + "time_per_iteration": 2.868405818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047282, + "balance_loss_mlp": 1.00260293, + "epoch": 0.3522508657175837, + "flos": 538449246720.0, + "grad_norm": 0.03139560131485747, + "language_loss": 0.89993465, + "learning_rate": 0.0007515273253054132, + "loss": 0.91040754, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.44726562, + "step": 1831, + "time_per_iteration": 2.6341445446014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_mlp": 1.00298083, + "epoch": 0.35244324740284727, + "flos": 568502747904.0, + "grad_norm": 0.03545868131612223, + "language_loss": 0.83198845, + "learning_rate": 0.0007512580246216988, + "loss": 0.8424651, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44726562, + "step": 1832, + "time_per_iteration": 2.691678524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00860476, + "epoch": 0.3526356290881108, + "flos": 514055139840.0, + "grad_norm": 0.03517539350184397, + "language_loss": 0.85415643, + "learning_rate": 0.000750988626390968, + "loss": 0.86468661, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.44506836, + "step": 1833, + "time_per_iteration": 2.6027944087982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050095, + "balance_loss_mlp": 1.00577271, + "epoch": 0.3528280107733744, + "flos": 596973718272.0, + "grad_norm": 0.033457257877764275, + "language_loss": 0.85569251, + "learning_rate": 0.0007507191307178108, + "loss": 0.86619347, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.4440918, + "step": 1834, + "time_per_iteration": 2.8065004348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054314, + "balance_loss_mlp": 1.00999165, + "epoch": 0.3530203924586379, + "flos": 552299306496.0, + "grad_norm": 0.040042804692427734, + "language_loss": 0.75668854, + "learning_rate": 0.0007504495377068543, + "loss": 0.76723164, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.4440918, + "step": 1835, + "time_per_iteration": 2.736536741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052598, + "balance_loss_mlp": 1.00832355, + "epoch": 0.3532127741439015, + "flos": 654306431232.0, + "grad_norm": 0.0387965270782292, + "language_loss": 0.82353514, + "learning_rate": 0.0007501798474627642, + "loss": 0.83406115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44360352, + "step": 1836, + "time_per_iteration": 2.9019014835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052616, + "balance_loss_mlp": 1.00824583, + "epoch": 0.35340515582916504, + "flos": 724151636736.0, + "grad_norm": 0.03634896017563763, + "language_loss": 0.84383756, + "learning_rate": 0.0007499100600902433, + "loss": 0.85436368, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.44458008, + "step": 1837, + "time_per_iteration": 3.0071663856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105242, + "balance_loss_mlp": 1.00812232, + "epoch": 0.35359753751442863, + "flos": 595998733056.0, + "grad_norm": 0.039287132740407786, + "language_loss": 0.853827, + "learning_rate": 0.0007496401756940324, + "loss": 0.86435115, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.44384766, + "step": 1838, + "time_per_iteration": 2.6924545764923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052318, + "balance_loss_mlp": 1.00780547, + "epoch": 0.3537899191996922, + "flos": 633806440704.0, + "grad_norm": 0.041905435038062475, + "language_loss": 0.83424079, + "learning_rate": 0.0007493701943789098, + "loss": 0.84476393, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.44580078, + "step": 1839, + "time_per_iteration": 2.744781970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.00727141, + "epoch": 0.35398230088495575, + "flos": 507353686272.0, + "grad_norm": 0.0353986915713622, + "language_loss": 0.8339026, + "learning_rate": 0.000749100116249692, + "loss": 0.84441972, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.44506836, + "step": 1840, + "time_per_iteration": 2.5823822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_mlp": 1.00490189, + "epoch": 0.35417468257021933, + "flos": 509047032576.0, + "grad_norm": 0.03988576427868324, + "language_loss": 0.86907303, + "learning_rate": 0.0007488299414112321, + "loss": 0.87956673, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.4453125, + "step": 1841, + "time_per_iteration": 2.6171295642852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055735, + "balance_loss_mlp": 1.01126969, + "epoch": 0.35436706425548287, + "flos": 657660076032.0, + "grad_norm": 0.035376771477334756, + "language_loss": 0.78015333, + "learning_rate": 0.0007485596699684215, + "loss": 0.79071069, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.44555664, + "step": 1842, + "time_per_iteration": 2.8393046855926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070571, + "balance_loss_mlp": 1.02572489, + "epoch": 0.35455944594074645, + "flos": 653889414144.0, + "grad_norm": 0.03498191670442302, + "language_loss": 0.86517459, + "learning_rate": 0.000748289302026189, + "loss": 0.87588024, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.44848633, + "step": 1843, + "time_per_iteration": 2.8524656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060154, + "balance_loss_mlp": 1.01566541, + "epoch": 0.35475182762601, + "flos": 850011429888.0, + "grad_norm": 0.03510464987001869, + "language_loss": 0.86422503, + "learning_rate": 0.0007480188376895004, + "loss": 0.87482655, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.4453125, + "step": 1844, + "time_per_iteration": 3.1228320598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048378, + "balance_loss_mlp": 1.00584412, + "epoch": 0.3549442093112736, + "flos": 1524777989376.0, + "grad_norm": 0.00626506088035535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74859715, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.42578125, + "step": 1845, + "time_per_iteration": 4.8881309032440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053267, + "balance_loss_mlp": 1.00906432, + "epoch": 0.3551365909965371, + "flos": 652715152128.0, + "grad_norm": 0.03760423595997357, + "language_loss": 0.78996736, + "learning_rate": 0.0007474776202528074, + "loss": 0.80050004, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.44287109, + "step": 1846, + "time_per_iteration": 2.9740474224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055296, + "balance_loss_mlp": 1.01118839, + "epoch": 0.3553289726818007, + "flos": 898923098112.0, + "grad_norm": 0.04404679517400465, + "language_loss": 0.81547415, + "learning_rate": 0.000747206867362922, + "loss": 0.82602704, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.44189453, + "step": 1847, + "time_per_iteration": 3.0834994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052455, + "balance_loss_mlp": 1.00822854, + "epoch": 0.3555213543670643, + "flos": 689734512384.0, + "grad_norm": 0.03965516085145463, + "language_loss": 0.8451193, + "learning_rate": 0.0007469360184988194, + "loss": 0.85564387, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.44311523, + "step": 1848, + "time_per_iteration": 2.8074848651885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050526, + "balance_loss_mlp": 1.00632286, + "epoch": 0.3557137360523278, + "flos": 539604066816.0, + "grad_norm": 0.033414642983477745, + "language_loss": 0.87585986, + "learning_rate": 0.0007466650737656518, + "loss": 0.88636506, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.44287109, + "step": 1849, + "time_per_iteration": 2.604926347732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049824, + "balance_loss_mlp": 1.00562072, + "epoch": 0.3559061177375914, + "flos": 403154539776.0, + "grad_norm": 0.03235738057519393, + "language_loss": 0.9068622, + "learning_rate": 0.0007463940332686098, + "loss": 0.91736042, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.44287109, + "step": 1850, + "time_per_iteration": 2.4913558959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056333, + "balance_loss_mlp": 1.01196373, + "epoch": 0.35609849942285493, + "flos": 697895042304.0, + "grad_norm": 0.0320980052654178, + "language_loss": 0.85078359, + "learning_rate": 0.0007461228971129205, + "loss": 0.86134696, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.44458008, + "step": 1851, + "time_per_iteration": 2.898726463317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.01557255, + "epoch": 0.3562908811081185, + "flos": 570002653440.0, + "grad_norm": 0.036011031747473804, + "language_loss": 0.86088216, + "learning_rate": 0.0007458516654038483, + "loss": 0.87148154, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.44458008, + "step": 1852, + "time_per_iteration": 2.6340625286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050769, + "balance_loss_mlp": 1.00651896, + "epoch": 0.35648326279338205, + "flos": 683610468864.0, + "grad_norm": 0.03085087761867809, + "language_loss": 0.87196577, + "learning_rate": 0.0007455803382466946, + "loss": 0.88247347, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44335938, + "step": 1853, + "time_per_iteration": 2.7936782836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_mlp": 1.00468445, + "epoch": 0.35667564447864564, + "flos": 630341980416.0, + "grad_norm": 0.02905562967314866, + "language_loss": 0.8756358, + "learning_rate": 0.0007453089157467979, + "loss": 0.88612318, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.44140625, + "step": 1854, + "time_per_iteration": 2.8003768920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053337, + "balance_loss_mlp": 1.00920558, + "epoch": 0.35686802616390917, + "flos": 815505844224.0, + "grad_norm": 0.03187136352260198, + "language_loss": 0.82840991, + "learning_rate": 0.0007450373980095341, + "loss": 0.83894324, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.44213867, + "step": 1855, + "time_per_iteration": 3.072218179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.00869787, + "epoch": 0.35706040784917276, + "flos": 527206280448.0, + "grad_norm": 0.03314729603592228, + "language_loss": 0.87318838, + "learning_rate": 0.0007447657851403155, + "loss": 0.88371575, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.44116211, + "step": 1856, + "time_per_iteration": 2.5849640369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047421, + "balance_loss_mlp": 1.00338531, + "epoch": 0.35725278953443634, + "flos": 513065570304.0, + "grad_norm": 0.033114806318055315, + "language_loss": 0.79136717, + "learning_rate": 0.0007444940772445915, + "loss": 0.80184138, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.44116211, + "step": 1857, + "time_per_iteration": 2.729100227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048076, + "balance_loss_mlp": 1.00404048, + "epoch": 0.3574451712196999, + "flos": 488493573888.0, + "grad_norm": 0.030889137628629628, + "language_loss": 0.80389744, + "learning_rate": 0.0007442222744278484, + "loss": 0.81437826, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.44116211, + "step": 1858, + "time_per_iteration": 2.673224687576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_mlp": 1.00433075, + "epoch": 0.35763755290496346, + "flos": 551822018304.0, + "grad_norm": 0.029026961526961815, + "language_loss": 0.8481214, + "learning_rate": 0.0007439503767956099, + "loss": 0.8586058, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.44189453, + "step": 1859, + "time_per_iteration": 2.7095680236816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_mlp": 1.00567627, + "epoch": 0.357829934590227, + "flos": 1507228232448.0, + "grad_norm": 0.007157576597672099, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80719817, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.42578125, + "step": 1860, + "time_per_iteration": 4.909587383270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00549006, + "epoch": 0.3580223162754906, + "flos": 569842260480.0, + "grad_norm": 0.027013738684289513, + "language_loss": 0.86190987, + "learning_rate": 0.000743406297506922, + "loss": 0.87240434, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.44042969, + "step": 1861, + "time_per_iteration": 2.7355735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00518215, + "epoch": 0.3582146979607541, + "flos": 627761131776.0, + "grad_norm": 0.0339710504259095, + "language_loss": 0.84903038, + "learning_rate": 0.0007431341160617031, + "loss": 0.8595221, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.44067383, + "step": 1862, + "time_per_iteration": 2.8932178020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054928, + "balance_loss_mlp": 1.01082051, + "epoch": 0.3584070796460177, + "flos": 508319923200.0, + "grad_norm": 0.030700215862736833, + "language_loss": 0.88826722, + "learning_rate": 0.0007428618402234491, + "loss": 0.89881647, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.44189453, + "step": 1863, + "time_per_iteration": 2.6574699878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105281, + "balance_loss_mlp": 1.00882196, + "epoch": 0.3585994613312813, + "flos": 607641219840.0, + "grad_norm": 0.030466419719222444, + "language_loss": 0.80836076, + "learning_rate": 0.0007425894700978668, + "loss": 0.8188889, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.44067383, + "step": 1864, + "time_per_iteration": 2.7388875484466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048686, + "balance_loss_mlp": 1.00467396, + "epoch": 0.3587918430165448, + "flos": 1415089579776.0, + "grad_norm": 0.030441642762586523, + "language_loss": 0.8033703, + "learning_rate": 0.0007423170057906996, + "loss": 0.8138572, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.44091797, + "step": 1865, + "time_per_iteration": 3.8431384563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3589842247018084, + "flos": 479514561024.0, + "grad_norm": 0.03198832631900347, + "language_loss": 0.8674798, + "learning_rate": 0.0007420444474077275, + "loss": 0.87792838, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.44067383, + "step": 1866, + "time_per_iteration": 2.5487258434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_mlp": 1.0028863, + "epoch": 0.35917660638707194, + "flos": 505706026752.0, + "grad_norm": 0.036738697797889144, + "language_loss": 0.90374953, + "learning_rate": 0.0007417717950547671, + "loss": 0.91421801, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.44042969, + "step": 1867, + "time_per_iteration": 2.6784894466400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052044, + "balance_loss_mlp": 1.00960541, + "epoch": 0.3593689880723355, + "flos": 1495484645376.0, + "grad_norm": 0.0080630279180651, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77048653, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.42480469, + "step": 1868, + "time_per_iteration": 4.930212497711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104515, + "balance_loss_mlp": 1.00118589, + "epoch": 0.35956136975759906, + "flos": 529672422912.0, + "grad_norm": 0.03031015371847706, + "language_loss": 0.85577166, + "learning_rate": 0.0007412262088623299, + "loss": 0.86622322, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.44042969, + "step": 1869, + "time_per_iteration": 2.73066782951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_mlp": 1.00385797, + "epoch": 0.35975375144286265, + "flos": 536000600832.0, + "grad_norm": 0.03552204952813077, + "language_loss": 0.80084878, + "learning_rate": 0.0007409532752346684, + "loss": 0.81132627, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.43969727, + "step": 1870, + "time_per_iteration": 2.6379218101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00638759, + "epoch": 0.3599461331281262, + "flos": 505929603072.0, + "grad_norm": 0.028943079800369927, + "language_loss": 0.8876543, + "learning_rate": 0.0007406802480606491, + "loss": 0.89815807, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.44067383, + "step": 1871, + "time_per_iteration": 2.6258225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049318, + "balance_loss_mlp": 1.00547302, + "epoch": 0.36013851481338977, + "flos": 512537737728.0, + "grad_norm": 0.03609789661305553, + "language_loss": 0.91903639, + "learning_rate": 0.0007404071274462707, + "loss": 0.92952955, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.43920898, + "step": 1872, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_mlp": 1.00494921, + "epoch": 0.36033089649865335, + "flos": 548632657152.0, + "grad_norm": 0.03255043761438457, + "language_loss": 0.84506214, + "learning_rate": 0.0007401339134975682, + "loss": 0.85555267, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.44189453, + "step": 1873, + "time_per_iteration": 2.6355786323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_mlp": 1.00575614, + "epoch": 0.3605232781839169, + "flos": 459614334720.0, + "grad_norm": 0.03456024010205507, + "language_loss": 0.84983587, + "learning_rate": 0.0007398606063206122, + "loss": 0.86033404, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.44140625, + "step": 1874, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_mlp": 1.00577569, + "epoch": 0.36071565986918047, + "flos": 510564434688.0, + "grad_norm": 0.03262157431229983, + "language_loss": 0.79280519, + "learning_rate": 0.0007395872060215101, + "loss": 0.80330336, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.44116211, + "step": 1875, + "time_per_iteration": 2.59242582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_mlp": 1.00785792, + "epoch": 0.360908041554444, + "flos": 560257647360.0, + "grad_norm": 0.03426029536230158, + "language_loss": 0.89306337, + "learning_rate": 0.0007393137127064056, + "loss": 0.9035809, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.43969727, + "step": 1876, + "time_per_iteration": 2.6217613220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00577068, + "epoch": 0.3611004232397076, + "flos": 524879143680.0, + "grad_norm": 0.03313366432597027, + "language_loss": 0.84778088, + "learning_rate": 0.0007390401264814779, + "loss": 0.85827708, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.43920898, + "step": 1877, + "time_per_iteration": 2.621366262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_mlp": 1.00752687, + "epoch": 0.3612928049249711, + "flos": 542033270784.0, + "grad_norm": 0.036139064810301956, + "language_loss": 0.85492337, + "learning_rate": 0.0007387664474529427, + "loss": 0.86543715, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.43920898, + "step": 1878, + "time_per_iteration": 2.6200942993164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.00776029, + "epoch": 0.3614851866102347, + "flos": 553630070784.0, + "grad_norm": 0.03346030230294773, + "language_loss": 0.91826439, + "learning_rate": 0.0007384926757270518, + "loss": 0.92877924, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.43798828, + "step": 1879, + "time_per_iteration": 2.6367645263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048472, + "balance_loss_mlp": 1.00481761, + "epoch": 0.36167756829549824, + "flos": 773427832320.0, + "grad_norm": 0.030641441804162946, + "language_loss": 0.80120707, + "learning_rate": 0.0007382188114100924, + "loss": 0.81169182, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.43725586, + "step": 1880, + "time_per_iteration": 2.9662272930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_mlp": 1.0051316, + "epoch": 0.36186994998076183, + "flos": 713188627200.0, + "grad_norm": 0.030233131555612264, + "language_loss": 0.82161707, + "learning_rate": 0.0007379448546083884, + "loss": 0.83210421, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.43652344, + "step": 1881, + "time_per_iteration": 2.9433577060699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_mlp": 1.00420797, + "epoch": 0.3620623316660254, + "flos": 748901522688.0, + "grad_norm": 0.028477152913266954, + "language_loss": 0.88624489, + "learning_rate": 0.0007376708054282992, + "loss": 0.89672405, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.43774414, + "step": 1882, + "time_per_iteration": 2.9565789699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00425851, + "epoch": 0.36225471335128895, + "flos": 483535044096.0, + "grad_norm": 0.03088815199044137, + "language_loss": 0.84632647, + "learning_rate": 0.0007373966639762201, + "loss": 0.85680467, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.4362793, + "step": 1883, + "time_per_iteration": 2.6308107376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_mlp": 1.00762069, + "epoch": 0.36244709503655254, + "flos": 507911654400.0, + "grad_norm": 0.045291722940018896, + "language_loss": 0.89109468, + "learning_rate": 0.0007371224303585822, + "loss": 0.90160698, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.43676758, + "step": 1884, + "time_per_iteration": 2.5738682746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053806, + "balance_loss_mlp": 1.01194, + "epoch": 0.36263947672181607, + "flos": 1397054741760.0, + "grad_norm": 0.007615502937667497, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81410873, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.41894531, + "step": 1885, + "time_per_iteration": 4.7547221183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105859, + "balance_loss_mlp": 1.01500738, + "epoch": 0.36283185840707965, + "flos": 654523204608.0, + "grad_norm": 0.03432185210428161, + "language_loss": 0.83272493, + "learning_rate": 0.0007365736870525335, + "loss": 0.84331077, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.43652344, + "step": 1886, + "time_per_iteration": 2.8305654525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_mlp": 1.00591362, + "epoch": 0.3630242400923432, + "flos": 489845725440.0, + "grad_norm": 0.036050619102321185, + "language_loss": 0.8310129, + "learning_rate": 0.000736299177577164, + "loss": 0.84150714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.43579102, + "step": 1887, + "time_per_iteration": 2.632485866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105207, + "balance_loss_mlp": 1.00853443, + "epoch": 0.3632166217776068, + "flos": 518232125184.0, + "grad_norm": 0.034844830144856315, + "language_loss": 0.84275633, + "learning_rate": 0.0007360245763623174, + "loss": 0.85327709, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.43603516, + "step": 1888, + "time_per_iteration": 2.6480350494384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049354, + "balance_loss_mlp": 1.00596213, + "epoch": 0.36340900346287036, + "flos": 647348353536.0, + "grad_norm": 0.03423797247490227, + "language_loss": 0.90607542, + "learning_rate": 0.0007357498835146039, + "loss": 0.91656893, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.43457031, + "step": 1889, + "time_per_iteration": 2.8152430057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055179, + "balance_loss_mlp": 1.01154852, + "epoch": 0.3636013851481339, + "flos": 554411615232.0, + "grad_norm": 0.0362068794335816, + "language_loss": 0.87730169, + "learning_rate": 0.0007354750991406684, + "loss": 0.8878535, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.43701172, + "step": 1890, + "time_per_iteration": 2.71056866645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047867, + "balance_loss_mlp": 1.0042125, + "epoch": 0.3637937668333975, + "flos": 547692665088.0, + "grad_norm": 0.03762567530645649, + "language_loss": 0.81321651, + "learning_rate": 0.0007352002233471919, + "loss": 0.82369518, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.43725586, + "step": 1891, + "time_per_iteration": 2.6590068340301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.01098096, + "epoch": 0.363986148518661, + "flos": 539211349248.0, + "grad_norm": 0.036762310622647384, + "language_loss": 0.79772675, + "learning_rate": 0.0007349252562408906, + "loss": 0.808276, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.44018555, + "step": 1892, + "time_per_iteration": 2.715721368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111044, + "balance_loss_mlp": 1.0663805, + "epoch": 0.3641785302039246, + "flos": 661511417856.0, + "grad_norm": 0.04360229312277944, + "language_loss": 0.82000142, + "learning_rate": 0.0007346501979285158, + "loss": 0.83110583, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.44140625, + "step": 1893, + "time_per_iteration": 2.927184820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_mlp": 1.01934052, + "epoch": 0.36437091188918813, + "flos": 1472084965632.0, + "grad_norm": 0.015393341944361743, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81600404, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.41992188, + "step": 1894, + "time_per_iteration": 4.786630868911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050162, + "balance_loss_mlp": 1.00648379, + "epoch": 0.3645632935744517, + "flos": 598445433600.0, + "grad_norm": 0.030741456608760154, + "language_loss": 0.86771834, + "learning_rate": 0.0007340998081127308, + "loss": 0.87822002, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.4375, + "step": 1895, + "time_per_iteration": 2.7590408325195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00284135, + "epoch": 0.36475567525971525, + "flos": 600696748032.0, + "grad_norm": 0.032247737775586885, + "language_loss": 0.91682166, + "learning_rate": 0.0007338244768230007, + "loss": 0.92728615, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.43676758, + "step": 1896, + "time_per_iteration": 2.806001663208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048582, + "balance_loss_mlp": 1.00502336, + "epoch": 0.36494805694497884, + "flos": 799832180736.0, + "grad_norm": 0.03166243516623692, + "language_loss": 0.89817142, + "learning_rate": 0.0007335490547545578, + "loss": 0.90865725, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.4362793, + "step": 1897, + "time_per_iteration": 3.0448927879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049034, + "balance_loss_mlp": 1.00535595, + "epoch": 0.3651404386302424, + "flos": 638478210816.0, + "grad_norm": 0.03536594015703217, + "language_loss": 0.82896376, + "learning_rate": 0.0007332735420143308, + "loss": 0.83945411, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.4375, + "step": 1898, + "time_per_iteration": 2.739990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_mlp": 1.00419891, + "epoch": 0.36533282031550596, + "flos": 492563634432.0, + "grad_norm": 0.03491103953335563, + "language_loss": 0.87321162, + "learning_rate": 0.0007329979387092826, + "loss": 0.88369012, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.43725586, + "step": 1899, + "time_per_iteration": 2.5661838054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_mlp": 1.00020182, + "epoch": 0.36552520200076954, + "flos": 857509979136.0, + "grad_norm": 0.025671163998745472, + "language_loss": 0.84557235, + "learning_rate": 0.0007327222449464124, + "loss": 0.85601258, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.43896484, + "step": 1900, + "time_per_iteration": 3.2916476726531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_mlp": 1.00545931, + "epoch": 0.3657175836860331, + "flos": 484716109056.0, + "grad_norm": 0.033162883177173925, + "language_loss": 0.89287698, + "learning_rate": 0.0007324464608327538, + "loss": 0.90336835, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4375, + "step": 1901, + "time_per_iteration": 2.6514644622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050151, + "balance_loss_mlp": 1.00647259, + "epoch": 0.36590996537129666, + "flos": 435721815552.0, + "grad_norm": 0.0385016057803441, + "language_loss": 0.88887352, + "learning_rate": 0.0007321705864753758, + "loss": 0.89937502, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.4375, + "step": 1902, + "time_per_iteration": 2.6785683631896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00171947, + "epoch": 0.3661023470565602, + "flos": 713514270720.0, + "grad_norm": 0.027132815564249787, + "language_loss": 0.85073566, + "learning_rate": 0.0007318946219813823, + "loss": 0.86119133, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.43920898, + "step": 1903, + "time_per_iteration": 2.9874324798583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00431406, + "epoch": 0.3662947287418238, + "flos": 565823722752.0, + "grad_norm": 0.03452387251033087, + "language_loss": 0.90632051, + "learning_rate": 0.000731618567457912, + "loss": 0.91680402, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.44116211, + "step": 1904, + "time_per_iteration": 2.684290885925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049154, + "balance_loss_mlp": 1.00516582, + "epoch": 0.3664871104270873, + "flos": 791203110912.0, + "grad_norm": 0.032826620308443535, + "language_loss": 0.87174082, + "learning_rate": 0.000731342423012139, + "loss": 0.88223237, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.44067383, + "step": 1905, + "time_per_iteration": 3.0617177486419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051542, + "balance_loss_mlp": 1.00750625, + "epoch": 0.3666794921123509, + "flos": 753981561600.0, + "grad_norm": 0.03506961035904521, + "language_loss": 0.83108962, + "learning_rate": 0.0007310661887512722, + "loss": 0.84160507, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.44116211, + "step": 1906, + "time_per_iteration": 3.046901226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.0011121, + "epoch": 0.3668718737976145, + "flos": 524607935232.0, + "grad_norm": 0.03388484398579531, + "language_loss": 0.82964659, + "learning_rate": 0.0007307898647825549, + "loss": 0.84010023, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.44335938, + "step": 1907, + "time_per_iteration": 2.6592161655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_mlp": 1.00767255, + "epoch": 0.367064255482878, + "flos": 573046205952.0, + "grad_norm": 0.03554957537225944, + "language_loss": 0.8992576, + "learning_rate": 0.0007305134512132659, + "loss": 0.90977585, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.44238281, + "step": 1908, + "time_per_iteration": 2.6961183547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055033, + "balance_loss_mlp": 1.01078284, + "epoch": 0.3672566371681416, + "flos": 448054473216.0, + "grad_norm": 0.04018581054394134, + "language_loss": 0.843858, + "learning_rate": 0.0007302369481507183, + "loss": 0.85440832, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.44335938, + "step": 1909, + "time_per_iteration": 2.488203763961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056026, + "balance_loss_mlp": 1.01358795, + "epoch": 0.36744901885340514, + "flos": 1543366893312.0, + "grad_norm": 0.00771809390988723, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81017786, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.42480469, + "step": 1910, + "time_per_iteration": 4.828088045120239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059395, + "balance_loss_mlp": 1.01457202, + "epoch": 0.36764140053866873, + "flos": 564762221568.0, + "grad_norm": 0.032014471163266715, + "language_loss": 0.86287534, + "learning_rate": 0.000729683673975274, + "loss": 0.87346923, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.44824219, + "step": 1911, + "time_per_iteration": 2.6982359886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058317, + "balance_loss_mlp": 1.01366162, + "epoch": 0.36783378222393226, + "flos": 1218652614144.0, + "grad_norm": 0.03007186425733569, + "language_loss": 0.8357197, + "learning_rate": 0.0007294069030771774, + "loss": 0.84630299, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.44702148, + "step": 1912, + "time_per_iteration": 3.6612210273742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049037, + "balance_loss_mlp": 1.0043577, + "epoch": 0.36802616390919585, + "flos": 499720988928.0, + "grad_norm": 0.03131225250708543, + "language_loss": 0.91280997, + "learning_rate": 0.0007291300431154224, + "loss": 0.92330033, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.44726562, + "step": 1913, + "time_per_iteration": 2.574129581451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053715, + "balance_loss_mlp": 1.01108551, + "epoch": 0.36821854559445943, + "flos": 1585618904064.0, + "grad_norm": 0.006266309435424964, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7144345, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.42675781, + "step": 1914, + "time_per_iteration": 4.960723876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052577, + "balance_loss_mlp": 1.0082792, + "epoch": 0.36841092727972297, + "flos": 837090668544.0, + "grad_norm": 0.03136779226227803, + "language_loss": 0.80375087, + "learning_rate": 0.0007285760564309179, + "loss": 0.81427664, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.44384766, + "step": 1915, + "time_per_iteration": 3.0985960960388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010541, + "balance_loss_mlp": 1.00965917, + "epoch": 0.36860330896498655, + "flos": 691211085312.0, + "grad_norm": 0.031502418433557444, + "language_loss": 0.85988045, + "learning_rate": 0.0007282989299232448, + "loss": 0.87042141, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.4453125, + "step": 1916, + "time_per_iteration": 3.034715175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.01065195, + "epoch": 0.3687956906502501, + "flos": 555240791808.0, + "grad_norm": 0.03953946470073971, + "language_loss": 0.84794021, + "learning_rate": 0.0007280217147820668, + "loss": 0.85849106, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.4453125, + "step": 1917, + "time_per_iteration": 2.61297869682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_mlp": 1.0093317, + "epoch": 0.3689880723355137, + "flos": 577820043264.0, + "grad_norm": 0.030128455165502346, + "language_loss": 0.7994225, + "learning_rate": 0.0007277444111150079, + "loss": 0.80996048, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.44555664, + "step": 1918, + "time_per_iteration": 2.7244873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_mlp": 1.00845671, + "epoch": 0.3691804540207772, + "flos": 529887250944.0, + "grad_norm": 0.035938670194894204, + "language_loss": 0.84948546, + "learning_rate": 0.0007274670190297272, + "loss": 0.86001301, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.44384766, + "step": 1919, + "time_per_iteration": 2.6209609508514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_mlp": 1.0041858, + "epoch": 0.3693728357060408, + "flos": 562181372928.0, + "grad_norm": 0.026922320390231402, + "language_loss": 0.82273662, + "learning_rate": 0.0007271895386339179, + "loss": 0.83322287, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.4453125, + "step": 1920, + "time_per_iteration": 2.7952609062194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00292087, + "epoch": 0.3695652173913043, + "flos": 580900534272.0, + "grad_norm": 0.03055527362799568, + "language_loss": 0.83712995, + "learning_rate": 0.0007269119700353073, + "loss": 0.84760189, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.44360352, + "step": 1921, + "time_per_iteration": 2.808595895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049468, + "balance_loss_mlp": 1.00519335, + "epoch": 0.3697575990765679, + "flos": 514059997440.0, + "grad_norm": 0.029192022992987326, + "language_loss": 0.85655916, + "learning_rate": 0.0007266343133416571, + "loss": 0.86705387, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.44360352, + "step": 1922, + "time_per_iteration": 2.7229409217834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045757, + "balance_loss_mlp": 1.00255585, + "epoch": 0.3699499807618315, + "flos": 1573906430976.0, + "grad_norm": 0.004633598174219594, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.7816267, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.43261719, + "step": 1923, + "time_per_iteration": 4.855220556259155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049661, + "balance_loss_mlp": 1.00526702, + "epoch": 0.37014236244709503, + "flos": 498325095936.0, + "grad_norm": 0.04063724538866958, + "language_loss": 0.84789312, + "learning_rate": 0.0007260787361004556, + "loss": 0.85838968, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.44482422, + "step": 1924, + "time_per_iteration": 2.5634405612945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063278, + "balance_loss_mlp": 1.01998138, + "epoch": 0.3703347441323586, + "flos": 1447608233472.0, + "grad_norm": 0.011285785538321925, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.7482478, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 4.881471157073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050604, + "balance_loss_mlp": 1.00601971, + "epoch": 0.37052712581762215, + "flos": 564714589440.0, + "grad_norm": 0.030700116077417884, + "language_loss": 0.87676865, + "learning_rate": 0.0007255228077730903, + "loss": 0.88727468, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.44628906, + "step": 1926, + "time_per_iteration": 2.6604056358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_mlp": 1.00426066, + "epoch": 0.37071950750288574, + "flos": 927571958016.0, + "grad_norm": 0.030848240929213684, + "language_loss": 0.82266426, + "learning_rate": 0.0007252447122218632, + "loss": 0.83315009, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.4440918, + "step": 1927, + "time_per_iteration": 3.189232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.00424135, + "epoch": 0.37091188918814927, + "flos": 419201478912.0, + "grad_norm": 0.038028798643346066, + "language_loss": 0.88517463, + "learning_rate": 0.0007249665292228834, + "loss": 0.89565861, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.44238281, + "step": 1928, + "time_per_iteration": 2.6051783561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_mlp": 1.00443351, + "epoch": 0.37110427087341286, + "flos": 464147099136.0, + "grad_norm": 0.03246756835091633, + "language_loss": 0.8426615, + "learning_rate": 0.000724688258884151, + "loss": 0.85314661, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.44165039, + "step": 1929, + "time_per_iteration": 2.5537402629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105112, + "balance_loss_mlp": 1.00703681, + "epoch": 0.3712966525586764, + "flos": 851081679360.0, + "grad_norm": 0.026814038228573516, + "language_loss": 0.86998665, + "learning_rate": 0.0007244099013137002, + "loss": 0.88049793, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.44165039, + "step": 1930, + "time_per_iteration": 3.091195821762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052812, + "balance_loss_mlp": 1.00901484, + "epoch": 0.37148903424394, + "flos": 927559319040.0, + "grad_norm": 0.03484228463474462, + "language_loss": 0.89224607, + "learning_rate": 0.0007241314566195993, + "loss": 0.90277416, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.4387207, + "step": 1931, + "time_per_iteration": 3.2276151180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00616395, + "epoch": 0.37168141592920356, + "flos": 520821722112.0, + "grad_norm": 0.033577876196724185, + "language_loss": 0.86394525, + "learning_rate": 0.0007238529249099496, + "loss": 0.87444603, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.43994141, + "step": 1932, + "time_per_iteration": 2.6099538803100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_mlp": 1.00075531, + "epoch": 0.3718737976144671, + "flos": 1449062452224.0, + "grad_norm": 0.005805601038449312, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78900075, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.42480469, + "step": 1933, + "time_per_iteration": 4.864013910293579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00741839, + "epoch": 0.3720661792997307, + "flos": 760954223616.0, + "grad_norm": 0.031651541573232696, + "language_loss": 0.81381935, + "learning_rate": 0.000723295600876581, + "loss": 0.82433319, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.44042969, + "step": 1934, + "time_per_iteration": 3.003988742828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_mlp": 1.00353265, + "epoch": 0.3722585609849942, + "flos": 518045487360.0, + "grad_norm": 0.031160015664157277, + "language_loss": 0.88386387, + "learning_rate": 0.0007230168087692344, + "loss": 0.89433783, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.43945312, + "step": 1935, + "time_per_iteration": 2.6490824222564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_mlp": 1.00165451, + "epoch": 0.3724509426702578, + "flos": 783869812224.0, + "grad_norm": 0.03743087194604022, + "language_loss": 0.82867873, + "learning_rate": 0.0007227379300790839, + "loss": 0.83913326, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.4387207, + "step": 1936, + "time_per_iteration": 3.010700225830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044084, + "balance_loss_mlp": 1.00011992, + "epoch": 0.37264332435552133, + "flos": 392599799040.0, + "grad_norm": 0.032423549870759565, + "language_loss": 0.86443603, + "learning_rate": 0.0007224589649143997, + "loss": 0.87487686, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.44042969, + "step": 1937, + "time_per_iteration": 2.54010272026062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_mlp": 1.00072384, + "epoch": 0.3728357060407849, + "flos": 543913254912.0, + "grad_norm": 0.03387233199209411, + "language_loss": 0.81436574, + "learning_rate": 0.0007221799133834861, + "loss": 0.82481098, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.4387207, + "step": 1938, + "time_per_iteration": 2.6355655193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_mlp": 1.00154293, + "epoch": 0.3730280877260485, + "flos": 434484370176.0, + "grad_norm": 0.03416430777388856, + "language_loss": 0.82122993, + "learning_rate": 0.00072190077559468, + "loss": 0.83168304, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.43847656, + "step": 1939, + "time_per_iteration": 2.5033867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049643, + "balance_loss_mlp": 1.00579786, + "epoch": 0.37322046941131204, + "flos": 532511841024.0, + "grad_norm": 0.031902006564455146, + "language_loss": 0.89473069, + "learning_rate": 0.0007216215516563527, + "loss": 0.90522707, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.43920898, + "step": 1940, + "time_per_iteration": 2.685201406478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_mlp": 1.00538588, + "epoch": 0.3734128510965756, + "flos": 532576969728.0, + "grad_norm": 0.03682978505173481, + "language_loss": 0.83770883, + "learning_rate": 0.0007213422416769083, + "loss": 0.84820092, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.43896484, + "step": 1941, + "time_per_iteration": 2.5981826782226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104851, + "balance_loss_mlp": 1.00454593, + "epoch": 0.37360523278183916, + "flos": 501433777152.0, + "grad_norm": 0.029644951468961563, + "language_loss": 0.75750655, + "learning_rate": 0.0007210628457647849, + "loss": 0.76799166, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.44042969, + "step": 1942, + "time_per_iteration": 2.5780391693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_mlp": 1.00365365, + "epoch": 0.37379761446710275, + "flos": 549112857600.0, + "grad_norm": 0.03283775645447924, + "language_loss": 0.79155779, + "learning_rate": 0.000720783364028453, + "loss": 0.80203396, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.44042969, + "step": 1943, + "time_per_iteration": 2.7498555183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052612, + "balance_loss_mlp": 1.0085758, + "epoch": 0.3739899961523663, + "flos": 476740271616.0, + "grad_norm": 0.03229344723146533, + "language_loss": 0.88345349, + "learning_rate": 0.0007205037965764177, + "loss": 0.89397967, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.44116211, + "step": 1944, + "time_per_iteration": 2.559565305709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049461, + "balance_loss_mlp": 1.00533009, + "epoch": 0.37418237783762986, + "flos": 613077037824.0, + "grad_norm": 0.033726561022773015, + "language_loss": 0.85856438, + "learning_rate": 0.0007202241435172161, + "loss": 0.86905897, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.44213867, + "step": 1945, + "time_per_iteration": 2.7495012283325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_mlp": 1.00618136, + "epoch": 0.3743747595228934, + "flos": 767629432320.0, + "grad_norm": 0.030482282234963888, + "language_loss": 0.88839138, + "learning_rate": 0.0007199444049594198, + "loss": 0.89889503, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.44262695, + "step": 1946, + "time_per_iteration": 2.927438259124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_mlp": 1.00679135, + "epoch": 0.374567141208157, + "flos": 525491546880.0, + "grad_norm": 0.03274984488565387, + "language_loss": 0.84098482, + "learning_rate": 0.0007196645810116322, + "loss": 0.85149455, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.44262695, + "step": 1947, + "time_per_iteration": 2.669954538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00717854, + "epoch": 0.37475952289342057, + "flos": 682614096384.0, + "grad_norm": 0.03500222096290466, + "language_loss": 0.84308642, + "learning_rate": 0.0007193846717824912, + "loss": 0.85360044, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.44311523, + "step": 1948, + "time_per_iteration": 2.873595714569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_mlp": 1.01018322, + "epoch": 0.3749519045786841, + "flos": 461216307456.0, + "grad_norm": 0.03758393676626501, + "language_loss": 0.89286113, + "learning_rate": 0.0007191046773806669, + "loss": 0.90340507, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.44287109, + "step": 1949, + "time_per_iteration": 2.5632805824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052491, + "balance_loss_mlp": 1.00816894, + "epoch": 0.3751442862639477, + "flos": 956388013824.0, + "grad_norm": 0.04355990755149793, + "language_loss": 0.83803475, + "learning_rate": 0.0007188245979148631, + "loss": 0.84855968, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.4440918, + "step": 1950, + "time_per_iteration": 3.153048515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050564, + "balance_loss_mlp": 1.00619411, + "epoch": 0.3753366679492112, + "flos": 528806307840.0, + "grad_norm": 0.034134677221205334, + "language_loss": 0.88437903, + "learning_rate": 0.0007185444334938157, + "loss": 0.89488459, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.44458008, + "step": 1951, + "time_per_iteration": 2.77795147895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052982, + "balance_loss_mlp": 1.0084213, + "epoch": 0.3755290496344748, + "flos": 522849460224.0, + "grad_norm": 0.03641649118573359, + "language_loss": 0.85489821, + "learning_rate": 0.0007182641842262947, + "loss": 0.86542803, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.44628906, + "step": 1952, + "time_per_iteration": 2.6038033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063178, + "balance_loss_mlp": 1.01852179, + "epoch": 0.37572143131973834, + "flos": 622372945920.0, + "grad_norm": 0.036303705105214745, + "language_loss": 0.78406018, + "learning_rate": 0.0007179838502211022, + "loss": 0.79469192, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.44702148, + "step": 1953, + "time_per_iteration": 2.8537991046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050235, + "balance_loss_mlp": 1.00565112, + "epoch": 0.37591381300500193, + "flos": 772274957568.0, + "grad_norm": 0.033405608161133214, + "language_loss": 0.87193865, + "learning_rate": 0.0007177034315870738, + "loss": 0.88244104, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.44677734, + "step": 1954, + "time_per_iteration": 2.9944725036621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_mlp": 1.00469208, + "epoch": 0.37610619469026546, + "flos": 521481757440.0, + "grad_norm": 0.05036646851246907, + "language_loss": 0.91552407, + "learning_rate": 0.0007174229284330773, + "loss": 0.92601728, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.44702148, + "step": 1955, + "time_per_iteration": 2.607128143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046853, + "balance_loss_mlp": 1.0023644, + "epoch": 0.37629857637552905, + "flos": 599971584000.0, + "grad_norm": 0.029911324472659546, + "language_loss": 0.87468076, + "learning_rate": 0.0007171423408680141, + "loss": 0.88514924, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.44555664, + "step": 1956, + "time_per_iteration": 2.8234241008758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00272334, + "epoch": 0.37649095806079264, + "flos": 566019108864.0, + "grad_norm": 0.03303955535560464, + "language_loss": 0.90624022, + "learning_rate": 0.0007168616690008176, + "loss": 0.91671115, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.44458008, + "step": 1957, + "time_per_iteration": 2.645219326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047066, + "balance_loss_mlp": 1.00271976, + "epoch": 0.37668333974605617, + "flos": 593569529088.0, + "grad_norm": 0.03512927569377508, + "language_loss": 0.86650079, + "learning_rate": 0.0007165809129404545, + "loss": 0.87697142, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.44433594, + "step": 1958, + "time_per_iteration": 2.762319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.00742376, + "epoch": 0.37687572143131975, + "flos": 420365047296.0, + "grad_norm": 0.03381206580119959, + "language_loss": 0.8673501, + "learning_rate": 0.0007163000727959239, + "loss": 0.87786663, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.44311523, + "step": 1959, + "time_per_iteration": 2.4887454509735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_mlp": 1.00466919, + "epoch": 0.3770681031165833, + "flos": 1360387269888.0, + "grad_norm": 0.007286715675134549, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79006183, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.42480469, + "step": 1960, + "time_per_iteration": 4.844388961791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053466, + "balance_loss_mlp": 1.00938201, + "epoch": 0.3772604848018469, + "flos": 646154649600.0, + "grad_norm": 0.030030705089392724, + "language_loss": 0.85244703, + "learning_rate": 0.00071573814069052, + "loss": 0.86298174, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.44165039, + "step": 1961, + "time_per_iteration": 2.93870210647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_mlp": 0.99976981, + "epoch": 0.3774528664871104, + "flos": 903202150656.0, + "grad_norm": 0.029467737659617427, + "language_loss": 0.88618672, + "learning_rate": 0.0007154570489478081, + "loss": 0.89662528, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.44165039, + "step": 1962, + "time_per_iteration": 3.2101829051971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_mlp": 1.00241697, + "epoch": 0.377645248172374, + "flos": 789464077824.0, + "grad_norm": 0.02894999631439154, + "language_loss": 0.87102842, + "learning_rate": 0.0007151758735572514, + "loss": 0.88149416, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.44238281, + "step": 1963, + "time_per_iteration": 3.0217864513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046842, + "balance_loss_mlp": 1.00282979, + "epoch": 0.3778376298576376, + "flos": 587925686016.0, + "grad_norm": 0.035422959183698866, + "language_loss": 0.81287247, + "learning_rate": 0.0007148946146280119, + "loss": 0.82334089, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.44091797, + "step": 1964, + "time_per_iteration": 2.9066553115844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_mlp": 1.01407623, + "epoch": 0.3780300115429011, + "flos": 1399672528896.0, + "grad_norm": 0.012885740561533653, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73248661, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.42480469, + "step": 1965, + "time_per_iteration": 4.874085426330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3782223932281647, + "flos": 1360634178816.0, + "grad_norm": 0.008484298942656315, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397645, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.42578125, + "step": 1966, + "time_per_iteration": 4.964066743850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048666, + "balance_loss_mlp": 1.00467777, + "epoch": 0.37841477491342823, + "flos": 705517046016.0, + "grad_norm": 0.02737284959483133, + "language_loss": 0.8436377, + "learning_rate": 0.0007140503377003022, + "loss": 0.85412437, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.44067383, + "step": 1967, + "time_per_iteration": 3.014033555984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_mlp": 1.00764298, + "epoch": 0.3786071565986918, + "flos": 530156514048.0, + "grad_norm": 0.03014770490429956, + "language_loss": 0.85294402, + "learning_rate": 0.000713768745708599, + "loss": 0.86346149, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.44189453, + "step": 1968, + "time_per_iteration": 2.6359875202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_mlp": 1.0084002, + "epoch": 0.37879953828395535, + "flos": 994901443584.0, + "grad_norm": 0.03323886334735767, + "language_loss": 0.78270096, + "learning_rate": 0.0007134870707245085, + "loss": 0.79322648, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.44238281, + "step": 1969, + "time_per_iteration": 3.276670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054281, + "balance_loss_mlp": 1.01010239, + "epoch": 0.37899191996921894, + "flos": 627793212672.0, + "grad_norm": 0.033324026165203316, + "language_loss": 0.84867144, + "learning_rate": 0.0007132053128573864, + "loss": 0.85921425, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.44262695, + "step": 1970, + "time_per_iteration": 2.747647523880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_mlp": 1.00727034, + "epoch": 0.37918430165448247, + "flos": 687520136448.0, + "grad_norm": 0.034311044198206936, + "language_loss": 0.84702653, + "learning_rate": 0.0007129234722166211, + "loss": 0.85754126, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.44287109, + "step": 1971, + "time_per_iteration": 2.8502755165100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104953, + "balance_loss_mlp": 1.00535131, + "epoch": 0.37937668333974606, + "flos": 476618762496.0, + "grad_norm": 0.028798969169212138, + "language_loss": 0.91637433, + "learning_rate": 0.0007126415489116328, + "loss": 0.92686969, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.44262695, + "step": 1972, + "time_per_iteration": 2.703598737716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_mlp": 1.00559556, + "epoch": 0.37956906502500964, + "flos": 708825004032.0, + "grad_norm": 0.033945121596029554, + "language_loss": 0.81780016, + "learning_rate": 0.0007123595430518736, + "loss": 0.82829797, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.44262695, + "step": 1973, + "time_per_iteration": 2.859210252761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_mlp": 1.00345445, + "epoch": 0.3797614467102732, + "flos": 427559340288.0, + "grad_norm": 0.03504063937858188, + "language_loss": 0.86830699, + "learning_rate": 0.0007120774547468282, + "loss": 0.87878382, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.44311523, + "step": 1974, + "time_per_iteration": 2.5465054512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105377, + "balance_loss_mlp": 1.00944817, + "epoch": 0.37995382839553676, + "flos": 482881811712.0, + "grad_norm": 0.031503790568027705, + "language_loss": 0.82317638, + "learning_rate": 0.0007117952841060128, + "loss": 0.83371413, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.4440918, + "step": 1975, + "time_per_iteration": 2.789965867996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_mlp": 1.00924242, + "epoch": 0.3801462100808003, + "flos": 561671036928.0, + "grad_norm": 0.03572346778222672, + "language_loss": 0.84539783, + "learning_rate": 0.0007115130312389756, + "loss": 0.85593396, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.44433594, + "step": 1976, + "time_per_iteration": 2.7104804515838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046782, + "balance_loss_mlp": 1.00236499, + "epoch": 0.3803385917660639, + "flos": 465888077568.0, + "grad_norm": 0.03508123942848817, + "language_loss": 0.80071044, + "learning_rate": 0.0007112306962552973, + "loss": 0.81117821, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.44506836, + "step": 1977, + "time_per_iteration": 2.644700527191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053171, + "balance_loss_mlp": 1.00863445, + "epoch": 0.3805309734513274, + "flos": 522905840640.0, + "grad_norm": 0.0297417361696937, + "language_loss": 0.8625899, + "learning_rate": 0.0007109482792645896, + "loss": 0.87312162, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.44580078, + "step": 1978, + "time_per_iteration": 2.736924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052388, + "balance_loss_mlp": 1.00780404, + "epoch": 0.380723355136591, + "flos": 592553714688.0, + "grad_norm": 0.03207088172149068, + "language_loss": 0.84620887, + "learning_rate": 0.0007106657803764969, + "loss": 0.85673285, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.44628906, + "step": 1979, + "time_per_iteration": 2.797027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_mlp": 1.00851822, + "epoch": 0.38091573682185453, + "flos": 623855354880.0, + "grad_norm": 0.034228405400289826, + "language_loss": 0.82734859, + "learning_rate": 0.0007103831997006948, + "loss": 0.83788031, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.4465332, + "step": 1980, + "time_per_iteration": 2.774831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00601208, + "epoch": 0.3811081185071181, + "flos": 570176652288.0, + "grad_norm": 0.02916230611543443, + "language_loss": 0.85986841, + "learning_rate": 0.0007101005373468908, + "loss": 0.87037432, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.4465332, + "step": 1981, + "time_per_iteration": 2.889430284500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_mlp": 1.00647449, + "epoch": 0.3813005001923817, + "flos": 585991266816.0, + "grad_norm": 0.029260882769569122, + "language_loss": 0.87282979, + "learning_rate": 0.0007098177934248242, + "loss": 0.88334191, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.44726562, + "step": 1982, + "time_per_iteration": 2.734011173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_mlp": 1.00509369, + "epoch": 0.38149288187764524, + "flos": 622811350272.0, + "grad_norm": 0.03279838714755621, + "language_loss": 0.86164075, + "learning_rate": 0.0007095349680442661, + "loss": 0.87213778, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.44677734, + "step": 1983, + "time_per_iteration": 2.8532214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049432, + "balance_loss_mlp": 1.00496709, + "epoch": 0.3816852635629088, + "flos": 571798066944.0, + "grad_norm": 0.03407469020321441, + "language_loss": 0.79342288, + "learning_rate": 0.0007092520613150188, + "loss": 0.80391723, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4453125, + "step": 1984, + "time_per_iteration": 2.6656527519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_mlp": 1.01058352, + "epoch": 0.38187764524817236, + "flos": 566679144192.0, + "grad_norm": 0.03287674379309895, + "language_loss": 0.81891948, + "learning_rate": 0.0007089690733469165, + "loss": 0.82946956, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.44506836, + "step": 1985, + "time_per_iteration": 2.6921868324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104986, + "balance_loss_mlp": 1.00544298, + "epoch": 0.38207002693343595, + "flos": 632399854080.0, + "grad_norm": 0.03591516825864857, + "language_loss": 0.8265506, + "learning_rate": 0.000708686004249825, + "loss": 0.83704919, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.44506836, + "step": 1986, + "time_per_iteration": 2.771472454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_mlp": 1.0026772, + "epoch": 0.3822624086186995, + "flos": 549841912320.0, + "grad_norm": 0.027805852633017242, + "language_loss": 0.91746366, + "learning_rate": 0.0007084028541336413, + "loss": 0.92793083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.44116211, + "step": 1987, + "time_per_iteration": 2.7168381214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049881, + "balance_loss_mlp": 1.00572634, + "epoch": 0.38245479030396307, + "flos": 615067837440.0, + "grad_norm": 0.03052630202850825, + "language_loss": 0.86906445, + "learning_rate": 0.0007081196231082942, + "loss": 0.87956333, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.44238281, + "step": 1988, + "time_per_iteration": 2.8021280765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00325835, + "epoch": 0.38264717198922665, + "flos": 669304508160.0, + "grad_norm": 0.03253134732635267, + "language_loss": 0.8090933, + "learning_rate": 0.0007078363112837436, + "loss": 0.81956601, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.44091797, + "step": 1989, + "time_per_iteration": 2.812901020050049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00232375, + "epoch": 0.3828395536744902, + "flos": 455687170560.0, + "grad_norm": 0.03353740504071411, + "language_loss": 0.8610149, + "learning_rate": 0.000707552918769981, + "loss": 0.87147707, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43969727, + "step": 1990, + "time_per_iteration": 2.503817081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0038017, + "epoch": 0.3830319353597538, + "flos": 500483091456.0, + "grad_norm": 0.030831133245435974, + "language_loss": 0.84298265, + "learning_rate": 0.000707269445677029, + "loss": 0.85345787, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43798828, + "step": 1991, + "time_per_iteration": 2.77250599861145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_mlp": 1.00373507, + "epoch": 0.3832243170450173, + "flos": 745467197952.0, + "grad_norm": 0.03142895241328533, + "language_loss": 0.85860848, + "learning_rate": 0.0007069858921149416, + "loss": 0.86908376, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.4387207, + "step": 1992, + "time_per_iteration": 3.001058578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_mlp": 1.00363255, + "epoch": 0.3834166987302809, + "flos": 579346193664.0, + "grad_norm": 0.027707623231004064, + "language_loss": 0.86360574, + "learning_rate": 0.0007067022581938043, + "loss": 0.87407815, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.43676758, + "step": 1993, + "time_per_iteration": 2.896017551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049302, + "balance_loss_mlp": 1.00579047, + "epoch": 0.3836090804155444, + "flos": 537609376512.0, + "grad_norm": 0.038344647976828676, + "language_loss": 0.83944476, + "learning_rate": 0.0007064185440237334, + "loss": 0.8499378, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.43579102, + "step": 1994, + "time_per_iteration": 2.8133461475372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051736, + "balance_loss_mlp": 1.00820076, + "epoch": 0.383801462100808, + "flos": 603052075008.0, + "grad_norm": 0.0304270283066245, + "language_loss": 0.85033917, + "learning_rate": 0.0007061347497148764, + "loss": 0.86085653, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.43603516, + "step": 1995, + "time_per_iteration": 2.829977035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050646, + "balance_loss_mlp": 1.00694358, + "epoch": 0.38399384378607154, + "flos": 573799560192.0, + "grad_norm": 0.034646706108572276, + "language_loss": 0.86866224, + "learning_rate": 0.0007058508753774122, + "loss": 0.87916863, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.43774414, + "step": 1996, + "time_per_iteration": 2.684966564178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049901, + "balance_loss_mlp": 1.00629473, + "epoch": 0.38418622547133513, + "flos": 537780463104.0, + "grad_norm": 0.03333459391135046, + "language_loss": 0.87270373, + "learning_rate": 0.0007055669211215505, + "loss": 0.88320273, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.43676758, + "step": 1997, + "time_per_iteration": 2.623508930206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054497, + "balance_loss_mlp": 1.01079535, + "epoch": 0.3843786071565987, + "flos": 574014388224.0, + "grad_norm": 0.04127067736406929, + "language_loss": 0.78599155, + "learning_rate": 0.0007052828870575322, + "loss": 0.79653656, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43774414, + "step": 1998, + "time_per_iteration": 2.644423723220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_mlp": 1.00761676, + "epoch": 0.38457098884186225, + "flos": 730080294144.0, + "grad_norm": 0.03146347648703673, + "language_loss": 0.87266672, + "learning_rate": 0.0007049987732956291, + "loss": 0.88318008, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.43798828, + "step": 1999, + "time_per_iteration": 2.963409185409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_mlp": 1.00447905, + "epoch": 0.38476337052712584, + "flos": 584621618688.0, + "grad_norm": 0.024706606255084192, + "language_loss": 0.83278054, + "learning_rate": 0.0007047145799461439, + "loss": 0.84326208, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.4375, + "step": 2000, + "time_per_iteration": 2.86661434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_mlp": 1.00459874, + "epoch": 0.38495575221238937, + "flos": 554159848704.0, + "grad_norm": 0.03147773281119346, + "language_loss": 0.83074015, + "learning_rate": 0.00070443030711941, + "loss": 0.84122348, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.43798828, + "step": 2001, + "time_per_iteration": 2.778719425201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_mlp": 1.00175321, + "epoch": 0.38514813389765296, + "flos": 655678024704.0, + "grad_norm": 0.03168685191580143, + "language_loss": 0.82975376, + "learning_rate": 0.0007041459549257924, + "loss": 0.84020758, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.43701172, + "step": 2002, + "time_per_iteration": 2.8597054481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046243, + "balance_loss_mlp": 1.00261223, + "epoch": 0.3853405155829165, + "flos": 869647250688.0, + "grad_norm": 0.03552713767777679, + "language_loss": 0.78954732, + "learning_rate": 0.0007038615234756859, + "loss": 0.80000973, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.43701172, + "step": 2003, + "time_per_iteration": 3.167647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.00697505, + "epoch": 0.3855328972681801, + "flos": 547469088768.0, + "grad_norm": 0.03596547507231522, + "language_loss": 0.84374714, + "learning_rate": 0.000703577012879517, + "loss": 0.85425198, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.43579102, + "step": 2004, + "time_per_iteration": 2.644718885421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00397706, + "epoch": 0.3857252789534436, + "flos": 535099492608.0, + "grad_norm": 0.03525407945169758, + "language_loss": 0.89214581, + "learning_rate": 0.0007032924232477423, + "loss": 0.90262067, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.43579102, + "step": 2005, + "time_per_iteration": 2.6340301036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_mlp": 1.01023984, + "epoch": 0.3859176606387072, + "flos": 492767768832.0, + "grad_norm": 0.0325086763316175, + "language_loss": 0.80829036, + "learning_rate": 0.0007030077546908493, + "loss": 0.81882888, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.43676758, + "step": 2006, + "time_per_iteration": 2.6427574157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051659, + "balance_loss_mlp": 1.00969696, + "epoch": 0.3861100423239708, + "flos": 1490158675968.0, + "grad_norm": 0.006099468603868092, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84116316, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.41992188, + "step": 2007, + "time_per_iteration": 4.792185068130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_mlp": 1.00383234, + "epoch": 0.3863024240092343, + "flos": 474693091584.0, + "grad_norm": 0.0379943815396184, + "language_loss": 0.79703128, + "learning_rate": 0.0007024381812438117, + "loss": 0.80750644, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.4375, + "step": 2008, + "time_per_iteration": 2.6320388317108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_mlp": 1.0153178, + "epoch": 0.3864948056944979, + "flos": 717979961088.0, + "grad_norm": 0.04179543058298576, + "language_loss": 0.84345418, + "learning_rate": 0.0007021532765747951, + "loss": 0.85404319, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.43652344, + "step": 2009, + "time_per_iteration": 3.0408942699432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_mlp": 1.01370513, + "epoch": 0.38668718737976143, + "flos": 728955609600.0, + "grad_norm": 0.033678441310908816, + "language_loss": 0.80296206, + "learning_rate": 0.0007018682934229162, + "loss": 0.81353402, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43554688, + "step": 2010, + "time_per_iteration": 2.9119958877563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_mlp": 1.01025474, + "epoch": 0.386879569065025, + "flos": 526489864704.0, + "grad_norm": 0.031759350944825356, + "language_loss": 0.83489478, + "learning_rate": 0.0007015832318988152, + "loss": 0.84543192, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43530273, + "step": 2011, + "time_per_iteration": 2.625828981399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_mlp": 1.00643158, + "epoch": 0.38707195075028855, + "flos": 1530727067136.0, + "grad_norm": 0.008010138125144308, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74938273, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.41796875, + "step": 2012, + "time_per_iteration": 4.969848155975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_mlp": 1.00555038, + "epoch": 0.38726433243555214, + "flos": 558386411520.0, + "grad_norm": 0.029387859415775444, + "language_loss": 0.84841448, + "learning_rate": 0.0007010128741766604, + "loss": 0.85890484, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.43554688, + "step": 2013, + "time_per_iteration": 2.808583974838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_mlp": 1.00205982, + "epoch": 0.38745671412081567, + "flos": 554756700672.0, + "grad_norm": 0.037665143906504196, + "language_loss": 0.84820414, + "learning_rate": 0.0007007275782000391, + "loss": 0.85866058, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.43652344, + "step": 2014, + "time_per_iteration": 2.6201975345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_mlp": 1.00775766, + "epoch": 0.38764909580607926, + "flos": 459345071616.0, + "grad_norm": 0.03590133597746071, + "language_loss": 0.85486585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86537898, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.4362793, + "step": 2015, + "time_per_iteration": 2.5167059898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_mlp": 1.00792837, + "epoch": 0.38784147749134285, + "flos": 523259674368.0, + "grad_norm": 0.036833384765870066, + "language_loss": 0.90223992, + "learning_rate": 0.0007001567525695169, + "loss": 0.9127546, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.43603516, + "step": 2016, + "time_per_iteration": 2.663416624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_mlp": 0.99923599, + "epoch": 0.3880338591766064, + "flos": 667401191424.0, + "grad_norm": 0.027528515382714943, + "language_loss": 0.84397906, + "learning_rate": 0.0006998712231372303, + "loss": 0.85440457, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.43383789, + "step": 2017, + "time_per_iteration": 2.982222080230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_mlp": 1.00389743, + "epoch": 0.38822624086186996, + "flos": 595176359424.0, + "grad_norm": 0.028816590459513517, + "language_loss": 0.86776507, + "learning_rate": 0.0006995856161080532, + "loss": 0.87823659, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43310547, + "step": 2018, + "time_per_iteration": 2.8449933528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046566, + "balance_loss_mlp": 1.00300694, + "epoch": 0.3884186225471335, + "flos": 613682638080.0, + "grad_norm": 0.032032500930829794, + "language_loss": 0.82425624, + "learning_rate": 0.0006992999315928679, + "loss": 0.83472192, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.4362793, + "step": 2019, + "time_per_iteration": 2.803743362426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_mlp": 1.00401926, + "epoch": 0.3886110042323971, + "flos": 608244874752.0, + "grad_norm": 0.027721707471257077, + "language_loss": 0.86241317, + "learning_rate": 0.0006990141697025871, + "loss": 0.87288654, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.43383789, + "step": 2020, + "time_per_iteration": 2.7804739475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046585, + "balance_loss_mlp": 1.00481415, + "epoch": 0.3888033859176606, + "flos": 1531196573952.0, + "grad_norm": 0.004554603876592686, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77406228, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.41796875, + "step": 2021, + "time_per_iteration": 4.76949667930603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_mlp": 1.00478315, + "epoch": 0.3889957676029242, + "flos": 693672370176.0, + "grad_norm": 0.038162906437672096, + "language_loss": 0.8292582, + "learning_rate": 0.0006984424142405392, + "loss": 0.83973902, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.43359375, + "step": 2022, + "time_per_iteration": 2.7983930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_mlp": 1.00599611, + "epoch": 0.3891881492881878, + "flos": 516195638784.0, + "grad_norm": 0.03974199995652067, + "language_loss": 0.82402384, + "learning_rate": 0.0006981564208907474, + "loss": 0.83451867, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.43554688, + "step": 2023, + "time_per_iteration": 2.613600730895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_mlp": 1.00707471, + "epoch": 0.3893805309734513, + "flos": 630176729856.0, + "grad_norm": 0.03303002735023947, + "language_loss": 0.90586042, + "learning_rate": 0.0006978703506098102, + "loss": 0.91636622, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.43579102, + "step": 2024, + "time_per_iteration": 2.7258403301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_mlp": 1.00748503, + "epoch": 0.3895729126587149, + "flos": 545207080704.0, + "grad_norm": 0.0334033578711094, + "language_loss": 0.88520938, + "learning_rate": 0.00069758420350879, + "loss": 0.89571834, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.43481445, + "step": 2025, + "time_per_iteration": 2.6406970024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_mlp": 1.00427127, + "epoch": 0.38976529434397844, + "flos": 619407161088.0, + "grad_norm": 0.03600656764113765, + "language_loss": 0.86979783, + "learning_rate": 0.000697297979698779, + "loss": 0.88027489, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43505859, + "step": 2026, + "time_per_iteration": 2.729025363922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00297225, + "epoch": 0.38995767602924203, + "flos": 836346062592.0, + "grad_norm": 0.030634369701250594, + "language_loss": 0.84155977, + "learning_rate": 0.0006970116792908992, + "loss": 0.85202479, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.43603516, + "step": 2027, + "time_per_iteration": 3.0780837535858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054265, + "balance_loss_mlp": 1.01070547, + "epoch": 0.39015005771450556, + "flos": 542647619328.0, + "grad_norm": 0.03376343400122794, + "language_loss": 0.81809974, + "learning_rate": 0.000696725302396302, + "loss": 0.82864237, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.4362793, + "step": 2028, + "time_per_iteration": 2.6632442474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_mlp": 1.00277102, + "epoch": 0.39034243939976915, + "flos": 1009142275584.0, + "grad_norm": 0.030316104633677343, + "language_loss": 0.86213875, + "learning_rate": 0.0006964388491261692, + "loss": 0.872603, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.43725586, + "step": 2029, + "time_per_iteration": 3.2410776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052185, + "balance_loss_mlp": 1.00848317, + "epoch": 0.3905348210850327, + "flos": 680241272832.0, + "grad_norm": 0.03528753395725821, + "language_loss": 0.88294208, + "learning_rate": 0.0006961523195917114, + "loss": 0.89346391, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.43774414, + "step": 2030, + "time_per_iteration": 2.8754475116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104739, + "balance_loss_mlp": 1.00375915, + "epoch": 0.39072720277029627, + "flos": 549989666304.0, + "grad_norm": 0.032806843563698423, + "language_loss": 0.78588331, + "learning_rate": 0.0006958657139041696, + "loss": 0.79635721, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.43701172, + "step": 2031, + "time_per_iteration": 2.7329561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_mlp": 1.00554657, + "epoch": 0.39091958445555985, + "flos": 1551054025728.0, + "grad_norm": 0.008088132411436895, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77760577, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.42089844, + "step": 2032, + "time_per_iteration": 4.958296298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048975, + "balance_loss_mlp": 1.00529635, + "epoch": 0.3911119661408234, + "flos": 505052794368.0, + "grad_norm": 0.03533188094946227, + "language_loss": 0.78901434, + "learning_rate": 0.0006952922745149434, + "loss": 0.7995041, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.4375, + "step": 2033, + "time_per_iteration": 2.6192519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050645, + "balance_loss_mlp": 1.00684798, + "epoch": 0.391304347826087, + "flos": 558330031104.0, + "grad_norm": 0.032114717040763616, + "language_loss": 0.88009661, + "learning_rate": 0.000695005441035888, + "loss": 0.89060307, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.4387207, + "step": 2034, + "time_per_iteration": 2.6519060134887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_mlp": 1.00334167, + "epoch": 0.3914967295113505, + "flos": 1502944322304.0, + "grad_norm": 0.004600085335304226, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7476902, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.42285156, + "step": 2035, + "time_per_iteration": 4.875830888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049757, + "balance_loss_mlp": 1.00581694, + "epoch": 0.3916891111966141, + "flos": 708330219264.0, + "grad_norm": 0.02756997110289995, + "language_loss": 0.81809461, + "learning_rate": 0.0006944315470656863, + "loss": 0.82859218, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.44018555, + "step": 2036, + "time_per_iteration": 2.9486818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104941, + "balance_loss_mlp": 1.00537384, + "epoch": 0.3918814928818776, + "flos": 557409480960.0, + "grad_norm": 0.03430912315299504, + "language_loss": 0.91194409, + "learning_rate": 0.000694144486797345, + "loss": 0.92243814, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.44116211, + "step": 2037, + "time_per_iteration": 2.661637783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_mlp": 1.01155853, + "epoch": 0.3920738745671412, + "flos": 1541688131328.0, + "grad_norm": 0.009695617032389551, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80574143, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.421875, + "step": 2038, + "time_per_iteration": 4.676162004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_mlp": 1.00672829, + "epoch": 0.39226625625240474, + "flos": 499805559552.0, + "grad_norm": 0.03059706599431713, + "language_loss": 0.9011066, + "learning_rate": 0.0006935701402514156, + "loss": 0.91161263, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.43945312, + "step": 2039, + "time_per_iteration": 2.5921828746795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_mlp": 0.99837494, + "epoch": 0.39245863793766833, + "flos": 1350453680640.0, + "grad_norm": 0.0024785612799689367, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74075705, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.42480469, + "step": 2040, + "time_per_iteration": 4.920953273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_mlp": 1.00180471, + "epoch": 0.3926510196229319, + "flos": 1348115873280.0, + "grad_norm": 0.032003611488688986, + "language_loss": 0.84899294, + "learning_rate": 0.0006929954931031422, + "loss": 0.85944915, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.43896484, + "step": 2041, + "time_per_iteration": 3.7454288005828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_mlp": 1.00144792, + "epoch": 0.39284340130819545, + "flos": 500604600576.0, + "grad_norm": 0.027328608847006428, + "language_loss": 0.89267606, + "learning_rate": 0.0006927080570819805, + "loss": 0.9031285, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.4387207, + "step": 2042, + "time_per_iteration": 2.6191000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049281, + "balance_loss_mlp": 1.00565004, + "epoch": 0.39303578299345904, + "flos": 521342751744.0, + "grad_norm": 0.03887631720492337, + "language_loss": 0.81479704, + "learning_rate": 0.0006924205462449161, + "loss": 0.82528985, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.43701172, + "step": 2043, + "time_per_iteration": 2.6156415939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_mlp": 1.00467432, + "epoch": 0.39322816467872257, + "flos": 909539076864.0, + "grad_norm": 0.03230930456366714, + "language_loss": 0.82451463, + "learning_rate": 0.0006921329607035702, + "loss": 0.83499742, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.43676758, + "step": 2044, + "time_per_iteration": 3.248239040374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_mlp": 1.0066911, + "epoch": 0.39342054636398616, + "flos": 518642339328.0, + "grad_norm": 0.028076885263619615, + "language_loss": 0.88591248, + "learning_rate": 0.0006918453005695938, + "loss": 0.89641762, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.43896484, + "step": 2045, + "time_per_iteration": 2.6417062282562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.00430059, + "epoch": 0.3936129280492497, + "flos": 549012735744.0, + "grad_norm": 0.027900695924135757, + "language_loss": 0.84910023, + "learning_rate": 0.0006915575659546662, + "loss": 0.85958266, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.44018555, + "step": 2046, + "time_per_iteration": 2.6784913539886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053966, + "balance_loss_mlp": 1.0100733, + "epoch": 0.3938053097345133, + "flos": 527141151744.0, + "grad_norm": 0.03448231278490725, + "language_loss": 0.81310439, + "learning_rate": 0.0006912697569704959, + "loss": 0.82364404, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.43969727, + "step": 2047, + "time_per_iteration": 2.6214752197265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050494, + "balance_loss_mlp": 1.00679207, + "epoch": 0.39399769141977686, + "flos": 472589531136.0, + "grad_norm": 0.03168334850546869, + "language_loss": 0.87124646, + "learning_rate": 0.0006909818737288205, + "loss": 0.88175148, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.43774414, + "step": 2048, + "time_per_iteration": 2.6057982444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00775015, + "epoch": 0.3941900731050404, + "flos": 502727602944.0, + "grad_norm": 0.03501112209435681, + "language_loss": 0.81578481, + "learning_rate": 0.000690693916341406, + "loss": 0.82629883, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.43725586, + "step": 2049, + "time_per_iteration": 2.6459243297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.00910771, + "epoch": 0.394382454790304, + "flos": 582007722240.0, + "grad_norm": 0.03071224069667877, + "language_loss": 0.83009964, + "learning_rate": 0.0006904058849200475, + "loss": 0.8406263, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.4362793, + "step": 2050, + "time_per_iteration": 2.766828775405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_mlp": 1.00243104, + "epoch": 0.3945748364755675, + "flos": 514845432576.0, + "grad_norm": 0.030877215482718844, + "language_loss": 0.85563171, + "learning_rate": 0.0006901177795765683, + "loss": 0.86609566, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.44042969, + "step": 2051, + "time_per_iteration": 2.659912109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051919, + "balance_loss_mlp": 1.00807357, + "epoch": 0.3947672181608311, + "flos": 595058740992.0, + "grad_norm": 0.03343854917241654, + "language_loss": 0.821091, + "learning_rate": 0.0006898296004228213, + "loss": 0.8316102, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.43920898, + "step": 2052, + "time_per_iteration": 2.7115862369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_mlp": 1.00455475, + "epoch": 0.39495959984609463, + "flos": 1551052080384.0, + "grad_norm": 0.003971648916451202, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79173255, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.41992188, + "step": 2053, + "time_per_iteration": 4.894740343093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051907, + "balance_loss_mlp": 1.00818145, + "epoch": 0.3951519815313582, + "flos": 497524109568.0, + "grad_norm": 0.03573797234588687, + "language_loss": 0.80267316, + "learning_rate": 0.0006892530211320763, + "loss": 0.81319225, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.43798828, + "step": 2054, + "time_per_iteration": 2.767686605453491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104549, + "balance_loss_mlp": 1.00193131, + "epoch": 0.39534436321662175, + "flos": 532223136000.0, + "grad_norm": 0.03591265467553322, + "language_loss": 0.84680569, + "learning_rate": 0.000688964621218926, + "loss": 0.85726058, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.4362793, + "step": 2055, + "time_per_iteration": 2.6054694652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_mlp": 1.004722, + "epoch": 0.39553674490188534, + "flos": 703725523200.0, + "grad_norm": 0.03424008758122415, + "language_loss": 0.8074584, + "learning_rate": 0.0006886761479432037, + "loss": 0.8179388, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.43383789, + "step": 2056, + "time_per_iteration": 2.8390727043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.0042696, + "epoch": 0.3957291265871489, + "flos": 410656979712.0, + "grad_norm": 0.03388460034269331, + "language_loss": 0.85256028, + "learning_rate": 0.0006883876014169045, + "loss": 0.86303759, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.43530273, + "step": 2057, + "time_per_iteration": 2.554170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051678, + "balance_loss_mlp": 1.00814319, + "epoch": 0.39592150827241246, + "flos": 619639485696.0, + "grad_norm": 0.03722447028160607, + "language_loss": 0.90694773, + "learning_rate": 0.000688098981752052, + "loss": 0.91746461, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.43603516, + "step": 2058, + "time_per_iteration": 2.733053684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_mlp": 1.00568974, + "epoch": 0.39611388995767605, + "flos": 822721524480.0, + "grad_norm": 0.04279286873756595, + "language_loss": 0.80609208, + "learning_rate": 0.0006878102890606982, + "loss": 0.81658387, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.43554688, + "step": 2059, + "time_per_iteration": 3.084789752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_mlp": 1.00416124, + "epoch": 0.3963062716429396, + "flos": 493214921472.0, + "grad_norm": 0.03961147378322192, + "language_loss": 0.81771576, + "learning_rate": 0.0006875215234549239, + "loss": 0.82819128, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.43457031, + "step": 2060, + "time_per_iteration": 2.5823421478271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_mlp": 1.00351596, + "epoch": 0.39649865332820317, + "flos": 585834764544.0, + "grad_norm": 0.03854635921535854, + "language_loss": 0.8654902, + "learning_rate": 0.0006872326850468376, + "loss": 0.87595946, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.43481445, + "step": 2061, + "time_per_iteration": 2.705690860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.0052762, + "epoch": 0.3966910350134667, + "flos": 459512267520.0, + "grad_norm": 0.037411346592439484, + "language_loss": 0.79843795, + "learning_rate": 0.0006869437739485762, + "loss": 0.80892581, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.43579102, + "step": 2062, + "time_per_iteration": 2.5978832244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050469, + "balance_loss_mlp": 1.00710082, + "epoch": 0.3968834166987303, + "flos": 509615694336.0, + "grad_norm": 0.03224635872548594, + "language_loss": 0.93265009, + "learning_rate": 0.0006866547902723053, + "loss": 0.94315481, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.43432617, + "step": 2063, + "time_per_iteration": 2.7325148582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.00502992, + "epoch": 0.3970757983839938, + "flos": 573743179776.0, + "grad_norm": 0.0353853142482034, + "language_loss": 0.80804694, + "learning_rate": 0.000686365734130218, + "loss": 0.81852973, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.43310547, + "step": 2064, + "time_per_iteration": 2.719521999359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_mlp": 1.00350547, + "epoch": 0.3972681800692574, + "flos": 482586303744.0, + "grad_norm": 0.03284702600830507, + "language_loss": 0.8411094, + "learning_rate": 0.000686076605634536, + "loss": 0.8515777, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.43383789, + "step": 2065, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_mlp": 1.00822306, + "epoch": 0.397460561754521, + "flos": 488905733376.0, + "grad_norm": 0.0324228687482344, + "language_loss": 0.84781277, + "learning_rate": 0.0006857874048975088, + "loss": 0.85833061, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.4362793, + "step": 2066, + "time_per_iteration": 2.5906848907470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_mlp": 1.00659895, + "epoch": 0.3976529434397845, + "flos": 422896318464.0, + "grad_norm": 0.03171433053589848, + "language_loss": 0.8744958, + "learning_rate": 0.0006854981320314142, + "loss": 0.8849957, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.43457031, + "step": 2067, + "time_per_iteration": 2.4699788093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_mlp": 1.00240779, + "epoch": 0.3978453251250481, + "flos": 546622415616.0, + "grad_norm": 0.03563960500295594, + "language_loss": 0.8728829, + "learning_rate": 0.0006852087871485579, + "loss": 0.88334048, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.43408203, + "step": 2068, + "time_per_iteration": 2.6414859294891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_mlp": 1.00163472, + "epoch": 0.39803770681031164, + "flos": 652002627072.0, + "grad_norm": 0.03732729296318665, + "language_loss": 0.82978511, + "learning_rate": 0.0006849193703612735, + "loss": 0.84023428, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.43334961, + "step": 2069, + "time_per_iteration": 2.791269063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_mlp": 0.999928, + "epoch": 0.39823008849557523, + "flos": 741427272960.0, + "grad_norm": 0.030595728613543666, + "language_loss": 0.78243995, + "learning_rate": 0.0006846298817819225, + "loss": 0.79287314, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.43457031, + "step": 2070, + "time_per_iteration": 2.9561986923217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_mlp": 1.00235701, + "epoch": 0.39842247018083876, + "flos": 385889597184.0, + "grad_norm": 0.036398106493658954, + "language_loss": 0.81909132, + "learning_rate": 0.0006843403215228945, + "loss": 0.82954645, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.43212891, + "step": 2071, + "time_per_iteration": 2.4993679523468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.00218797, + "epoch": 0.39861485186610235, + "flos": 534763155456.0, + "grad_norm": 0.028807086351499752, + "language_loss": 0.8150484, + "learning_rate": 0.0006840506896966065, + "loss": 0.82550067, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.4309082, + "step": 2072, + "time_per_iteration": 2.7684881687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049113, + "balance_loss_mlp": 1.00595963, + "epoch": 0.39880723355136594, + "flos": 644413671168.0, + "grad_norm": 0.03625588542647267, + "language_loss": 0.83127856, + "learning_rate": 0.0006837609864155038, + "loss": 0.8417697, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.43212891, + "step": 2073, + "time_per_iteration": 2.8514270782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_mlp": 1.00782108, + "epoch": 0.39899961523662947, + "flos": 516892612608.0, + "grad_norm": 0.031931162968107815, + "language_loss": 0.83936673, + "learning_rate": 0.0006834712117920592, + "loss": 0.84987766, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.43334961, + "step": 2074, + "time_per_iteration": 2.6099319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_mlp": 1.00583923, + "epoch": 0.39919199692189306, + "flos": 465338857728.0, + "grad_norm": 0.040350277752625376, + "language_loss": 0.86345923, + "learning_rate": 0.0006831813659387729, + "loss": 0.87394845, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.43139648, + "step": 2075, + "time_per_iteration": 2.5189003944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_mlp": 1.00421119, + "epoch": 0.3993843786071566, + "flos": 532679036928.0, + "grad_norm": 0.031639049857806745, + "language_loss": 0.84865057, + "learning_rate": 0.0006828914489681733, + "loss": 0.85912478, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.43261719, + "step": 2076, + "time_per_iteration": 2.7052366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_mlp": 1.00252223, + "epoch": 0.3995767602924202, + "flos": 505024604160.0, + "grad_norm": 0.02906284980485529, + "language_loss": 0.85967886, + "learning_rate": 0.0006826014609928162, + "loss": 0.87013543, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.43188477, + "step": 2077, + "time_per_iteration": 2.7127158641815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046635, + "balance_loss_mlp": 1.00514984, + "epoch": 0.3997691419776837, + "flos": 1457473781760.0, + "grad_norm": 0.010869866041652092, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84246022, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.41503906, + "step": 2078, + "time_per_iteration": 4.8602213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_mlp": 1.00586236, + "epoch": 0.3999615236629473, + "flos": 531756541440.0, + "grad_norm": 0.03484656463436615, + "language_loss": 0.80513203, + "learning_rate": 0.0006820212724781896, + "loss": 0.81562173, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.43164062, + "step": 2079, + "time_per_iteration": 2.6769065856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00732243, + "epoch": 0.4001539053482108, + "flos": 696362088960.0, + "grad_norm": 0.03370335981625205, + "language_loss": 0.84624374, + "learning_rate": 0.0006817310721641694, + "loss": 0.85674727, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.4309082, + "step": 2080, + "time_per_iteration": 2.8362321853637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_mlp": 1.00619566, + "epoch": 0.4003462870334744, + "flos": 521379690240.0, + "grad_norm": 0.0372462453928972, + "language_loss": 0.84107649, + "learning_rate": 0.00068144080129589, + "loss": 0.85156924, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.43139648, + "step": 2081, + "time_per_iteration": 2.673391342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047044, + "balance_loss_mlp": 1.00400949, + "epoch": 0.400538668718738, + "flos": 493503626496.0, + "grad_norm": 0.03624950820375382, + "language_loss": 0.83452618, + "learning_rate": 0.0006811504599860441, + "loss": 0.84499657, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.4309082, + "step": 2082, + "time_per_iteration": 2.5872161388397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048557, + "balance_loss_mlp": 1.0056175, + "epoch": 0.40073105040400153, + "flos": 491452555776.0, + "grad_norm": 0.03058886918361784, + "language_loss": 0.86615109, + "learning_rate": 0.0006808600483473526, + "loss": 0.87663668, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.42993164, + "step": 2083, + "time_per_iteration": 2.9167916774749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_mlp": 1.00165451, + "epoch": 0.4009234320892651, + "flos": 563540327424.0, + "grad_norm": 0.029579631805043773, + "language_loss": 0.86442864, + "learning_rate": 0.0006805695664925629, + "loss": 0.87487578, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.43115234, + "step": 2084, + "time_per_iteration": 2.8129522800445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_mlp": 1.00328159, + "epoch": 0.40111581377452865, + "flos": 426853618176.0, + "grad_norm": 0.03869673141168483, + "language_loss": 0.84653956, + "learning_rate": 0.0006802790145344506, + "loss": 0.85700059, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.42871094, + "step": 2085, + "time_per_iteration": 2.4816439151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_mlp": 1.00480056, + "epoch": 0.40130819545979224, + "flos": 613643754240.0, + "grad_norm": 0.033294901740297575, + "language_loss": 0.87748265, + "learning_rate": 0.0006799883925858176, + "loss": 0.88795811, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.42797852, + "step": 2086, + "time_per_iteration": 2.883460760116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010467, + "balance_loss_mlp": 1.00397515, + "epoch": 0.40150057714505577, + "flos": 524451432960.0, + "grad_norm": 0.03567087941007639, + "language_loss": 0.85852945, + "learning_rate": 0.0006796977007594933, + "loss": 0.86899644, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.42773438, + "step": 2087, + "time_per_iteration": 2.6274635791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_mlp": 1.00641906, + "epoch": 0.40169295883031936, + "flos": 562554648576.0, + "grad_norm": 0.03237434691106299, + "language_loss": 0.86948609, + "learning_rate": 0.0006794069391683345, + "loss": 0.87997776, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.42797852, + "step": 2088, + "time_per_iteration": 2.7452995777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00204611, + "epoch": 0.4018853405155829, + "flos": 520020735744.0, + "grad_norm": 0.03787206100605993, + "language_loss": 0.81785774, + "learning_rate": 0.0006791161079252248, + "loss": 0.82830572, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.42797852, + "step": 2089, + "time_per_iteration": 2.7205429077148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104968, + "balance_loss_mlp": 1.00683641, + "epoch": 0.4020777222008465, + "flos": 527288905728.0, + "grad_norm": 0.03117280194599123, + "language_loss": 0.83103907, + "learning_rate": 0.0006788252071430747, + "loss": 0.84153581, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.42895508, + "step": 2090, + "time_per_iteration": 2.659057378768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105285, + "balance_loss_mlp": 1.01000619, + "epoch": 0.40227010388611006, + "flos": 526841753088.0, + "grad_norm": 0.038447003118097976, + "language_loss": 0.86962426, + "learning_rate": 0.0006785342369348222, + "loss": 0.88015276, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.42895508, + "step": 2091, + "time_per_iteration": 2.7038679122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_mlp": 1.00374973, + "epoch": 0.4024624855713736, + "flos": 433227482880.0, + "grad_norm": 0.04129881296644863, + "language_loss": 0.80178273, + "learning_rate": 0.0006782431974134316, + "loss": 0.81224871, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.42895508, + "step": 2092, + "time_per_iteration": 2.522822618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_mlp": 1.00185025, + "epoch": 0.4026548672566372, + "flos": 768092136192.0, + "grad_norm": 0.028161411572745265, + "language_loss": 0.89556634, + "learning_rate": 0.0006779520886918949, + "loss": 0.90601373, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.42944336, + "step": 2093, + "time_per_iteration": 3.059269905090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051245, + "balance_loss_mlp": 1.00847256, + "epoch": 0.4028472489419007, + "flos": 644118163200.0, + "grad_norm": 0.031871945568835235, + "language_loss": 0.81586826, + "learning_rate": 0.0006776609108832301, + "loss": 0.82638067, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.42822266, + "step": 2094, + "time_per_iteration": 2.824986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_mlp": 1.00707567, + "epoch": 0.4030396306271643, + "flos": 492824149248.0, + "grad_norm": 0.03027887325873737, + "language_loss": 0.85679066, + "learning_rate": 0.0006773696641004828, + "loss": 0.86729133, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.43041992, + "step": 2095, + "time_per_iteration": 2.575521230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00742352, + "epoch": 0.40323201231242783, + "flos": 903195347712.0, + "grad_norm": 0.03549236004367387, + "language_loss": 0.78398442, + "learning_rate": 0.0006770783484567247, + "loss": 0.7944876, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.42944336, + "step": 2096, + "time_per_iteration": 3.1476502418518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_mlp": 1.00417244, + "epoch": 0.4034243939976914, + "flos": 571730992896.0, + "grad_norm": 0.04456027219971551, + "language_loss": 0.86790794, + "learning_rate": 0.000676786964065055, + "loss": 0.87837982, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.43066406, + "step": 2097, + "time_per_iteration": 2.826936960220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049393, + "balance_loss_mlp": 1.00635874, + "epoch": 0.403616775682955, + "flos": 508460874240.0, + "grad_norm": 0.03200015951198879, + "language_loss": 0.79479361, + "learning_rate": 0.0006764955110385986, + "loss": 0.80528748, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.4309082, + "step": 2098, + "time_per_iteration": 2.732429027557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105173, + "balance_loss_mlp": 1.0086236, + "epoch": 0.40380915736821854, + "flos": 520411507968.0, + "grad_norm": 0.033549102084289066, + "language_loss": 0.81161886, + "learning_rate": 0.0006762039894905083, + "loss": 0.82213616, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.43164062, + "step": 2099, + "time_per_iteration": 2.638117790222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104845, + "balance_loss_mlp": 1.00524902, + "epoch": 0.40400153905348213, + "flos": 442887918336.0, + "grad_norm": 0.03592642868139018, + "language_loss": 0.80970824, + "learning_rate": 0.000675912399533962, + "loss": 0.82019281, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.43261719, + "step": 2100, + "time_per_iteration": 2.58172345161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_mlp": 1.00585735, + "epoch": 0.40419392073874566, + "flos": 773705843712.0, + "grad_norm": 0.032245854328407444, + "language_loss": 0.85358262, + "learning_rate": 0.0006756207412821656, + "loss": 0.86407304, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.43237305, + "step": 2101, + "time_per_iteration": 3.0158467292785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053218, + "balance_loss_mlp": 1.01006424, + "epoch": 0.40438630242400925, + "flos": 767990068992.0, + "grad_norm": 0.03424537155124627, + "language_loss": 0.81043333, + "learning_rate": 0.0006753290148483505, + "loss": 0.82096547, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.43212891, + "step": 2102, + "time_per_iteration": 3.0169148445129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050406, + "balance_loss_mlp": 1.0073241, + "epoch": 0.4045786841092728, + "flos": 416129736192.0, + "grad_norm": 0.032341452227877814, + "language_loss": 0.79544723, + "learning_rate": 0.0006750372203457752, + "loss": 0.80595136, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.43139648, + "step": 2103, + "time_per_iteration": 2.459439277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104576, + "balance_loss_mlp": 1.00274944, + "epoch": 0.40477106579453637, + "flos": 540309788928.0, + "grad_norm": 0.028365330829485943, + "language_loss": 0.87031502, + "learning_rate": 0.0006747453578877242, + "loss": 0.88077265, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.43066406, + "step": 2104, + "time_per_iteration": 2.704583168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.00413048, + "epoch": 0.4049634474797999, + "flos": 828092213760.0, + "grad_norm": 0.03564801319951872, + "language_loss": 0.83885705, + "learning_rate": 0.0006744534275875085, + "loss": 0.84932852, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.43066406, + "step": 2105, + "time_per_iteration": 3.070952892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049443, + "balance_loss_mlp": 1.00631273, + "epoch": 0.4051558291650635, + "flos": 573753873408.0, + "grad_norm": 0.03321600555114549, + "language_loss": 0.86069483, + "learning_rate": 0.0006741614295584657, + "loss": 0.87118924, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.43188477, + "step": 2106, + "time_per_iteration": 2.677860736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_mlp": 1.00802493, + "epoch": 0.4053482108503271, + "flos": 733245355776.0, + "grad_norm": 0.034313991245887424, + "language_loss": 0.78860825, + "learning_rate": 0.0006738693639139595, + "loss": 0.79911888, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.4309082, + "step": 2107, + "time_per_iteration": 3.021329402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_mlp": 1.0043304, + "epoch": 0.4055405925355906, + "flos": 1214950971648.0, + "grad_norm": 0.03202932182515954, + "language_loss": 0.77947468, + "learning_rate": 0.0006735772307673796, + "loss": 0.7899493, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.43188477, + "step": 2108, + "time_per_iteration": 3.524618148803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_mlp": 1.00476336, + "epoch": 0.4057329742208542, + "flos": 717108988416.0, + "grad_norm": 0.03284224075250963, + "language_loss": 0.84037805, + "learning_rate": 0.0006732850302321421, + "loss": 0.85085559, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.43041992, + "step": 2109, + "time_per_iteration": 2.9528980255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_mlp": 1.00423336, + "epoch": 0.4059253559061177, + "flos": 565953980160.0, + "grad_norm": 0.033245578967332844, + "language_loss": 0.85031784, + "learning_rate": 0.00067299276242169, + "loss": 0.86078906, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.42944336, + "step": 2110, + "time_per_iteration": 2.715207815170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046326, + "balance_loss_mlp": 1.00493622, + "epoch": 0.4061177375913813, + "flos": 1597189459200.0, + "grad_norm": 0.00881896921345328, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75428492, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.4140625, + "step": 2111, + "time_per_iteration": 4.921623468399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045683, + "balance_loss_mlp": 1.00276768, + "epoch": 0.40631011927664484, + "flos": 616622178048.0, + "grad_norm": 0.03872377126422628, + "language_loss": 0.78301811, + "learning_rate": 0.0006724080254290395, + "loss": 0.79347491, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.4296875, + "step": 2112, + "time_per_iteration": 2.7997756004333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104896, + "balance_loss_mlp": 1.00606835, + "epoch": 0.40650250096190843, + "flos": 558748993536.0, + "grad_norm": 0.03550284292845091, + "language_loss": 0.90693575, + "learning_rate": 0.0006721155564738566, + "loss": 0.91742539, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.42944336, + "step": 2113, + "time_per_iteration": 2.6585686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_mlp": 1.00054932, + "epoch": 0.40669488264717196, + "flos": 1583545479168.0, + "grad_norm": 0.009767435928617773, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79664576, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.4140625, + "step": 2114, + "time_per_iteration": 4.948775053024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047453, + "balance_loss_mlp": 1.00460887, + "epoch": 0.40688726433243555, + "flos": 508656260352.0, + "grad_norm": 0.031160170727070474, + "language_loss": 0.86169994, + "learning_rate": 0.0006715304182135078, + "loss": 0.8721745, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.42895508, + "step": 2115, + "time_per_iteration": 2.6279850006103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00449646, + "epoch": 0.40707964601769914, + "flos": 590352944640.0, + "grad_norm": 0.04782787246513916, + "language_loss": 0.89337373, + "learning_rate": 0.0006712377491355127, + "loss": 0.90384591, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.42773438, + "step": 2116, + "time_per_iteration": 2.863960027694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_mlp": 1.00449598, + "epoch": 0.40727202770296267, + "flos": 581651943168.0, + "grad_norm": 0.026696862883813798, + "language_loss": 0.81451207, + "learning_rate": 0.0006709450135771274, + "loss": 0.8249836, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.42700195, + "step": 2117, + "time_per_iteration": 2.94854998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_mlp": 1.00589585, + "epoch": 0.40746440938822626, + "flos": 505109174784.0, + "grad_norm": 0.029498043522937258, + "language_loss": 0.87031925, + "learning_rate": 0.0006706522116520023, + "loss": 0.88080668, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.42895508, + "step": 2118, + "time_per_iteration": 2.6655611991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051527, + "balance_loss_mlp": 1.00880289, + "epoch": 0.4076567910734898, + "flos": 606711921408.0, + "grad_norm": 0.03542644850365937, + "language_loss": 0.83226359, + "learning_rate": 0.0006703593434738127, + "loss": 0.84277886, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.42773438, + "step": 2119, + "time_per_iteration": 2.7478883266448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049424, + "balance_loss_mlp": 1.00662768, + "epoch": 0.4078491727587534, + "flos": 480519681792.0, + "grad_norm": 0.032767120193604775, + "language_loss": 0.788118, + "learning_rate": 0.0006700664091562604, + "loss": 0.79861224, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.4284668, + "step": 2120, + "time_per_iteration": 2.532407760620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054491, + "balance_loss_mlp": 1.01167095, + "epoch": 0.4080415544440169, + "flos": 511419856128.0, + "grad_norm": 0.031947051498113735, + "language_loss": 0.85428649, + "learning_rate": 0.0006697734088130725, + "loss": 0.86483139, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.42871094, + "step": 2121, + "time_per_iteration": 2.6053290367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_mlp": 1.00899482, + "epoch": 0.4082339361292805, + "flos": 735928271616.0, + "grad_norm": 0.0331707162631359, + "language_loss": 0.86154819, + "learning_rate": 0.0006694803425580018, + "loss": 0.87206686, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.42919922, + "step": 2122, + "time_per_iteration": 2.995340585708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051457, + "balance_loss_mlp": 1.00863671, + "epoch": 0.4084263178145441, + "flos": 458405079552.0, + "grad_norm": 0.03582566166827548, + "language_loss": 0.85069245, + "learning_rate": 0.0006691872105048268, + "loss": 0.86120701, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.42871094, + "step": 2123, + "time_per_iteration": 2.6434147357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_mlp": 1.00655949, + "epoch": 0.4086186994998076, + "flos": 564026363904.0, + "grad_norm": 0.030981369506813725, + "language_loss": 0.84940457, + "learning_rate": 0.0006688940127673513, + "loss": 0.85990047, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.4309082, + "step": 2124, + "time_per_iteration": 2.677267074584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051992, + "balance_loss_mlp": 1.00914872, + "epoch": 0.4088110811850712, + "flos": 574894109184.0, + "grad_norm": 0.03166953679677798, + "language_loss": 0.86061293, + "learning_rate": 0.0006686007494594049, + "loss": 0.87113285, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.42895508, + "step": 2125, + "time_per_iteration": 2.806321620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_mlp": 1.00845325, + "epoch": 0.40900346287033473, + "flos": 457847111424.0, + "grad_norm": 0.04138148105998068, + "language_loss": 0.81154513, + "learning_rate": 0.0006683074206948425, + "loss": 0.82205856, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.42944336, + "step": 2126, + "time_per_iteration": 2.5422966480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051657, + "balance_loss_mlp": 1.00878966, + "epoch": 0.4091958445555983, + "flos": 618595481088.0, + "grad_norm": 0.03139043933990307, + "language_loss": 0.81871778, + "learning_rate": 0.0006680140265875443, + "loss": 0.82923436, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.42919922, + "step": 2127, + "time_per_iteration": 2.8402438163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_mlp": 1.0048064, + "epoch": 0.40938822624086185, + "flos": 473371075584.0, + "grad_norm": 0.031125843736347292, + "language_loss": 0.96506268, + "learning_rate": 0.0006677205672514162, + "loss": 0.97553754, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.42724609, + "step": 2128, + "time_per_iteration": 2.6291539669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047334, + "balance_loss_mlp": 1.00460982, + "epoch": 0.40958060792612544, + "flos": 571118589696.0, + "grad_norm": 0.02838685720934929, + "language_loss": 0.89474666, + "learning_rate": 0.000667427042800389, + "loss": 0.90522003, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.42773438, + "step": 2129, + "time_per_iteration": 2.749999761581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_mlp": 1.00435364, + "epoch": 0.40977298961138897, + "flos": 610471889664.0, + "grad_norm": 0.033304274322438925, + "language_loss": 0.8343153, + "learning_rate": 0.0006671334533484192, + "loss": 0.84478706, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.42871094, + "step": 2130, + "time_per_iteration": 2.778238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_mlp": 1.00636995, + "epoch": 0.40996537129665256, + "flos": 582873837312.0, + "grad_norm": 0.027360354791446346, + "language_loss": 0.83860981, + "learning_rate": 0.0006668397990094881, + "loss": 0.84910274, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.4296875, + "step": 2131, + "time_per_iteration": 2.711257219314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_mlp": 1.00145221, + "epoch": 0.41015775298191615, + "flos": 517554593280.0, + "grad_norm": 0.031461982022778785, + "language_loss": 0.85118818, + "learning_rate": 0.0006665460798976027, + "loss": 0.86163139, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.42919922, + "step": 2132, + "time_per_iteration": 2.7143847942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046552, + "balance_loss_mlp": 1.00370777, + "epoch": 0.4103501346671797, + "flos": 511446100992.0, + "grad_norm": 0.02874706903740214, + "language_loss": 0.82064044, + "learning_rate": 0.0006662522961267947, + "loss": 0.83110595, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.42895508, + "step": 2133, + "time_per_iteration": 2.683544635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_mlp": 1.00212467, + "epoch": 0.41054251635244327, + "flos": 550927713024.0, + "grad_norm": 0.027003210560574007, + "language_loss": 0.87900901, + "learning_rate": 0.0006659584478111211, + "loss": 0.88945937, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.4296875, + "step": 2134, + "time_per_iteration": 2.781217336654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_mlp": 1.00254142, + "epoch": 0.4107348980377068, + "flos": 841299734784.0, + "grad_norm": 0.03651700728131785, + "language_loss": 0.83066756, + "learning_rate": 0.000665664535064664, + "loss": 0.84112048, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.42797852, + "step": 2135, + "time_per_iteration": 3.067751169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_mlp": 1.00390935, + "epoch": 0.4109272797229704, + "flos": 504764089344.0, + "grad_norm": 0.03160666135819327, + "language_loss": 0.83225, + "learning_rate": 0.0006653705580015303, + "loss": 0.84271616, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.42749023, + "step": 2136, + "time_per_iteration": 2.6899030208587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.00521994, + "epoch": 0.4111196614082339, + "flos": 612024284928.0, + "grad_norm": 0.02957451828286975, + "language_loss": 0.87109792, + "learning_rate": 0.0006650765167358523, + "loss": 0.8815788, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.42919922, + "step": 2137, + "time_per_iteration": 2.8179140090942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048701, + "balance_loss_mlp": 1.00590456, + "epoch": 0.4113120430934975, + "flos": 454104639744.0, + "grad_norm": 0.033800673848535426, + "language_loss": 0.91012341, + "learning_rate": 0.0006647824113817864, + "loss": 0.92061043, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.4284668, + "step": 2138, + "time_per_iteration": 2.5263419151306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00635624, + "epoch": 0.41150442477876104, + "flos": 542710802688.0, + "grad_norm": 0.028316546184043286, + "language_loss": 0.818874, + "learning_rate": 0.000664488242053515, + "loss": 0.82936704, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.42993164, + "step": 2139, + "time_per_iteration": 2.770169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_mlp": 1.0037353, + "epoch": 0.4116968064640246, + "flos": 577392332544.0, + "grad_norm": 0.027329597632332964, + "language_loss": 0.84529692, + "learning_rate": 0.0006641940088652445, + "loss": 0.8557626, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.42895508, + "step": 2140, + "time_per_iteration": 2.761660575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046986, + "balance_loss_mlp": 1.00416613, + "epoch": 0.4118891881492882, + "flos": 497150833920.0, + "grad_norm": 0.03165424709394261, + "language_loss": 0.82833397, + "learning_rate": 0.0006638997119312065, + "loss": 0.83880383, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.42871094, + "step": 2141, + "time_per_iteration": 2.6978652477264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071571, + "balance_loss_mlp": 1.02980042, + "epoch": 0.41208156983455174, + "flos": 1541573425152.0, + "grad_norm": 0.013007961614308571, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76134878, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.41796875, + "step": 2142, + "time_per_iteration": 4.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.00611305, + "epoch": 0.41227395151981533, + "flos": 586058340864.0, + "grad_norm": 0.033991757131589403, + "language_loss": 0.85150123, + "learning_rate": 0.000663310927282877, + "loss": 0.86199009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.42822266, + "step": 2143, + "time_per_iteration": 2.7552297115325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_mlp": 1.00635242, + "epoch": 0.41246633320507886, + "flos": 443893039104.0, + "grad_norm": 0.031026250164357557, + "language_loss": 0.8627826, + "learning_rate": 0.000663016439797172, + "loss": 0.87327409, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.4284668, + "step": 2144, + "time_per_iteration": 2.627795934677124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_mlp": 1.00593042, + "epoch": 0.41265871489034245, + "flos": 581095920384.0, + "grad_norm": 0.032902127624834396, + "language_loss": 0.81700695, + "learning_rate": 0.0006627218890228724, + "loss": 0.82749426, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.4284668, + "step": 2145, + "time_per_iteration": 2.7726335525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051442, + "balance_loss_mlp": 1.00852692, + "epoch": 0.412851096575606, + "flos": 762529951488.0, + "grad_norm": 0.03700396426728773, + "language_loss": 0.8427214, + "learning_rate": 0.0006624272750743326, + "loss": 0.85323578, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.4296875, + "step": 2146, + "time_per_iteration": 3.047786235809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051555, + "balance_loss_mlp": 1.00854468, + "epoch": 0.41304347826086957, + "flos": 556521978624.0, + "grad_norm": 0.0279029176228374, + "language_loss": 0.83148611, + "learning_rate": 0.0006621325980659322, + "loss": 0.84200168, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.43066406, + "step": 2147, + "time_per_iteration": 2.7805261611938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105086, + "balance_loss_mlp": 1.00796807, + "epoch": 0.41323585994613315, + "flos": 666894746112.0, + "grad_norm": 0.03289726182172815, + "language_loss": 0.82395911, + "learning_rate": 0.000661837858112075, + "loss": 0.83446777, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.42944336, + "step": 2148, + "time_per_iteration": 2.8236329555511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_mlp": 1.00153887, + "epoch": 0.4134282416313967, + "flos": 549785531904.0, + "grad_norm": 0.03194652549549522, + "language_loss": 0.89158356, + "learning_rate": 0.0006615430553271888, + "loss": 0.90202832, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.42993164, + "step": 2149, + "time_per_iteration": 2.7931926250457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043808, + "balance_loss_mlp": 1.00101149, + "epoch": 0.4136206233166603, + "flos": 647513604096.0, + "grad_norm": 0.02946183128139913, + "language_loss": 0.8604427, + "learning_rate": 0.0006612481898257264, + "loss": 0.87088078, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.4284668, + "step": 2150, + "time_per_iteration": 2.853116512298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_mlp": 1.00279057, + "epoch": 0.4138130050019238, + "flos": 518364327936.0, + "grad_norm": 0.034556300996824205, + "language_loss": 0.85756087, + "learning_rate": 0.000660953261722165, + "loss": 0.86801755, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.42919922, + "step": 2151, + "time_per_iteration": 2.5899548530578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00575054, + "epoch": 0.4140053866871874, + "flos": 610369822464.0, + "grad_norm": 0.032804683798420206, + "language_loss": 0.83155799, + "learning_rate": 0.0006606582711310055, + "loss": 0.84204322, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.42822266, + "step": 2152, + "time_per_iteration": 2.7591912746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_mlp": 1.00258613, + "epoch": 0.4141977683724509, + "flos": 580846099200.0, + "grad_norm": 0.031179869336458114, + "language_loss": 0.84146237, + "learning_rate": 0.0006603632181667736, + "loss": 0.85191619, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.4284668, + "step": 2153, + "time_per_iteration": 2.661051034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_mlp": 1.00470734, + "epoch": 0.4143901500577145, + "flos": 1310178863616.0, + "grad_norm": 0.005957353398288201, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79989231, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.41210938, + "step": 2154, + "time_per_iteration": 4.908870458602905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_mlp": 1.00416827, + "epoch": 0.41458253174297804, + "flos": 461122988544.0, + "grad_norm": 0.03503771604154275, + "language_loss": 0.82412434, + "learning_rate": 0.0006597729255773153, + "loss": 0.83459282, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.42724609, + "step": 2155, + "time_per_iteration": 2.51566481590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048208, + "balance_loss_mlp": 1.00531614, + "epoch": 0.41477491342824163, + "flos": 554439805440.0, + "grad_norm": 0.033219020360443, + "language_loss": 0.82733047, + "learning_rate": 0.0006594776861812608, + "loss": 0.83781254, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.42944336, + "step": 2156, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_mlp": 1.00501156, + "epoch": 0.4149672951135052, + "flos": 699086800896.0, + "grad_norm": 0.029687792529517126, + "language_loss": 0.87240821, + "learning_rate": 0.0006591823848704776, + "loss": 0.88288647, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.42871094, + "step": 2157, + "time_per_iteration": 2.950136661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00647271, + "epoch": 0.41515967679876875, + "flos": 566837591808.0, + "grad_norm": 0.02753963183350331, + "language_loss": 0.82045114, + "learning_rate": 0.0006588870217596117, + "loss": 0.83094263, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.42724609, + "step": 2158, + "time_per_iteration": 2.742954730987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_mlp": 1.00440574, + "epoch": 0.41535205848403234, + "flos": 502178383104.0, + "grad_norm": 0.03782519840746282, + "language_loss": 0.86309534, + "learning_rate": 0.0006585915969633334, + "loss": 0.8735671, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.42822266, + "step": 2159, + "time_per_iteration": 2.6314492225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048815, + "balance_loss_mlp": 1.00599504, + "epoch": 0.41554444016929587, + "flos": 608702721024.0, + "grad_norm": 0.03160589415450587, + "language_loss": 0.8965854, + "learning_rate": 0.0006582961105963366, + "loss": 0.90707356, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.42871094, + "step": 2160, + "time_per_iteration": 2.779524564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052466, + "balance_loss_mlp": 1.0094316, + "epoch": 0.41573682185455946, + "flos": 530156514048.0, + "grad_norm": 0.0316987683946157, + "language_loss": 0.78011453, + "learning_rate": 0.0006580005627733395, + "loss": 0.79063922, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.4309082, + "step": 2161, + "time_per_iteration": 2.655961275100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053795, + "balance_loss_mlp": 1.01095116, + "epoch": 0.415929203539823, + "flos": 506038473216.0, + "grad_norm": 0.030200496407476712, + "language_loss": 0.82344484, + "learning_rate": 0.0006577049536090838, + "loss": 0.83398283, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.42895508, + "step": 2162, + "time_per_iteration": 2.734727144241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_mlp": 1.00536776, + "epoch": 0.4161215852250866, + "flos": 583824523008.0, + "grad_norm": 0.03528478058898885, + "language_loss": 0.86106777, + "learning_rate": 0.000657409283218335, + "loss": 0.87155068, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.4296875, + "step": 2163, + "time_per_iteration": 2.659733533859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051194, + "balance_loss_mlp": 1.00844538, + "epoch": 0.4163139669103501, + "flos": 491760702720.0, + "grad_norm": 0.03176725688202085, + "language_loss": 0.81183624, + "learning_rate": 0.0006571135517158829, + "loss": 0.82234824, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.42797852, + "step": 2164, + "time_per_iteration": 2.639364004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_mlp": 1.00241089, + "epoch": 0.4165063485956137, + "flos": 1291023243264.0, + "grad_norm": 0.009317160244550511, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807671, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.41210938, + "step": 2165, + "time_per_iteration": 4.755609750747681 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_mlp": 1.00600576, + "epoch": 0.4166987302808773, + "flos": 496258473984.0, + "grad_norm": 0.03907979296448248, + "language_loss": 0.83556676, + "learning_rate": 0.0006565219058351444, + "loss": 0.84605455, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.42822266, + "step": 2166, + "time_per_iteration": 2.549835443496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_mlp": 1.00087476, + "epoch": 0.4168911119661408, + "flos": 465067649280.0, + "grad_norm": 0.0316582334519174, + "language_loss": 0.83126116, + "learning_rate": 0.0006562259916865553, + "loss": 0.8416996, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.43017578, + "step": 2167, + "time_per_iteration": 2.577807664871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045446, + "balance_loss_mlp": 1.00253069, + "epoch": 0.4170834936514044, + "flos": 537943768320.0, + "grad_norm": 0.03263228805326442, + "language_loss": 0.79910517, + "learning_rate": 0.0006559300168856573, + "loss": 0.8095597, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.4296875, + "step": 2168, + "time_per_iteration": 2.716322898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_mlp": 1.00819373, + "epoch": 0.41727587533666793, + "flos": 551750086656.0, + "grad_norm": 0.029704951266317694, + "language_loss": 0.86753178, + "learning_rate": 0.0006556339815473577, + "loss": 0.87804294, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.4296875, + "step": 2169, + "time_per_iteration": 2.627387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044935, + "balance_loss_mlp": 1.00204313, + "epoch": 0.4174682570219315, + "flos": 632378466816.0, + "grad_norm": 0.03018462927838879, + "language_loss": 0.86615288, + "learning_rate": 0.000655337885786588, + "loss": 0.87660229, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.42944336, + "step": 2170, + "time_per_iteration": 2.8836913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_mlp": 1.00211012, + "epoch": 0.41766063870719505, + "flos": 520756593408.0, + "grad_norm": 0.03274558076895909, + "language_loss": 0.85911119, + "learning_rate": 0.0006550417297183025, + "loss": 0.86956197, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.43017578, + "step": 2171, + "time_per_iteration": 2.6085855960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054021, + "balance_loss_mlp": 1.0111295, + "epoch": 0.41785302039245864, + "flos": 559055195136.0, + "grad_norm": 0.03215226267597247, + "language_loss": 0.82142568, + "learning_rate": 0.0006547455134574793, + "loss": 0.83196592, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.42944336, + "step": 2172, + "time_per_iteration": 2.7207438945770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_mlp": 1.0057919, + "epoch": 0.41804540207772223, + "flos": 790028848896.0, + "grad_norm": 0.03152263917705172, + "language_loss": 0.84573895, + "learning_rate": 0.0006544492371191198, + "loss": 0.85622525, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.42895508, + "step": 2173, + "time_per_iteration": 3.1091549396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050974, + "balance_loss_mlp": 1.00791526, + "epoch": 0.41823778376298576, + "flos": 905891869440.0, + "grad_norm": 0.03158772894298815, + "language_loss": 0.83616948, + "learning_rate": 0.0006541529008182485, + "loss": 0.84667921, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.43115234, + "step": 2174, + "time_per_iteration": 3.1934547424316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050617, + "balance_loss_mlp": 1.0074867, + "epoch": 0.41843016544824935, + "flos": 512574676224.0, + "grad_norm": 0.036197783568866736, + "language_loss": 0.87799633, + "learning_rate": 0.0006538565046699136, + "loss": 0.88850248, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.43188477, + "step": 2175, + "time_per_iteration": 2.6156668663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043414, + "balance_loss_mlp": 1.00047445, + "epoch": 0.4186225471335129, + "flos": 654290880000.0, + "grad_norm": 0.03486733903162065, + "language_loss": 0.81864989, + "learning_rate": 0.0006535600487891862, + "loss": 0.82908404, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.42993164, + "step": 2176, + "time_per_iteration": 2.7715044021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00778568, + "epoch": 0.41881492881877647, + "flos": 570226229760.0, + "grad_norm": 0.03182850960977162, + "language_loss": 0.89874047, + "learning_rate": 0.0006532635332911603, + "loss": 0.90924585, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.42797852, + "step": 2177, + "time_per_iteration": 2.714635133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_mlp": 1.00352275, + "epoch": 0.41900731050404, + "flos": 913485682944.0, + "grad_norm": 0.031061931256926825, + "language_loss": 0.81313407, + "learning_rate": 0.0006529669582909541, + "loss": 0.82359695, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.42822266, + "step": 2178, + "time_per_iteration": 3.2592601776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052105, + "balance_loss_mlp": 1.00923753, + "epoch": 0.4191996921893036, + "flos": 536784090624.0, + "grad_norm": 0.03590517964257674, + "language_loss": 0.86468148, + "learning_rate": 0.0006526703239037077, + "loss": 0.87520254, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.42919922, + "step": 2179, + "time_per_iteration": 2.6636452674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.00259995, + "epoch": 0.4193920738745671, + "flos": 583731204096.0, + "grad_norm": 0.030716470700417473, + "language_loss": 0.86737585, + "learning_rate": 0.0006523736302445851, + "loss": 0.87783122, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.42993164, + "step": 2180, + "time_per_iteration": 2.801374673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00535846, + "epoch": 0.4195844555598307, + "flos": 1337802205440.0, + "grad_norm": 0.03692120158624074, + "language_loss": 0.77735525, + "learning_rate": 0.0006520768774287728, + "loss": 0.78783798, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.4296875, + "step": 2181, + "time_per_iteration": 3.781163454055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048528, + "balance_loss_mlp": 1.00568438, + "epoch": 0.4197768372450943, + "flos": 599997828864.0, + "grad_norm": 0.02986751846873145, + "language_loss": 0.85868645, + "learning_rate": 0.0006517800655714806, + "loss": 0.86917174, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.42895508, + "step": 2182, + "time_per_iteration": 2.8340775966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00454116, + "epoch": 0.4199692189303578, + "flos": 736597055232.0, + "grad_norm": 0.031915917751050384, + "language_loss": 0.8544265, + "learning_rate": 0.0006514831947879407, + "loss": 0.86489916, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.42773438, + "step": 2183, + "time_per_iteration": 2.943141460418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.005602, + "epoch": 0.4201616006156214, + "flos": 751663173120.0, + "grad_norm": 0.03318909585917556, + "language_loss": 0.78676963, + "learning_rate": 0.0006511862651934091, + "loss": 0.79725242, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.42724609, + "step": 2184, + "time_per_iteration": 3.0779521465301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049999, + "balance_loss_mlp": 1.00713122, + "epoch": 0.42035398230088494, + "flos": 548092185600.0, + "grad_norm": 0.030200903128349884, + "language_loss": 0.82675183, + "learning_rate": 0.0006508892769031638, + "loss": 0.83725178, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.42919922, + "step": 2185, + "time_per_iteration": 2.6862621307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052696, + "balance_loss_mlp": 1.0098995, + "epoch": 0.42054636398614853, + "flos": 618048206592.0, + "grad_norm": 0.035053166321698394, + "language_loss": 0.87309551, + "learning_rate": 0.000650592230032506, + "loss": 0.88362241, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.4284668, + "step": 2186, + "time_per_iteration": 2.7250919342041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051072, + "balance_loss_mlp": 1.00813246, + "epoch": 0.42073874567141206, + "flos": 641667571968.0, + "grad_norm": 0.033545410607481084, + "language_loss": 0.85750729, + "learning_rate": 0.0006502951246967595, + "loss": 0.86801797, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.42993164, + "step": 2187, + "time_per_iteration": 2.8897902965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051911, + "balance_loss_mlp": 1.00911534, + "epoch": 0.42093112735667565, + "flos": 494823697152.0, + "grad_norm": 0.02963421973388752, + "language_loss": 0.87416923, + "learning_rate": 0.0006499979610112706, + "loss": 0.88468838, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.4284668, + "step": 2188, + "time_per_iteration": 2.690762519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_mlp": 1.00219369, + "epoch": 0.4211235090419392, + "flos": 543437912064.0, + "grad_norm": 0.03405892185917734, + "language_loss": 0.84498167, + "learning_rate": 0.000649700739091409, + "loss": 0.85543036, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.42724609, + "step": 2189, + "time_per_iteration": 2.7150561809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050289, + "balance_loss_mlp": 1.00918579, + "epoch": 0.42131589072720277, + "flos": 1535391055872.0, + "grad_norm": 0.006162303642849888, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.7488656, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.41113281, + "step": 2190, + "time_per_iteration": 4.829074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_mlp": 1.00466371, + "epoch": 0.42150827241246636, + "flos": 567936031488.0, + "grad_norm": 0.029782751851152003, + "language_loss": 0.85824835, + "learning_rate": 0.0006491061210101557, + "loss": 0.8687222, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.42773438, + "step": 2191, + "time_per_iteration": 2.7018613815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_mlp": 1.00197124, + "epoch": 0.4217006540977299, + "flos": 708842500608.0, + "grad_norm": 0.03166528206992478, + "language_loss": 0.84430063, + "learning_rate": 0.0006488087250796157, + "loss": 0.85474735, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.42749023, + "step": 2192, + "time_per_iteration": 2.907424211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_mlp": 1.00236881, + "epoch": 0.4218930357829935, + "flos": 628562118144.0, + "grad_norm": 0.02920565844268777, + "language_loss": 0.82024074, + "learning_rate": 0.0006485112713764049, + "loss": 0.83069193, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.42797852, + "step": 2193, + "time_per_iteration": 2.9393887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_mlp": 1.00435925, + "epoch": 0.422085417468257, + "flos": 461290184448.0, + "grad_norm": 0.02925244938415649, + "language_loss": 0.84264457, + "learning_rate": 0.0006482137600160051, + "loss": 0.85311759, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.42993164, + "step": 2194, + "time_per_iteration": 2.549301862716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050742, + "balance_loss_mlp": 1.00780332, + "epoch": 0.4222777991535206, + "flos": 474981796608.0, + "grad_norm": 0.030629871462955913, + "language_loss": 0.85158336, + "learning_rate": 0.0006479161911139206, + "loss": 0.86209077, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.42993164, + "step": 2195, + "time_per_iteration": 2.6384336948394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_mlp": 1.01116967, + "epoch": 0.4224701808387841, + "flos": 471844925184.0, + "grad_norm": 0.03651823295441523, + "language_loss": 0.8580153, + "learning_rate": 0.0006476185647856778, + "loss": 0.8685571, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.43066406, + "step": 2196, + "time_per_iteration": 2.61171817779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050335, + "balance_loss_mlp": 1.00737166, + "epoch": 0.4226625625240477, + "flos": 678823992576.0, + "grad_norm": 0.03269819945270571, + "language_loss": 0.81914455, + "learning_rate": 0.0006473208811468255, + "loss": 0.8296479, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.43017578, + "step": 2197, + "time_per_iteration": 2.892245292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_mlp": 1.00611031, + "epoch": 0.4228549442093113, + "flos": 504559954944.0, + "grad_norm": 0.030930986611316814, + "language_loss": 0.84766257, + "learning_rate": 0.0006470231403129347, + "loss": 0.85815352, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.43041992, + "step": 2198, + "time_per_iteration": 2.64943265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104433, + "balance_loss_mlp": 1.00119996, + "epoch": 0.42304732589457483, + "flos": 613075092480.0, + "grad_norm": 0.027263393707605364, + "language_loss": 0.81978631, + "learning_rate": 0.0006467253423995988, + "loss": 0.83022958, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.43188477, + "step": 2199, + "time_per_iteration": 2.8850364685058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048401, + "balance_loss_mlp": 1.00527155, + "epoch": 0.4232397075798384, + "flos": 516649594368.0, + "grad_norm": 0.03785502815659436, + "language_loss": 0.79452145, + "learning_rate": 0.000646427487522433, + "loss": 0.80500549, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.43188477, + "step": 2200, + "time_per_iteration": 2.694916009902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050341, + "balance_loss_mlp": 1.00713968, + "epoch": 0.42343208926510195, + "flos": 590934245376.0, + "grad_norm": 0.030735047123199966, + "language_loss": 0.83900952, + "learning_rate": 0.0006461295757970749, + "loss": 0.84951293, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.43261719, + "step": 2201, + "time_per_iteration": 2.835726737976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046215, + "balance_loss_mlp": 1.00320446, + "epoch": 0.42362447095036554, + "flos": 641819216640.0, + "grad_norm": 0.03465447846020762, + "language_loss": 0.82287079, + "learning_rate": 0.0006458316073391839, + "loss": 0.83333296, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.43066406, + "step": 2202, + "time_per_iteration": 2.8503153324127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045543, + "balance_loss_mlp": 1.00241327, + "epoch": 0.42381685263562907, + "flos": 513718802688.0, + "grad_norm": 0.030503622319833546, + "language_loss": 0.88278598, + "learning_rate": 0.0006455335822644422, + "loss": 0.89324141, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.43188477, + "step": 2203, + "time_per_iteration": 2.6294915676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_mlp": 1.00689554, + "epoch": 0.42400923432089266, + "flos": 547822922496.0, + "grad_norm": 0.03601428124518316, + "language_loss": 0.78504658, + "learning_rate": 0.0006452355006885527, + "loss": 0.79554689, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.43188477, + "step": 2204, + "time_per_iteration": 2.7194669246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050575, + "balance_loss_mlp": 1.00756454, + "epoch": 0.4242016160061562, + "flos": 623288638464.0, + "grad_norm": 0.038292152226624715, + "language_loss": 0.88211453, + "learning_rate": 0.0006449373627272412, + "loss": 0.89262021, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.43066406, + "step": 2205, + "time_per_iteration": 2.760643243789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048111, + "balance_loss_mlp": 1.00495708, + "epoch": 0.4243939976914198, + "flos": 572972328960.0, + "grad_norm": 0.03657249930928273, + "language_loss": 0.83085704, + "learning_rate": 0.0006446391684962553, + "loss": 0.84133816, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.43212891, + "step": 2206, + "time_per_iteration": 2.656205892562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050624, + "balance_loss_mlp": 1.00766063, + "epoch": 0.42458637937668336, + "flos": 449665194240.0, + "grad_norm": 0.03531472123955245, + "language_loss": 0.83588743, + "learning_rate": 0.000644340918111364, + "loss": 0.84639364, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.43017578, + "step": 2207, + "time_per_iteration": 2.563599109649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_mlp": 1.00460744, + "epoch": 0.4247787610619469, + "flos": 436336164096.0, + "grad_norm": 0.035922125926704504, + "language_loss": 0.8567791, + "learning_rate": 0.0006440426116883585, + "loss": 0.86725497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.43041992, + "step": 2208, + "time_per_iteration": 2.5554726123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_mlp": 1.00743186, + "epoch": 0.4249711427472105, + "flos": 497122643712.0, + "grad_norm": 0.02878008588010938, + "language_loss": 0.86522639, + "learning_rate": 0.0006437442493430519, + "loss": 0.87572914, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.42895508, + "step": 2209, + "time_per_iteration": 2.698664426803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00334466, + "epoch": 0.425163524432474, + "flos": 657108910848.0, + "grad_norm": 0.03332162137783894, + "language_loss": 0.87084454, + "learning_rate": 0.000643445831191278, + "loss": 0.88130671, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.42919922, + "step": 2210, + "time_per_iteration": 2.919759750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_mlp": 1.00526094, + "epoch": 0.4253559061177376, + "flos": 651779050752.0, + "grad_norm": 0.0360276634161647, + "language_loss": 0.82163692, + "learning_rate": 0.0006431473573488937, + "loss": 0.83211577, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.42675781, + "step": 2211, + "time_per_iteration": 2.7520995140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051038, + "balance_loss_mlp": 1.00836086, + "epoch": 0.42554828780300114, + "flos": 555203853312.0, + "grad_norm": 0.03839138543396186, + "language_loss": 0.85743141, + "learning_rate": 0.0006428488279317765, + "loss": 0.86794186, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.42724609, + "step": 2212, + "time_per_iteration": 2.6509060859680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046098, + "balance_loss_mlp": 1.00356376, + "epoch": 0.4257406694882647, + "flos": 515422842624.0, + "grad_norm": 0.03572196481521071, + "language_loss": 0.88174772, + "learning_rate": 0.0006425502430558259, + "loss": 0.89220864, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.42578125, + "step": 2213, + "time_per_iteration": 2.6220855712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_mlp": 1.00623667, + "epoch": 0.42593305117352825, + "flos": 516705974784.0, + "grad_norm": 0.03258136107598633, + "language_loss": 0.85395515, + "learning_rate": 0.0006422516028369628, + "loss": 0.86444604, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.42895508, + "step": 2214, + "time_per_iteration": 2.6463093757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_mlp": 1.00069499, + "epoch": 0.42612543285879184, + "flos": 589238953728.0, + "grad_norm": 0.0291937048711678, + "language_loss": 0.83896095, + "learning_rate": 0.0006419529073911296, + "loss": 0.8493973, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.42993164, + "step": 2215, + "time_per_iteration": 2.910792112350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.0052923, + "epoch": 0.42631781454405543, + "flos": 636752783616.0, + "grad_norm": 0.03192715722055512, + "language_loss": 0.86142385, + "learning_rate": 0.0006416541568342901, + "loss": 0.87190473, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.4284668, + "step": 2216, + "time_per_iteration": 2.846374750137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_mlp": 1.00366437, + "epoch": 0.42651019622931896, + "flos": 542246153472.0, + "grad_norm": 0.029068811164029314, + "language_loss": 0.84547782, + "learning_rate": 0.0006413553512824297, + "loss": 0.8559429, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.42895508, + "step": 2217, + "time_per_iteration": 2.7738640308380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.00467396, + "epoch": 0.42670257791458255, + "flos": 559224336384.0, + "grad_norm": 0.03125487953761627, + "language_loss": 0.85257965, + "learning_rate": 0.0006410564908515549, + "loss": 0.86305416, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.42822266, + "step": 2218, + "time_per_iteration": 2.654423713684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050321, + "balance_loss_mlp": 1.00757229, + "epoch": 0.4268949595998461, + "flos": 622450713600.0, + "grad_norm": 0.03350458888486861, + "language_loss": 0.85655409, + "learning_rate": 0.0006407575756576935, + "loss": 0.86705726, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.42797852, + "step": 2219, + "time_per_iteration": 2.7789905071258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047517, + "balance_loss_mlp": 1.00479233, + "epoch": 0.42708734128510967, + "flos": 539015963136.0, + "grad_norm": 0.029341516559542476, + "language_loss": 0.87978554, + "learning_rate": 0.0006404586058168951, + "loss": 0.8902607, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.42773438, + "step": 2220, + "time_per_iteration": 2.7526872158050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047218, + "balance_loss_mlp": 1.00456524, + "epoch": 0.4272797229703732, + "flos": 503862981120.0, + "grad_norm": 0.03177497968579407, + "language_loss": 0.87384629, + "learning_rate": 0.0006401595814452296, + "loss": 0.88431847, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.42700195, + "step": 2221, + "time_per_iteration": 2.620292901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_mlp": 1.00282323, + "epoch": 0.4274721046556368, + "flos": 493438497792.0, + "grad_norm": 0.03138650703960668, + "language_loss": 0.81104958, + "learning_rate": 0.000639860502658789, + "loss": 0.82150364, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.42626953, + "step": 2222, + "time_per_iteration": 2.6335668563842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_mlp": 1.01007414, + "epoch": 0.4276644863409004, + "flos": 569462181888.0, + "grad_norm": 0.029337527326174825, + "language_loss": 0.84956491, + "learning_rate": 0.0006395613695736853, + "loss": 0.86009336, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.42822266, + "step": 2223, + "time_per_iteration": 2.69158935546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_mlp": 1.01059997, + "epoch": 0.4278568680261639, + "flos": 608563715328.0, + "grad_norm": 0.03527650476558936, + "language_loss": 0.8254534, + "learning_rate": 0.0006392621823060529, + "loss": 0.83598542, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.42651367, + "step": 2224, + "time_per_iteration": 2.7607972621917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_mlp": 0.99978256, + "epoch": 0.4280492497114275, + "flos": 561579663360.0, + "grad_norm": 0.03854840542263403, + "language_loss": 0.8576616, + "learning_rate": 0.0006389629409720465, + "loss": 0.86808693, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.42797852, + "step": 2225, + "time_per_iteration": 2.675492525100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_mlp": 1.00333333, + "epoch": 0.428241631396691, + "flos": 721902267648.0, + "grad_norm": 0.035169952304445494, + "language_loss": 0.89023572, + "learning_rate": 0.0006386636456878417, + "loss": 0.90069675, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.42822266, + "step": 2226, + "time_per_iteration": 2.8786110877990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046721, + "balance_loss_mlp": 1.00397301, + "epoch": 0.4284340130819546, + "flos": 430370568192.0, + "grad_norm": 0.04053005061098929, + "language_loss": 0.92206526, + "learning_rate": 0.0006383642965696353, + "loss": 0.93253243, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.42797852, + "step": 2227, + "time_per_iteration": 2.468848705291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00519955, + "epoch": 0.42862639476721814, + "flos": 526160330496.0, + "grad_norm": 0.0312355764309364, + "language_loss": 0.83643448, + "learning_rate": 0.000638064893733645, + "loss": 0.84691536, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.42944336, + "step": 2228, + "time_per_iteration": 2.7273313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_mlp": 1.0059433, + "epoch": 0.42881877645248173, + "flos": 466378971648.0, + "grad_norm": 0.033088247906643435, + "language_loss": 0.90412128, + "learning_rate": 0.000637765437296109, + "loss": 0.91460913, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.42895508, + "step": 2229, + "time_per_iteration": 2.6459994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104348, + "balance_loss_mlp": 1.00051713, + "epoch": 0.42901115813774526, + "flos": 561356087040.0, + "grad_norm": 0.033851055909267555, + "language_loss": 0.85812581, + "learning_rate": 0.000637465927373287, + "loss": 0.86856055, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.43017578, + "step": 2230, + "time_per_iteration": 2.6650984287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_mlp": 1.00843728, + "epoch": 0.42920353982300885, + "flos": 562528403712.0, + "grad_norm": 0.03941473686966497, + "language_loss": 0.79439276, + "learning_rate": 0.000637166364081459, + "loss": 0.80490577, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.42919922, + "step": 2231, + "time_per_iteration": 2.6497089862823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_mlp": 1.00242341, + "epoch": 0.42939592150827244, + "flos": 557316162048.0, + "grad_norm": 0.0345529023969128, + "language_loss": 0.84757453, + "learning_rate": 0.0006368667475369256, + "loss": 0.85802627, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.42797852, + "step": 2232, + "time_per_iteration": 2.7934672832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_mlp": 1.00753021, + "epoch": 0.42958830319353597, + "flos": 1524945185280.0, + "grad_norm": 0.006396251355867503, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79576218, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.40917969, + "step": 2233, + "time_per_iteration": 6.342620372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_mlp": 1.0040741, + "epoch": 0.42978068487879956, + "flos": 1498872316416.0, + "grad_norm": 0.003657386104401554, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.7994051, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.41015625, + "step": 2234, + "time_per_iteration": 4.862509250640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044747, + "balance_loss_mlp": 1.00209367, + "epoch": 0.4299730665640631, + "flos": 548063995392.0, + "grad_norm": 0.029617650166464796, + "language_loss": 0.86346197, + "learning_rate": 0.0006359675795504112, + "loss": 0.87390947, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.42700195, + "step": 2235, + "time_per_iteration": 2.747687339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_mlp": 1.0022428, + "epoch": 0.4301654482493267, + "flos": 1131116700672.0, + "grad_norm": 0.034530900471349386, + "language_loss": 0.74852663, + "learning_rate": 0.0006356677511584775, + "loss": 0.75897634, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.42773438, + "step": 2236, + "time_per_iteration": 3.4453399181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_mlp": 1.00291729, + "epoch": 0.4303578299345902, + "flos": 496742565120.0, + "grad_norm": 0.03572959525697719, + "language_loss": 0.8668766, + "learning_rate": 0.0006353678700956511, + "loss": 0.87733233, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.42700195, + "step": 2237, + "time_per_iteration": 2.562898874282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_mlp": 1.00228131, + "epoch": 0.4305502116198538, + "flos": 616930324992.0, + "grad_norm": 0.03185512314906856, + "language_loss": 0.84350532, + "learning_rate": 0.0006350679364783569, + "loss": 0.853953, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.42529297, + "step": 2238, + "time_per_iteration": 2.7968668937683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044024, + "balance_loss_mlp": 1.00139523, + "epoch": 0.4307425933051173, + "flos": 560322776064.0, + "grad_norm": 0.03209283293682184, + "language_loss": 0.85997605, + "learning_rate": 0.0006347679504230393, + "loss": 0.87041628, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.42675781, + "step": 2239, + "time_per_iteration": 2.634075880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042998, + "balance_loss_mlp": 1.00039279, + "epoch": 0.4309349749903809, + "flos": 973818206976.0, + "grad_norm": 0.03253096283776471, + "language_loss": 0.77016532, + "learning_rate": 0.0006344679120461632, + "loss": 0.7805953, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.42651367, + "step": 2240, + "time_per_iteration": 3.334874153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044857, + "balance_loss_mlp": 1.00222731, + "epoch": 0.4311273566756445, + "flos": 542973262848.0, + "grad_norm": 0.034862997803941254, + "language_loss": 0.8043505, + "learning_rate": 0.0006341678214642134, + "loss": 0.81479907, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.42675781, + "step": 2241, + "time_per_iteration": 2.6504814624786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00386059, + "epoch": 0.43131973836090803, + "flos": 763112219136.0, + "grad_norm": 0.032836493574204505, + "language_loss": 0.83329326, + "learning_rate": 0.0006338676787936963, + "loss": 0.84375745, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.42602539, + "step": 2242, + "time_per_iteration": 3.0819406509399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049013, + "balance_loss_mlp": 1.0064075, + "epoch": 0.4315121200461716, + "flos": 555603373824.0, + "grad_norm": 0.03474898353682057, + "language_loss": 0.8436116, + "learning_rate": 0.0006335674841511367, + "loss": 0.85410172, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.42651367, + "step": 2243, + "time_per_iteration": 2.688323974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.00395203, + "epoch": 0.43170450173143515, + "flos": 1488689872896.0, + "grad_norm": 0.005657229041031833, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80226028, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.40917969, + "step": 2244, + "time_per_iteration": 5.0437562465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_mlp": 1.00093079, + "epoch": 0.43189688341669874, + "flos": 1476910325760.0, + "grad_norm": 0.004174711640612148, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.784073, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.40820312, + "step": 2245, + "time_per_iteration": 4.930269002914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105129, + "balance_loss_mlp": 1.00870872, + "epoch": 0.43208926510196227, + "flos": 493985772288.0, + "grad_norm": 0.03367129883883542, + "language_loss": 0.83325648, + "learning_rate": 0.0006326665895567652, + "loss": 0.84376937, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.42626953, + "step": 2246, + "time_per_iteration": 2.6496520042419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_mlp": 1.0025456, + "epoch": 0.43228164678722586, + "flos": 521303867904.0, + "grad_norm": 0.0373506965449987, + "language_loss": 0.88340402, + "learning_rate": 0.0006323661881916976, + "loss": 0.89385581, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.42675781, + "step": 2247, + "time_per_iteration": 2.7220535278320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_mlp": 1.00188208, + "epoch": 0.4324740284724894, + "flos": 797396173824.0, + "grad_norm": 0.03547023876634794, + "language_loss": 0.8184936, + "learning_rate": 0.0006320657354375179, + "loss": 0.82893801, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.42602539, + "step": 2248, + "time_per_iteration": 2.939730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00463986, + "epoch": 0.432666410157753, + "flos": 483098585088.0, + "grad_norm": 0.03653679675435745, + "language_loss": 0.87333679, + "learning_rate": 0.0006317652314108726, + "loss": 0.88380903, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.42626953, + "step": 2249, + "time_per_iteration": 2.554605007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104756, + "balance_loss_mlp": 1.00512183, + "epoch": 0.43285879184301657, + "flos": 501210200832.0, + "grad_norm": 0.035110898136686476, + "language_loss": 0.91870761, + "learning_rate": 0.0006314646762284277, + "loss": 0.92918324, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.42480469, + "step": 2250, + "time_per_iteration": 2.6592071056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051212, + "balance_loss_mlp": 1.01029968, + "epoch": 0.4330511735282801, + "flos": 1513793592576.0, + "grad_norm": 0.004753866691066904, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76477039, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.40917969, + "step": 2251, + "time_per_iteration": 4.880429267883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00837088, + "epoch": 0.4332435552135437, + "flos": 700838472960.0, + "grad_norm": 0.03213295924784481, + "language_loss": 0.77973437, + "learning_rate": 0.0006308634128629022, + "loss": 0.790241, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.42333984, + "step": 2252, + "time_per_iteration": 2.882138729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048462, + "balance_loss_mlp": 1.00621462, + "epoch": 0.4334359368988072, + "flos": 593483013120.0, + "grad_norm": 0.03310670466815904, + "language_loss": 0.87855673, + "learning_rate": 0.0006305627049132531, + "loss": 0.8890413, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.42285156, + "step": 2253, + "time_per_iteration": 2.756601095199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052718, + "balance_loss_mlp": 1.01049364, + "epoch": 0.4336283185840708, + "flos": 844276213248.0, + "grad_norm": 0.028181128656308053, + "language_loss": 0.86222875, + "learning_rate": 0.0006302619462746662, + "loss": 0.87275594, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.42260742, + "step": 2254, + "time_per_iteration": 3.1384341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00748384, + "epoch": 0.43382070026933434, + "flos": 627402440448.0, + "grad_norm": 0.031912731462448586, + "language_loss": 0.90840006, + "learning_rate": 0.0006299611370639069, + "loss": 0.91889828, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.42382812, + "step": 2255, + "time_per_iteration": 2.712411642074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00746036, + "epoch": 0.4340130819545979, + "flos": 592210574592.0, + "grad_norm": 0.034079381595113686, + "language_loss": 0.79521996, + "learning_rate": 0.0006296602773977593, + "loss": 0.80571818, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.42407227, + "step": 2256, + "time_per_iteration": 2.714035987854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044192, + "balance_loss_mlp": 1.00182462, + "epoch": 0.4342054636398615, + "flos": 491956088832.0, + "grad_norm": 0.031173748742501443, + "language_loss": 0.88170785, + "learning_rate": 0.0006293593673930277, + "loss": 0.89214981, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.42407227, + "step": 2257, + "time_per_iteration": 2.6403400897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050154, + "balance_loss_mlp": 1.00771534, + "epoch": 0.43439784532512504, + "flos": 700261062912.0, + "grad_norm": 0.031956889919079245, + "language_loss": 0.79138076, + "learning_rate": 0.0006290584071665358, + "loss": 0.80188227, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.42480469, + "step": 2258, + "time_per_iteration": 2.88726544380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051787, + "balance_loss_mlp": 1.00942004, + "epoch": 0.43459022701038863, + "flos": 486802172928.0, + "grad_norm": 0.03220669099915263, + "language_loss": 0.82764459, + "learning_rate": 0.0006287573968351266, + "loss": 0.83816242, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.42407227, + "step": 2259, + "time_per_iteration": 2.556873083114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_mlp": 1.00314939, + "epoch": 0.43478260869565216, + "flos": 644267862528.0, + "grad_norm": 0.0421666552527836, + "language_loss": 0.83019865, + "learning_rate": 0.0006284563365156626, + "loss": 0.84065259, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.42285156, + "step": 2260, + "time_per_iteration": 2.7845253944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044012, + "balance_loss_mlp": 1.0014782, + "epoch": 0.43497499038091575, + "flos": 427010120448.0, + "grad_norm": 0.03632893260701325, + "language_loss": 0.87946701, + "learning_rate": 0.0006281552263250261, + "loss": 0.88990712, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.42578125, + "step": 2261, + "time_per_iteration": 2.4605414867401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_mlp": 1.00973511, + "epoch": 0.4351673720661793, + "flos": 1541527738368.0, + "grad_norm": 0.007050141628338806, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81742275, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.41015625, + "step": 2262, + "time_per_iteration": 4.901712656021118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105392, + "balance_loss_mlp": 1.01160097, + "epoch": 0.43535975375144287, + "flos": 750466556928.0, + "grad_norm": 0.036118497785784055, + "language_loss": 0.8206706, + "learning_rate": 0.0006275528567978593, + "loss": 0.83120978, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.42358398, + "step": 2263, + "time_per_iteration": 2.9023561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049817, + "balance_loss_mlp": 1.00749719, + "epoch": 0.4355521354367064, + "flos": 862752356352.0, + "grad_norm": 0.037575674234966834, + "language_loss": 0.82972687, + "learning_rate": 0.0006272515976951898, + "loss": 0.84022498, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.42358398, + "step": 2264, + "time_per_iteration": 3.062626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_mlp": 1.00086057, + "epoch": 0.43574451712197, + "flos": 735843700992.0, + "grad_norm": 0.027621901281680974, + "language_loss": 0.7971707, + "learning_rate": 0.0006269502891890687, + "loss": 0.80760157, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.42260742, + "step": 2265, + "time_per_iteration": 3.006544351577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047735, + "balance_loss_mlp": 1.00548732, + "epoch": 0.4359368988072336, + "flos": 571713496320.0, + "grad_norm": 0.03795602123750952, + "language_loss": 0.88080567, + "learning_rate": 0.0006266489313964743, + "loss": 0.89128304, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.42285156, + "step": 2266, + "time_per_iteration": 2.7217609882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00633645, + "epoch": 0.4361292804924971, + "flos": 556671677952.0, + "grad_norm": 0.02985944883667051, + "language_loss": 0.86046827, + "learning_rate": 0.0006263475244344041, + "loss": 0.87095433, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.4230957, + "step": 2267, + "time_per_iteration": 2.844616651535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048894, + "balance_loss_mlp": 1.00688469, + "epoch": 0.4363216621777607, + "flos": 558349473024.0, + "grad_norm": 0.03645132335916721, + "language_loss": 0.84930134, + "learning_rate": 0.0006260460684198746, + "loss": 0.85979033, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.42041016, + "step": 2268, + "time_per_iteration": 2.6209938526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_mlp": 1.00457883, + "epoch": 0.4365140438630242, + "flos": 479197665792.0, + "grad_norm": 0.03681259693925087, + "language_loss": 0.84888554, + "learning_rate": 0.0006257445634699213, + "loss": 0.85935068, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.41967773, + "step": 2269, + "time_per_iteration": 2.5371193885803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_mlp": 1.00675571, + "epoch": 0.4367064255482878, + "flos": 580008174336.0, + "grad_norm": 0.03379370609735099, + "language_loss": 0.83707798, + "learning_rate": 0.0006254430097015993, + "loss": 0.84756517, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.41992188, + "step": 2270, + "time_per_iteration": 2.663670539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053223, + "balance_loss_mlp": 1.01278687, + "epoch": 0.43689880723355135, + "flos": 1462274830848.0, + "grad_norm": 0.005499517712732893, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77532315, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.40429688, + "step": 2271, + "time_per_iteration": 4.872848033905029 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051249, + "balance_loss_mlp": 1.00945389, + "epoch": 0.43709118891881493, + "flos": 668874852096.0, + "grad_norm": 0.028346757116800847, + "language_loss": 0.85555887, + "learning_rate": 0.0006248397561781609, + "loss": 0.86607134, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.41821289, + "step": 2272, + "time_per_iteration": 2.8525848388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_mlp": 1.01004434, + "epoch": 0.43728357060407846, + "flos": 545914748160.0, + "grad_norm": 0.03971939435737374, + "language_loss": 0.86681366, + "learning_rate": 0.0006245380566572482, + "loss": 0.87733418, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.42041016, + "step": 2273, + "time_per_iteration": 2.65950608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_mlp": 1.01047897, + "epoch": 0.43747595228934205, + "flos": 748185106944.0, + "grad_norm": 0.03474296828051499, + "language_loss": 0.764799, + "learning_rate": 0.0006242363087863744, + "loss": 0.77532339, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.41992188, + "step": 2274, + "time_per_iteration": 3.009678363800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_mlp": 1.00212932, + "epoch": 0.43766833397460564, + "flos": 632530111488.0, + "grad_norm": 0.043644038275203835, + "language_loss": 0.86733937, + "learning_rate": 0.0006239345126826878, + "loss": 0.87778056, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.42016602, + "step": 2275, + "time_per_iteration": 2.7913572788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_mlp": 1.00093269, + "epoch": 0.43786071565986917, + "flos": 532099681536.0, + "grad_norm": 0.03488456741245989, + "language_loss": 0.84520668, + "learning_rate": 0.0006236326684633561, + "loss": 0.85563612, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.42041016, + "step": 2276, + "time_per_iteration": 2.868460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_mlp": 1.00567341, + "epoch": 0.43805309734513276, + "flos": 539558380032.0, + "grad_norm": 0.04090877877929134, + "language_loss": 0.75841373, + "learning_rate": 0.0006233307762455658, + "loss": 0.76888937, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.41918945, + "step": 2277, + "time_per_iteration": 2.675471782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.00563169, + "epoch": 0.4382454790303963, + "flos": 865965050112.0, + "grad_norm": 0.057141626101515054, + "language_loss": 0.83989596, + "learning_rate": 0.0006230288361465216, + "loss": 0.85037291, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.42089844, + "step": 2278, + "time_per_iteration": 3.0322673320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_mlp": 1.005216, + "epoch": 0.4384378607156599, + "flos": 766802201088.0, + "grad_norm": 0.03709867443192191, + "language_loss": 0.85241038, + "learning_rate": 0.0006227268482834473, + "loss": 0.86288601, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.42382812, + "step": 2279, + "time_per_iteration": 2.900203227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.0024029, + "epoch": 0.4386302424009234, + "flos": 669797347584.0, + "grad_norm": 0.03112976006735108, + "language_loss": 0.87510288, + "learning_rate": 0.000622424812773585, + "loss": 0.88555157, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.42504883, + "step": 2280, + "time_per_iteration": 2.8384146690368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_mlp": 1.00591767, + "epoch": 0.438822624086187, + "flos": 486150885888.0, + "grad_norm": 0.037274279546085635, + "language_loss": 0.8020004, + "learning_rate": 0.000622122729734195, + "loss": 0.81248468, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.42553711, + "step": 2281, + "time_per_iteration": 2.6004860401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048001, + "balance_loss_mlp": 1.00549114, + "epoch": 0.4390150057714506, + "flos": 500259515136.0, + "grad_norm": 0.032261530197162686, + "language_loss": 0.88006121, + "learning_rate": 0.0006218205992825566, + "loss": 0.8905412, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.42553711, + "step": 2282, + "time_per_iteration": 2.619781494140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049893, + "balance_loss_mlp": 1.00745404, + "epoch": 0.4392073874567141, + "flos": 559352648448.0, + "grad_norm": 0.035010140104523226, + "language_loss": 0.8217926, + "learning_rate": 0.0006215184215359671, + "loss": 0.83229148, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.42480469, + "step": 2283, + "time_per_iteration": 2.7295265197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_mlp": 1.00495577, + "epoch": 0.4393997691419777, + "flos": 606423216384.0, + "grad_norm": 0.031848598857185544, + "language_loss": 0.86998332, + "learning_rate": 0.0006212161966117425, + "loss": 0.88045812, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.42578125, + "step": 2284, + "time_per_iteration": 2.718440532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_mlp": 1.00607538, + "epoch": 0.43959215082724123, + "flos": 805484772096.0, + "grad_norm": 0.035712970592664255, + "language_loss": 0.82239711, + "learning_rate": 0.0006209139246272164, + "loss": 0.83288318, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.42578125, + "step": 2285, + "time_per_iteration": 2.9688222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050536, + "balance_loss_mlp": 1.00793087, + "epoch": 0.4397845325125048, + "flos": 488608280064.0, + "grad_norm": 0.03687327973299051, + "language_loss": 0.82202113, + "learning_rate": 0.0006206116056997421, + "loss": 0.8325265, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.42651367, + "step": 2286, + "time_per_iteration": 2.5476558208465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048309, + "balance_loss_mlp": 1.00579894, + "epoch": 0.43997691419776835, + "flos": 481785317376.0, + "grad_norm": 0.030160303580515496, + "language_loss": 0.8299154, + "learning_rate": 0.0006203092399466892, + "loss": 0.84039849, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.42553711, + "step": 2287, + "time_per_iteration": 2.5308852195739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_mlp": 1.00539064, + "epoch": 0.44016929588303194, + "flos": 484129950720.0, + "grad_norm": 0.02729114822665251, + "language_loss": 0.85650307, + "learning_rate": 0.0006200068274854473, + "loss": 0.8669818, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.42529297, + "step": 2288, + "time_per_iteration": 2.6596133708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045912, + "balance_loss_mlp": 1.00361645, + "epoch": 0.4403616775682955, + "flos": 573024818688.0, + "grad_norm": 0.028573956325372987, + "language_loss": 0.86632061, + "learning_rate": 0.0006197043684334229, + "loss": 0.87677968, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.42333984, + "step": 2289, + "time_per_iteration": 2.773327350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00496542, + "epoch": 0.44055405925355906, + "flos": 632000333568.0, + "grad_norm": 0.03542319310998882, + "language_loss": 0.80357343, + "learning_rate": 0.0006194018629080411, + "loss": 0.81404698, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.42431641, + "step": 2290, + "time_per_iteration": 2.7465741634368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_mlp": 1.00444698, + "epoch": 0.44074644093882265, + "flos": 537826149888.0, + "grad_norm": 0.033710926441732514, + "language_loss": 0.82429153, + "learning_rate": 0.0006190993110267451, + "loss": 0.83475971, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.42407227, + "step": 2291, + "time_per_iteration": 2.734936237335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_mlp": 1.00401258, + "epoch": 0.4409388226240862, + "flos": 464166541056.0, + "grad_norm": 0.03677198311176373, + "language_loss": 0.84841394, + "learning_rate": 0.0006187967129069958, + "loss": 0.85887772, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.42407227, + "step": 2292, + "time_per_iteration": 2.491478443145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048193, + "balance_loss_mlp": 1.00604105, + "epoch": 0.44113120430934977, + "flos": 567161289984.0, + "grad_norm": 0.027373577802651455, + "language_loss": 0.87309539, + "learning_rate": 0.0006184940686662722, + "loss": 0.88357735, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.421875, + "step": 2293, + "time_per_iteration": 2.7358779907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045965, + "balance_loss_mlp": 1.00371683, + "epoch": 0.4413235859946133, + "flos": 544675357440.0, + "grad_norm": 0.03072432375615432, + "language_loss": 0.9056381, + "learning_rate": 0.0006181913784220714, + "loss": 0.91609776, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.42285156, + "step": 2294, + "time_per_iteration": 2.6358015537261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_mlp": 1.00485992, + "epoch": 0.4415159676798769, + "flos": 1573305688320.0, + "grad_norm": 0.007789835090792861, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81599367, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.40722656, + "step": 2295, + "time_per_iteration": 4.902246713638306 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_mlp": 1.00181961, + "epoch": 0.4417083493651404, + "flos": 660013457664.0, + "grad_norm": 0.029698143477661094, + "language_loss": 0.80193049, + "learning_rate": 0.0006175858603933146, + "loss": 0.8123709, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.42260742, + "step": 2296, + "time_per_iteration": 2.8894712924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010469, + "balance_loss_mlp": 1.00477171, + "epoch": 0.441900731050404, + "flos": 741818045184.0, + "grad_norm": 0.03343125158047759, + "language_loss": 0.81235009, + "learning_rate": 0.0006172830328438416, + "loss": 0.82281911, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.42163086, + "step": 2297, + "time_per_iteration": 3.03363299369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_mlp": 1.00080705, + "epoch": 0.44209311273566754, + "flos": 540596548608.0, + "grad_norm": 0.03516131163144532, + "language_loss": 0.87775767, + "learning_rate": 0.0006169801597610572, + "loss": 0.88818848, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.4230957, + "step": 2298, + "time_per_iteration": 2.7615511417388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047937, + "balance_loss_mlp": 1.00580859, + "epoch": 0.4422854944209311, + "flos": 622730670336.0, + "grad_norm": 0.03691263796350213, + "language_loss": 0.90342188, + "learning_rate": 0.0006166772412625469, + "loss": 0.91390121, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.42163086, + "step": 2299, + "time_per_iteration": 2.757885456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_mlp": 1.00208378, + "epoch": 0.4424778761061947, + "flos": 660061089792.0, + "grad_norm": 0.03315959572172903, + "language_loss": 0.82509053, + "learning_rate": 0.0006163742774659141, + "loss": 0.835536, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.42504883, + "step": 2300, + "time_per_iteration": 2.8489365577697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045508, + "balance_loss_mlp": 1.00316477, + "epoch": 0.44267025779145824, + "flos": 569703254784.0, + "grad_norm": 0.02877714461404429, + "language_loss": 0.86486191, + "learning_rate": 0.0006160712684887801, + "loss": 0.87531698, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.42382812, + "step": 2301, + "time_per_iteration": 2.783581495285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_mlp": 1.00126386, + "epoch": 0.44286263947672183, + "flos": 497819617536.0, + "grad_norm": 0.032325076823307486, + "language_loss": 0.82883227, + "learning_rate": 0.0006157682144487832, + "loss": 0.83926737, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.42285156, + "step": 2302, + "time_per_iteration": 2.8138058185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046294, + "balance_loss_mlp": 1.00395119, + "epoch": 0.44305502116198536, + "flos": 610608950016.0, + "grad_norm": 0.032307808069359366, + "language_loss": 0.83262819, + "learning_rate": 0.0006154651154635793, + "loss": 0.84309107, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.42382812, + "step": 2303, + "time_per_iteration": 2.9065494537353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045793, + "balance_loss_mlp": 1.00349796, + "epoch": 0.44324740284724895, + "flos": 471742857984.0, + "grad_norm": 0.03422426159351285, + "language_loss": 0.85742319, + "learning_rate": 0.0006151619716508421, + "loss": 0.86788118, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.42333984, + "step": 2304, + "time_per_iteration": 2.5973682403564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00377095, + "epoch": 0.4434397845325125, + "flos": 579812788224.0, + "grad_norm": 0.032225909976612614, + "language_loss": 0.87212336, + "learning_rate": 0.0006148587831282625, + "loss": 0.88258433, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.42358398, + "step": 2305, + "time_per_iteration": 2.6349332332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_mlp": 1.00563049, + "epoch": 0.44363216621777607, + "flos": 1499997967872.0, + "grad_norm": 0.0072841640427745245, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80222803, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.41113281, + "step": 2306, + "time_per_iteration": 4.920953989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047952, + "balance_loss_mlp": 1.00565624, + "epoch": 0.44382454790303966, + "flos": 478285863936.0, + "grad_norm": 0.035350800366555836, + "language_loss": 0.87850344, + "learning_rate": 0.0006142522724244255, + "loss": 0.88898295, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.42333984, + "step": 2307, + "time_per_iteration": 2.5206384658813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044529, + "balance_loss_mlp": 1.00361633, + "epoch": 0.4440169295883032, + "flos": 1547306696448.0, + "grad_norm": 0.0037013242818687312, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77529252, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.40917969, + "step": 2308, + "time_per_iteration": 4.906585454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00570333, + "epoch": 0.4442093112735668, + "flos": 592291254528.0, + "grad_norm": 0.03559804859588436, + "language_loss": 0.78114909, + "learning_rate": 0.000613645584293942, + "loss": 0.79162765, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.421875, + "step": 2309, + "time_per_iteration": 2.9084970951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_mlp": 1.00767648, + "epoch": 0.4444016929588303, + "flos": 531328830720.0, + "grad_norm": 0.036447190975963356, + "language_loss": 0.83448339, + "learning_rate": 0.0006133421739881185, + "loss": 0.84498286, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.4230957, + "step": 2310, + "time_per_iteration": 2.652672052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044821, + "balance_loss_mlp": 1.0026927, + "epoch": 0.4445940746440939, + "flos": 621389212416.0, + "grad_norm": 0.035906278639006764, + "language_loss": 0.83511341, + "learning_rate": 0.0006130387196789605, + "loss": 0.84556162, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.42163086, + "step": 2311, + "time_per_iteration": 2.747197151184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.00328362, + "epoch": 0.4447864563293574, + "flos": 630376973568.0, + "grad_norm": 0.027043038636915952, + "language_loss": 0.84677482, + "learning_rate": 0.0006127352214842795, + "loss": 0.85723037, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.4230957, + "step": 2312, + "time_per_iteration": 3.0515668392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045637, + "balance_loss_mlp": 1.00327015, + "epoch": 0.444978838014621, + "flos": 652002627072.0, + "grad_norm": 0.034195517498726076, + "language_loss": 0.85929281, + "learning_rate": 0.0006124316795219041, + "loss": 0.86974919, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.42407227, + "step": 2313, + "time_per_iteration": 2.778184652328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050022, + "balance_loss_mlp": 1.00786984, + "epoch": 0.44517121969988455, + "flos": 613589319168.0, + "grad_norm": 0.029604729226228255, + "language_loss": 0.82924336, + "learning_rate": 0.0006121280939096794, + "loss": 0.83974361, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.421875, + "step": 2314, + "time_per_iteration": 2.7615392208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045707, + "balance_loss_mlp": 1.00350666, + "epoch": 0.44536360138514813, + "flos": 489715468032.0, + "grad_norm": 0.036472505020621125, + "language_loss": 0.8826952, + "learning_rate": 0.000611824464765468, + "loss": 0.89315224, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.42236328, + "step": 2315, + "time_per_iteration": 2.67606782913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_mlp": 1.01759338, + "epoch": 0.4455559830704117, + "flos": 1519056390144.0, + "grad_norm": 0.01193419136680653, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79653352, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.40820312, + "step": 2316, + "time_per_iteration": 4.725375652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045978, + "balance_loss_mlp": 1.00384891, + "epoch": 0.44574836475567525, + "flos": 616817564160.0, + "grad_norm": 0.032139423648612636, + "language_loss": 0.85745513, + "learning_rate": 0.000611217076352619, + "loss": 0.86791497, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.42163086, + "step": 2317, + "time_per_iteration": 2.8277692794799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046443, + "balance_loss_mlp": 1.00429094, + "epoch": 0.44594074644093884, + "flos": 507434366208.0, + "grad_norm": 0.030845694350894858, + "language_loss": 0.83782113, + "learning_rate": 0.0006109133173197905, + "loss": 0.84828556, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.421875, + "step": 2318, + "time_per_iteration": 2.740814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044287, + "balance_loss_mlp": 1.0021348, + "epoch": 0.44613312812620237, + "flos": 728313070848.0, + "grad_norm": 0.03532114030566384, + "language_loss": 0.86011016, + "learning_rate": 0.0006106095152265935, + "loss": 0.87055302, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.421875, + "step": 2319, + "time_per_iteration": 2.982090473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048334, + "balance_loss_mlp": 1.00615764, + "epoch": 0.44632550981146596, + "flos": 637058985216.0, + "grad_norm": 0.029959494040304766, + "language_loss": 0.85331011, + "learning_rate": 0.0006103056701909739, + "loss": 0.86379343, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.42211914, + "step": 2320, + "time_per_iteration": 2.911764621734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_mlp": 1.00878716, + "epoch": 0.4465178914967295, + "flos": 828618100992.0, + "grad_norm": 0.026414177364328564, + "language_loss": 0.83389866, + "learning_rate": 0.0006100017823308956, + "loss": 0.8444078, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.42163086, + "step": 2321, + "time_per_iteration": 3.166370153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00672007, + "epoch": 0.4467102731819931, + "flos": 667033751808.0, + "grad_norm": 0.03675396641442824, + "language_loss": 0.80177474, + "learning_rate": 0.0006096978517643377, + "loss": 0.81226206, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.42041016, + "step": 2322, + "time_per_iteration": 2.7839677333831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049863, + "balance_loss_mlp": 1.00780618, + "epoch": 0.4469026548672566, + "flos": 513970569216.0, + "grad_norm": 0.036357166954029595, + "language_loss": 0.84299958, + "learning_rate": 0.0006093938786092968, + "loss": 0.85349822, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.42089844, + "step": 2323, + "time_per_iteration": 2.6366002559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_mlp": 1.01054394, + "epoch": 0.4470950365525202, + "flos": 685286318592.0, + "grad_norm": 0.03621901423501995, + "language_loss": 0.9042533, + "learning_rate": 0.0006090898629837857, + "loss": 0.91477954, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.42114258, + "step": 2324, + "time_per_iteration": 2.8338427543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_mlp": 1.00514829, + "epoch": 0.4472874182377838, + "flos": 628535873280.0, + "grad_norm": 0.028780974393906523, + "language_loss": 0.87792349, + "learning_rate": 0.0006087858050058337, + "loss": 0.88839531, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.4206543, + "step": 2325, + "time_per_iteration": 2.7868492603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_mlp": 1.00534439, + "epoch": 0.4474797999230473, + "flos": 548241884928.0, + "grad_norm": 0.03362424978515615, + "language_loss": 0.83227015, + "learning_rate": 0.0006084817047934866, + "loss": 0.84274435, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.42114258, + "step": 2326, + "time_per_iteration": 2.6603922843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.01144028, + "epoch": 0.4476721816083109, + "flos": 456757420032.0, + "grad_norm": 0.033869443234677665, + "language_loss": 0.90294945, + "learning_rate": 0.0006081775624648066, + "loss": 0.91348392, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.42041016, + "step": 2327, + "time_per_iteration": 2.563965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_mlp": 1.00730181, + "epoch": 0.44786456329357444, + "flos": 482501733120.0, + "grad_norm": 0.03973119590818811, + "language_loss": 0.83093679, + "learning_rate": 0.0006078733781378721, + "loss": 0.8414318, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.42236328, + "step": 2328, + "time_per_iteration": 2.5500621795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056217, + "balance_loss_mlp": 1.01401651, + "epoch": 0.448056944978838, + "flos": 553237353216.0, + "grad_norm": 0.0336771809947293, + "language_loss": 0.82818258, + "learning_rate": 0.0006075691519307781, + "loss": 0.83874476, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.42236328, + "step": 2329, + "time_per_iteration": 2.8369436264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053357, + "balance_loss_mlp": 1.01125205, + "epoch": 0.44824932666410156, + "flos": 551917282560.0, + "grad_norm": 0.03290883990888194, + "language_loss": 0.81853932, + "learning_rate": 0.0006072648839616356, + "loss": 0.82907289, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.42138672, + "step": 2330, + "time_per_iteration": 2.707853078842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_mlp": 1.00861132, + "epoch": 0.44844170834936514, + "flos": 990273414912.0, + "grad_norm": 0.029288900679948552, + "language_loss": 0.83132529, + "learning_rate": 0.0006069605743485718, + "loss": 0.84183216, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.42114258, + "step": 2331, + "time_per_iteration": 3.347529649734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053351, + "balance_loss_mlp": 1.011127, + "epoch": 0.44863409003462873, + "flos": 592451647488.0, + "grad_norm": 0.033148459483392366, + "language_loss": 0.84139442, + "learning_rate": 0.0006066562232097303, + "loss": 0.85192794, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.42260742, + "step": 2332, + "time_per_iteration": 2.7059993743896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048256, + "balance_loss_mlp": 1.00600874, + "epoch": 0.44882647171989226, + "flos": 725985934080.0, + "grad_norm": 0.033171968523288915, + "language_loss": 0.86700636, + "learning_rate": 0.0006063518306632708, + "loss": 0.87748891, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.42285156, + "step": 2333, + "time_per_iteration": 2.9296460151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_mlp": 1.00607038, + "epoch": 0.44901885340515585, + "flos": 535991852544.0, + "grad_norm": 0.03657763323068719, + "language_loss": 0.83056581, + "learning_rate": 0.0006060473968273688, + "loss": 0.84104872, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.42260742, + "step": 2334, + "time_per_iteration": 2.6368448734283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104747, + "balance_loss_mlp": 1.0070343, + "epoch": 0.4492112350904194, + "flos": 1558693526016.0, + "grad_norm": 0.008278759352477436, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.7892701, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.40429688, + "step": 2335, + "time_per_iteration": 4.866518497467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050045, + "balance_loss_mlp": 1.00951385, + "epoch": 0.44940361677568297, + "flos": 1526703660288.0, + "grad_norm": 0.009772749846677187, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82055259, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.40527344, + "step": 2336, + "time_per_iteration": 4.832434892654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049508, + "balance_loss_mlp": 1.00759399, + "epoch": 0.4495959984609465, + "flos": 383321387520.0, + "grad_norm": 0.039418428301582195, + "language_loss": 0.88819385, + "learning_rate": 0.0006051338487650047, + "loss": 0.89868897, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.41943359, + "step": 2337, + "time_per_iteration": 2.4261343479156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104738, + "balance_loss_mlp": 1.00537109, + "epoch": 0.4497883801462101, + "flos": 498883064064.0, + "grad_norm": 0.03829280299631375, + "language_loss": 0.83062887, + "learning_rate": 0.0006048292509534095, + "loss": 0.84110272, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.42041016, + "step": 2338, + "time_per_iteration": 2.5792438983917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00425851, + "epoch": 0.4499807618314736, + "flos": 615590812416.0, + "grad_norm": 0.03236488600067343, + "language_loss": 0.78186011, + "learning_rate": 0.0006045246124434895, + "loss": 0.79232258, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.42016602, + "step": 2339, + "time_per_iteration": 2.736332654953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049178, + "balance_loss_mlp": 1.00704992, + "epoch": 0.4501731435167372, + "flos": 1007068850688.0, + "grad_norm": 0.0336222564343559, + "language_loss": 0.8735106, + "learning_rate": 0.0006042199333535162, + "loss": 0.88400233, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.42163086, + "step": 2340, + "time_per_iteration": 3.3217411041259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_mlp": 1.0066278, + "epoch": 0.4503655252020008, + "flos": 822328806912.0, + "grad_norm": 0.031746848330129245, + "language_loss": 0.8445214, + "learning_rate": 0.0006039152138017763, + "loss": 0.85500968, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.42236328, + "step": 2341, + "time_per_iteration": 3.027831792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_mlp": 1.00464213, + "epoch": 0.4505579068872643, + "flos": 487414576128.0, + "grad_norm": 0.03971234339866032, + "language_loss": 0.84330553, + "learning_rate": 0.0006036104539065726, + "loss": 0.85377491, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.42333984, + "step": 2342, + "time_per_iteration": 2.6650640964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_mlp": 1.00030267, + "epoch": 0.4507502885725279, + "flos": 886336728576.0, + "grad_norm": 0.030953760348096254, + "language_loss": 0.8473978, + "learning_rate": 0.000603305653786223, + "loss": 0.85782403, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.42358398, + "step": 2343, + "time_per_iteration": 3.146728277206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_mlp": 1.00284708, + "epoch": 0.45094267025779144, + "flos": 579422016000.0, + "grad_norm": 0.032254310776320565, + "language_loss": 0.84862161, + "learning_rate": 0.0006030008135590622, + "loss": 0.859074, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.42431641, + "step": 2344, + "time_per_iteration": 2.716326951980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00387657, + "epoch": 0.45113505194305503, + "flos": 526442232576.0, + "grad_norm": 0.029625683171065443, + "language_loss": 0.81110835, + "learning_rate": 0.0006026959333434387, + "loss": 0.82157081, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.42407227, + "step": 2345, + "time_per_iteration": 2.757293939590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046277, + "balance_loss_mlp": 1.00379133, + "epoch": 0.45132743362831856, + "flos": 503116429824.0, + "grad_norm": 0.029442245536271623, + "language_loss": 0.77997512, + "learning_rate": 0.0006023910132577181, + "loss": 0.79043788, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.42529297, + "step": 2346, + "time_per_iteration": 2.6643226146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_mlp": 1.00201178, + "epoch": 0.45151981531358215, + "flos": 432836710656.0, + "grad_norm": 0.03508285710405181, + "language_loss": 0.85304409, + "learning_rate": 0.0006020860534202806, + "loss": 0.86348718, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.42333984, + "step": 2347, + "time_per_iteration": 2.508922815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_mlp": 1.00444722, + "epoch": 0.4517121969988457, + "flos": 713494828800.0, + "grad_norm": 0.031320840574665956, + "language_loss": 0.81720173, + "learning_rate": 0.0006017810539495224, + "loss": 0.8276692, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.42333984, + "step": 2348, + "time_per_iteration": 2.916851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046985, + "balance_loss_mlp": 1.00459409, + "epoch": 0.45190457868410927, + "flos": 580557394176.0, + "grad_norm": 0.03199810496833265, + "language_loss": 0.82887936, + "learning_rate": 0.0006014760149638547, + "loss": 0.83934915, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.42431641, + "step": 2349, + "time_per_iteration": 2.6583147048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_mlp": 1.00189018, + "epoch": 0.45209696036937286, + "flos": 483628363008.0, + "grad_norm": 0.034942038630734404, + "language_loss": 0.89322019, + "learning_rate": 0.000601170936581704, + "loss": 0.90366322, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.42456055, + "step": 2350, + "time_per_iteration": 2.5171234607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051553, + "balance_loss_mlp": 1.00906706, + "epoch": 0.4522893420546364, + "flos": 541260474624.0, + "grad_norm": 0.03828852417675836, + "language_loss": 0.85383743, + "learning_rate": 0.0006008658189215121, + "loss": 0.86435294, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.42529297, + "step": 2351, + "time_per_iteration": 2.6463332176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_mlp": 1.00725281, + "epoch": 0.4524817237399, + "flos": 497691305472.0, + "grad_norm": 0.039190213199739796, + "language_loss": 0.80507791, + "learning_rate": 0.0006005606621017366, + "loss": 0.81557548, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.42553711, + "step": 2352, + "time_per_iteration": 2.5637879371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048153, + "balance_loss_mlp": 1.00597668, + "epoch": 0.4526741054251635, + "flos": 653841782016.0, + "grad_norm": 0.04275245206988235, + "language_loss": 0.80476063, + "learning_rate": 0.0006002554662408496, + "loss": 0.81524217, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.42211914, + "step": 2353, + "time_per_iteration": 2.8951141834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_mlp": 1.00500786, + "epoch": 0.4528664871104271, + "flos": 572004146688.0, + "grad_norm": 0.03654890079235127, + "language_loss": 0.91683698, + "learning_rate": 0.0005999502314573388, + "loss": 0.92731076, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.42407227, + "step": 2354, + "time_per_iteration": 2.64512300491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051522, + "balance_loss_mlp": 1.00927448, + "epoch": 0.45305886879569063, + "flos": 459679463424.0, + "grad_norm": 0.03675635166201985, + "language_loss": 0.86984789, + "learning_rate": 0.0005996449578697066, + "loss": 0.88036311, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.42285156, + "step": 2355, + "time_per_iteration": 2.6577048301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.0069412, + "epoch": 0.4532512504809542, + "flos": 506207614464.0, + "grad_norm": 0.033984488129296754, + "language_loss": 0.81732345, + "learning_rate": 0.0005993396455964709, + "loss": 0.82781321, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.4206543, + "step": 2356, + "time_per_iteration": 2.7086563110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.0067569, + "epoch": 0.4534436321662178, + "flos": 583312241664.0, + "grad_norm": 0.03467705138292274, + "language_loss": 0.82385033, + "learning_rate": 0.0005990342947561647, + "loss": 0.8343392, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.42163086, + "step": 2357, + "time_per_iteration": 2.6705219745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_mlp": 1.00484145, + "epoch": 0.45363601385148133, + "flos": 550773156096.0, + "grad_norm": 0.03186226313127573, + "language_loss": 0.78742826, + "learning_rate": 0.0005987289054673351, + "loss": 0.79789847, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.42211914, + "step": 2358, + "time_per_iteration": 2.6073710918426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105212, + "balance_loss_mlp": 1.01063538, + "epoch": 0.4538283955367449, + "flos": 1477793937408.0, + "grad_norm": 0.008894510659601113, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77627861, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.41503906, + "step": 2359, + "time_per_iteration": 4.796559810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044324, + "balance_loss_mlp": 1.00245762, + "epoch": 0.45402077722200845, + "flos": 585797826048.0, + "grad_norm": 0.043889208643714143, + "language_loss": 0.91937214, + "learning_rate": 0.0005981180120183722, + "loss": 0.92981529, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.41894531, + "step": 2360, + "time_per_iteration": 2.6962461471557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104759, + "balance_loss_mlp": 1.00584316, + "epoch": 0.45421315890727204, + "flos": 532889974272.0, + "grad_norm": 0.05191452902852925, + "language_loss": 0.85740328, + "learning_rate": 0.0005978125080954089, + "loss": 0.86787915, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.41772461, + "step": 2361, + "time_per_iteration": 2.777160882949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049759, + "balance_loss_mlp": 1.00794065, + "epoch": 0.4544055405925356, + "flos": 786552728064.0, + "grad_norm": 0.0404371323010207, + "language_loss": 0.77941048, + "learning_rate": 0.000597506966198262, + "loss": 0.78990805, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.41845703, + "step": 2362, + "time_per_iteration": 2.9561667442321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048479, + "balance_loss_mlp": 1.00663614, + "epoch": 0.45459792227779916, + "flos": 519202252800.0, + "grad_norm": 0.0386377549927772, + "language_loss": 0.84570003, + "learning_rate": 0.0005972013864455536, + "loss": 0.85618478, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.41870117, + "step": 2363, + "time_per_iteration": 2.577075958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_mlp": 1.00757432, + "epoch": 0.4547903039630627, + "flos": 538598946048.0, + "grad_norm": 0.03734609962487706, + "language_loss": 0.86156821, + "learning_rate": 0.0005968957689559203, + "loss": 0.87206089, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.41723633, + "step": 2364, + "time_per_iteration": 2.663912773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_mlp": 1.00543737, + "epoch": 0.4549826856483263, + "flos": 529691864832.0, + "grad_norm": 0.03600076061776594, + "language_loss": 0.89443278, + "learning_rate": 0.0005965901138480131, + "loss": 0.90490627, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.41943359, + "step": 2365, + "time_per_iteration": 2.635735034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048292, + "balance_loss_mlp": 1.00633037, + "epoch": 0.45517506733358987, + "flos": 521983345152.0, + "grad_norm": 0.04096543812015268, + "language_loss": 0.87860775, + "learning_rate": 0.0005962844212404982, + "loss": 0.88909072, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.41992188, + "step": 2366, + "time_per_iteration": 2.675039291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049904, + "balance_loss_mlp": 1.00799048, + "epoch": 0.4553674490188534, + "flos": 452009827584.0, + "grad_norm": 0.02917585056549172, + "language_loss": 0.88090932, + "learning_rate": 0.0005959786912520558, + "loss": 0.89140838, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.41943359, + "step": 2367, + "time_per_iteration": 2.605693817138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046028, + "balance_loss_mlp": 1.00399494, + "epoch": 0.455559830704117, + "flos": 547745154816.0, + "grad_norm": 0.029185999772899627, + "language_loss": 0.84459692, + "learning_rate": 0.0005956729240013806, + "loss": 0.85505724, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.4206543, + "step": 2368, + "time_per_iteration": 2.792929172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104665, + "balance_loss_mlp": 1.00447345, + "epoch": 0.4557522123893805, + "flos": 584866582272.0, + "grad_norm": 0.02991931447914949, + "language_loss": 0.92050606, + "learning_rate": 0.0005953671196071824, + "loss": 0.93097258, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.42211914, + "step": 2369, + "time_per_iteration": 2.7024593353271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052055, + "balance_loss_mlp": 1.00992644, + "epoch": 0.4559445940746441, + "flos": 527484291840.0, + "grad_norm": 0.03299201390628513, + "language_loss": 0.80723774, + "learning_rate": 0.0005950612781881846, + "loss": 0.81775832, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.42163086, + "step": 2370, + "time_per_iteration": 2.7288575172424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.0061928, + "epoch": 0.45613697575990764, + "flos": 653368384512.0, + "grad_norm": 0.034012751150725565, + "language_loss": 0.76432264, + "learning_rate": 0.0005947553998631259, + "loss": 0.77480543, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.42114258, + "step": 2371, + "time_per_iteration": 2.865060567855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051777, + "balance_loss_mlp": 1.00976777, + "epoch": 0.4563293574451712, + "flos": 868624633344.0, + "grad_norm": 0.02789239974176414, + "language_loss": 0.79458821, + "learning_rate": 0.000594449484750758, + "loss": 0.80510592, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.42041016, + "step": 2372, + "time_per_iteration": 3.147550344467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_mlp": 1.00242209, + "epoch": 0.45652173913043476, + "flos": 499132885248.0, + "grad_norm": 0.03342359133343608, + "language_loss": 0.83513892, + "learning_rate": 0.0005941435329698484, + "loss": 0.84558398, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.42114258, + "step": 2373, + "time_per_iteration": 2.6924219131469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_mlp": 1.00441325, + "epoch": 0.45671412081569834, + "flos": 561959741952.0, + "grad_norm": 0.03267163379038315, + "language_loss": 0.83796972, + "learning_rate": 0.0005938375446391778, + "loss": 0.84843373, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.42016602, + "step": 2374, + "time_per_iteration": 2.731687307357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_mlp": 1.00281477, + "epoch": 0.45690650250096193, + "flos": 504123495936.0, + "grad_norm": 0.03711297965033783, + "language_loss": 0.89367199, + "learning_rate": 0.0005935315198775415, + "loss": 0.90412098, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.42114258, + "step": 2375, + "time_per_iteration": 2.679049015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_mlp": 1.0040555, + "epoch": 0.45709888418622546, + "flos": 431599265280.0, + "grad_norm": 0.033405413713201326, + "language_loss": 0.87559128, + "learning_rate": 0.0005932254588037486, + "loss": 0.88605309, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.42163086, + "step": 2376, + "time_per_iteration": 2.5139987468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_mlp": 1.00384891, + "epoch": 0.45729126587148905, + "flos": 526693999104.0, + "grad_norm": 0.034118342932564036, + "language_loss": 0.86638731, + "learning_rate": 0.000592919361536623, + "loss": 0.87684566, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.42016602, + "step": 2377, + "time_per_iteration": 2.652921438217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047367, + "balance_loss_mlp": 1.00545263, + "epoch": 0.4574836475567526, + "flos": 639148939776.0, + "grad_norm": 0.03214355149845838, + "language_loss": 0.89487022, + "learning_rate": 0.0005926132281950017, + "loss": 0.90534389, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.41943359, + "step": 2378, + "time_per_iteration": 2.7740533351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050302, + "balance_loss_mlp": 1.00819683, + "epoch": 0.45767602924201617, + "flos": 650791426560.0, + "grad_norm": 0.03291422707035226, + "language_loss": 0.85368007, + "learning_rate": 0.0005923070588977367, + "loss": 0.86418307, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.42138672, + "step": 2379, + "time_per_iteration": 2.8456881046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050793, + "balance_loss_mlp": 1.00873554, + "epoch": 0.4578684109272797, + "flos": 747963475968.0, + "grad_norm": 0.03509802642472786, + "language_loss": 0.86739749, + "learning_rate": 0.0005920008537636931, + "loss": 0.87790543, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.42089844, + "step": 2380, + "time_per_iteration": 2.910720109939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048858, + "balance_loss_mlp": 1.00692058, + "epoch": 0.4580607926125433, + "flos": 642729073152.0, + "grad_norm": 0.029242782263759974, + "language_loss": 0.87235177, + "learning_rate": 0.0005916946129117504, + "loss": 0.88284034, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.41967773, + "step": 2381, + "time_per_iteration": 2.8813161849975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.00948262, + "epoch": 0.4582531742978069, + "flos": 803240260608.0, + "grad_norm": 0.03239264438363608, + "language_loss": 0.81130052, + "learning_rate": 0.0005913883364608017, + "loss": 0.82181567, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.4206543, + "step": 2382, + "time_per_iteration": 3.062751531600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105103, + "balance_loss_mlp": 1.00914025, + "epoch": 0.4584455559830704, + "flos": 685518643200.0, + "grad_norm": 0.031797549541833704, + "language_loss": 0.88895178, + "learning_rate": 0.0005910820245297542, + "loss": 0.8994621, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.41918945, + "step": 2383, + "time_per_iteration": 2.8653757572174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_mlp": 1.00387442, + "epoch": 0.458637937668334, + "flos": 519282932736.0, + "grad_norm": 0.03550111139800055, + "language_loss": 0.80986464, + "learning_rate": 0.000590775677237529, + "loss": 0.82032269, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.41967773, + "step": 2384, + "time_per_iteration": 2.7324440479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_mlp": 1.0042969, + "epoch": 0.4588303193535975, + "flos": 506533257984.0, + "grad_norm": 0.03366806840699952, + "language_loss": 0.80683196, + "learning_rate": 0.0005904692947030601, + "loss": 0.81729311, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.41845703, + "step": 2385, + "time_per_iteration": 2.5837819576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_mlp": 1.00176287, + "epoch": 0.4590227010388611, + "flos": 496909761024.0, + "grad_norm": 0.03855013464211847, + "language_loss": 0.89966094, + "learning_rate": 0.0005901628770452963, + "loss": 0.91009706, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.41870117, + "step": 2386, + "time_per_iteration": 2.60300350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_mlp": 1.00132906, + "epoch": 0.45921508272412465, + "flos": 494602066176.0, + "grad_norm": 0.034718704885035666, + "language_loss": 0.87768519, + "learning_rate": 0.000589856424383199, + "loss": 0.88811642, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.41821289, + "step": 2387, + "time_per_iteration": 2.6108267307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_mlp": 1.00232685, + "epoch": 0.45940746440938823, + "flos": 692593372416.0, + "grad_norm": 0.03330437261727838, + "language_loss": 0.83652228, + "learning_rate": 0.000589549936835744, + "loss": 0.846964, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.41870117, + "step": 2388, + "time_per_iteration": 2.8968546390533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_mlp": 1.00545883, + "epoch": 0.45959984609465176, + "flos": 504737844480.0, + "grad_norm": 0.03238722342606361, + "language_loss": 0.79404306, + "learning_rate": 0.0005892434145219202, + "loss": 0.80451536, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.41796875, + "step": 2389, + "time_per_iteration": 2.6019601821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_mlp": 1.00350749, + "epoch": 0.45979222777991535, + "flos": 677840259072.0, + "grad_norm": 0.03571192687498619, + "language_loss": 0.83136904, + "learning_rate": 0.0005889368575607303, + "loss": 0.84182131, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.41748047, + "step": 2390, + "time_per_iteration": 2.8418307304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042843, + "balance_loss_mlp": 1.00107241, + "epoch": 0.45998460946517894, + "flos": 779039594496.0, + "grad_norm": 0.031212653964934608, + "language_loss": 0.79287618, + "learning_rate": 0.00058863026607119, + "loss": 0.80330467, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.41796875, + "step": 2391, + "time_per_iteration": 3.0931389331817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_mlp": 1.00333977, + "epoch": 0.46017699115044247, + "flos": 853022901504.0, + "grad_norm": 0.035796836390277, + "language_loss": 0.80142331, + "learning_rate": 0.0005883236401723287, + "loss": 0.8118751, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.41870117, + "step": 2392, + "time_per_iteration": 3.170374631881714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_mlp": 1.00222623, + "epoch": 0.46036937283570606, + "flos": 576964621824.0, + "grad_norm": 0.03330985308732758, + "language_loss": 0.84980971, + "learning_rate": 0.0005880169799831893, + "loss": 0.86025083, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.41918945, + "step": 2393, + "time_per_iteration": 2.693976879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048284, + "balance_loss_mlp": 1.00641727, + "epoch": 0.4605617545209696, + "flos": 613120779264.0, + "grad_norm": 0.03386951364717573, + "language_loss": 0.82288468, + "learning_rate": 0.0005877102856228278, + "loss": 0.83336759, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.41894531, + "step": 2394, + "time_per_iteration": 2.8137876987457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104836, + "balance_loss_mlp": 1.0063504, + "epoch": 0.4607541362062332, + "flos": 534159500544.0, + "grad_norm": 0.06543347642857557, + "language_loss": 0.85095239, + "learning_rate": 0.0005874035572103133, + "loss": 0.86143595, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.42041016, + "step": 2395, + "time_per_iteration": 2.6604816913604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_mlp": 1.0043298, + "epoch": 0.4609465178914967, + "flos": 648474983424.0, + "grad_norm": 0.04503809754512356, + "language_loss": 0.83026469, + "learning_rate": 0.0005870967948647288, + "loss": 0.84072733, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.41967773, + "step": 2396, + "time_per_iteration": 2.8022336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.00658417, + "epoch": 0.4611388995767603, + "flos": 1469501204736.0, + "grad_norm": 0.004136605290049959, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75355613, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.40722656, + "step": 2397, + "time_per_iteration": 5.5826334953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00350773, + "epoch": 0.46133128126202383, + "flos": 724477280256.0, + "grad_norm": 0.03194619056097999, + "language_loss": 0.86316049, + "learning_rate": 0.0005864831688507443, + "loss": 0.8736161, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.42089844, + "step": 2398, + "time_per_iteration": 3.0160725116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051143, + "balance_loss_mlp": 1.00903809, + "epoch": 0.4615236629472874, + "flos": 549114802944.0, + "grad_norm": 0.0336665595141197, + "language_loss": 0.75746781, + "learning_rate": 0.0005861763054205754, + "loss": 0.76797926, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.42138672, + "step": 2399, + "time_per_iteration": 2.7720346450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052341, + "balance_loss_mlp": 1.01011705, + "epoch": 0.461716044632551, + "flos": 603460343808.0, + "grad_norm": 0.030278987672658065, + "language_loss": 0.80694187, + "learning_rate": 0.0005858694085337976, + "loss": 0.81746531, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.42260742, + "step": 2400, + "time_per_iteration": 2.790825366973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_mlp": 1.00722611, + "epoch": 0.46190842631781454, + "flos": 475437697536.0, + "grad_norm": 0.03561782978750914, + "language_loss": 0.83960855, + "learning_rate": 0.0005855624783095589, + "loss": 0.85010278, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.42236328, + "step": 2401, + "time_per_iteration": 2.5512595176696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_mlp": 1.00930238, + "epoch": 0.4621008080030781, + "flos": 438402786048.0, + "grad_norm": 0.034731386600305836, + "language_loss": 0.85895813, + "learning_rate": 0.00058525551486702, + "loss": 0.86947024, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.41943359, + "step": 2402, + "time_per_iteration": 2.5168349742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_mlp": 1.0077796, + "epoch": 0.46229318968834165, + "flos": 526498612992.0, + "grad_norm": 0.03903258697063272, + "language_loss": 0.81848848, + "learning_rate": 0.0005849485183253548, + "loss": 0.82898641, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.42041016, + "step": 2403, + "time_per_iteration": 2.640596389770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043904, + "balance_loss_mlp": 1.00213277, + "epoch": 0.46248557137360524, + "flos": 440534536704.0, + "grad_norm": 0.0318215105397156, + "language_loss": 0.87703103, + "learning_rate": 0.0005846414888037501, + "loss": 0.88747007, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.41796875, + "step": 2404, + "time_per_iteration": 2.4814634323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046869, + "balance_loss_mlp": 1.00516927, + "epoch": 0.4626779530588688, + "flos": 618773370624.0, + "grad_norm": 0.036713203920182555, + "language_loss": 0.8266353, + "learning_rate": 0.0005843344264214049, + "loss": 0.83710396, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.41723633, + "step": 2405, + "time_per_iteration": 2.7493507862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_mlp": 1.00461316, + "epoch": 0.46287033474413236, + "flos": 671360436480.0, + "grad_norm": 0.031131832431387497, + "language_loss": 0.85281026, + "learning_rate": 0.0005840273312975317, + "loss": 0.86327314, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.41699219, + "step": 2406, + "time_per_iteration": 2.8235156536102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045049, + "balance_loss_mlp": 1.00332618, + "epoch": 0.46306271642939595, + "flos": 481199159040.0, + "grad_norm": 0.037353418102982906, + "language_loss": 0.90573472, + "learning_rate": 0.0005837202035513555, + "loss": 0.91618526, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.41748047, + "step": 2407, + "time_per_iteration": 2.5672457218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043171, + "balance_loss_mlp": 1.001472, + "epoch": 0.4632550981146595, + "flos": 581858022912.0, + "grad_norm": 0.03272683029516706, + "language_loss": 0.81903768, + "learning_rate": 0.0005834130433021136, + "loss": 0.82946944, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.41723633, + "step": 2408, + "time_per_iteration": 4.229294538497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_mlp": 1.00044954, + "epoch": 0.46344747979992307, + "flos": 525018149376.0, + "grad_norm": 0.030754893265702864, + "language_loss": 0.73835284, + "learning_rate": 0.0005831058506690563, + "loss": 0.74877453, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.41748047, + "step": 2409, + "time_per_iteration": 2.614616632461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_mlp": 1.00183976, + "epoch": 0.4636398614851866, + "flos": 747813776640.0, + "grad_norm": 0.03608107183813509, + "language_loss": 0.86105043, + "learning_rate": 0.0005827986257714464, + "loss": 0.87148345, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.41479492, + "step": 2410, + "time_per_iteration": 2.953162670135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00935507, + "epoch": 0.4638322431704502, + "flos": 597646392576.0, + "grad_norm": 0.032192415237476964, + "language_loss": 0.89042687, + "learning_rate": 0.0005824913687285591, + "loss": 0.90093744, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.41723633, + "step": 2411, + "time_per_iteration": 2.685081958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045225, + "balance_loss_mlp": 1.00357294, + "epoch": 0.4640246248557137, + "flos": 540533365248.0, + "grad_norm": 0.03324810257023632, + "language_loss": 0.82180583, + "learning_rate": 0.0005821840796596821, + "loss": 0.83225811, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.41674805, + "step": 2412, + "time_per_iteration": 2.7183375358581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_mlp": 1.00403953, + "epoch": 0.4642170065409773, + "flos": 563809590528.0, + "grad_norm": 0.030050486484180242, + "language_loss": 0.80926406, + "learning_rate": 0.0005818767586841158, + "loss": 0.81972128, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.41699219, + "step": 2413, + "time_per_iteration": 2.7701165676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050594, + "balance_loss_mlp": 1.00884688, + "epoch": 0.46440938822624084, + "flos": 532062743040.0, + "grad_norm": 0.027541485530404662, + "language_loss": 0.86138541, + "learning_rate": 0.0005815694059211726, + "loss": 0.87189138, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.41772461, + "step": 2414, + "time_per_iteration": 2.668760061264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104752, + "balance_loss_mlp": 1.00717926, + "epoch": 0.4646017699115044, + "flos": 1529627649024.0, + "grad_norm": 0.008676045744997887, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81921148, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.40332031, + "step": 2415, + "time_per_iteration": 4.801916599273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054371, + "balance_loss_mlp": 1.01403046, + "epoch": 0.464794151596768, + "flos": 1544174682624.0, + "grad_norm": 0.009441918844152984, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.77999437, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.40332031, + "step": 2416, + "time_per_iteration": 4.990759372711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.0040803, + "epoch": 0.46498653328203154, + "flos": 502539019776.0, + "grad_norm": 0.03083676606802021, + "language_loss": 0.86654723, + "learning_rate": 0.0005806471581013931, + "loss": 0.87700671, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.41894531, + "step": 2417, + "time_per_iteration": 2.6697516441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046587, + "balance_loss_mlp": 1.00452995, + "epoch": 0.46517891496729513, + "flos": 677301732864.0, + "grad_norm": 0.03671323650301262, + "language_loss": 0.79226685, + "learning_rate": 0.0005803396793823146, + "loss": 0.80273271, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.42089844, + "step": 2418, + "time_per_iteration": 2.8375697135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054201, + "balance_loss_mlp": 1.01212037, + "epoch": 0.46537129665255866, + "flos": 586512296448.0, + "grad_norm": 0.037063881541601694, + "language_loss": 0.86435425, + "learning_rate": 0.0005800321694726065, + "loss": 0.87489623, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.42114258, + "step": 2419, + "time_per_iteration": 2.7743778228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01136279, + "epoch": 0.46556367833782225, + "flos": 588821936640.0, + "grad_norm": 0.0340005426894483, + "language_loss": 0.87128568, + "learning_rate": 0.0005797246284916545, + "loss": 0.8818208, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.421875, + "step": 2420, + "time_per_iteration": 2.6835851669311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049313, + "balance_loss_mlp": 1.00878143, + "epoch": 0.4657560600230858, + "flos": 1488584893440.0, + "grad_norm": 0.006163961209168608, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78554499, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.40527344, + "step": 2421, + "time_per_iteration": 4.943193197250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_mlp": 1.00570607, + "epoch": 0.46594844170834937, + "flos": 581393373696.0, + "grad_norm": 0.03388172676180004, + "language_loss": 0.8850925, + "learning_rate": 0.0005791094537936233, + "loss": 0.89556992, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.4206543, + "step": 2422, + "time_per_iteration": 2.694913148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.00559843, + "epoch": 0.4661408233936129, + "flos": 513571048704.0, + "grad_norm": 0.036220885297141736, + "language_loss": 0.82194817, + "learning_rate": 0.0005788018203153762, + "loss": 0.83242476, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.42089844, + "step": 2423, + "time_per_iteration": 2.582130193710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_mlp": 1.006392, + "epoch": 0.4663332050788765, + "flos": 492033856512.0, + "grad_norm": 0.03516767090589214, + "language_loss": 0.86157548, + "learning_rate": 0.000578494156243549, + "loss": 0.87205875, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.41967773, + "step": 2424, + "time_per_iteration": 2.569465160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047846, + "balance_loss_mlp": 1.0060271, + "epoch": 0.4665255867641401, + "flos": 513708109056.0, + "grad_norm": 0.03097112252036683, + "language_loss": 0.89247042, + "learning_rate": 0.0005781864616975878, + "loss": 0.90294886, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.41845703, + "step": 2425, + "time_per_iteration": 2.6580159664154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043502, + "balance_loss_mlp": 1.00175464, + "epoch": 0.4667179684494036, + "flos": 425707546368.0, + "grad_norm": 0.0331787429652153, + "language_loss": 0.84786129, + "learning_rate": 0.0005778787367969502, + "loss": 0.85829628, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.41772461, + "step": 2426, + "time_per_iteration": 2.577146291732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046987, + "balance_loss_mlp": 1.00526416, + "epoch": 0.4669103501346672, + "flos": 709224524544.0, + "grad_norm": 0.030186535385466236, + "language_loss": 0.81415391, + "learning_rate": 0.0005775709816611053, + "loss": 0.82462376, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.41748047, + "step": 2427, + "time_per_iteration": 2.946763515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044811, + "balance_loss_mlp": 1.00294447, + "epoch": 0.4671027318199307, + "flos": 555946513920.0, + "grad_norm": 0.029160974795623382, + "language_loss": 0.83887118, + "learning_rate": 0.0005772631964095346, + "loss": 0.84931928, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.41894531, + "step": 2428, + "time_per_iteration": 2.7246575355529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047997, + "balance_loss_mlp": 1.0062499, + "epoch": 0.4672951135051943, + "flos": 568196546304.0, + "grad_norm": 0.03470882192857659, + "language_loss": 0.86100912, + "learning_rate": 0.000576955381161731, + "loss": 0.87148911, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.41772461, + "step": 2429, + "time_per_iteration": 2.6618916988372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00959647, + "epoch": 0.46748749519045785, + "flos": 425418841344.0, + "grad_norm": 0.034295751127670006, + "language_loss": 0.86858582, + "learning_rate": 0.0005766475360371985, + "loss": 0.87909877, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.41723633, + "step": 2430, + "time_per_iteration": 2.6010043621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048678, + "balance_loss_mlp": 1.00697899, + "epoch": 0.46767987687572143, + "flos": 539371742208.0, + "grad_norm": 0.034969896754344705, + "language_loss": 0.85521102, + "learning_rate": 0.0005763396611554536, + "loss": 0.86569786, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.41723633, + "step": 2431, + "time_per_iteration": 2.6345412731170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045821, + "balance_loss_mlp": 1.00409806, + "epoch": 0.467872258560985, + "flos": 825076851456.0, + "grad_norm": 0.03589185796451142, + "language_loss": 0.80950278, + "learning_rate": 0.0005760317566360237, + "loss": 0.81996095, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.41748047, + "step": 2432, + "time_per_iteration": 3.0410006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050036, + "balance_loss_mlp": 1.0083127, + "epoch": 0.46806464024624855, + "flos": 662854821120.0, + "grad_norm": 0.03375923289076794, + "language_loss": 0.86271471, + "learning_rate": 0.000575723822598448, + "loss": 0.87321508, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.41748047, + "step": 2433, + "time_per_iteration": 2.7712388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00251079, + "epoch": 0.46825702193151214, + "flos": 757055249664.0, + "grad_norm": 0.029730946872360612, + "language_loss": 0.82302332, + "learning_rate": 0.0005754158591622773, + "loss": 0.83346617, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.41796875, + "step": 2434, + "time_per_iteration": 2.9708468914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_mlp": 1.00818896, + "epoch": 0.4684494036167757, + "flos": 440310960384.0, + "grad_norm": 0.03563934149764459, + "language_loss": 0.83011699, + "learning_rate": 0.0005751078664470732, + "loss": 0.84061682, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.41821289, + "step": 2435, + "time_per_iteration": 2.5696167945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.00468564, + "epoch": 0.46864178530203926, + "flos": 533749286400.0, + "grad_norm": 0.031914354194682755, + "language_loss": 0.86557531, + "learning_rate": 0.0005747998445724094, + "loss": 0.87603986, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.41796875, + "step": 2436, + "time_per_iteration": 2.6336376667022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047101, + "balance_loss_mlp": 1.00535429, + "epoch": 0.4688341669873028, + "flos": 577826846208.0, + "grad_norm": 0.03221336233810001, + "language_loss": 0.89470494, + "learning_rate": 0.0005744917936578707, + "loss": 0.90517592, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.41772461, + "step": 2437, + "time_per_iteration": 2.7748000621795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054309, + "balance_loss_mlp": 1.0126332, + "epoch": 0.4690265486725664, + "flos": 540718057728.0, + "grad_norm": 0.029623138174113085, + "language_loss": 0.84520715, + "learning_rate": 0.0005741837138230526, + "loss": 0.85575026, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.41699219, + "step": 2438, + "time_per_iteration": 2.717194080352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047928, + "balance_loss_mlp": 1.0061574, + "epoch": 0.4692189303578299, + "flos": 771882240000.0, + "grad_norm": 0.03250588789777806, + "language_loss": 0.86937356, + "learning_rate": 0.0005738756051875627, + "loss": 0.87985283, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.41796875, + "step": 2439, + "time_per_iteration": 3.0656278133392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050138, + "balance_loss_mlp": 1.00846255, + "epoch": 0.4694113120430935, + "flos": 572514482688.0, + "grad_norm": 0.03167805631394848, + "language_loss": 0.84031767, + "learning_rate": 0.0005735674678710192, + "loss": 0.85081905, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.41699219, + "step": 2440, + "time_per_iteration": 2.6962802410125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010485, + "balance_loss_mlp": 1.00675285, + "epoch": 0.4696036937283571, + "flos": 750095226624.0, + "grad_norm": 0.037443971636707395, + "language_loss": 0.82144701, + "learning_rate": 0.0005732593019930517, + "loss": 0.83193195, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.41772461, + "step": 2441, + "time_per_iteration": 2.9041428565979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050874, + "balance_loss_mlp": 1.00915074, + "epoch": 0.4697960754136206, + "flos": 494443618560.0, + "grad_norm": 0.033679899008564836, + "language_loss": 0.87957233, + "learning_rate": 0.0005729511076733008, + "loss": 0.89008105, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.41748047, + "step": 2442, + "time_per_iteration": 2.6734514236450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056181, + "balance_loss_mlp": 1.01433861, + "epoch": 0.4699884570988842, + "flos": 726361155072.0, + "grad_norm": 0.036289078656904894, + "language_loss": 0.85521489, + "learning_rate": 0.000572642885031418, + "loss": 0.86577672, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.41870117, + "step": 2443, + "time_per_iteration": 2.9099576473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_mlp": 1.01062274, + "epoch": 0.47018083878414774, + "flos": 556578359040.0, + "grad_norm": 0.03125880297204364, + "language_loss": 0.81027329, + "learning_rate": 0.0005723346341870662, + "loss": 0.82079738, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.41821289, + "step": 2444, + "time_per_iteration": 2.7017409801483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046603, + "balance_loss_mlp": 1.00480783, + "epoch": 0.4703732204694113, + "flos": 424962940416.0, + "grad_norm": 0.03329454905005034, + "language_loss": 0.86812586, + "learning_rate": 0.0005720263552599188, + "loss": 0.8785919, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.41821289, + "step": 2445, + "time_per_iteration": 2.462155818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044259, + "balance_loss_mlp": 1.00239313, + "epoch": 0.47056560215467486, + "flos": 704756888832.0, + "grad_norm": 0.03166905827629482, + "language_loss": 0.80339378, + "learning_rate": 0.0005717180483696604, + "loss": 0.81383634, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.41894531, + "step": 2446, + "time_per_iteration": 2.8927905559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_mlp": 1.00115991, + "epoch": 0.47075798383993844, + "flos": 556013587968.0, + "grad_norm": 0.03197533000624638, + "language_loss": 0.8331126, + "learning_rate": 0.0005714097136359862, + "loss": 0.8435452, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.42138672, + "step": 2447, + "time_per_iteration": 2.632544994354248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043755, + "balance_loss_mlp": 1.00169826, + "epoch": 0.470950365525202, + "flos": 565494188544.0, + "grad_norm": 0.028044805803111937, + "language_loss": 0.87163484, + "learning_rate": 0.0005711013511786027, + "loss": 0.88207239, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.42089844, + "step": 2448, + "time_per_iteration": 2.781325578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049475, + "balance_loss_mlp": 1.00768065, + "epoch": 0.47114274721046556, + "flos": 535499013120.0, + "grad_norm": 0.029728682222295192, + "language_loss": 0.84444499, + "learning_rate": 0.0005707929611172263, + "loss": 0.8549397, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.41821289, + "step": 2449, + "time_per_iteration": 2.704754114151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104782, + "balance_loss_mlp": 1.00576317, + "epoch": 0.47133512889572915, + "flos": 474078743040.0, + "grad_norm": 0.03341999970225476, + "language_loss": 0.84505057, + "learning_rate": 0.000570484543571585, + "loss": 0.85552877, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.42089844, + "step": 2450, + "time_per_iteration": 2.56648850440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_mlp": 1.00129259, + "epoch": 0.4715275105809927, + "flos": 459968168448.0, + "grad_norm": 0.03640704052870178, + "language_loss": 0.83504367, + "learning_rate": 0.0005701760986614171, + "loss": 0.84547579, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.41943359, + "step": 2451, + "time_per_iteration": 2.5392374992370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_mlp": 1.00522745, + "epoch": 0.47171989226625627, + "flos": 422887570176.0, + "grad_norm": 0.0300201122524448, + "language_loss": 0.87997985, + "learning_rate": 0.0005698676265064714, + "loss": 0.89045107, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.41918945, + "step": 2452, + "time_per_iteration": 2.501518487930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_mlp": 1.00378954, + "epoch": 0.4719122739515198, + "flos": 458376889344.0, + "grad_norm": 0.036567202146268483, + "language_loss": 0.89326543, + "learning_rate": 0.0005695591272265074, + "loss": 0.90372366, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.4206543, + "step": 2453, + "time_per_iteration": 2.5203113555908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00703239, + "epoch": 0.4721046556367834, + "flos": 516017749248.0, + "grad_norm": 0.03590555599096038, + "language_loss": 0.82296801, + "learning_rate": 0.0005692506009412954, + "loss": 0.83345866, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.4206543, + "step": 2454, + "time_per_iteration": 2.703277826309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_mlp": 1.00982666, + "epoch": 0.4722970373220469, + "flos": 1575706702080.0, + "grad_norm": 0.007700978657663942, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78601336, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.40234375, + "step": 2455, + "time_per_iteration": 4.935078859329224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_mlp": 1.00380278, + "epoch": 0.4724894190073105, + "flos": 587395908096.0, + "grad_norm": 0.032995428661028114, + "language_loss": 0.90020776, + "learning_rate": 0.0005686334678342593, + "loss": 0.91066664, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.42114258, + "step": 2456, + "time_per_iteration": 2.913954019546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_mlp": 1.00291097, + "epoch": 0.4726818006925741, + "flos": 869073731328.0, + "grad_norm": 0.0323844824027511, + "language_loss": 0.82033843, + "learning_rate": 0.0005683248612520274, + "loss": 0.83078766, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.42041016, + "step": 2457, + "time_per_iteration": 4.4027345180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_mlp": 1.0055697, + "epoch": 0.4728741823778376, + "flos": 754228470528.0, + "grad_norm": 0.03548497467281451, + "language_loss": 0.84315181, + "learning_rate": 0.0005680162281437321, + "loss": 0.85363138, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.42431641, + "step": 2458, + "time_per_iteration": 2.8824384212493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_mlp": 1.00649393, + "epoch": 0.4730665640631012, + "flos": 539658501888.0, + "grad_norm": 0.029540383226657484, + "language_loss": 0.85216498, + "learning_rate": 0.000567707568629195, + "loss": 0.86265045, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.42089844, + "step": 2459, + "time_per_iteration": 2.7024879455566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105311, + "balance_loss_mlp": 1.01088595, + "epoch": 0.47325894574836475, + "flos": 492683198208.0, + "grad_norm": 0.02914158825310119, + "language_loss": 0.8318013, + "learning_rate": 0.0005673988828282486, + "loss": 0.84233236, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.42260742, + "step": 2460, + "time_per_iteration": 2.680508852005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045398, + "balance_loss_mlp": 1.00341213, + "epoch": 0.47345132743362833, + "flos": 765832073472.0, + "grad_norm": 0.11223827549321637, + "language_loss": 0.8158704, + "learning_rate": 0.0005670901708607352, + "loss": 0.82632446, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.42016602, + "step": 2461, + "time_per_iteration": 2.963573455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_mlp": 1.00873268, + "epoch": 0.47364370911889186, + "flos": 541169101056.0, + "grad_norm": 0.03621241484942453, + "language_loss": 0.84821182, + "learning_rate": 0.0005667814328465076, + "loss": 0.85871977, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.42089844, + "step": 2462, + "time_per_iteration": 2.623180389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052459, + "balance_loss_mlp": 1.01042545, + "epoch": 0.47383609080415545, + "flos": 407092397568.0, + "grad_norm": 0.0408736366196423, + "language_loss": 0.82667732, + "learning_rate": 0.0005664726689054285, + "loss": 0.83720195, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.4206543, + "step": 2463, + "time_per_iteration": 2.463602304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_mlp": 1.01253569, + "epoch": 0.474028472489419, + "flos": 454439031552.0, + "grad_norm": 0.030418063351129263, + "language_loss": 0.81695265, + "learning_rate": 0.0005661638791573704, + "loss": 0.82749808, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.42041016, + "step": 2464, + "time_per_iteration": 2.736748695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048425, + "balance_loss_mlp": 1.00651097, + "epoch": 0.47422085417468257, + "flos": 493195479552.0, + "grad_norm": 0.029840540723241396, + "language_loss": 0.87200695, + "learning_rate": 0.0005658550637222164, + "loss": 0.88249123, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.41943359, + "step": 2465, + "time_per_iteration": 2.618978261947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00532842, + "epoch": 0.47441323585994616, + "flos": 740126644224.0, + "grad_norm": 0.027711669007488924, + "language_loss": 0.82591414, + "learning_rate": 0.0005655462227198592, + "loss": 0.8363868, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.41967773, + "step": 2466, + "time_per_iteration": 2.9003212451934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045592, + "balance_loss_mlp": 1.00363016, + "epoch": 0.4746056175452097, + "flos": 485675543040.0, + "grad_norm": 0.03086334809399425, + "language_loss": 0.84889436, + "learning_rate": 0.0005652373562702016, + "loss": 0.85935026, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.41992188, + "step": 2467, + "time_per_iteration": 2.635524272918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050978, + "balance_loss_mlp": 1.00913572, + "epoch": 0.4747979992304733, + "flos": 462006600192.0, + "grad_norm": 0.030700027016666232, + "language_loss": 0.89103687, + "learning_rate": 0.000564928464493156, + "loss": 0.9015466, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.41870117, + "step": 2468, + "time_per_iteration": 2.5902397632598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050963, + "balance_loss_mlp": 1.00900185, + "epoch": 0.4749903809157368, + "flos": 865880479488.0, + "grad_norm": 0.04027391649848807, + "language_loss": 0.82258296, + "learning_rate": 0.000564619547508645, + "loss": 0.83309263, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.41992188, + "step": 2469, + "time_per_iteration": 3.071483850479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_mlp": 1.00877666, + "epoch": 0.4751827626010004, + "flos": 506552699904.0, + "grad_norm": 0.03439249398490307, + "language_loss": 0.83728659, + "learning_rate": 0.0005643106054366008, + "loss": 0.84779418, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.42016602, + "step": 2470, + "time_per_iteration": 2.5717906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054243, + "balance_loss_mlp": 1.01240063, + "epoch": 0.47537514428626393, + "flos": 560453033472.0, + "grad_norm": 0.030831302101538484, + "language_loss": 0.80302799, + "learning_rate": 0.000564001638396965, + "loss": 0.81357038, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.41870117, + "step": 2471, + "time_per_iteration": 2.807666540145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010519, + "balance_loss_mlp": 1.01008177, + "epoch": 0.4755675259715275, + "flos": 835677278976.0, + "grad_norm": 0.03000607606640632, + "language_loss": 0.82444054, + "learning_rate": 0.0005636926465096897, + "loss": 0.83495951, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.41845703, + "step": 2472, + "time_per_iteration": 3.0930862426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_mlp": 1.01106381, + "epoch": 0.47575990765679105, + "flos": 509233670400.0, + "grad_norm": 0.03423576863830587, + "language_loss": 0.88083971, + "learning_rate": 0.0005633836298947363, + "loss": 0.89136827, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.41821289, + "step": 2473, + "time_per_iteration": 2.5820775032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050107, + "balance_loss_mlp": 1.00819325, + "epoch": 0.47595228934205464, + "flos": 592963928832.0, + "grad_norm": 0.03298724569498326, + "language_loss": 0.71285135, + "learning_rate": 0.000563074588672075, + "loss": 0.72335243, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.41943359, + "step": 2474, + "time_per_iteration": 2.693268299102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_mlp": 1.01231647, + "epoch": 0.4761446710273182, + "flos": 581684024064.0, + "grad_norm": 0.03213378714772974, + "language_loss": 0.85775197, + "learning_rate": 0.0005627655229616868, + "loss": 0.86829406, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.41918945, + "step": 2475, + "time_per_iteration": 2.719207286834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051223, + "balance_loss_mlp": 1.00933242, + "epoch": 0.47633705271258175, + "flos": 674080290816.0, + "grad_norm": 0.026991444464169446, + "language_loss": 0.9029963, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350853, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.41918945, + "step": 2476, + "time_per_iteration": 2.793189764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054365, + "balance_loss_mlp": 1.0125705, + "epoch": 0.47652943439784534, + "flos": 542971317504.0, + "grad_norm": 0.02962321585608733, + "language_loss": 0.84663439, + "learning_rate": 0.0005621473185576986, + "loss": 0.85717803, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.41821289, + "step": 2477, + "time_per_iteration": 2.7773327827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_mlp": 1.00822008, + "epoch": 0.4767218160831089, + "flos": 525847325952.0, + "grad_norm": 0.03556533386707064, + "language_loss": 0.87709439, + "learning_rate": 0.0005618381801041068, + "loss": 0.8875953, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.41894531, + "step": 2478, + "time_per_iteration": 2.6155920028686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053209, + "balance_loss_mlp": 1.0111047, + "epoch": 0.47691419776837246, + "flos": 569127790080.0, + "grad_norm": 0.035286823129286084, + "language_loss": 0.83750623, + "learning_rate": 0.0005615290176428044, + "loss": 0.84803832, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.42138672, + "step": 2479, + "time_per_iteration": 2.6538074016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049988, + "balance_loss_mlp": 1.00802612, + "epoch": 0.477106579453636, + "flos": 532025804544.0, + "grad_norm": 0.0314839310376407, + "language_loss": 0.85928833, + "learning_rate": 0.0005612198312938187, + "loss": 0.86978817, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.41992188, + "step": 2480, + "time_per_iteration": 2.781107187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051481, + "balance_loss_mlp": 1.00937629, + "epoch": 0.4772989611388996, + "flos": 595502002944.0, + "grad_norm": 0.03185012593036433, + "language_loss": 0.79825139, + "learning_rate": 0.0005609106211771868, + "loss": 0.80876625, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.42138672, + "step": 2481, + "time_per_iteration": 2.854200839996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049319, + "balance_loss_mlp": 1.00702322, + "epoch": 0.4774913428241631, + "flos": 545708668416.0, + "grad_norm": 0.032298555104441296, + "language_loss": 0.89798552, + "learning_rate": 0.0005606013874129543, + "loss": 0.90847874, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.42333984, + "step": 2482, + "time_per_iteration": 2.8364884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.00214577, + "epoch": 0.4776837245094267, + "flos": 541130217216.0, + "grad_norm": 0.031860038244933726, + "language_loss": 0.8004725, + "learning_rate": 0.0005602921301211768, + "loss": 0.81091738, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.42382812, + "step": 2483, + "time_per_iteration": 2.719606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_mlp": 1.00185454, + "epoch": 0.4778761061946903, + "flos": 472756727040.0, + "grad_norm": 0.037639636071959574, + "language_loss": 0.82567894, + "learning_rate": 0.0005599828494219185, + "loss": 0.83612138, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.42431641, + "step": 2484, + "time_per_iteration": 2.5541560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_mlp": 1.00548136, + "epoch": 0.4780684878799538, + "flos": 727338085632.0, + "grad_norm": 0.033674716450053835, + "language_loss": 0.89748895, + "learning_rate": 0.0005596735454352527, + "loss": 0.90796649, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.4230957, + "step": 2485, + "time_per_iteration": 2.9516124725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_mlp": 1.00921071, + "epoch": 0.4782608695652174, + "flos": 549954673152.0, + "grad_norm": 0.03622289239904689, + "language_loss": 0.86092174, + "learning_rate": 0.0005593642182812619, + "loss": 0.87143582, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.42236328, + "step": 2486, + "time_per_iteration": 2.643221139907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054193, + "balance_loss_mlp": 1.01192153, + "epoch": 0.47845325125048094, + "flos": 831403084032.0, + "grad_norm": 0.035916445699024475, + "language_loss": 0.84163451, + "learning_rate": 0.0005590548680800378, + "loss": 0.85217643, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.4230957, + "step": 2487, + "time_per_iteration": 3.1013588905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_mlp": 1.01356208, + "epoch": 0.4786456329357445, + "flos": 515271197952.0, + "grad_norm": 0.032399463516541584, + "language_loss": 0.76797146, + "learning_rate": 0.0005587454949516804, + "loss": 0.77852952, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.42285156, + "step": 2488, + "time_per_iteration": 2.7681314945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00992179, + "epoch": 0.47883801462100806, + "flos": 565730403840.0, + "grad_norm": 0.034669501918414815, + "language_loss": 0.88538134, + "learning_rate": 0.0005584360990162993, + "loss": 0.89590186, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.42163086, + "step": 2489, + "time_per_iteration": 2.6323490142822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105419, + "balance_loss_mlp": 1.01196563, + "epoch": 0.47903039630627164, + "flos": 580705148160.0, + "grad_norm": 0.028676455513171533, + "language_loss": 0.85944891, + "learning_rate": 0.0005581266803940124, + "loss": 0.86999071, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.42260742, + "step": 2490, + "time_per_iteration": 2.758180856704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051095, + "balance_loss_mlp": 1.00891864, + "epoch": 0.47922277799153523, + "flos": 620086638336.0, + "grad_norm": 0.029629924190795385, + "language_loss": 0.8824507, + "learning_rate": 0.0005578172392049471, + "loss": 0.89296162, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.42211914, + "step": 2491, + "time_per_iteration": 2.733055353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049931, + "balance_loss_mlp": 1.00787377, + "epoch": 0.47941515967679876, + "flos": 640859782656.0, + "grad_norm": 0.03401187912624355, + "language_loss": 0.84927547, + "learning_rate": 0.0005575077755692386, + "loss": 0.85977477, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.42089844, + "step": 2492, + "time_per_iteration": 2.7897393703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051779, + "balance_loss_mlp": 1.00988865, + "epoch": 0.47960754136206235, + "flos": 520876157184.0, + "grad_norm": 0.02611914925979928, + "language_loss": 0.8632732, + "learning_rate": 0.0005571982896070316, + "loss": 0.87379098, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.41918945, + "step": 2493, + "time_per_iteration": 2.667999744415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_mlp": 1.01010633, + "epoch": 0.4797999230473259, + "flos": 476032604160.0, + "grad_norm": 0.03441931276085345, + "language_loss": 0.90227294, + "learning_rate": 0.0005568887814384792, + "loss": 0.9127928, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.41918945, + "step": 2494, + "time_per_iteration": 2.5400681495666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105198, + "balance_loss_mlp": 1.01023245, + "epoch": 0.47999230473258947, + "flos": 533069809152.0, + "grad_norm": 0.031194267436751296, + "language_loss": 0.87632048, + "learning_rate": 0.000556579251183743, + "loss": 0.88684028, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.41772461, + "step": 2495, + "time_per_iteration": 2.662360906600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047828, + "balance_loss_mlp": 1.00615287, + "epoch": 0.480184686417853, + "flos": 602606867712.0, + "grad_norm": 0.03455941378420467, + "language_loss": 0.8073976, + "learning_rate": 0.0005562696989629936, + "loss": 0.81787586, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.41699219, + "step": 2496, + "time_per_iteration": 2.677384614944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049959, + "balance_loss_mlp": 1.00837922, + "epoch": 0.4803770681031166, + "flos": 529262208768.0, + "grad_norm": 0.02987635047659329, + "language_loss": 0.83264202, + "learning_rate": 0.0005559601248964095, + "loss": 0.84314156, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.41601562, + "step": 2497, + "time_per_iteration": 2.629697322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052806, + "balance_loss_mlp": 1.01132119, + "epoch": 0.4805694497883801, + "flos": 512229590784.0, + "grad_norm": 0.031958617017597245, + "language_loss": 0.86286914, + "learning_rate": 0.0005556505291041783, + "loss": 0.87339711, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.41503906, + "step": 2498, + "time_per_iteration": 2.6821835041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105189, + "balance_loss_mlp": 1.0103811, + "epoch": 0.4807618314736437, + "flos": 601606604544.0, + "grad_norm": 0.02993690761083535, + "language_loss": 0.84804475, + "learning_rate": 0.0005553409117064954, + "loss": 0.85856366, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.4152832, + "step": 2499, + "time_per_iteration": 2.868149518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_mlp": 1.00626087, + "epoch": 0.4809542131589073, + "flos": 570030843648.0, + "grad_norm": 0.03218775088546566, + "language_loss": 0.85501659, + "learning_rate": 0.0005550312728235654, + "loss": 0.86549377, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.41479492, + "step": 2500, + "time_per_iteration": 2.6775684356689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00767624, + "epoch": 0.4811465948441708, + "flos": 577166810880.0, + "grad_norm": 0.03560315442462447, + "language_loss": 0.84339613, + "learning_rate": 0.0005547216125756003, + "loss": 0.85388672, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.4140625, + "step": 2501, + "time_per_iteration": 2.730938196182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051501, + "balance_loss_mlp": 1.01011145, + "epoch": 0.4813389765294344, + "flos": 825298482432.0, + "grad_norm": 0.030150461655227775, + "language_loss": 0.82324314, + "learning_rate": 0.0005544119310828211, + "loss": 0.83375812, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.4140625, + "step": 2502, + "time_per_iteration": 3.113402843475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01203024, + "epoch": 0.48153135821469795, + "flos": 636700293888.0, + "grad_norm": 0.03404405348604493, + "language_loss": 0.85394537, + "learning_rate": 0.0005541022284654568, + "loss": 0.8644805, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.41503906, + "step": 2503, + "time_per_iteration": 2.946800708770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055542, + "balance_loss_mlp": 1.01393807, + "epoch": 0.48172373989996153, + "flos": 504709654272.0, + "grad_norm": 0.029988445312160498, + "language_loss": 0.84392428, + "learning_rate": 0.0005537925048437446, + "loss": 0.85447979, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.41625977, + "step": 2504, + "time_per_iteration": 2.5928125381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_mlp": 1.0131073, + "epoch": 0.48191612158522507, + "flos": 1535568945408.0, + "grad_norm": 0.009640282548559968, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76805007, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.40429688, + "step": 2505, + "time_per_iteration": 4.956170320510864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105109, + "balance_loss_mlp": 1.00936711, + "epoch": 0.48210850327048865, + "flos": 703813006080.0, + "grad_norm": 0.02927379087328487, + "language_loss": 0.88880217, + "learning_rate": 0.0005531729950682664, + "loss": 0.89931303, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.41748047, + "step": 2506, + "time_per_iteration": 2.9935836791992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052672, + "balance_loss_mlp": 1.01106763, + "epoch": 0.4823008849557522, + "flos": 440701732608.0, + "grad_norm": 0.04047033106809228, + "language_loss": 0.85417378, + "learning_rate": 0.000552863209155015, + "loss": 0.86470056, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.41625977, + "step": 2507, + "time_per_iteration": 2.4729647636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053204, + "balance_loss_mlp": 1.01157653, + "epoch": 0.48249326664101577, + "flos": 472813107456.0, + "grad_norm": 0.04603508602748786, + "language_loss": 0.82726657, + "learning_rate": 0.0005525534027184461, + "loss": 0.8377986, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.41650391, + "step": 2508, + "time_per_iteration": 2.5513370037078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055774, + "balance_loss_mlp": 1.01421785, + "epoch": 0.48268564832627936, + "flos": 564315068928.0, + "grad_norm": 0.02879273586569962, + "language_loss": 0.83137357, + "learning_rate": 0.0005522435758788365, + "loss": 0.84193128, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.41577148, + "step": 2509, + "time_per_iteration": 2.753450393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055715, + "balance_loss_mlp": 1.01415896, + "epoch": 0.4828780300115429, + "flos": 630843568128.0, + "grad_norm": 0.03460020680283242, + "language_loss": 0.80409563, + "learning_rate": 0.0005519337287564721, + "loss": 0.8146528, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.41577148, + "step": 2510, + "time_per_iteration": 2.790820360183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.01020396, + "epoch": 0.4830704116968065, + "flos": 633005454336.0, + "grad_norm": 0.032398618840687954, + "language_loss": 0.83713245, + "learning_rate": 0.000551623861471646, + "loss": 0.84764957, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.4152832, + "step": 2511, + "time_per_iteration": 2.750471353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.01596832, + "epoch": 0.48326279338207, + "flos": 1572619408128.0, + "grad_norm": 0.008656675131842123, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79874945, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.40136719, + "step": 2512, + "time_per_iteration": 4.832056999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_mlp": 1.00636733, + "epoch": 0.4834551750673336, + "flos": 510238791168.0, + "grad_norm": 0.030652937711335218, + "language_loss": 0.87039137, + "learning_rate": 0.0005510040668958211, + "loss": 0.88087165, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.41674805, + "step": 2513, + "time_per_iteration": 2.593559741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053741, + "balance_loss_mlp": 1.0134964, + "epoch": 0.48364755675259713, + "flos": 1531828419072.0, + "grad_norm": 0.007806244380112886, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78814328, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.40234375, + "step": 2514, + "time_per_iteration": 4.834583282470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049855, + "balance_loss_mlp": 1.00810826, + "epoch": 0.4838399384378607, + "flos": 566047299072.0, + "grad_norm": 0.0392841259920432, + "language_loss": 0.83837014, + "learning_rate": 0.0005503841931138645, + "loss": 0.84886873, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.41772461, + "step": 2515, + "time_per_iteration": 2.704660177230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_mlp": 1.00741005, + "epoch": 0.4840323201231243, + "flos": 388542377472.0, + "grad_norm": 0.03590543250931975, + "language_loss": 0.82853907, + "learning_rate": 0.0005500742268214025, + "loss": 0.83903086, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.41796875, + "step": 2516, + "time_per_iteration": 2.4684557914733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048399, + "balance_loss_mlp": 1.00662851, + "epoch": 0.48422470180838784, + "flos": 632176277760.0, + "grad_norm": 0.031370714323768, + "language_loss": 0.8605336, + "learning_rate": 0.0005497642410884014, + "loss": 0.87101769, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.41796875, + "step": 2517, + "time_per_iteration": 2.7523274421691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00808239, + "epoch": 0.4844170834936514, + "flos": 500313950208.0, + "grad_norm": 0.02829147010426611, + "language_loss": 0.85602349, + "learning_rate": 0.0005494542360352085, + "loss": 0.86652207, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.41796875, + "step": 2518, + "time_per_iteration": 2.635472059249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_mlp": 1.00882208, + "epoch": 0.48460946517891496, + "flos": 552195293952.0, + "grad_norm": 0.029973626664194793, + "language_loss": 0.86134493, + "learning_rate": 0.0005491442117821783, + "loss": 0.87185204, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.41918945, + "step": 2519, + "time_per_iteration": 2.686150550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050979, + "balance_loss_mlp": 1.00916088, + "epoch": 0.48480184686417854, + "flos": 530462715648.0, + "grad_norm": 0.03547836116600895, + "language_loss": 0.87863553, + "learning_rate": 0.0005488341684496732, + "loss": 0.88914526, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.41845703, + "step": 2520, + "time_per_iteration": 2.6380345821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_mlp": 1.01155508, + "epoch": 0.4849942285494421, + "flos": 533048421888.0, + "grad_norm": 0.030317982530802673, + "language_loss": 0.92374247, + "learning_rate": 0.0005485241061580624, + "loss": 0.93427622, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.41845703, + "step": 2521, + "time_per_iteration": 2.7106375694274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048164, + "balance_loss_mlp": 1.00639331, + "epoch": 0.48518661023470566, + "flos": 723973747200.0, + "grad_norm": 0.029300799536016952, + "language_loss": 0.85061228, + "learning_rate": 0.0005482140250277228, + "loss": 0.86109388, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.41796875, + "step": 2522, + "time_per_iteration": 2.998014450073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_mlp": 1.00859261, + "epoch": 0.4853789919199692, + "flos": 507156354816.0, + "grad_norm": 0.033835684591452045, + "language_loss": 0.87858051, + "learning_rate": 0.0005479039251790387, + "loss": 0.88908345, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.41723633, + "step": 2523, + "time_per_iteration": 2.6554031372070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00470078, + "epoch": 0.4855713736052328, + "flos": 661700001024.0, + "grad_norm": 0.033801552668461764, + "language_loss": 0.85375023, + "learning_rate": 0.0005475938067324014, + "loss": 0.86421466, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.41772461, + "step": 2524, + "time_per_iteration": 2.8294761180877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_mlp": 1.00839663, + "epoch": 0.48576375529049637, + "flos": 437890504704.0, + "grad_norm": 0.03215141471545655, + "language_loss": 0.84198898, + "learning_rate": 0.0005472836698082098, + "loss": 0.85249019, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.41748047, + "step": 2525, + "time_per_iteration": 2.553400754928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_mlp": 1.00858843, + "epoch": 0.4859561369757599, + "flos": 582845647104.0, + "grad_norm": 0.029048493067812663, + "language_loss": 0.84421259, + "learning_rate": 0.0005469735145268694, + "loss": 0.85471547, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.41723633, + "step": 2526, + "time_per_iteration": 2.741071939468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01121581, + "epoch": 0.4861485186610235, + "flos": 488933923584.0, + "grad_norm": 0.035658567470948505, + "language_loss": 0.81546867, + "learning_rate": 0.0005466633410087933, + "loss": 0.82599807, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.41748047, + "step": 2527, + "time_per_iteration": 2.7008073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057697, + "balance_loss_mlp": 1.01735687, + "epoch": 0.486340900346287, + "flos": 1561113981696.0, + "grad_norm": 0.006481424575109751, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78318518, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.40332031, + "step": 2528, + "time_per_iteration": 4.889545679092407 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048632, + "balance_loss_mlp": 1.00719464, + "epoch": 0.4865332820315506, + "flos": 483990945024.0, + "grad_norm": 0.029120047594960542, + "language_loss": 0.88662624, + "learning_rate": 0.0005460429397441214, + "loss": 0.89711249, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.41455078, + "step": 2529, + "time_per_iteration": 4.04598331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.00706387, + "epoch": 0.48672566371681414, + "flos": 536857967616.0, + "grad_norm": 0.030816613356667605, + "language_loss": 0.87420261, + "learning_rate": 0.0005457327122383866, + "loss": 0.88468921, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.41625977, + "step": 2530, + "time_per_iteration": 2.613560676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.01515198, + "epoch": 0.4869180454020777, + "flos": 1415833195776.0, + "grad_norm": 0.0094125035005948, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75691986, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.40332031, + "step": 2531, + "time_per_iteration": 4.826287269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104985, + "balance_loss_mlp": 1.00831711, + "epoch": 0.48711042708734126, + "flos": 574227270912.0, + "grad_norm": 0.03266780624208146, + "language_loss": 0.76332569, + "learning_rate": 0.0005451122040823244, + "loss": 0.77382421, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.41552734, + "step": 2532, + "time_per_iteration": 2.805912494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_mlp": 1.00438511, + "epoch": 0.48730280877260485, + "flos": 627817512192.0, + "grad_norm": 0.03502227574741412, + "language_loss": 0.77874511, + "learning_rate": 0.0005448019236728997, + "loss": 0.78920573, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.41699219, + "step": 2533, + "time_per_iteration": 2.865936040878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048209, + "balance_loss_mlp": 1.00670052, + "epoch": 0.48749519045786843, + "flos": 513468981504.0, + "grad_norm": 0.035197852276093636, + "language_loss": 0.85303891, + "learning_rate": 0.0005444916258698255, + "loss": 0.86352104, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.4152832, + "step": 2534, + "time_per_iteration": 2.6375105381011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_mlp": 1.00399435, + "epoch": 0.48768757214313196, + "flos": 526479171072.0, + "grad_norm": 0.030578272272676787, + "language_loss": 0.86534977, + "learning_rate": 0.0005441813107935704, + "loss": 0.87580293, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.41333008, + "step": 2535, + "time_per_iteration": 2.6708908081054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044819, + "balance_loss_mlp": 1.0033108, + "epoch": 0.48787995382839555, + "flos": 506031670272.0, + "grad_norm": 0.03128667529665633, + "language_loss": 0.86385322, + "learning_rate": 0.0005438709785646091, + "loss": 0.87430143, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.4152832, + "step": 2536, + "time_per_iteration": 2.587376117706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_mlp": 1.00599802, + "epoch": 0.4880723355136591, + "flos": 576248206080.0, + "grad_norm": 0.031424284702784445, + "language_loss": 0.87241846, + "learning_rate": 0.0005435606293034234, + "loss": 0.88289213, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.41381836, + "step": 2537, + "time_per_iteration": 2.6678061485290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.00425005, + "epoch": 0.48826471719892267, + "flos": 562537152000.0, + "grad_norm": 0.03574143188627203, + "language_loss": 0.85282528, + "learning_rate": 0.0005432502631305016, + "loss": 0.8632828, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.4152832, + "step": 2538, + "time_per_iteration": 2.7138583660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00763726, + "epoch": 0.4884570988841862, + "flos": 727549022976.0, + "grad_norm": 0.02708673321136359, + "language_loss": 0.84024864, + "learning_rate": 0.0005429398801663386, + "loss": 0.85074031, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.41552734, + "step": 2539, + "time_per_iteration": 2.964188814163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_mlp": 1.00797033, + "epoch": 0.4886494805694498, + "flos": 431924908800.0, + "grad_norm": 0.037537890597472735, + "language_loss": 0.83715379, + "learning_rate": 0.0005426294805314355, + "loss": 0.84764791, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.41455078, + "step": 2540, + "time_per_iteration": 2.5386080741882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_mlp": 1.00251019, + "epoch": 0.4888418622547134, + "flos": 674345663232.0, + "grad_norm": 0.02795943805212824, + "language_loss": 0.80757105, + "learning_rate": 0.0005423190643463003, + "loss": 0.81801265, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.41674805, + "step": 2541, + "time_per_iteration": 3.0026512145996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_mlp": 1.00182211, + "epoch": 0.4890342439399769, + "flos": 542936324352.0, + "grad_norm": 0.03490297591946719, + "language_loss": 0.83297753, + "learning_rate": 0.0005420086317314473, + "loss": 0.84341061, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.41503906, + "step": 2542, + "time_per_iteration": 2.713738441467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_mlp": 1.00457919, + "epoch": 0.4892266256252405, + "flos": 591863543808.0, + "grad_norm": 0.03220316860335889, + "language_loss": 0.81509852, + "learning_rate": 0.0005416981828073971, + "loss": 0.8255589, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.41479492, + "step": 2543, + "time_per_iteration": 2.833582639694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_mlp": 1.00983429, + "epoch": 0.48941900731050403, + "flos": 1519657121280.0, + "grad_norm": 0.011925691275285389, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78164709, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.3984375, + "step": 2544, + "time_per_iteration": 4.825795412063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_mlp": 1.00319445, + "epoch": 0.4896113889957676, + "flos": 471519281664.0, + "grad_norm": 0.035595787649594084, + "language_loss": 0.85265428, + "learning_rate": 0.000541077236513819, + "loss": 0.86310375, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.41772461, + "step": 2545, + "time_per_iteration": 2.5318596363067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046977, + "balance_loss_mlp": 1.00515878, + "epoch": 0.48980377068103115, + "flos": 497552299776.0, + "grad_norm": 0.029954814135253697, + "language_loss": 0.8290776, + "learning_rate": 0.0005407667393853638, + "loss": 0.8395474, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.41845703, + "step": 2546, + "time_per_iteration": 2.6808276176452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049449, + "balance_loss_mlp": 1.00765431, + "epoch": 0.48999615236629473, + "flos": 694108829184.0, + "grad_norm": 0.033072726692276254, + "language_loss": 0.83875388, + "learning_rate": 0.0005404562264298569, + "loss": 0.84924835, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.41821289, + "step": 2547, + "time_per_iteration": 2.8665168285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_mlp": 1.00894189, + "epoch": 0.49018853405155827, + "flos": 542749686528.0, + "grad_norm": 0.0323259245637504, + "language_loss": 0.84166187, + "learning_rate": 0.0005401456977678498, + "loss": 0.85217071, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.41967773, + "step": 2548, + "time_per_iteration": 2.646385431289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.01248467, + "epoch": 0.49038091573682185, + "flos": 697109607168.0, + "grad_norm": 0.03434023749691101, + "language_loss": 0.7811271, + "learning_rate": 0.0005398351535199008, + "loss": 0.79166895, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.41723633, + "step": 2549, + "time_per_iteration": 3.0581490993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056036, + "balance_loss_mlp": 1.01443195, + "epoch": 0.49057329742208544, + "flos": 598063409664.0, + "grad_norm": 0.032237778563639685, + "language_loss": 0.84733725, + "learning_rate": 0.0005395245938065735, + "loss": 0.85789764, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.41625977, + "step": 2550, + "time_per_iteration": 2.7877790927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_mlp": 1.01105404, + "epoch": 0.490765679107349, + "flos": 514417721856.0, + "grad_norm": 0.03812364840268788, + "language_loss": 0.82968283, + "learning_rate": 0.0005392140187484379, + "loss": 0.84021086, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.41772461, + "step": 2551, + "time_per_iteration": 2.59513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_mlp": 1.01097441, + "epoch": 0.49095806079261256, + "flos": 630843568128.0, + "grad_norm": 0.028435741934699065, + "language_loss": 0.8977747, + "learning_rate": 0.0005389034284660701, + "loss": 0.90830076, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.41650391, + "step": 2552, + "time_per_iteration": 2.8811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051565, + "balance_loss_mlp": 1.00979364, + "epoch": 0.4911504424778761, + "flos": 916793640960.0, + "grad_norm": 0.038088038632412044, + "language_loss": 0.82567823, + "learning_rate": 0.000538592823080052, + "loss": 0.83619392, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.41796875, + "step": 2553, + "time_per_iteration": 3.147981882095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_mlp": 1.00736189, + "epoch": 0.4913428241631397, + "flos": 439855059456.0, + "grad_norm": 0.03635352086596181, + "language_loss": 0.85271204, + "learning_rate": 0.000538282202710971, + "loss": 0.86320198, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.41650391, + "step": 2554, + "time_per_iteration": 2.5295345783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_mlp": 1.00865471, + "epoch": 0.4915352058484032, + "flos": 637240765440.0, + "grad_norm": 0.03576310950851386, + "language_loss": 0.82746387, + "learning_rate": 0.000537971567479421, + "loss": 0.83796692, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.41674805, + "step": 2555, + "time_per_iteration": 2.7715530395507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047308, + "balance_loss_mlp": 1.00567997, + "epoch": 0.4917275875336668, + "flos": 505510640640.0, + "grad_norm": 0.03586911519664752, + "language_loss": 0.88338435, + "learning_rate": 0.0005376609175060011, + "loss": 0.89385736, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.41650391, + "step": 2556, + "time_per_iteration": 2.6225156784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_mlp": 1.00252998, + "epoch": 0.49191996921893033, + "flos": 655734405120.0, + "grad_norm": 0.03188042342455107, + "language_loss": 0.80798948, + "learning_rate": 0.0005373502529113162, + "loss": 0.81842965, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.41503906, + "step": 2557, + "time_per_iteration": 2.809008836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00485921, + "epoch": 0.4921123509041939, + "flos": 493399613952.0, + "grad_norm": 0.03491285747037794, + "language_loss": 0.8216666, + "learning_rate": 0.0005370395738159773, + "loss": 0.83213049, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.41552734, + "step": 2558, + "time_per_iteration": 2.6442172527313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047723, + "balance_loss_mlp": 1.00619018, + "epoch": 0.4923047325894575, + "flos": 547208573952.0, + "grad_norm": 0.0376599347248576, + "language_loss": 0.83764005, + "learning_rate": 0.0005367288803406003, + "loss": 0.84811723, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.41552734, + "step": 2559, + "time_per_iteration": 2.6496431827545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_mlp": 1.00299704, + "epoch": 0.49249711427472104, + "flos": 597590012160.0, + "grad_norm": 0.034513710641845094, + "language_loss": 0.81748044, + "learning_rate": 0.0005364181726058073, + "loss": 0.8279264, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.41625977, + "step": 2560, + "time_per_iteration": 2.677976608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.0049566, + "epoch": 0.4926894959599846, + "flos": 498809187072.0, + "grad_norm": 0.0360523922041074, + "language_loss": 0.83156157, + "learning_rate": 0.0005361074507322261, + "loss": 0.84202433, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.41333008, + "step": 2561, + "time_per_iteration": 2.5902929306030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_mlp": 1.00575542, + "epoch": 0.49288187764524816, + "flos": 537183611136.0, + "grad_norm": 0.03594243708601782, + "language_loss": 0.81942439, + "learning_rate": 0.000535796714840489, + "loss": 0.82989568, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.41381836, + "step": 2562, + "time_per_iteration": 2.6181418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_mlp": 1.00658977, + "epoch": 0.49307425933051174, + "flos": 642713521920.0, + "grad_norm": 0.03700989683335547, + "language_loss": 0.84345794, + "learning_rate": 0.0005354859650512348, + "loss": 0.85393751, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.41381836, + "step": 2563, + "time_per_iteration": 2.7921204566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048021, + "balance_loss_mlp": 1.00670326, + "epoch": 0.4932666410157753, + "flos": 517265888256.0, + "grad_norm": 0.0348037560143354, + "language_loss": 0.8771596, + "learning_rate": 0.0005351752014851074, + "loss": 0.88763982, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.41333008, + "step": 2564, + "time_per_iteration": 2.602555990219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048335, + "balance_loss_mlp": 1.00694537, + "epoch": 0.49345902270103886, + "flos": 602652554496.0, + "grad_norm": 0.04115766537624956, + "language_loss": 0.83900678, + "learning_rate": 0.0005348644242627553, + "loss": 0.84949011, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.4140625, + "step": 2565, + "time_per_iteration": 2.7332029342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010522, + "balance_loss_mlp": 1.01195526, + "epoch": 0.49365140438630245, + "flos": 1496984550912.0, + "grad_norm": 0.005471138804527184, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76338828, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.40234375, + "step": 2566, + "time_per_iteration": 4.974903583526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051209, + "balance_loss_mlp": 1.00991523, + "epoch": 0.493843786071566, + "flos": 630789133056.0, + "grad_norm": 0.031108020693620165, + "language_loss": 0.8259182, + "learning_rate": 0.0005342428293320013, + "loss": 0.83643031, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.41308594, + "step": 2567, + "time_per_iteration": 2.774355173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_mlp": 1.01332963, + "epoch": 0.49403616775682957, + "flos": 618690745344.0, + "grad_norm": 0.04042101882964004, + "language_loss": 0.84698522, + "learning_rate": 0.0005339320118649238, + "loss": 0.85753244, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.4140625, + "step": 2568, + "time_per_iteration": 2.7593345642089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_mlp": 1.0091759, + "epoch": 0.4942285494420931, + "flos": 578814470400.0, + "grad_norm": 0.03306097920847627, + "language_loss": 0.87056893, + "learning_rate": 0.000533621181224271, + "loss": 0.88107407, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.41357422, + "step": 2569, + "time_per_iteration": 2.815171957015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_mlp": 1.00358069, + "epoch": 0.4944209311273567, + "flos": 631466664960.0, + "grad_norm": 0.04400973771206172, + "language_loss": 0.82116252, + "learning_rate": 0.0005333103375307182, + "loss": 0.83161294, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.41479492, + "step": 2570, + "time_per_iteration": 2.86649227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048831, + "balance_loss_mlp": 1.00751352, + "epoch": 0.4946133128126202, + "flos": 588719869440.0, + "grad_norm": 0.030724614795269025, + "language_loss": 0.86645854, + "learning_rate": 0.0005329994809049451, + "loss": 0.87694681, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.41333008, + "step": 2571, + "time_per_iteration": 2.717759847640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_mlp": 1.00297725, + "epoch": 0.4948056944978838, + "flos": 584847140352.0, + "grad_norm": 0.02937251460087377, + "language_loss": 0.88108343, + "learning_rate": 0.0005326886114676375, + "loss": 0.89152658, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.41357422, + "step": 2572, + "time_per_iteration": 2.767547369003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_mlp": 1.00207376, + "epoch": 0.49499807618314734, + "flos": 482781689856.0, + "grad_norm": 0.032763972727654474, + "language_loss": 0.88217831, + "learning_rate": 0.0005323777293394854, + "loss": 0.8926127, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.41381836, + "step": 2573, + "time_per_iteration": 2.557117223739624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_mlp": 1.00318027, + "epoch": 0.4951904578684109, + "flos": 520038232320.0, + "grad_norm": 0.044201740478413694, + "language_loss": 0.82535017, + "learning_rate": 0.000532066834641184, + "loss": 0.83579636, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.41455078, + "step": 2574, + "time_per_iteration": 2.6565427780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_mlp": 1.00202954, + "epoch": 0.4953828395536745, + "flos": 536578010880.0, + "grad_norm": 0.03171877270725238, + "language_loss": 0.85277009, + "learning_rate": 0.0005317559274934334, + "loss": 0.8632071, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.41699219, + "step": 2575, + "time_per_iteration": 2.720740795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048187, + "balance_loss_mlp": 1.00653565, + "epoch": 0.49557522123893805, + "flos": 529607294208.0, + "grad_norm": 0.03640176927698583, + "language_loss": 0.81348443, + "learning_rate": 0.0005314450080169382, + "loss": 0.82396632, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.41674805, + "step": 2576, + "time_per_iteration": 2.6694118976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.00729847, + "epoch": 0.49576760292420163, + "flos": 428918294784.0, + "grad_norm": 0.03343170538339807, + "language_loss": 0.81225574, + "learning_rate": 0.0005311340763324083, + "loss": 0.82274544, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.41699219, + "step": 2577, + "time_per_iteration": 2.5676074028015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050384, + "balance_loss_mlp": 1.00866091, + "epoch": 0.49595998460946517, + "flos": 566316562176.0, + "grad_norm": 0.031028578783915843, + "language_loss": 0.83262658, + "learning_rate": 0.0005308231325605578, + "loss": 0.84313035, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.41748047, + "step": 2578, + "time_per_iteration": 2.6750431060791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00893033, + "epoch": 0.49615236629472875, + "flos": 703814951424.0, + "grad_norm": 0.16493684193156796, + "language_loss": 0.7742933, + "learning_rate": 0.0005305121768221061, + "loss": 0.78479862, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.41625977, + "step": 2579, + "time_per_iteration": 3.083477020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047222, + "balance_loss_mlp": 1.00688171, + "epoch": 0.4963447479799923, + "flos": 1444755209472.0, + "grad_norm": 0.004557610476670616, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76085544, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.40332031, + "step": 2580, + "time_per_iteration": 4.820146083831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_mlp": 1.00602686, + "epoch": 0.49653712966525587, + "flos": 538664074752.0, + "grad_norm": 0.031551785699882776, + "language_loss": 0.92325974, + "learning_rate": 0.0005298902299282984, + "loss": 0.93373942, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.41967773, + "step": 2581, + "time_per_iteration": 2.619842529296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050513, + "balance_loss_mlp": 1.00840831, + "epoch": 0.4967295113505194, + "flos": 608396519424.0, + "grad_norm": 0.03377113658216861, + "language_loss": 0.8488903, + "learning_rate": 0.0005295792390144033, + "loss": 0.8593955, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.42138672, + "step": 2582, + "time_per_iteration": 2.722321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00872111, + "epoch": 0.496921893035783, + "flos": 475531016448.0, + "grad_norm": 0.04081472802053015, + "language_loss": 0.84166956, + "learning_rate": 0.0005292682366168294, + "loss": 0.85217929, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.42285156, + "step": 2583, + "time_per_iteration": 2.5314435958862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00393724, + "epoch": 0.4971142747210466, + "flos": 598603881216.0, + "grad_norm": 0.03300753756436905, + "language_loss": 0.80573511, + "learning_rate": 0.0005289572228563181, + "loss": 0.81619596, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.421875, + "step": 2584, + "time_per_iteration": 2.7332074642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00846612, + "epoch": 0.4973066564063101, + "flos": 600735631872.0, + "grad_norm": 0.03199938195942058, + "language_loss": 0.83498567, + "learning_rate": 0.000528646197853616, + "loss": 0.8454923, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.42236328, + "step": 2585, + "time_per_iteration": 2.748955249786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_mlp": 1.00938058, + "epoch": 0.4974990380915737, + "flos": 650770039296.0, + "grad_norm": 0.03327645798274956, + "language_loss": 0.86559486, + "learning_rate": 0.0005283351617294735, + "loss": 0.87611067, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.42236328, + "step": 2586, + "time_per_iteration": 2.9175055027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051655, + "balance_loss_mlp": 1.01093292, + "epoch": 0.49769141977683723, + "flos": 1532442767616.0, + "grad_norm": 0.005920405298637117, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77688324, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.40722656, + "step": 2587, + "time_per_iteration": 4.992246627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.00936949, + "epoch": 0.4978838014621008, + "flos": 537398439168.0, + "grad_norm": 0.03485872476270145, + "language_loss": 0.87171799, + "learning_rate": 0.0005277130565998916, + "loss": 0.88223433, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.4230957, + "step": 2588, + "time_per_iteration": 2.7742838859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048839, + "balance_loss_mlp": 1.00666261, + "epoch": 0.49807618314736435, + "flos": 540746247936.0, + "grad_norm": 0.02719767735149213, + "language_loss": 0.82424593, + "learning_rate": 0.0005274019878359748, + "loss": 0.83473426, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.42211914, + "step": 2589, + "time_per_iteration": 2.7111029624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_mlp": 1.00699103, + "epoch": 0.49826856483262794, + "flos": 543522482688.0, + "grad_norm": 0.03488772819740132, + "language_loss": 0.87582624, + "learning_rate": 0.0005270909084336628, + "loss": 0.88631868, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.42285156, + "step": 2590, + "time_per_iteration": 2.6801702976226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00911105, + "epoch": 0.4984609465178915, + "flos": 523361741568.0, + "grad_norm": 0.03538182267925601, + "language_loss": 0.89689445, + "learning_rate": 0.0005267798185137276, + "loss": 0.90740824, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.4230957, + "step": 2591, + "time_per_iteration": 2.673933506011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00577164, + "epoch": 0.49865332820315506, + "flos": 575705789184.0, + "grad_norm": 0.03191547825845594, + "language_loss": 0.90023857, + "learning_rate": 0.0005264687181969444, + "loss": 0.91071951, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.42358398, + "step": 2592, + "time_per_iteration": 2.729825735092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047144, + "balance_loss_mlp": 1.00484908, + "epoch": 0.49884570988841864, + "flos": 1015211884032.0, + "grad_norm": 0.03571151562514848, + "language_loss": 0.75975507, + "learning_rate": 0.0005261576076040937, + "loss": 0.77022654, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.42333984, + "step": 2593, + "time_per_iteration": 3.284675359725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00502849, + "epoch": 0.4990380915736822, + "flos": 560648419584.0, + "grad_norm": 0.032935336602121515, + "language_loss": 0.84734505, + "learning_rate": 0.0005258464868559591, + "loss": 0.85781705, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.42211914, + "step": 2594, + "time_per_iteration": 2.638974905014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_mlp": 1.00772595, + "epoch": 0.49923047325894576, + "flos": 499944565248.0, + "grad_norm": 0.031535831762229155, + "language_loss": 0.89198703, + "learning_rate": 0.0005255353560733284, + "loss": 0.90248442, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.42041016, + "step": 2595, + "time_per_iteration": 2.5665078163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_mlp": 1.00414276, + "epoch": 0.4994228549442093, + "flos": 1499790921216.0, + "grad_norm": 0.005502914482473529, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76623321, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.40527344, + "step": 2596, + "time_per_iteration": 4.774062395095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_mlp": 1.0082401, + "epoch": 0.4996152366294729, + "flos": 558514723584.0, + "grad_norm": 0.032060383149289634, + "language_loss": 0.83672047, + "learning_rate": 0.0005249130648877492, + "loss": 0.84722298, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.42041016, + "step": 2597, + "time_per_iteration": 2.7558000087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051517, + "balance_loss_mlp": 1.00950754, + "epoch": 0.4998076183147364, + "flos": 416483569920.0, + "grad_norm": 0.036130927396763525, + "language_loss": 0.85007888, + "learning_rate": 0.0005246019047263953, + "loss": 0.86059409, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.42041016, + "step": 2598, + "time_per_iteration": 2.4761478900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045154, + "balance_loss_mlp": 1.00300181, + "epoch": 0.5, + "flos": 468326029824.0, + "grad_norm": 0.035928472301153966, + "language_loss": 0.83319026, + "learning_rate": 0.0005242907350137353, + "loss": 0.84364176, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.421875, + "step": 2599, + "time_per_iteration": 2.551312208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_mlp": 1.00439322, + "epoch": 0.5001923816852636, + "flos": 483756675072.0, + "grad_norm": 0.03511658446114867, + "language_loss": 0.79463625, + "learning_rate": 0.0005239795558705754, + "loss": 0.80510032, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.42041016, + "step": 2600, + "time_per_iteration": 2.6441214084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_mlp": 1.00278771, + "epoch": 0.5003847633705272, + "flos": 534856474368.0, + "grad_norm": 0.03015144944524051, + "language_loss": 0.89835393, + "learning_rate": 0.0005236683674177264, + "loss": 0.90880144, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.41992188, + "step": 2601, + "time_per_iteration": 2.669487953186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_mlp": 1.00746012, + "epoch": 0.5005771450557907, + "flos": 739056394752.0, + "grad_norm": 0.03236196452732128, + "language_loss": 0.82869333, + "learning_rate": 0.0005233571697760021, + "loss": 0.83918852, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.42089844, + "step": 2602, + "time_per_iteration": 2.85748028755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_mlp": 1.00264096, + "epoch": 0.5007695267410542, + "flos": 780307175424.0, + "grad_norm": 0.03720253600362933, + "language_loss": 0.83658135, + "learning_rate": 0.0005230459630662203, + "loss": 0.84702832, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.42089844, + "step": 2603, + "time_per_iteration": 2.9300596714019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_mlp": 1.00358939, + "epoch": 0.5009619084263178, + "flos": 624619402752.0, + "grad_norm": 0.038089595528021734, + "language_loss": 0.82175541, + "learning_rate": 0.0005227347474092022, + "loss": 0.83221114, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.42016602, + "step": 2604, + "time_per_iteration": 2.7056775093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00621724, + "epoch": 0.5011542901115814, + "flos": 532193000448.0, + "grad_norm": 0.026542730624890497, + "language_loss": 0.84019673, + "learning_rate": 0.0005224235229257724, + "loss": 0.85067946, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.42089844, + "step": 2605, + "time_per_iteration": 2.6953065395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048861, + "balance_loss_mlp": 1.00680435, + "epoch": 0.5013466717968449, + "flos": 528628418304.0, + "grad_norm": 0.028335807962849974, + "language_loss": 0.87261045, + "learning_rate": 0.0005221122897367589, + "loss": 0.88309902, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.42089844, + "step": 2606, + "time_per_iteration": 2.7901618480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051336, + "balance_loss_mlp": 1.00939834, + "epoch": 0.5015390534821085, + "flos": 567089358336.0, + "grad_norm": 0.03672669743645021, + "language_loss": 0.81618142, + "learning_rate": 0.0005218010479629932, + "loss": 0.82669473, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.41967773, + "step": 2607, + "time_per_iteration": 2.6298229694366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047474, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5017314351673721, + "flos": 567768835584.0, + "grad_norm": 0.038374388481505664, + "language_loss": 0.82467473, + "learning_rate": 0.0005214897977253102, + "loss": 0.83514941, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41992188, + "step": 2608, + "time_per_iteration": 2.6571240425109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044229, + "balance_loss_mlp": 1.00231516, + "epoch": 0.5019238168526357, + "flos": 523387986432.0, + "grad_norm": 0.030375370520194293, + "language_loss": 0.84678638, + "learning_rate": 0.0005211785391445473, + "loss": 0.85722864, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.41943359, + "step": 2609, + "time_per_iteration": 2.7354485988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_mlp": 1.00309336, + "epoch": 0.5021161985378992, + "flos": 642637699584.0, + "grad_norm": 0.0345609683707489, + "language_loss": 0.80034763, + "learning_rate": 0.0005208672723415467, + "loss": 0.81079769, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.41943359, + "step": 2610, + "time_per_iteration": 2.8003506660461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_mlp": 1.00431252, + "epoch": 0.5023085802231627, + "flos": 592423457280.0, + "grad_norm": 0.034384432252957974, + "language_loss": 0.79919124, + "learning_rate": 0.0005205559974371525, + "loss": 0.8096537, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41967773, + "step": 2611, + "time_per_iteration": 2.801931142807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_mlp": 1.00283635, + "epoch": 0.5025009619084263, + "flos": 473334137088.0, + "grad_norm": 0.0314075616675113, + "language_loss": 0.83085155, + "learning_rate": 0.0005202447145522123, + "loss": 0.84129953, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.41992188, + "step": 2612, + "time_per_iteration": 2.7084405422210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104663, + "balance_loss_mlp": 1.00476372, + "epoch": 0.5026933435936899, + "flos": 456077942784.0, + "grad_norm": 0.03248187925620893, + "language_loss": 0.79969329, + "learning_rate": 0.0005199334238075769, + "loss": 0.81015956, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.41894531, + "step": 2613, + "time_per_iteration": 2.5416245460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00367975, + "epoch": 0.5028857252789535, + "flos": 492722082048.0, + "grad_norm": 0.030734349084793038, + "language_loss": 0.92369366, + "learning_rate": 0.0005196221253241, + "loss": 0.93415004, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.41992188, + "step": 2614, + "time_per_iteration": 2.5504183769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.00344431, + "epoch": 0.503078106964217, + "flos": 626731711488.0, + "grad_norm": 0.0333228394962432, + "language_loss": 0.83482671, + "learning_rate": 0.0005193108192226383, + "loss": 0.84528148, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.4206543, + "step": 2615, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_mlp": 1.00445676, + "epoch": 0.5032704886494805, + "flos": 580138431744.0, + "grad_norm": 0.028161477664975402, + "language_loss": 0.87796414, + "learning_rate": 0.000518999505624052, + "loss": 0.88842779, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.41943359, + "step": 2616, + "time_per_iteration": 2.703958749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_mlp": 1.00289583, + "epoch": 0.5034628703347441, + "flos": 472846155264.0, + "grad_norm": 0.026579731156649716, + "language_loss": 0.83874726, + "learning_rate": 0.000518688184649203, + "loss": 0.84919554, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41967773, + "step": 2617, + "time_per_iteration": 2.7804102897644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046877, + "balance_loss_mlp": 1.00501108, + "epoch": 0.5036552520200077, + "flos": 490813907712.0, + "grad_norm": 0.028739225931260208, + "language_loss": 0.84081781, + "learning_rate": 0.0005183768564189577, + "loss": 0.85128659, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41894531, + "step": 2618, + "time_per_iteration": 2.559967517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049251, + "balance_loss_mlp": 1.00724185, + "epoch": 0.5038476337052713, + "flos": 495216414720.0, + "grad_norm": 0.040417435174145346, + "language_loss": 0.82122672, + "learning_rate": 0.0005180655210541838, + "loss": 0.83171928, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.42041016, + "step": 2619, + "time_per_iteration": 2.569495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_mlp": 1.00471759, + "epoch": 0.5040400153905348, + "flos": 601740752640.0, + "grad_norm": 0.03616333015321602, + "language_loss": 0.83923668, + "learning_rate": 0.0005177541786757527, + "loss": 0.84970129, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.41772461, + "step": 2620, + "time_per_iteration": 2.7744040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048715, + "balance_loss_mlp": 1.0068723, + "epoch": 0.5042323970757984, + "flos": 812920137984.0, + "grad_norm": 0.03309299686066053, + "language_loss": 0.83304209, + "learning_rate": 0.000517442829404538, + "loss": 0.84352922, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.41870117, + "step": 2621, + "time_per_iteration": 2.97257137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048042, + "balance_loss_mlp": 1.00610471, + "epoch": 0.504424778761062, + "flos": 628607804928.0, + "grad_norm": 0.035914844760130495, + "language_loss": 0.87778026, + "learning_rate": 0.0005171314733614166, + "loss": 0.88826072, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.41967773, + "step": 2622, + "time_per_iteration": 2.8732259273529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051056, + "balance_loss_mlp": 1.0091418, + "epoch": 0.5046171604463255, + "flos": 516957741312.0, + "grad_norm": 0.03505567711141955, + "language_loss": 0.79205, + "learning_rate": 0.0005168201106672671, + "loss": 0.80256051, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.41943359, + "step": 2623, + "time_per_iteration": 2.773688316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_mlp": 1.00590754, + "epoch": 0.504809542131589, + "flos": 528853939968.0, + "grad_norm": 0.0377301000829576, + "language_loss": 0.8564831, + "learning_rate": 0.0005165087414429717, + "loss": 0.86696255, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.4206543, + "step": 2624, + "time_per_iteration": 2.6755454540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051892, + "balance_loss_mlp": 1.0100261, + "epoch": 0.5050019238168526, + "flos": 555175663104.0, + "grad_norm": 0.03350143092818485, + "language_loss": 0.83751678, + "learning_rate": 0.0005161973658094144, + "loss": 0.84803575, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.41894531, + "step": 2625, + "time_per_iteration": 2.6260385513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105232, + "balance_loss_mlp": 1.01057339, + "epoch": 0.5051943055021162, + "flos": 575929365504.0, + "grad_norm": 0.030667351452066165, + "language_loss": 0.83093894, + "learning_rate": 0.000515885983887482, + "loss": 0.84146214, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.41772461, + "step": 2626, + "time_per_iteration": 2.7437500953674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_mlp": 1.00686646, + "epoch": 0.5053866871873798, + "flos": 497682557184.0, + "grad_norm": 0.033924054159163435, + "language_loss": 0.84715843, + "learning_rate": 0.0005155745957980636, + "loss": 0.85764432, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.41748047, + "step": 2627, + "time_per_iteration": 2.625260353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048009, + "balance_loss_mlp": 1.00638068, + "epoch": 0.5055790688726434, + "flos": 503220442368.0, + "grad_norm": 0.03037314022037546, + "language_loss": 0.89067703, + "learning_rate": 0.000515263201662051, + "loss": 0.90115714, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41650391, + "step": 2628, + "time_per_iteration": 2.68068265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.00565541, + "epoch": 0.5057714505579068, + "flos": 846768600576.0, + "grad_norm": 0.031311962044338205, + "language_loss": 0.83074951, + "learning_rate": 0.0005149518016003378, + "loss": 0.84122205, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.41625977, + "step": 2629, + "time_per_iteration": 3.208085060119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048667, + "balance_loss_mlp": 1.00720644, + "epoch": 0.5059638322431704, + "flos": 498809187072.0, + "grad_norm": 0.03517894489413756, + "language_loss": 0.82677329, + "learning_rate": 0.0005146403957338206, + "loss": 0.83725995, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.41479492, + "step": 2630, + "time_per_iteration": 2.5591788291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_mlp": 1.0044378, + "epoch": 0.506156213928434, + "flos": 619114565376.0, + "grad_norm": 0.029747387185900163, + "language_loss": 0.82375658, + "learning_rate": 0.0005143289841833975, + "loss": 0.83421576, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41503906, + "step": 2631, + "time_per_iteration": 2.8919997215270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5063485956136976, + "flos": 425790171648.0, + "grad_norm": 0.040524041139339724, + "language_loss": 0.82811654, + "learning_rate": 0.0005140175670699696, + "loss": 0.83857036, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6062378883361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.0038327, + "epoch": 0.5065409772989612, + "flos": 571070957568.0, + "grad_norm": 0.026263595366118216, + "language_loss": 0.83201623, + "learning_rate": 0.0005137061445144395, + "loss": 0.84246838, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.4140625, + "step": 2633, + "time_per_iteration": 2.9138190746307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00282133, + "epoch": 0.5067333589842247, + "flos": 629970650112.0, + "grad_norm": 0.032671607566671305, + "language_loss": 0.87714005, + "learning_rate": 0.000513394716637712, + "loss": 0.8875829, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.41479492, + "step": 2634, + "time_per_iteration": 2.7618257999420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044567, + "balance_loss_mlp": 1.00422668, + "epoch": 0.5069257406694883, + "flos": 1451098938624.0, + "grad_norm": 0.004578936312393245, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.8023628, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.40332031, + "step": 2635, + "time_per_iteration": 4.85358738899231 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050199, + "balance_loss_mlp": 1.00869, + "epoch": 0.5071181223547518, + "flos": 640058796288.0, + "grad_norm": 0.03342633817994969, + "language_loss": 0.81428993, + "learning_rate": 0.0005127718454042958, + "loss": 0.82479185, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.4152832, + "step": 2636, + "time_per_iteration": 2.8021318912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_mlp": 1.00304461, + "epoch": 0.5073105040400154, + "flos": 714873225216.0, + "grad_norm": 0.031182962990379204, + "language_loss": 0.85094464, + "learning_rate": 0.0005124604022894269, + "loss": 0.86139023, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.4152832, + "step": 2637, + "time_per_iteration": 2.934414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_mlp": 1.00676727, + "epoch": 0.5075028857252789, + "flos": 1439614899456.0, + "grad_norm": 0.007557162842452459, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7823543, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.40429688, + "step": 2638, + "time_per_iteration": 4.820345878601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104601, + "balance_loss_mlp": 1.00435817, + "epoch": 0.5076952674105425, + "flos": 572308402944.0, + "grad_norm": 0.03427455588588844, + "language_loss": 0.83839953, + "learning_rate": 0.0005118375016679325, + "loss": 0.84885961, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.41674805, + "step": 2639, + "time_per_iteration": 2.753891706466675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104483, + "balance_loss_mlp": 1.00327373, + "epoch": 0.5078876490958061, + "flos": 517713040896.0, + "grad_norm": 0.0397313189962262, + "language_loss": 0.81205344, + "learning_rate": 0.0005115260444031382, + "loss": 0.82250178, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.41577148, + "step": 2640, + "time_per_iteration": 2.5884034633636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_mlp": 1.0042038, + "epoch": 0.5080800307810697, + "flos": 1587622342656.0, + "grad_norm": 0.00452780467183982, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79776466, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.40429688, + "step": 2641, + "time_per_iteration": 5.021141290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048765, + "balance_loss_mlp": 1.0071131, + "epoch": 0.5082724124663333, + "flos": 486187824384.0, + "grad_norm": 0.03665123216497768, + "language_loss": 0.87927556, + "learning_rate": 0.0005109031165700483, + "loss": 0.88976324, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.41674805, + "step": 2642, + "time_per_iteration": 2.564768075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044813, + "balance_loss_mlp": 1.00313723, + "epoch": 0.5084647941515967, + "flos": 683443272960.0, + "grad_norm": 0.03222315683418769, + "language_loss": 0.84105259, + "learning_rate": 0.0005105916462435945, + "loss": 0.85150075, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.41699219, + "step": 2643, + "time_per_iteration": 2.8432576656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046727, + "balance_loss_mlp": 1.0049082, + "epoch": 0.5086571758368603, + "flos": 549813722112.0, + "grad_norm": 0.031341979306324576, + "language_loss": 0.85911554, + "learning_rate": 0.0005102801718050989, + "loss": 0.86958289, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.41845703, + "step": 2644, + "time_per_iteration": 2.7012667655944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_mlp": 1.00658011, + "epoch": 0.5088495575221239, + "flos": 565079116800.0, + "grad_norm": 0.03553781912080262, + "language_loss": 0.89604807, + "learning_rate": 0.0005099686933754867, + "loss": 0.90653086, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.41723633, + "step": 2645, + "time_per_iteration": 2.774092197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047355, + "balance_loss_mlp": 1.00551212, + "epoch": 0.5090419392073875, + "flos": 552512189184.0, + "grad_norm": 0.03374447512064937, + "language_loss": 0.84807706, + "learning_rate": 0.0005096572110756845, + "loss": 0.85855055, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.41870117, + "step": 2646, + "time_per_iteration": 2.691534996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050517, + "balance_loss_mlp": 1.00857961, + "epoch": 0.509234320892651, + "flos": 568884771840.0, + "grad_norm": 0.0280586539552875, + "language_loss": 0.86222303, + "learning_rate": 0.0005093457250266205, + "loss": 0.87272823, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.41967773, + "step": 2647, + "time_per_iteration": 2.669032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_mlp": 1.00750375, + "epoch": 0.5094267025779146, + "flos": 583694265600.0, + "grad_norm": 0.03456739808544309, + "language_loss": 0.83707237, + "learning_rate": 0.000509034235349224, + "loss": 0.84756589, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.41870117, + "step": 2648, + "time_per_iteration": 2.7174429893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048368, + "balance_loss_mlp": 1.00657344, + "epoch": 0.5096190842631781, + "flos": 593139873024.0, + "grad_norm": 0.03190176036185227, + "language_loss": 0.81830442, + "learning_rate": 0.0005087227421644266, + "loss": 0.82878816, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.41821289, + "step": 2649, + "time_per_iteration": 2.730527877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_mlp": 1.00278723, + "epoch": 0.5098114659484417, + "flos": 514584917760.0, + "grad_norm": 0.03166339002539628, + "language_loss": 0.86503744, + "learning_rate": 0.0005084112455931602, + "loss": 0.87548256, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.41748047, + "step": 2650, + "time_per_iteration": 2.588543176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048046, + "balance_loss_mlp": 1.00627494, + "epoch": 0.5100038476337053, + "flos": 485601666048.0, + "grad_norm": 0.03514605484852806, + "language_loss": 0.85810292, + "learning_rate": 0.0005080997457563586, + "loss": 0.86858344, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41796875, + "step": 2651, + "time_per_iteration": 2.547510862350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053822, + "balance_loss_mlp": 1.01214612, + "epoch": 0.5101962293189688, + "flos": 462555820032.0, + "grad_norm": 0.03981395249249623, + "language_loss": 0.79794431, + "learning_rate": 0.0005077882427749569, + "loss": 0.80848241, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41699219, + "step": 2652, + "time_per_iteration": 2.5867154598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052855, + "balance_loss_mlp": 1.0111798, + "epoch": 0.5103886110042324, + "flos": 588133711104.0, + "grad_norm": 0.03576387090025985, + "language_loss": 0.8527801, + "learning_rate": 0.0005074767367698913, + "loss": 0.86330867, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.41699219, + "step": 2653, + "time_per_iteration": 2.668619155883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052633, + "balance_loss_mlp": 1.01083803, + "epoch": 0.510580992689496, + "flos": 846679172352.0, + "grad_norm": 0.03324234024932545, + "language_loss": 0.84336531, + "learning_rate": 0.0005071652278620988, + "loss": 0.85389161, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.41821289, + "step": 2654, + "time_per_iteration": 3.0502736568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_mlp": 1.01043141, + "epoch": 0.5107733743747596, + "flos": 659811268608.0, + "grad_norm": 0.033221976859431776, + "language_loss": 0.83371234, + "learning_rate": 0.0005068537161725186, + "loss": 0.84423465, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.41821289, + "step": 2655, + "time_per_iteration": 2.7732832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049493, + "balance_loss_mlp": 1.00784123, + "epoch": 0.510965756060023, + "flos": 702961475328.0, + "grad_norm": 0.03652104464060243, + "language_loss": 0.84970605, + "learning_rate": 0.0005065422018220893, + "loss": 0.860201, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41674805, + "step": 2656, + "time_per_iteration": 2.8670201301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_mlp": 1.00430822, + "epoch": 0.5111581377452866, + "flos": 560941982208.0, + "grad_norm": 0.03459233510222537, + "language_loss": 0.80690587, + "learning_rate": 0.0005062306849317521, + "loss": 0.81736469, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41601562, + "step": 2657, + "time_per_iteration": 2.8002302646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_mlp": 1.00202358, + "epoch": 0.5113505194305502, + "flos": 610146246144.0, + "grad_norm": 0.03554743150534212, + "language_loss": 0.83936596, + "learning_rate": 0.0005059191656224487, + "loss": 0.84980083, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41479492, + "step": 2658, + "time_per_iteration": 2.716935157775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_mlp": 1.0037955, + "epoch": 0.5115429011158138, + "flos": 535535951616.0, + "grad_norm": 0.03199868953010379, + "language_loss": 0.89635181, + "learning_rate": 0.0005056076440151212, + "loss": 0.90680414, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.41455078, + "step": 2659, + "time_per_iteration": 2.6661012172698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_mlp": 1.0019455, + "epoch": 0.5117352828010774, + "flos": 1365275813376.0, + "grad_norm": 0.005851878799964376, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.773305, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.40429688, + "step": 2660, + "time_per_iteration": 4.8821775913238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5119276644863409, + "flos": 634931125248.0, + "grad_norm": 0.030472593638878876, + "language_loss": 0.87624103, + "learning_rate": 0.0005049845943901691, + "loss": 0.88671124, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.4152832, + "step": 2661, + "time_per_iteration": 2.868314743041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_mlp": 1.00434649, + "epoch": 0.5121200461716044, + "flos": 586781559552.0, + "grad_norm": 0.035240788892260635, + "language_loss": 0.87104362, + "learning_rate": 0.0005046730666144338, + "loss": 0.88150167, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.41479492, + "step": 2662, + "time_per_iteration": 2.7716057300567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_mlp": 1.00323498, + "epoch": 0.512312427856868, + "flos": 1034224608000.0, + "grad_norm": 0.027938837780362106, + "language_loss": 0.8826527, + "learning_rate": 0.0005043615370244532, + "loss": 0.89309919, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41430664, + "step": 2663, + "time_per_iteration": 3.4280622005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046261, + "balance_loss_mlp": 1.00611115, + "epoch": 0.5125048095421316, + "flos": 1540901729280.0, + "grad_norm": 0.006786755652655265, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.7929064, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.40136719, + "step": 2664, + "time_per_iteration": 4.68994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.003901, + "epoch": 0.5126971912273951, + "flos": 592328193024.0, + "grad_norm": 0.02608573212926663, + "language_loss": 0.86075294, + "learning_rate": 0.0005037384728855425, + "loss": 0.87120485, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.41308594, + "step": 2665, + "time_per_iteration": 2.7917027473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_mlp": 1.00552762, + "epoch": 0.5128895729126587, + "flos": 552718268928.0, + "grad_norm": 0.03821611985083245, + "language_loss": 0.85252321, + "learning_rate": 0.0005034269385785075, + "loss": 0.86299217, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.41381836, + "step": 2666, + "time_per_iteration": 2.63472318649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_mlp": 1.00605392, + "epoch": 0.5130819545979223, + "flos": 482232470016.0, + "grad_norm": 0.03834683208397515, + "language_loss": 0.85133517, + "learning_rate": 0.0005031154029410168, + "loss": 0.86180985, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.41430664, + "step": 2667, + "time_per_iteration": 2.517110824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049696, + "balance_loss_mlp": 1.00837803, + "epoch": 0.5132743362831859, + "flos": 476768461824.0, + "grad_norm": 0.033096203996997774, + "language_loss": 0.87656248, + "learning_rate": 0.0005028038660940197, + "loss": 0.88705945, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.41333008, + "step": 2668, + "time_per_iteration": 2.5096347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105007, + "balance_loss_mlp": 1.00870478, + "epoch": 0.5134667179684494, + "flos": 504903095040.0, + "grad_norm": 0.028882778070319505, + "language_loss": 0.84998578, + "learning_rate": 0.0005024923281584648, + "loss": 0.86048645, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.41381836, + "step": 2669, + "time_per_iteration": 2.6474804878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_mlp": 1.0076561, + "epoch": 0.5136590996537129, + "flos": 505005162240.0, + "grad_norm": 0.03165719334287126, + "language_loss": 0.8319236, + "learning_rate": 0.0005021807892553026, + "loss": 0.84241164, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.41162109, + "step": 2670, + "time_per_iteration": 2.7183725833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044063, + "balance_loss_mlp": 1.00269723, + "epoch": 0.5138514813389765, + "flos": 625800467712.0, + "grad_norm": 0.030310171756311025, + "language_loss": 0.85420138, + "learning_rate": 0.0005018692495054828, + "loss": 0.86464202, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41381836, + "step": 2671, + "time_per_iteration": 2.772813081741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_mlp": 1.00224543, + "epoch": 0.5140438630242401, + "flos": 584634257664.0, + "grad_norm": 0.030896406933945995, + "language_loss": 0.80988181, + "learning_rate": 0.0005015577090299561, + "loss": 0.82031626, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.41210938, + "step": 2672, + "time_per_iteration": 2.6667463779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_mlp": 1.00858212, + "epoch": 0.5142362447095037, + "flos": 488905733376.0, + "grad_norm": 0.032429697018958814, + "language_loss": 0.87124586, + "learning_rate": 0.0005012461679496729, + "loss": 0.88174391, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41235352, + "step": 2673, + "time_per_iteration": 2.6442089080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104556, + "balance_loss_mlp": 1.00431406, + "epoch": 0.5144286263947672, + "flos": 527885757696.0, + "grad_norm": 0.03122591363863073, + "language_loss": 0.88052714, + "learning_rate": 0.0005009346263855848, + "loss": 0.89098281, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.41259766, + "step": 2674, + "time_per_iteration": 2.602527379989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048679, + "balance_loss_mlp": 1.00736094, + "epoch": 0.5146210080800308, + "flos": 487590520320.0, + "grad_norm": 0.029060606816111258, + "language_loss": 0.84209937, + "learning_rate": 0.0005006230844586422, + "loss": 0.85258621, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.41333008, + "step": 2675, + "time_per_iteration": 2.8685102462768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043387, + "balance_loss_mlp": 1.00216484, + "epoch": 0.5148133897652943, + "flos": 516975237888.0, + "grad_norm": 0.028587045609365692, + "language_loss": 0.79492688, + "learning_rate": 0.0005003115422897968, + "loss": 0.80536079, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.41235352, + "step": 2676, + "time_per_iteration": 2.765714168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_mlp": 1.00024414, + "epoch": 0.5150057714505579, + "flos": 512212094208.0, + "grad_norm": 0.033131913333961045, + "language_loss": 0.87827182, + "learning_rate": 0.0005, + "loss": 0.88868773, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.41357422, + "step": 2677, + "time_per_iteration": 2.705502986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047696, + "balance_loss_mlp": 1.00623488, + "epoch": 0.5151981531358215, + "flos": 912391133952.0, + "grad_norm": 0.03328612222334398, + "language_loss": 0.79844034, + "learning_rate": 0.0004996884577102033, + "loss": 0.80891728, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.41479492, + "step": 2678, + "time_per_iteration": 3.112602949142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049572, + "balance_loss_mlp": 1.00801528, + "epoch": 0.515390534821085, + "flos": 472930725888.0, + "grad_norm": 0.03414850275815592, + "language_loss": 0.85192269, + "learning_rate": 0.000499376915541358, + "loss": 0.86241841, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.41577148, + "step": 2679, + "time_per_iteration": 2.732088565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_mlp": 1.00475073, + "epoch": 0.5155829165063486, + "flos": 651358142976.0, + "grad_norm": 0.0316115868451719, + "language_loss": 0.81490767, + "learning_rate": 0.0004990653736144155, + "loss": 0.82537097, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.41601562, + "step": 2680, + "time_per_iteration": 2.9006052017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_mlp": 1.00425994, + "epoch": 0.5157752981916122, + "flos": 415161553920.0, + "grad_norm": 0.034873868180568895, + "language_loss": 0.86566359, + "learning_rate": 0.0004987538320503271, + "loss": 0.876122, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.41601562, + "step": 2681, + "time_per_iteration": 2.5385584831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049108, + "balance_loss_mlp": 1.00750434, + "epoch": 0.5159676798768758, + "flos": 554932644864.0, + "grad_norm": 0.03448939758068617, + "language_loss": 0.83127022, + "learning_rate": 0.0004984422909700442, + "loss": 0.84176129, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.41625977, + "step": 2682, + "time_per_iteration": 2.7167794704437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105107, + "balance_loss_mlp": 1.00944197, + "epoch": 0.5161600615621393, + "flos": 587621429760.0, + "grad_norm": 0.033752660754493145, + "language_loss": 0.84206975, + "learning_rate": 0.0004981307504945173, + "loss": 0.85258043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.41650391, + "step": 2683, + "time_per_iteration": 2.6896650791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_mlp": 1.00856805, + "epoch": 0.5163524432474028, + "flos": 589948566528.0, + "grad_norm": 0.03498305011402451, + "language_loss": 0.90086776, + "learning_rate": 0.0004978192107446976, + "loss": 0.9113704, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.41723633, + "step": 2684, + "time_per_iteration": 2.7550315856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_mlp": 1.00456297, + "epoch": 0.5165448249326664, + "flos": 504905040384.0, + "grad_norm": 0.03233825392148911, + "language_loss": 0.87956327, + "learning_rate": 0.0004975076718415353, + "loss": 0.89002615, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41748047, + "step": 2685, + "time_per_iteration": 2.5969831943511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046859, + "balance_loss_mlp": 1.00515938, + "epoch": 0.51673720661793, + "flos": 417647138304.0, + "grad_norm": 0.0327603501643271, + "language_loss": 0.91275072, + "learning_rate": 0.0004971961339059806, + "loss": 0.9232192, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.41723633, + "step": 2686, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048798, + "balance_loss_mlp": 1.00714636, + "epoch": 0.5169295883031936, + "flos": 600075596544.0, + "grad_norm": 0.03249247039046824, + "language_loss": 0.84663117, + "learning_rate": 0.0004968845970589832, + "loss": 0.8571192, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41674805, + "step": 2687, + "time_per_iteration": 2.7266340255737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047672, + "balance_loss_mlp": 1.00597274, + "epoch": 0.517121969988457, + "flos": 557911068672.0, + "grad_norm": 0.03510688251477249, + "language_loss": 0.85442108, + "learning_rate": 0.0004965730614214926, + "loss": 0.86489779, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.41723633, + "step": 2688, + "time_per_iteration": 2.669203758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00721848, + "epoch": 0.5173143516737206, + "flos": 470375155200.0, + "grad_norm": 0.031768698442390816, + "language_loss": 0.85484231, + "learning_rate": 0.0004962615271144576, + "loss": 0.86533004, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.41577148, + "step": 2689, + "time_per_iteration": 2.508864164352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00578225, + "epoch": 0.5175067333589842, + "flos": 721379292672.0, + "grad_norm": 0.036604011276375, + "language_loss": 0.83442801, + "learning_rate": 0.0004959499942588264, + "loss": 0.84490001, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.41430664, + "step": 2690, + "time_per_iteration": 2.937147617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054985, + "balance_loss_mlp": 1.01473999, + "epoch": 0.5176991150442478, + "flos": 1469344702464.0, + "grad_norm": 0.008104040921495323, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79255009, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.40234375, + "step": 2691, + "time_per_iteration": 4.793481111526489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047518, + "balance_loss_mlp": 1.00593746, + "epoch": 0.5178914967295114, + "flos": 613784705280.0, + "grad_norm": 0.029651978346564224, + "language_loss": 0.85819978, + "learning_rate": 0.0004953269333855661, + "loss": 0.86867493, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.41601562, + "step": 2692, + "time_per_iteration": 2.7456183433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054253, + "balance_loss_mlp": 1.01293516, + "epoch": 0.5180838784147749, + "flos": 501981051648.0, + "grad_norm": 0.03275547277888071, + "language_loss": 0.85017627, + "learning_rate": 0.0004950154056098309, + "loss": 0.86071873, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.41333008, + "step": 2693, + "time_per_iteration": 2.710204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052407, + "balance_loss_mlp": 1.01108897, + "epoch": 0.5182762601000385, + "flos": 690042659328.0, + "grad_norm": 0.03430000909694698, + "language_loss": 0.84476924, + "learning_rate": 0.0004947038797692867, + "loss": 0.85529327, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41333008, + "step": 2694, + "time_per_iteration": 2.846104860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053132, + "balance_loss_mlp": 1.01169479, + "epoch": 0.518468641785302, + "flos": 666801427200.0, + "grad_norm": 0.031372779584062496, + "language_loss": 0.77936417, + "learning_rate": 0.0004943923559848789, + "loss": 0.78989553, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.41455078, + "step": 2695, + "time_per_iteration": 2.780346155166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054261, + "balance_loss_mlp": 1.01303816, + "epoch": 0.5186610234705656, + "flos": 567814522368.0, + "grad_norm": 0.025403978054072948, + "language_loss": 0.9097802, + "learning_rate": 0.0004940808343775515, + "loss": 0.92032284, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.41235352, + "step": 2696, + "time_per_iteration": 2.6940221786499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052298, + "balance_loss_mlp": 1.01093256, + "epoch": 0.5188534051558291, + "flos": 429793158144.0, + "grad_norm": 0.033988353521974116, + "language_loss": 0.8254481, + "learning_rate": 0.0004937693150682479, + "loss": 0.83597112, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.41381836, + "step": 2697, + "time_per_iteration": 2.5146913528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048575, + "balance_loss_mlp": 1.00725734, + "epoch": 0.5190457868410927, + "flos": 547412708352.0, + "grad_norm": 0.031596370266791504, + "language_loss": 0.77111042, + "learning_rate": 0.0004934577981779107, + "loss": 0.78159618, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41333008, + "step": 2698, + "time_per_iteration": 2.6567137241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_mlp": 1.00327134, + "epoch": 0.5192381685263563, + "flos": 549746648064.0, + "grad_norm": 0.029705122804042017, + "language_loss": 0.81764138, + "learning_rate": 0.0004931462838274817, + "loss": 0.82808805, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.4140625, + "step": 2699, + "time_per_iteration": 2.817087173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050227, + "balance_loss_mlp": 1.00895715, + "epoch": 0.5194305502116199, + "flos": 576350273280.0, + "grad_norm": 0.03619468074242637, + "language_loss": 0.84569639, + "learning_rate": 0.0004928347721379011, + "loss": 0.85619867, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.4128418, + "step": 2700, + "time_per_iteration": 2.6439361572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049587, + "balance_loss_mlp": 1.00831699, + "epoch": 0.5196229318968835, + "flos": 435218282496.0, + "grad_norm": 0.03299749227833017, + "language_loss": 0.82266027, + "learning_rate": 0.0004925232632301089, + "loss": 0.83315617, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.4128418, + "step": 2701, + "time_per_iteration": 2.5564098358154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_mlp": 1.00409007, + "epoch": 0.5198153135821469, + "flos": 559986438912.0, + "grad_norm": 0.03181007655018395, + "language_loss": 0.79940033, + "learning_rate": 0.0004922117572250431, + "loss": 0.80985349, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.41235352, + "step": 2702, + "time_per_iteration": 2.651662826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00763237, + "epoch": 0.5200076952674105, + "flos": 566835646464.0, + "grad_norm": 0.030877309828348475, + "language_loss": 0.81538028, + "learning_rate": 0.0004919002542436414, + "loss": 0.82586813, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.41162109, + "step": 2703, + "time_per_iteration": 2.829218864440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_mlp": 1.01028192, + "epoch": 0.5202000769526741, + "flos": 572273409792.0, + "grad_norm": 0.031996161034096735, + "language_loss": 0.81638157, + "learning_rate": 0.0004915887544068399, + "loss": 0.82689589, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.41162109, + "step": 2704, + "time_per_iteration": 2.6583306789398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052276, + "balance_loss_mlp": 1.01110101, + "epoch": 0.5203924586379377, + "flos": 695467783680.0, + "grad_norm": 0.03456723160752419, + "language_loss": 0.7851603, + "learning_rate": 0.0004912772578355736, + "loss": 0.79568309, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41186523, + "step": 2705, + "time_per_iteration": 2.9061107635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051355, + "balance_loss_mlp": 1.01010871, + "epoch": 0.5205848403232012, + "flos": 567691067904.0, + "grad_norm": 0.03253184462937942, + "language_loss": 0.83445644, + "learning_rate": 0.000490965764650776, + "loss": 0.84497005, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.41259766, + "step": 2706, + "time_per_iteration": 2.8724799156188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_mlp": 1.01042521, + "epoch": 0.5207772220084648, + "flos": 1216205913600.0, + "grad_norm": 0.03130848752928153, + "language_loss": 0.83192623, + "learning_rate": 0.0004906542749733798, + "loss": 0.84244412, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.41381836, + "step": 2707, + "time_per_iteration": 3.6585958003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_mlp": 1.00770402, + "epoch": 0.5209696036937284, + "flos": 594032232960.0, + "grad_norm": 0.02732760694007456, + "language_loss": 0.85709697, + "learning_rate": 0.0004903427889243156, + "loss": 0.86758834, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.41455078, + "step": 2708, + "time_per_iteration": 2.871150016784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00294721, + "epoch": 0.5211619853789919, + "flos": 523956648192.0, + "grad_norm": 0.03352920522422817, + "language_loss": 0.85979593, + "learning_rate": 0.0004900313066245134, + "loss": 0.87024117, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.41601562, + "step": 2709, + "time_per_iteration": 2.6438417434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_mlp": 1.00632536, + "epoch": 0.5213543670642555, + "flos": 503861035776.0, + "grad_norm": 0.03205745002268137, + "language_loss": 0.81327069, + "learning_rate": 0.0004897198281949012, + "loss": 0.82374883, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.41503906, + "step": 2710, + "time_per_iteration": 2.693906307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049613, + "balance_loss_mlp": 1.00800931, + "epoch": 0.521546748749519, + "flos": 587072209920.0, + "grad_norm": 0.036857631666753196, + "language_loss": 0.78204525, + "learning_rate": 0.0004894083537564057, + "loss": 0.79254138, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.41625977, + "step": 2711, + "time_per_iteration": 2.7300491333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_mlp": 1.00333273, + "epoch": 0.5217391304347826, + "flos": 571266343680.0, + "grad_norm": 0.030696577254243577, + "language_loss": 0.81681752, + "learning_rate": 0.0004890968834299519, + "loss": 0.82726759, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.41699219, + "step": 2712, + "time_per_iteration": 2.746556043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_mlp": 1.00831652, + "epoch": 0.5219315121200462, + "flos": 543920057856.0, + "grad_norm": 0.028956363679279982, + "language_loss": 0.79082847, + "learning_rate": 0.0004887854173364633, + "loss": 0.80132675, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.4152832, + "step": 2713, + "time_per_iteration": 2.733306884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051945, + "balance_loss_mlp": 1.01045978, + "epoch": 0.5221238938053098, + "flos": 551531367936.0, + "grad_norm": 0.030815907554272836, + "language_loss": 0.82228422, + "learning_rate": 0.0004884739555968617, + "loss": 0.83280361, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.41503906, + "step": 2714, + "time_per_iteration": 2.815034866333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054211, + "balance_loss_mlp": 1.01425171, + "epoch": 0.5223162754905732, + "flos": 1358392579584.0, + "grad_norm": 0.009025254493072253, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80031264, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.39941406, + "step": 2715, + "time_per_iteration": 5.005860090255737 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_mlp": 1.00550854, + "epoch": 0.5225086571758368, + "flos": 568974200064.0, + "grad_norm": 0.030755982791586634, + "language_loss": 0.87142956, + "learning_rate": 0.0004878510456629992, + "loss": 0.88190192, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.41748047, + "step": 2716, + "time_per_iteration": 2.9582624435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_mlp": 1.00713038, + "epoch": 0.5227010388611004, + "flos": 501136323840.0, + "grad_norm": 0.03155972783921746, + "language_loss": 0.85419679, + "learning_rate": 0.00048753959771057314, + "loss": 0.86468375, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.41577148, + "step": 2717, + "time_per_iteration": 2.623081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104979, + "balance_loss_mlp": 1.00832856, + "epoch": 0.522893420546364, + "flos": 598799267328.0, + "grad_norm": 0.035176839616525644, + "language_loss": 0.83230948, + "learning_rate": 0.0004872281545957044, + "loss": 0.84280741, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.41479492, + "step": 2718, + "time_per_iteration": 2.7231285572052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059587, + "balance_loss_mlp": 1.01800716, + "epoch": 0.5230858022316276, + "flos": 665922673152.0, + "grad_norm": 0.03224340083556492, + "language_loss": 0.86415994, + "learning_rate": 0.0004869167164393055, + "loss": 0.8747558, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.41601562, + "step": 2719, + "time_per_iteration": 2.9305646419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054772, + "balance_loss_mlp": 1.0132159, + "epoch": 0.5232781839168911, + "flos": 605034126336.0, + "grad_norm": 0.0287825993415993, + "language_loss": 0.89917624, + "learning_rate": 0.00048660528336228793, + "loss": 0.909724, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.41577148, + "step": 2720, + "time_per_iteration": 2.788072347640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_mlp": 1.0080725, + "epoch": 0.5234705656021547, + "flos": 551841460224.0, + "grad_norm": 0.02763684671666484, + "language_loss": 0.90116215, + "learning_rate": 0.0004862938554855606, + "loss": 0.91165972, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.41699219, + "step": 2721, + "time_per_iteration": 2.775818109512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_mlp": 1.00965011, + "epoch": 0.5236629472874182, + "flos": 505295812608.0, + "grad_norm": 0.03601660428487822, + "language_loss": 0.86817378, + "learning_rate": 0.0004859824329300304, + "loss": 0.87868822, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.41821289, + "step": 2722, + "time_per_iteration": 2.587228536605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053208, + "balance_loss_mlp": 1.01138973, + "epoch": 0.5238553289726818, + "flos": 548697785856.0, + "grad_norm": 0.03170706554102953, + "language_loss": 0.83958352, + "learning_rate": 0.00048567101581660244, + "loss": 0.85011566, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.41845703, + "step": 2723, + "time_per_iteration": 2.6208062171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050325, + "balance_loss_mlp": 1.00843501, + "epoch": 0.5240477106579453, + "flos": 533004680448.0, + "grad_norm": 0.03335820140898581, + "language_loss": 0.87488234, + "learning_rate": 0.00048535960426617956, + "loss": 0.88538557, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.41918945, + "step": 2724, + "time_per_iteration": 2.5951199531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050726, + "balance_loss_mlp": 1.00883543, + "epoch": 0.5242400923432089, + "flos": 619090265856.0, + "grad_norm": 0.03212273913620546, + "language_loss": 0.8244487, + "learning_rate": 0.0004850481983996621, + "loss": 0.83495593, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.41918945, + "step": 2725, + "time_per_iteration": 2.747008800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049694, + "balance_loss_mlp": 1.00785124, + "epoch": 0.5244324740284725, + "flos": 417590757888.0, + "grad_norm": 0.03280670580990367, + "language_loss": 0.88229245, + "learning_rate": 0.0004847367983379492, + "loss": 0.89278936, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.41870117, + "step": 2726, + "time_per_iteration": 2.437721014022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_mlp": 1.00770533, + "epoch": 0.5246248557137361, + "flos": 627732941568.0, + "grad_norm": 0.03120006141405487, + "language_loss": 0.79435945, + "learning_rate": 0.00048442540420193643, + "loss": 0.80485278, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.41650391, + "step": 2727, + "time_per_iteration": 2.927518844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.00911331, + "epoch": 0.5248172373989997, + "flos": 1250403352320.0, + "grad_norm": 0.03663625191481743, + "language_loss": 0.7991612, + "learning_rate": 0.0004841140161125182, + "loss": 0.80966663, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.41455078, + "step": 2728, + "time_per_iteration": 3.574690818786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053819, + "balance_loss_mlp": 1.01250064, + "epoch": 0.5250096190842631, + "flos": 507883464192.0, + "grad_norm": 0.03360211420143325, + "language_loss": 0.85387456, + "learning_rate": 0.0004838026341905857, + "loss": 0.86441278, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.41333008, + "step": 2729, + "time_per_iteration": 2.7263481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046844, + "balance_loss_mlp": 1.00547838, + "epoch": 0.5252020007695267, + "flos": 612508376064.0, + "grad_norm": 0.029211194306351093, + "language_loss": 0.85320604, + "learning_rate": 0.00048349125855702844, + "loss": 0.86367452, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.41381836, + "step": 2730, + "time_per_iteration": 2.775851011276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00649071, + "epoch": 0.5253943824547903, + "flos": 540292292352.0, + "grad_norm": 0.02938539212610817, + "language_loss": 0.81675971, + "learning_rate": 0.00048317988933273287, + "loss": 0.82723826, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.41381836, + "step": 2731, + "time_per_iteration": 2.7763831615448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00613368, + "epoch": 0.5255867641400539, + "flos": 699338567424.0, + "grad_norm": 0.033934632058623626, + "language_loss": 0.82549971, + "learning_rate": 0.00048286852663858367, + "loss": 0.83597326, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.41235352, + "step": 2732, + "time_per_iteration": 2.96213698387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052513, + "balance_loss_mlp": 1.01131439, + "epoch": 0.5257791458253175, + "flos": 668549208576.0, + "grad_norm": 0.03297641476237434, + "language_loss": 0.84432375, + "learning_rate": 0.000482557170595462, + "loss": 0.85484892, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.41210938, + "step": 2733, + "time_per_iteration": 2.840514659881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050533, + "balance_loss_mlp": 1.00943005, + "epoch": 0.525971527510581, + "flos": 484605293568.0, + "grad_norm": 0.032410991276381265, + "language_loss": 0.88272679, + "learning_rate": 0.0004822458213242475, + "loss": 0.89323211, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.41113281, + "step": 2734, + "time_per_iteration": 2.560474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047172, + "balance_loss_mlp": 1.00613987, + "epoch": 0.5261639091958445, + "flos": 831348648960.0, + "grad_norm": 0.03341440797603734, + "language_loss": 0.86630881, + "learning_rate": 0.00048193447894581627, + "loss": 0.87678051, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.41040039, + "step": 2735, + "time_per_iteration": 3.1240243911743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105002, + "balance_loss_mlp": 1.00886869, + "epoch": 0.5263562908811081, + "flos": 521733523968.0, + "grad_norm": 0.03226346413051534, + "language_loss": 0.88327318, + "learning_rate": 0.00048162314358104243, + "loss": 0.89377338, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.41162109, + "step": 2736, + "time_per_iteration": 2.599510669708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_mlp": 1.00581563, + "epoch": 0.5265486725663717, + "flos": 576098506752.0, + "grad_norm": 0.03477073688653673, + "language_loss": 0.84006953, + "learning_rate": 0.0004813118153507969, + "loss": 0.85054016, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.41259766, + "step": 2737, + "time_per_iteration": 2.7309916019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0173111, + "epoch": 0.5267410542516352, + "flos": 1550561186304.0, + "grad_norm": 0.008968329145720436, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83504307, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.3984375, + "step": 2738, + "time_per_iteration": 4.815824747085571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054104, + "balance_loss_mlp": 1.01311994, + "epoch": 0.5269334359368988, + "flos": 931462183680.0, + "grad_norm": 0.03276977156640091, + "language_loss": 0.84196591, + "learning_rate": 0.00048068918077736163, + "loss": 0.85250694, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.40991211, + "step": 2739, + "time_per_iteration": 3.2470173835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051244, + "balance_loss_mlp": 1.01004505, + "epoch": 0.5271258176221624, + "flos": 656635513344.0, + "grad_norm": 0.03436954846361053, + "language_loss": 0.82138938, + "learning_rate": 0.0004803778746759001, + "loss": 0.83190179, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.41210938, + "step": 2740, + "time_per_iteration": 2.920330286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051176, + "balance_loss_mlp": 1.01007247, + "epoch": 0.527318199307426, + "flos": 544062954240.0, + "grad_norm": 0.045913237701965745, + "language_loss": 0.82631075, + "learning_rate": 0.00048006657619242317, + "loss": 0.83682251, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.41113281, + "step": 2741, + "time_per_iteration": 2.612001419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_mlp": 1.00462067, + "epoch": 0.5275105809926895, + "flos": 448899201024.0, + "grad_norm": 0.036563153452021165, + "language_loss": 0.78434455, + "learning_rate": 0.00047975528544778775, + "loss": 0.7948041, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.41357422, + "step": 2742, + "time_per_iteration": 2.590146064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_mlp": 1.00130069, + "epoch": 0.527702962677953, + "flos": 580053861120.0, + "grad_norm": 0.038221984800347206, + "language_loss": 0.89132345, + "learning_rate": 0.00047944400256284754, + "loss": 0.90174961, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.41333008, + "step": 2743, + "time_per_iteration": 2.691096305847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046686, + "balance_loss_mlp": 1.00548708, + "epoch": 0.5278953443632166, + "flos": 654010923264.0, + "grad_norm": 0.03476413811576821, + "language_loss": 0.80653423, + "learning_rate": 0.0004791327276584532, + "loss": 0.8170011, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.41210938, + "step": 2744, + "time_per_iteration": 2.8089282512664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00753677, + "epoch": 0.5280877260484802, + "flos": 515049566976.0, + "grad_norm": 0.03187296499214836, + "language_loss": 0.81036532, + "learning_rate": 0.00047882146085545264, + "loss": 0.82085317, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.41259766, + "step": 2745, + "time_per_iteration": 2.646883010864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055283, + "balance_loss_mlp": 1.01541901, + "epoch": 0.5282801077337438, + "flos": 1448715421440.0, + "grad_norm": 0.006687794222264933, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76457667, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.3984375, + "step": 2746, + "time_per_iteration": 4.967897653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_mlp": 1.00703144, + "epoch": 0.5284724894190073, + "flos": 605967315456.0, + "grad_norm": 0.03667028691338261, + "language_loss": 0.80105197, + "learning_rate": 0.00047819895203700684, + "loss": 0.81153399, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.41186523, + "step": 2747, + "time_per_iteration": 2.7146098613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105164, + "balance_loss_mlp": 1.01187134, + "epoch": 0.5286648711042709, + "flos": 1498106323200.0, + "grad_norm": 0.006729060992495368, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76564074, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.39746094, + "step": 2748, + "time_per_iteration": 4.6327197551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045337, + "balance_loss_mlp": 1.00416195, + "epoch": 0.5288572527895344, + "flos": 598834260480.0, + "grad_norm": 0.03692084834433464, + "language_loss": 0.89385319, + "learning_rate": 0.0004775764770742277, + "loss": 0.90430653, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.41186523, + "step": 2749, + "time_per_iteration": 2.807567834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_mlp": 1.00394237, + "epoch": 0.529049634474798, + "flos": 558440846592.0, + "grad_norm": 0.03911259999059639, + "language_loss": 0.87067056, + "learning_rate": 0.00047726525259079777, + "loss": 0.88112199, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.41210938, + "step": 2750, + "time_per_iteration": 2.7838735580444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_mlp": 1.00348663, + "epoch": 0.5292420161600616, + "flos": 582435432960.0, + "grad_norm": 0.03406590895995427, + "language_loss": 0.89342177, + "learning_rate": 0.0004769540369337798, + "loss": 0.9038682, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.41162109, + "step": 2751, + "time_per_iteration": 2.716430902481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 1.00405395, + "epoch": 0.5294343978453251, + "flos": 609564945408.0, + "grad_norm": 0.0303004693379624, + "language_loss": 0.8646909, + "learning_rate": 0.00047664283022399794, + "loss": 0.87514395, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.41259766, + "step": 2752, + "time_per_iteration": 2.8746426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048518, + "balance_loss_mlp": 1.00736678, + "epoch": 0.5296267795305887, + "flos": 647710935552.0, + "grad_norm": 0.032209809873809676, + "language_loss": 0.81781971, + "learning_rate": 0.00047633163258227376, + "loss": 0.82830489, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.41162109, + "step": 2753, + "time_per_iteration": 2.859628677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048376, + "balance_loss_mlp": 1.0070343, + "epoch": 0.5298191612158523, + "flos": 560806867200.0, + "grad_norm": 0.034095977821307535, + "language_loss": 0.85918152, + "learning_rate": 0.0004760204441294247, + "loss": 0.86966527, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.41357422, + "step": 2754, + "time_per_iteration": 2.642761707305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049622, + "balance_loss_mlp": 1.00842357, + "epoch": 0.5300115429011159, + "flos": 515132192256.0, + "grad_norm": 0.03324074908377848, + "language_loss": 0.86806327, + "learning_rate": 0.00047570926498626486, + "loss": 0.87855953, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.41210938, + "step": 2755, + "time_per_iteration": 2.688204765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048264, + "balance_loss_mlp": 1.00699341, + "epoch": 0.5302039245863793, + "flos": 674050155264.0, + "grad_norm": 0.032282959747224574, + "language_loss": 0.82332271, + "learning_rate": 0.00047539809527360474, + "loss": 0.83380532, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.4128418, + "step": 2756, + "time_per_iteration": 2.891369104385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051726, + "balance_loss_mlp": 1.01052761, + "epoch": 0.5303963062716429, + "flos": 732157609728.0, + "grad_norm": 0.027910460797545535, + "language_loss": 0.82830453, + "learning_rate": 0.0004750869351122511, + "loss": 0.83882177, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.41210938, + "step": 2757, + "time_per_iteration": 2.9782614707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051325, + "balance_loss_mlp": 1.01015055, + "epoch": 0.5305886879569065, + "flos": 574552914432.0, + "grad_norm": 0.03118318769242836, + "language_loss": 0.82440865, + "learning_rate": 0.00047477578462300685, + "loss": 0.83492196, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.41186523, + "step": 2758, + "time_per_iteration": 2.7210254669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104635, + "balance_loss_mlp": 1.00498474, + "epoch": 0.5307810696421701, + "flos": 696729528576.0, + "grad_norm": 0.03181982217221047, + "language_loss": 0.79867083, + "learning_rate": 0.0004744646439266718, + "loss": 0.8091343, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.41381836, + "step": 2759, + "time_per_iteration": 2.997299909591675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046488, + "balance_loss_mlp": 1.005265, + "epoch": 0.5309734513274337, + "flos": 650203322880.0, + "grad_norm": 0.04897119780065821, + "language_loss": 0.92728293, + "learning_rate": 0.000474153513144041, + "loss": 0.93774784, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.41235352, + "step": 2760, + "time_per_iteration": 2.9030909538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047525, + "balance_loss_mlp": 1.00618315, + "epoch": 0.5311658330126972, + "flos": 606056743680.0, + "grad_norm": 0.03383323202633534, + "language_loss": 0.87311566, + "learning_rate": 0.00047384239239590633, + "loss": 0.88359094, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.41357422, + "step": 2761, + "time_per_iteration": 2.8522770404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_mlp": 1.00859571, + "epoch": 0.5313582146979607, + "flos": 559317655296.0, + "grad_norm": 0.03320129260812799, + "language_loss": 0.89026552, + "learning_rate": 0.0004735312818030556, + "loss": 0.90076458, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.41333008, + "step": 2762, + "time_per_iteration": 2.6917500495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00390708, + "epoch": 0.5315505963832243, + "flos": 509446553088.0, + "grad_norm": 0.032512052220750494, + "language_loss": 0.8324827, + "learning_rate": 0.0004732201814862727, + "loss": 0.84293473, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.41308594, + "step": 2763, + "time_per_iteration": 2.7620086669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045981, + "balance_loss_mlp": 1.00461555, + "epoch": 0.5317429780684879, + "flos": 627669758208.0, + "grad_norm": 0.03302669202039023, + "language_loss": 0.81508183, + "learning_rate": 0.0004729090915663373, + "loss": 0.82554156, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.41381836, + "step": 2764, + "time_per_iteration": 2.827430248260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_mlp": 1.00333464, + "epoch": 0.5319353597537514, + "flos": 477699705600.0, + "grad_norm": 0.039772813062738895, + "language_loss": 0.85676539, + "learning_rate": 0.00047259801216402534, + "loss": 0.86721289, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.41430664, + "step": 2765, + "time_per_iteration": 2.5082104206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104674, + "balance_loss_mlp": 1.00535059, + "epoch": 0.532127741439015, + "flos": 502634284032.0, + "grad_norm": 0.03926492526470634, + "language_loss": 0.86841261, + "learning_rate": 0.00047228694340010845, + "loss": 0.87888008, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.4140625, + "step": 2766, + "time_per_iteration": 2.549739360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047042, + "balance_loss_mlp": 1.00555718, + "epoch": 0.5323201231242786, + "flos": 1166484510720.0, + "grad_norm": 0.033303639033777616, + "language_loss": 0.86118937, + "learning_rate": 0.0004719758853953544, + "loss": 0.87165976, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.41503906, + "step": 2767, + "time_per_iteration": 3.5872445106506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_mlp": 1.00888503, + "epoch": 0.5325125048095422, + "flos": 379541977344.0, + "grad_norm": 0.045646551162954616, + "language_loss": 0.84812796, + "learning_rate": 0.00047166483827052645, + "loss": 0.85863209, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.41552734, + "step": 2768, + "time_per_iteration": 2.4177846908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057545, + "balance_loss_mlp": 1.01796722, + "epoch": 0.5327048864948057, + "flos": 1544750147328.0, + "grad_norm": 0.015563445131555704, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78136033, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.39550781, + "step": 2769, + "time_per_iteration": 4.974437236785889 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00447309, + "epoch": 0.5328972681800692, + "flos": 912862586112.0, + "grad_norm": 0.03252924413682995, + "language_loss": 0.84066141, + "learning_rate": 0.000471042777143682, + "loss": 0.85112101, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.41503906, + "step": 2770, + "time_per_iteration": 3.204782724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104834, + "balance_loss_mlp": 1.00680697, + "epoch": 0.5330896498653328, + "flos": 474851539200.0, + "grad_norm": 0.03462661973501109, + "language_loss": 0.80093729, + "learning_rate": 0.0004707317633831707, + "loss": 0.81142068, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.41552734, + "step": 2771, + "time_per_iteration": 2.566772699356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049499, + "balance_loss_mlp": 1.00789511, + "epoch": 0.5332820315505964, + "flos": 502634284032.0, + "grad_norm": 0.03484250248812788, + "language_loss": 0.78787035, + "learning_rate": 0.00047042076098559673, + "loss": 0.79836535, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.41625977, + "step": 2772, + "time_per_iteration": 2.5929906368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_mlp": 1.00454724, + "epoch": 0.53347441323586, + "flos": 926033168640.0, + "grad_norm": 0.038112679556298976, + "language_loss": 0.74248701, + "learning_rate": 0.00047010977007170174, + "loss": 0.75295115, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.41894531, + "step": 2773, + "time_per_iteration": 3.221947193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_mlp": 1.00956452, + "epoch": 0.5336667949211235, + "flos": 575540538624.0, + "grad_norm": 0.03388488907034337, + "language_loss": 0.83005095, + "learning_rate": 0.00046979879076222334, + "loss": 0.8405627, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.41625977, + "step": 2774, + "time_per_iteration": 2.7014822959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_mlp": 1.00767875, + "epoch": 0.533859176606387, + "flos": 1066392363264.0, + "grad_norm": 0.03095569704566717, + "language_loss": 0.85300922, + "learning_rate": 0.0004694878231778939, + "loss": 0.86350143, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.41552734, + "step": 2775, + "time_per_iteration": 3.368795156478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048588, + "balance_loss_mlp": 1.00700808, + "epoch": 0.5340515582916506, + "flos": 747907095552.0, + "grad_norm": 0.030429614039409136, + "language_loss": 0.84799051, + "learning_rate": 0.0004691768674394423, + "loss": 0.8584764, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.41601562, + "step": 2776, + "time_per_iteration": 2.958280324935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052456, + "balance_loss_mlp": 1.01230621, + "epoch": 0.5342439399769142, + "flos": 1448821379328.0, + "grad_norm": 0.012202915272427423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85536468, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.40136719, + "step": 2777, + "time_per_iteration": 4.774897575378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_mlp": 1.00908661, + "epoch": 0.5344363216621778, + "flos": 1430699069952.0, + "grad_norm": 0.005918596107012712, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77702767, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.40136719, + "step": 2778, + "time_per_iteration": 4.978635549545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051694, + "balance_loss_mlp": 1.01039958, + "epoch": 0.5346287033474413, + "flos": 528676050432.0, + "grad_norm": 0.029867236989907914, + "language_loss": 0.79874206, + "learning_rate": 0.00046824407250656676, + "loss": 0.80925894, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.41308594, + "step": 2779, + "time_per_iteration": 2.610321044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_mlp": 1.00790143, + "epoch": 0.5348210850327049, + "flos": 511756193280.0, + "grad_norm": 0.03028632537310572, + "language_loss": 0.83974576, + "learning_rate": 0.0004679331653588161, + "loss": 0.85023701, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.41235352, + "step": 2780, + "time_per_iteration": 2.641401529312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_mlp": 1.00530362, + "epoch": 0.5350134667179685, + "flos": 463626069504.0, + "grad_norm": 0.032724184133620285, + "language_loss": 0.86073065, + "learning_rate": 0.0004676222706605147, + "loss": 0.87119734, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.41381836, + "step": 2781, + "time_per_iteration": 2.6093719005584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046994, + "balance_loss_mlp": 1.005795, + "epoch": 0.535205848403232, + "flos": 710118829824.0, + "grad_norm": 0.033538440780340566, + "language_loss": 0.85521388, + "learning_rate": 0.0004673113885323626, + "loss": 0.86568379, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.41210938, + "step": 2782, + "time_per_iteration": 2.8278369903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_mlp": 1.00337684, + "epoch": 0.5353982300884956, + "flos": 895793029632.0, + "grad_norm": 0.03115315889801346, + "language_loss": 0.79367262, + "learning_rate": 0.00046700051909505494, + "loss": 0.80411977, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.41357422, + "step": 2783, + "time_per_iteration": 3.181025743484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_mlp": 1.00410628, + "epoch": 0.5355906117737591, + "flos": 537025163520.0, + "grad_norm": 0.03272022966866855, + "language_loss": 0.84359205, + "learning_rate": 0.000466689662469282, + "loss": 0.85404533, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41235352, + "step": 2784, + "time_per_iteration": 2.623128890991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045278, + "balance_loss_mlp": 1.00419891, + "epoch": 0.5357829934590227, + "flos": 870328673280.0, + "grad_norm": 0.0344669350963294, + "language_loss": 0.84610772, + "learning_rate": 0.00046637881877572917, + "loss": 0.85656047, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.41088867, + "step": 2785, + "time_per_iteration": 3.079174757003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010433, + "balance_loss_mlp": 1.00229168, + "epoch": 0.5359753751442863, + "flos": 554446608384.0, + "grad_norm": 0.028858393123854686, + "language_loss": 0.85135722, + "learning_rate": 0.0004660679881350764, + "loss": 0.86179018, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.41015625, + "step": 2786, + "time_per_iteration": 2.7473020553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_mlp": 1.00150299, + "epoch": 0.5361677568295499, + "flos": 1483759533312.0, + "grad_norm": 0.0067453290840893895, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76649511, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.3984375, + "step": 2787, + "time_per_iteration": 5.041473627090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_mlp": 1.0027802, + "epoch": 0.5363601385148133, + "flos": 807642767616.0, + "grad_norm": 0.03504389904677532, + "language_loss": 0.78613555, + "learning_rate": 0.0004654463664951667, + "loss": 0.79657346, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.41015625, + "step": 2788, + "time_per_iteration": 2.9798529148101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_mlp": 1.00775349, + "epoch": 0.5365525202000769, + "flos": 508879836672.0, + "grad_norm": 0.03320853792290129, + "language_loss": 0.8327626, + "learning_rate": 0.0004651355757372447, + "loss": 0.84325004, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.40991211, + "step": 2789, + "time_per_iteration": 2.643827438354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720656, + "epoch": 0.5367449018853405, + "flos": 530015563008.0, + "grad_norm": 0.032066447391342436, + "language_loss": 0.8626231, + "learning_rate": 0.00046482479851489274, + "loss": 0.87310588, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.41088867, + "step": 2790, + "time_per_iteration": 2.7637765407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046715, + "balance_loss_mlp": 1.0056597, + "epoch": 0.5369372835706041, + "flos": 651217191936.0, + "grad_norm": 0.038515792328953954, + "language_loss": 0.78515691, + "learning_rate": 0.00046451403494876525, + "loss": 0.79562402, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.41064453, + "step": 2791, + "time_per_iteration": 2.9090025424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_mlp": 1.00504696, + "epoch": 0.5371296652558677, + "flos": 585628684800.0, + "grad_norm": 0.03231753899308558, + "language_loss": 0.84747189, + "learning_rate": 0.0004642032851595111, + "loss": 0.85793316, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.41088867, + "step": 2792, + "time_per_iteration": 2.775444507598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_mlp": 1.00717819, + "epoch": 0.5373220469411312, + "flos": 597084533760.0, + "grad_norm": 0.03483653357210067, + "language_loss": 0.85361469, + "learning_rate": 0.00046389254926777404, + "loss": 0.86409795, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41162109, + "step": 2793, + "time_per_iteration": 2.8168118000030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00378096, + "epoch": 0.5375144286263948, + "flos": 1116279016704.0, + "grad_norm": 0.03171846878783484, + "language_loss": 0.78282589, + "learning_rate": 0.0004635818273941926, + "loss": 0.79327619, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.41259766, + "step": 2794, + "time_per_iteration": 3.5206284523010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_mlp": 1.00301409, + "epoch": 0.5377068103116583, + "flos": 596769583872.0, + "grad_norm": 0.0416500636560626, + "language_loss": 0.82705241, + "learning_rate": 0.0004632711196593997, + "loss": 0.83749551, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.41308594, + "step": 2795, + "time_per_iteration": 2.81925892829895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010512, + "balance_loss_mlp": 1.0100255, + "epoch": 0.5378991919969219, + "flos": 885650448384.0, + "grad_norm": 0.03764518727969069, + "language_loss": 0.85939819, + "learning_rate": 0.00046296042618402297, + "loss": 0.86991024, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.41186523, + "step": 2796, + "time_per_iteration": 3.076819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047791, + "balance_loss_mlp": 1.00666356, + "epoch": 0.5380915736821854, + "flos": 711951181824.0, + "grad_norm": 0.02842771896049368, + "language_loss": 0.79539001, + "learning_rate": 0.0004626497470886839, + "loss": 0.80586791, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.41137695, + "step": 2797, + "time_per_iteration": 2.9846107959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00844073, + "epoch": 0.538283955367449, + "flos": 558115203072.0, + "grad_norm": 0.029565541443496178, + "language_loss": 0.82388103, + "learning_rate": 0.00046233908249399897, + "loss": 0.83437717, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41186523, + "step": 2798, + "time_per_iteration": 2.7782254219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01255548, + "epoch": 0.5384763370527126, + "flos": 514482850560.0, + "grad_norm": 0.03320479864481119, + "language_loss": 0.78804994, + "learning_rate": 0.00046202843252057905, + "loss": 0.79858828, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.4128418, + "step": 2799, + "time_per_iteration": 2.60296368598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_mlp": 1.00985634, + "epoch": 0.5386687187379762, + "flos": 490720588800.0, + "grad_norm": 0.036707180351256564, + "language_loss": 0.84230787, + "learning_rate": 0.00046171779728902896, + "loss": 0.8528192, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.4128418, + "step": 2800, + "time_per_iteration": 2.5585505962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00514555, + "epoch": 0.5388611004232398, + "flos": 483628363008.0, + "grad_norm": 0.04683117604826235, + "language_loss": 0.86678994, + "learning_rate": 0.000461407176919948, + "loss": 0.87725389, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.41259766, + "step": 2801, + "time_per_iteration": 2.5158677101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045703, + "balance_loss_mlp": 1.00440919, + "epoch": 0.5390534821085032, + "flos": 562089999360.0, + "grad_norm": 0.033429611400543416, + "language_loss": 0.85806906, + "learning_rate": 0.00046109657153392997, + "loss": 0.8685261, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.41308594, + "step": 2802, + "time_per_iteration": 2.685462236404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_mlp": 1.00591016, + "epoch": 0.5392458637937668, + "flos": 489361634304.0, + "grad_norm": 0.036955437438287664, + "language_loss": 0.83497781, + "learning_rate": 0.0004607859812515622, + "loss": 0.84544891, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.41210938, + "step": 2803, + "time_per_iteration": 2.6187045574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054182, + "balance_loss_mlp": 1.01300752, + "epoch": 0.5394382454790304, + "flos": 513050019072.0, + "grad_norm": 0.03744234433888121, + "language_loss": 0.88279247, + "learning_rate": 0.00046047540619342667, + "loss": 0.89333427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.41186523, + "step": 2804, + "time_per_iteration": 2.5895795822143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046381, + "balance_loss_mlp": 1.00525355, + "epoch": 0.539630627164294, + "flos": 568689385728.0, + "grad_norm": 0.033797229327163864, + "language_loss": 0.80605161, + "learning_rate": 0.00046016484648009933, + "loss": 0.81651545, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.41137695, + "step": 2805, + "time_per_iteration": 2.691092014312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_mlp": 1.00612748, + "epoch": 0.5398230088495575, + "flos": 527503733760.0, + "grad_norm": 0.03721333567310717, + "language_loss": 0.8141259, + "learning_rate": 0.0004598543022321501, + "loss": 0.82459861, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.41162109, + "step": 2806, + "time_per_iteration": 2.6083474159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_mlp": 1.00312901, + "epoch": 0.5400153905348211, + "flos": 539853888000.0, + "grad_norm": 0.03209862982455251, + "language_loss": 0.80560988, + "learning_rate": 0.0004595437735701433, + "loss": 0.81605339, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.41235352, + "step": 2807, + "time_per_iteration": 2.688770055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_mlp": 1.00354242, + "epoch": 0.5402077722200846, + "flos": 514665597696.0, + "grad_norm": 0.03651112385557252, + "language_loss": 0.83778703, + "learning_rate": 0.00045923326061463623, + "loss": 0.84823376, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.41137695, + "step": 2808, + "time_per_iteration": 2.761165142059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_mlp": 1.00534451, + "epoch": 0.5404001539053482, + "flos": 677567105280.0, + "grad_norm": 0.031915220360544935, + "language_loss": 0.81941223, + "learning_rate": 0.00045892276348618113, + "loss": 0.82987767, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.41210938, + "step": 2809, + "time_per_iteration": 2.9716503620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105954, + "balance_loss_mlp": 1.01948547, + "epoch": 0.5405925355906118, + "flos": 1558191938304.0, + "grad_norm": 0.009079850654737754, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79320371, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.40039062, + "step": 2810, + "time_per_iteration": 4.989593029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051922, + "balance_loss_mlp": 1.01069915, + "epoch": 0.5407849172758753, + "flos": 648538166784.0, + "grad_norm": 0.030063831285765737, + "language_loss": 0.81372178, + "learning_rate": 0.000458301817192603, + "loss": 0.82424104, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.41235352, + "step": 2811, + "time_per_iteration": 2.855461359024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063999, + "balance_loss_mlp": 1.02404022, + "epoch": 0.5409772989611389, + "flos": 1410483893760.0, + "grad_norm": 0.010433444863556941, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81905782, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.39941406, + "step": 2812, + "time_per_iteration": 4.82320761680603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048585, + "balance_loss_mlp": 1.00748193, + "epoch": 0.5411696806464025, + "flos": 555545048064.0, + "grad_norm": 0.0337189850887645, + "language_loss": 0.87703073, + "learning_rate": 0.00045768093565369983, + "loss": 0.88751662, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.41113281, + "step": 2813, + "time_per_iteration": 2.7693569660186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047899, + "balance_loss_mlp": 1.00660491, + "epoch": 0.5413620623316661, + "flos": 529205828352.0, + "grad_norm": 0.032417929995103685, + "language_loss": 0.82523155, + "learning_rate": 0.0004573705194685646, + "loss": 0.83571053, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.41308594, + "step": 2814, + "time_per_iteration": 2.6525402069091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_mlp": 1.00637758, + "epoch": 0.5415544440169295, + "flos": 599852020224.0, + "grad_norm": 0.03532378336462207, + "language_loss": 0.85743833, + "learning_rate": 0.00045706011983366157, + "loss": 0.86791384, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.41186523, + "step": 2815, + "time_per_iteration": 2.67850661277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_mlp": 1.0085113, + "epoch": 0.5417468257021931, + "flos": 471714667776.0, + "grad_norm": 0.039926593194372036, + "language_loss": 0.83561838, + "learning_rate": 0.00045674973686949847, + "loss": 0.84611619, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.4128418, + "step": 2816, + "time_per_iteration": 2.56265926361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_mlp": 1.00839996, + "epoch": 0.5419392073874567, + "flos": 682191243264.0, + "grad_norm": 0.04027281254885066, + "language_loss": 0.85790694, + "learning_rate": 0.0004564393706965766, + "loss": 0.86840272, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.41186523, + "step": 2817, + "time_per_iteration": 2.955655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_mlp": 1.00700641, + "epoch": 0.5421315890727203, + "flos": 463337364480.0, + "grad_norm": 0.033241337033607515, + "language_loss": 0.82050943, + "learning_rate": 0.00045612902143539116, + "loss": 0.83099198, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.41259766, + "step": 2818, + "time_per_iteration": 2.546567440032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_mlp": 1.0021013, + "epoch": 0.5423239707579839, + "flos": 437890504704.0, + "grad_norm": 0.03727551718578137, + "language_loss": 0.82264733, + "learning_rate": 0.00045581868920642986, + "loss": 0.83307964, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.41137695, + "step": 2819, + "time_per_iteration": 2.4746038913726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043245, + "balance_loss_mlp": 1.00197434, + "epoch": 0.5425163524432474, + "flos": 459306187776.0, + "grad_norm": 0.035271404401503774, + "language_loss": 0.80009091, + "learning_rate": 0.00045550837413017457, + "loss": 0.81052339, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.4128418, + "step": 2820, + "time_per_iteration": 2.598879098892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00348616, + "epoch": 0.542708734128511, + "flos": 420410734080.0, + "grad_norm": 0.029285477013781286, + "language_loss": 0.8579312, + "learning_rate": 0.0004551980763271005, + "loss": 0.86837852, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.41259766, + "step": 2821, + "time_per_iteration": 2.650609254837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00917685, + "epoch": 0.5429011158137745, + "flos": 679709549568.0, + "grad_norm": 0.038877958454501954, + "language_loss": 0.84286433, + "learning_rate": 0.0004548877959176756, + "loss": 0.8533681, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.41210938, + "step": 2822, + "time_per_iteration": 2.831773042678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049844, + "balance_loss_mlp": 1.00857341, + "epoch": 0.5430934974990381, + "flos": 541968142080.0, + "grad_norm": 0.03541809911924704, + "language_loss": 0.8707608, + "learning_rate": 0.00045457753302236166, + "loss": 0.8812592, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.4128418, + "step": 2823, + "time_per_iteration": 2.609090805053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_mlp": 1.00726891, + "epoch": 0.5432858791843016, + "flos": 659644072704.0, + "grad_norm": 0.03671475643697152, + "language_loss": 0.87739956, + "learning_rate": 0.00045426728776161353, + "loss": 0.8878845, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.41235352, + "step": 2824, + "time_per_iteration": 2.802915334701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_mlp": 1.00574553, + "epoch": 0.5434782608695652, + "flos": 532967741952.0, + "grad_norm": 0.03427907044877429, + "language_loss": 0.82057846, + "learning_rate": 0.00045395706025587863, + "loss": 0.83104837, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.41259766, + "step": 2825, + "time_per_iteration": 2.6308939456939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_mlp": 1.00194418, + "epoch": 0.5436706425548288, + "flos": 609633964800.0, + "grad_norm": 0.034616126048734014, + "language_loss": 0.8290934, + "learning_rate": 0.00045364685062559843, + "loss": 0.83952391, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.41113281, + "step": 2826, + "time_per_iteration": 2.8231375217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.006657, + "epoch": 0.5438630242400924, + "flos": 706773933312.0, + "grad_norm": 0.03098010756730768, + "language_loss": 0.92170852, + "learning_rate": 0.0004533366589912067, + "loss": 0.93218541, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.41040039, + "step": 2827, + "time_per_iteration": 2.9529805183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105042, + "balance_loss_mlp": 1.00912547, + "epoch": 0.544055405925356, + "flos": 857839513344.0, + "grad_norm": 0.036966152235284246, + "language_loss": 0.78087002, + "learning_rate": 0.0004530264854731306, + "loss": 0.79137421, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.41308594, + "step": 2828, + "time_per_iteration": 3.0584123134613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00913441, + "epoch": 0.5442477876106194, + "flos": 572968438272.0, + "grad_norm": 0.03388858680916364, + "language_loss": 0.84792554, + "learning_rate": 0.00045271633019179034, + "loss": 0.85842907, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.41235352, + "step": 2829, + "time_per_iteration": 2.827160596847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046647, + "balance_loss_mlp": 1.00532901, + "epoch": 0.544440169295883, + "flos": 626803643136.0, + "grad_norm": 0.02947280635893411, + "language_loss": 0.88373405, + "learning_rate": 0.0004524061932675986, + "loss": 0.89420056, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.41333008, + "step": 2830, + "time_per_iteration": 2.8206188678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_mlp": 1.00768852, + "epoch": 0.5446325509811466, + "flos": 837641833728.0, + "grad_norm": 0.03760239902604625, + "language_loss": 0.87454915, + "learning_rate": 0.00045209607482096125, + "loss": 0.88503784, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.41186523, + "step": 2831, + "time_per_iteration": 3.0359649658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_mlp": 1.00600255, + "epoch": 0.5448249326664102, + "flos": 484390465536.0, + "grad_norm": 0.03560900416786153, + "language_loss": 0.8480038, + "learning_rate": 0.0004517859749722772, + "loss": 0.85847604, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.41235352, + "step": 2832, + "time_per_iteration": 2.689295768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050991, + "balance_loss_mlp": 1.00972044, + "epoch": 0.5450173143516738, + "flos": 562346623488.0, + "grad_norm": 0.03426430427633819, + "language_loss": 0.79531574, + "learning_rate": 0.0004514758938419376, + "loss": 0.80582559, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.4128418, + "step": 2833, + "time_per_iteration": 2.8727176189422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049419, + "balance_loss_mlp": 1.00965118, + "epoch": 0.5452096960369373, + "flos": 1473588761856.0, + "grad_norm": 0.014550980978032766, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77970004, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.39746094, + "step": 2834, + "time_per_iteration": 4.9399590492248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_mlp": 1.00556791, + "epoch": 0.5454020777222008, + "flos": 466018334976.0, + "grad_norm": 0.03248736316688099, + "language_loss": 0.84558713, + "learning_rate": 0.00045085578821782175, + "loss": 0.85605574, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.41308594, + "step": 2835, + "time_per_iteration": 2.5900182723999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057518, + "balance_loss_mlp": 1.01784515, + "epoch": 0.5455944594074644, + "flos": 1472617667328.0, + "grad_norm": 0.013168056581512213, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77192259, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.39648438, + "step": 2836, + "time_per_iteration": 4.910645961761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_mlp": 1.01100063, + "epoch": 0.545786841092728, + "flos": 534305309184.0, + "grad_norm": 0.02738620901632673, + "language_loss": 0.81102663, + "learning_rate": 0.00045023575891159866, + "loss": 0.82154894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.41235352, + "step": 2837, + "time_per_iteration": 2.7457492351531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_mlp": 1.00682068, + "epoch": 0.5459792227779915, + "flos": 1355428740096.0, + "grad_norm": 0.008010480990562174, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75810492, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.3984375, + "step": 2838, + "time_per_iteration": 4.94202995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00748277, + "epoch": 0.5461716044632551, + "flos": 639073117440.0, + "grad_norm": 0.02877585305336934, + "language_loss": 0.78956163, + "learning_rate": 0.0004496158068861354, + "loss": 0.80004895, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.41259766, + "step": 2839, + "time_per_iteration": 2.808370590209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_mlp": 1.00642872, + "epoch": 0.5463639861485187, + "flos": 603926938368.0, + "grad_norm": 0.03433602558833516, + "language_loss": 0.81297666, + "learning_rate": 0.00044930586015455207, + "loss": 0.82345319, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.41235352, + "step": 2840, + "time_per_iteration": 2.782735824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_mlp": 1.00695133, + "epoch": 0.5465563678337823, + "flos": 643753635840.0, + "grad_norm": 0.02662038136573285, + "language_loss": 0.89087546, + "learning_rate": 0.000448995933104179, + "loss": 0.9013567, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.41186523, + "step": 2841, + "time_per_iteration": 2.869476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050304, + "balance_loss_mlp": 1.0090816, + "epoch": 0.5467487495190458, + "flos": 615365290752.0, + "grad_norm": 0.03719587304070891, + "language_loss": 0.80725658, + "learning_rate": 0.00044868602585534077, + "loss": 0.81775963, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.41235352, + "step": 2842, + "time_per_iteration": 2.843027353286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_mlp": 1.00552344, + "epoch": 0.5469411312043093, + "flos": 462128109312.0, + "grad_norm": 0.03959126806850753, + "language_loss": 0.89450765, + "learning_rate": 0.0004483761385283541, + "loss": 0.90497464, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.41186523, + "step": 2843, + "time_per_iteration": 2.5162315368652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044431, + "balance_loss_mlp": 1.00332797, + "epoch": 0.5471335128895729, + "flos": 562267888896.0, + "grad_norm": 0.03475490738980998, + "language_loss": 0.82207608, + "learning_rate": 0.0004480662712435281, + "loss": 0.83252037, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.41113281, + "step": 2844, + "time_per_iteration": 2.7367589473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_mlp": 1.0045476, + "epoch": 0.5473258945748365, + "flos": 519686343936.0, + "grad_norm": 0.032685207895773144, + "language_loss": 0.8903448, + "learning_rate": 0.0004477564241211635, + "loss": 0.90080059, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.41040039, + "step": 2845, + "time_per_iteration": 2.6059961318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00640178, + "epoch": 0.5475182762601001, + "flos": 434744884992.0, + "grad_norm": 0.035185291050346845, + "language_loss": 0.87463105, + "learning_rate": 0.0004474465972815541, + "loss": 0.88510644, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.41137695, + "step": 2846, + "time_per_iteration": 2.5159108638763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_mlp": 1.00808775, + "epoch": 0.5477106579453636, + "flos": 512574676224.0, + "grad_norm": 0.03033857724648134, + "language_loss": 0.88145, + "learning_rate": 0.000447136790844985, + "loss": 0.89194143, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.41064453, + "step": 2847, + "time_per_iteration": 2.7494916915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049923, + "balance_loss_mlp": 1.00889075, + "epoch": 0.5479030396306271, + "flos": 677141339904.0, + "grad_norm": 0.030728657632270156, + "language_loss": 0.81529921, + "learning_rate": 0.00044682700493173385, + "loss": 0.82579845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.41040039, + "step": 2848, + "time_per_iteration": 2.8558499813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043616, + "balance_loss_mlp": 1.00260758, + "epoch": 0.5480954213158907, + "flos": 877579346688.0, + "grad_norm": 0.03576262257130289, + "language_loss": 0.80969125, + "learning_rate": 0.00044651723966207004, + "loss": 0.82012743, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.41015625, + "step": 2849, + "time_per_iteration": 3.1599223613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_mlp": 1.00768459, + "epoch": 0.5482878030011543, + "flos": 623175877632.0, + "grad_norm": 0.0450385792128453, + "language_loss": 0.79220605, + "learning_rate": 0.00044620749515625536, + "loss": 0.80269301, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.41015625, + "step": 2850, + "time_per_iteration": 2.816164255142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044849, + "balance_loss_mlp": 1.00376952, + "epoch": 0.5484801846864179, + "flos": 498258021888.0, + "grad_norm": 0.033687612572946876, + "language_loss": 0.85353971, + "learning_rate": 0.00044589777153454334, + "loss": 0.86398828, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.41088867, + "step": 2851, + "time_per_iteration": 2.767086982727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_mlp": 1.00158429, + "epoch": 0.5486725663716814, + "flos": 443354512896.0, + "grad_norm": 0.032917884516517996, + "language_loss": 0.84102762, + "learning_rate": 0.00044558806891717895, + "loss": 0.85145497, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.41162109, + "step": 2852, + "time_per_iteration": 2.4791274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_mlp": 1.00560999, + "epoch": 0.548864948056945, + "flos": 656348753664.0, + "grad_norm": 0.02926310360240776, + "language_loss": 0.80048501, + "learning_rate": 0.0004452783874243998, + "loss": 0.81095093, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.40991211, + "step": 2853, + "time_per_iteration": 2.8510489463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051891, + "balance_loss_mlp": 1.01100183, + "epoch": 0.5490573297422086, + "flos": 547141499904.0, + "grad_norm": 0.035598285504377866, + "language_loss": 0.85552013, + "learning_rate": 0.00044496872717643475, + "loss": 0.86603898, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.40893555, + "step": 2854, + "time_per_iteration": 2.6640069484710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107375, + "balance_loss_mlp": 1.03398132, + "epoch": 0.5492497114274721, + "flos": 1593763882752.0, + "grad_norm": 0.015003928091872471, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7816304, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.39746094, + "step": 2855, + "time_per_iteration": 4.924941778182983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00791013, + "epoch": 0.5494420931127356, + "flos": 752270718720.0, + "grad_norm": 0.03382110809465603, + "language_loss": 0.82668245, + "learning_rate": 0.0004443494708958217, + "loss": 0.83717024, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.40869141, + "step": 2856, + "time_per_iteration": 2.9736838340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049194, + "balance_loss_mlp": 1.00837672, + "epoch": 0.5496344747979992, + "flos": 627305230848.0, + "grad_norm": 0.02827813290363101, + "language_loss": 0.81289691, + "learning_rate": 0.0004440398751035906, + "loss": 0.82338881, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.40820312, + "step": 2857, + "time_per_iteration": 2.943936347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_mlp": 1.01289868, + "epoch": 0.5498268564832628, + "flos": 524125789440.0, + "grad_norm": 0.04150845511788398, + "language_loss": 0.8407867, + "learning_rate": 0.00044373030103700645, + "loss": 0.85132337, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.40771484, + "step": 2858, + "time_per_iteration": 2.5977840423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00719178, + "epoch": 0.5500192381685264, + "flos": 605778732288.0, + "grad_norm": 0.03313045470580536, + "language_loss": 0.80440414, + "learning_rate": 0.000443420748816257, + "loss": 0.81488407, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.40795898, + "step": 2859, + "time_per_iteration": 2.7645347118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049935, + "balance_loss_mlp": 1.00914145, + "epoch": 0.55021161985379, + "flos": 521655756288.0, + "grad_norm": 0.037659665058523445, + "language_loss": 0.79047614, + "learning_rate": 0.0004431112185615208, + "loss": 0.8009755, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.40795898, + "step": 2860, + "time_per_iteration": 2.7862706184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_mlp": 1.00302446, + "epoch": 0.5504040015390534, + "flos": 490655460096.0, + "grad_norm": 0.03348154415794888, + "language_loss": 0.8037793, + "learning_rate": 0.00044280171039296845, + "loss": 0.8142184, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.40893555, + "step": 2861, + "time_per_iteration": 2.6561086177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052487, + "balance_loss_mlp": 1.01166964, + "epoch": 0.550596383224317, + "flos": 576862554624.0, + "grad_norm": 0.03513860333112342, + "language_loss": 0.88868964, + "learning_rate": 0.0004424922244307616, + "loss": 0.89921451, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.40820312, + "step": 2862, + "time_per_iteration": 2.7066099643707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01298499, + "epoch": 0.5507887649095806, + "flos": 643634072064.0, + "grad_norm": 0.03653258974946179, + "language_loss": 0.82663441, + "learning_rate": 0.00044218276079505315, + "loss": 0.83717263, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.40844727, + "step": 2863, + "time_per_iteration": 2.87058162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049384, + "balance_loss_mlp": 1.00856698, + "epoch": 0.5509811465948442, + "flos": 532865674752.0, + "grad_norm": 0.034931125724459874, + "language_loss": 0.75083911, + "learning_rate": 0.0004418733196059876, + "loss": 0.76133299, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.40820312, + "step": 2864, + "time_per_iteration": 2.690927743911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048154, + "balance_loss_mlp": 1.00719357, + "epoch": 0.5511735282801077, + "flos": 655984226304.0, + "grad_norm": 0.03582782743987034, + "language_loss": 0.80482149, + "learning_rate": 0.0004415639009837008, + "loss": 0.81530309, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.40966797, + "step": 2865, + "time_per_iteration": 2.8515002727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050322, + "balance_loss_mlp": 1.00948107, + "epoch": 0.5513659099653713, + "flos": 530610469632.0, + "grad_norm": 0.03216902856467023, + "language_loss": 0.82250589, + "learning_rate": 0.00044125450504831955, + "loss": 0.83300906, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.40844727, + "step": 2866, + "time_per_iteration": 2.743833303451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053087, + "balance_loss_mlp": 1.01229346, + "epoch": 0.5515582916506349, + "flos": 555974704128.0, + "grad_norm": 0.03636447949545943, + "language_loss": 0.827411, + "learning_rate": 0.0004409451319199622, + "loss": 0.83794183, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.40795898, + "step": 2867, + "time_per_iteration": 2.654466390609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045403, + "balance_loss_mlp": 1.00439477, + "epoch": 0.5517506733358984, + "flos": 736772999424.0, + "grad_norm": 0.03752588301556939, + "language_loss": 0.85160595, + "learning_rate": 0.0004406357817187381, + "loss": 0.86206001, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.41015625, + "step": 2868, + "time_per_iteration": 2.9610273838043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051499, + "balance_loss_mlp": 1.01065779, + "epoch": 0.551943055021162, + "flos": 1117190818560.0, + "grad_norm": 0.028811275091252902, + "language_loss": 0.81857193, + "learning_rate": 0.0004403264545647474, + "loss": 0.8290869, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.40844727, + "step": 2869, + "time_per_iteration": 3.511462450027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043012, + "balance_loss_mlp": 1.00195587, + "epoch": 0.5521354367064255, + "flos": 545502588672.0, + "grad_norm": 0.03184831617373855, + "language_loss": 0.85004073, + "learning_rate": 0.00044001715057808154, + "loss": 0.86047089, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.41064453, + "step": 2870, + "time_per_iteration": 2.744248390197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048495, + "balance_loss_mlp": 1.00746286, + "epoch": 0.5523278183916891, + "flos": 937872986880.0, + "grad_norm": 0.03348956391566461, + "language_loss": 0.81933939, + "learning_rate": 0.0004397078698788232, + "loss": 0.82982433, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.41040039, + "step": 2871, + "time_per_iteration": 3.193040132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_mlp": 1.01277161, + "epoch": 0.5525202000769527, + "flos": 1469101684224.0, + "grad_norm": 0.00853782264427079, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81494617, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.39453125, + "step": 2872, + "time_per_iteration": 4.887877702713013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050207, + "balance_loss_mlp": 1.00917542, + "epoch": 0.5527125817622163, + "flos": 490785717504.0, + "grad_norm": 0.036240955421061, + "language_loss": 0.78392744, + "learning_rate": 0.00043908937882281343, + "loss": 0.79442948, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.41040039, + "step": 2873, + "time_per_iteration": 2.5992209911346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00414526, + "epoch": 0.5529049634474797, + "flos": 636149128704.0, + "grad_norm": 0.03461125376652938, + "language_loss": 0.82969832, + "learning_rate": 0.0004387801687061814, + "loss": 0.84015036, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.41064453, + "step": 2874, + "time_per_iteration": 2.8166332244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_mlp": 1.00408852, + "epoch": 0.5530973451327433, + "flos": 582435432960.0, + "grad_norm": 0.031639900781256135, + "language_loss": 0.81371784, + "learning_rate": 0.0004384709823571958, + "loss": 0.82416999, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.41137695, + "step": 2875, + "time_per_iteration": 2.7777786254882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00458348, + "epoch": 0.5532897268180069, + "flos": 1124330676480.0, + "grad_norm": 0.03430168550584483, + "language_loss": 0.83714402, + "learning_rate": 0.0004381618198958932, + "loss": 0.84760094, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.41113281, + "step": 2876, + "time_per_iteration": 3.517432451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_mlp": 1.00536335, + "epoch": 0.5534821085032705, + "flos": 638513203968.0, + "grad_norm": 0.03082674119581989, + "language_loss": 0.83886576, + "learning_rate": 0.00043785268144230137, + "loss": 0.84933138, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.41210938, + "step": 2877, + "time_per_iteration": 2.9488272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048169, + "balance_loss_mlp": 1.0069226, + "epoch": 0.5536744901885341, + "flos": 572217029376.0, + "grad_norm": 0.037462471463683845, + "language_loss": 0.8303535, + "learning_rate": 0.00043754356711643837, + "loss": 0.84083521, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.41259766, + "step": 2878, + "time_per_iteration": 2.669304370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_mlp": 1.00479829, + "epoch": 0.5538668718737976, + "flos": 596917337856.0, + "grad_norm": 0.03146432649645385, + "language_loss": 0.84558415, + "learning_rate": 0.0004372344770383132, + "loss": 0.8560434, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.41137695, + "step": 2879, + "time_per_iteration": 2.855231761932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050888, + "balance_loss_mlp": 1.0097847, + "epoch": 0.5540592535590612, + "flos": 533719150848.0, + "grad_norm": 0.0358528854453713, + "language_loss": 0.83432066, + "learning_rate": 0.00043692541132792507, + "loss": 0.84482956, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.41113281, + "step": 2880, + "time_per_iteration": 2.662008047103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051879, + "balance_loss_mlp": 1.01070428, + "epoch": 0.5542516352443247, + "flos": 413505146112.0, + "grad_norm": 0.035032849721931915, + "language_loss": 0.83894408, + "learning_rate": 0.00043661637010526384, + "loss": 0.84946287, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.41186523, + "step": 2881, + "time_per_iteration": 2.507699489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_mlp": 1.00484717, + "epoch": 0.5544440169295883, + "flos": 548678343936.0, + "grad_norm": 0.03314086611141918, + "language_loss": 0.83246458, + "learning_rate": 0.00043630735349031025, + "loss": 0.84292531, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.41235352, + "step": 2882, + "time_per_iteration": 2.70409893989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_mlp": 1.00623393, + "epoch": 0.5546363986148518, + "flos": 623034926592.0, + "grad_norm": 0.03282028788454341, + "language_loss": 0.82495463, + "learning_rate": 0.00043599836160303495, + "loss": 0.83542871, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.41186523, + "step": 2883, + "time_per_iteration": 2.900757312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_mlp": 1.00550687, + "epoch": 0.5548287803001154, + "flos": 706580492544.0, + "grad_norm": 0.029978122278870225, + "language_loss": 0.78110325, + "learning_rate": 0.0004356893945633995, + "loss": 0.79157007, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.41186523, + "step": 2884, + "time_per_iteration": 2.975062608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_mlp": 1.00501966, + "epoch": 0.555021161985379, + "flos": 505184997120.0, + "grad_norm": 0.033025085572570244, + "language_loss": 0.82143605, + "learning_rate": 0.0004353804524913551, + "loss": 0.83189756, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.41137695, + "step": 2885, + "time_per_iteration": 2.6369645595550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_mlp": 1.00512528, + "epoch": 0.5552135436706426, + "flos": 617210281728.0, + "grad_norm": 0.0369840001422722, + "language_loss": 0.82350749, + "learning_rate": 0.0004350715355068441, + "loss": 0.83396947, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.41088867, + "step": 2886, + "time_per_iteration": 2.727186441421509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044964, + "balance_loss_mlp": 1.00393176, + "epoch": 0.5554059253559062, + "flos": 464817828096.0, + "grad_norm": 0.043659618464352824, + "language_loss": 0.80073905, + "learning_rate": 0.00043476264372979847, + "loss": 0.8111887, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.41040039, + "step": 2887, + "time_per_iteration": 2.5368049144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_mlp": 1.00357509, + "epoch": 0.5555983070411696, + "flos": 1564876885248.0, + "grad_norm": 0.03408551435207337, + "language_loss": 0.79322737, + "learning_rate": 0.0004344537772801408, + "loss": 0.80367273, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.40966797, + "step": 2888, + "time_per_iteration": 3.869920015335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057491, + "balance_loss_mlp": 1.01791382, + "epoch": 0.5557906887264332, + "flos": 1471229544192.0, + "grad_norm": 0.014769088101488215, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74479944, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.39550781, + "step": 2889, + "time_per_iteration": 4.936699867248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_mlp": 1.00608003, + "epoch": 0.5559830704116968, + "flos": 530864181504.0, + "grad_norm": 0.0376436874687178, + "language_loss": 0.83696067, + "learning_rate": 0.0004338361208426298, + "loss": 0.84743202, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.41064453, + "step": 2890, + "time_per_iteration": 2.6094541549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051706, + "balance_loss_mlp": 1.01069844, + "epoch": 0.5561754520969604, + "flos": 652519766016.0, + "grad_norm": 0.029226912064567154, + "language_loss": 0.81876659, + "learning_rate": 0.00043352733109457164, + "loss": 0.82928365, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.41015625, + "step": 2891, + "time_per_iteration": 2.8833718299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00985098, + "epoch": 0.556367833782224, + "flos": 735620124672.0, + "grad_norm": 0.029092214279724596, + "language_loss": 0.84975475, + "learning_rate": 0.00043321856715349244, + "loss": 0.86026359, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.41040039, + "step": 2892, + "time_per_iteration": 2.9798240661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046881, + "balance_loss_mlp": 1.00575387, + "epoch": 0.5565602154674875, + "flos": 673641886464.0, + "grad_norm": 0.03553967461394851, + "language_loss": 0.81101406, + "learning_rate": 0.00043290982913926466, + "loss": 0.8214829, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.41137695, + "step": 2893, + "time_per_iteration": 2.8139491081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00463176, + "epoch": 0.556752597152751, + "flos": 587504778240.0, + "grad_norm": 0.036653967015968944, + "language_loss": 0.84921324, + "learning_rate": 0.0004326011171717514, + "loss": 0.85967016, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.41064453, + "step": 2894, + "time_per_iteration": 2.9087953567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046707, + "balance_loss_mlp": 1.00555551, + "epoch": 0.5569449788380146, + "flos": 438691491072.0, + "grad_norm": 0.03515530628910635, + "language_loss": 0.81422639, + "learning_rate": 0.0004322924313708051, + "loss": 0.82469344, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.41162109, + "step": 2895, + "time_per_iteration": 2.529937505722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051552, + "balance_loss_mlp": 1.01054382, + "epoch": 0.5571373605232782, + "flos": 503248632576.0, + "grad_norm": 0.03724847922393753, + "language_loss": 0.84896851, + "learning_rate": 0.0004319837718562681, + "loss": 0.85948396, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.41015625, + "step": 2896, + "time_per_iteration": 2.6142115592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_mlp": 1.00599957, + "epoch": 0.5573297422085417, + "flos": 578590894080.0, + "grad_norm": 0.04905398235042313, + "language_loss": 0.83417499, + "learning_rate": 0.0004316751387479726, + "loss": 0.84464645, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.41162109, + "step": 2897, + "time_per_iteration": 2.7738893032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_mlp": 1.00555933, + "epoch": 0.5575221238938053, + "flos": 1346049251328.0, + "grad_norm": 0.03588075887117774, + "language_loss": 0.82779884, + "learning_rate": 0.0004313665321657409, + "loss": 0.83826572, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.41137695, + "step": 2898, + "time_per_iteration": 3.725510835647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_mlp": 1.00672877, + "epoch": 0.5577145055790689, + "flos": 603099707136.0, + "grad_norm": 0.03720848090960627, + "language_loss": 0.80283779, + "learning_rate": 0.00043105795222938436, + "loss": 0.81331486, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.40991211, + "step": 2899, + "time_per_iteration": 2.7282700538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_mlp": 1.00829744, + "epoch": 0.5579068872643325, + "flos": 563691972096.0, + "grad_norm": 0.03568825250494595, + "language_loss": 0.79214776, + "learning_rate": 0.00043074939905870467, + "loss": 0.80263913, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.40844727, + "step": 2900, + "time_per_iteration": 2.696354389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_mlp": 1.00399923, + "epoch": 0.558099268949596, + "flos": 545589104640.0, + "grad_norm": 0.04035642488371941, + "language_loss": 0.81151342, + "learning_rate": 0.0004304408727734927, + "loss": 0.82196188, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.40844727, + "step": 2901, + "time_per_iteration": 2.6394877433776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044507, + "balance_loss_mlp": 1.00366592, + "epoch": 0.5582916506348595, + "flos": 553853647104.0, + "grad_norm": 0.036813902208390564, + "language_loss": 0.89428526, + "learning_rate": 0.0004301323734935288, + "loss": 0.90473032, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.40844727, + "step": 2902, + "time_per_iteration": 2.659945249557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_mlp": 1.00635207, + "epoch": 0.5584840323201231, + "flos": 544425536256.0, + "grad_norm": 0.03290970227186249, + "language_loss": 0.87933898, + "learning_rate": 0.000429823901338583, + "loss": 0.88981086, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.40844727, + "step": 2903, + "time_per_iteration": 2.643388032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_mlp": 1.00432324, + "epoch": 0.5586764140053867, + "flos": 817023246336.0, + "grad_norm": 0.03162840926526219, + "language_loss": 0.87249023, + "learning_rate": 0.00042951545642841513, + "loss": 0.88294262, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.40917969, + "step": 2904, + "time_per_iteration": 3.0901763439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047509, + "balance_loss_mlp": 1.00642967, + "epoch": 0.5588687956906503, + "flos": 487416521472.0, + "grad_norm": 0.02951660315659268, + "language_loss": 0.87151515, + "learning_rate": 0.0004292070388827737, + "loss": 0.88199031, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.41088867, + "step": 2905, + "time_per_iteration": 2.6241614818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050702, + "balance_loss_mlp": 1.00967062, + "epoch": 0.5590611773759138, + "flos": 453069383424.0, + "grad_norm": 0.03428125950398782, + "language_loss": 0.81863332, + "learning_rate": 0.00042889864882139753, + "loss": 0.82914031, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.41040039, + "step": 2906, + "time_per_iteration": 2.6295247077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_mlp": 1.01025224, + "epoch": 0.5592535590611774, + "flos": 521957100288.0, + "grad_norm": 0.03203389874594117, + "language_loss": 0.82458705, + "learning_rate": 0.0004285902863640139, + "loss": 0.83510035, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.41088867, + "step": 2907, + "time_per_iteration": 2.6310994625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044647, + "balance_loss_mlp": 1.00366294, + "epoch": 0.5594459407464409, + "flos": 553601880576.0, + "grad_norm": 0.029509403523767207, + "language_loss": 0.86282808, + "learning_rate": 0.00042828195163033966, + "loss": 0.87327456, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.40991211, + "step": 2908, + "time_per_iteration": 2.720059871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_mlp": 1.00285828, + "epoch": 0.5596383224317045, + "flos": 485788303872.0, + "grad_norm": 0.032784621074408576, + "language_loss": 0.796462, + "learning_rate": 0.0004279736447400812, + "loss": 0.80690086, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.41040039, + "step": 2909, + "time_per_iteration": 2.562958240509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_mlp": 1.00323904, + "epoch": 0.5598307041169681, + "flos": 612380064000.0, + "grad_norm": 0.03125271468065307, + "language_loss": 0.78822809, + "learning_rate": 0.00042766536581293385, + "loss": 0.79866982, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.40942383, + "step": 2910, + "time_per_iteration": 2.742727041244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_mlp": 1.00297463, + "epoch": 0.5600230858022316, + "flos": 489917657088.0, + "grad_norm": 0.033084161668713065, + "language_loss": 0.80192208, + "learning_rate": 0.0004273571149685819, + "loss": 0.81236243, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.41064453, + "step": 2911, + "time_per_iteration": 2.7333109378814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_mlp": 1.00091636, + "epoch": 0.5602154674874952, + "flos": 599982277632.0, + "grad_norm": 0.033670817346998394, + "language_loss": 0.84396589, + "learning_rate": 0.00042704889232669937, + "loss": 0.8543846, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.40966797, + "step": 2912, + "time_per_iteration": 2.7085225582122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00336003, + "epoch": 0.5604078491727588, + "flos": 587063461632.0, + "grad_norm": 0.043754524068974454, + "language_loss": 0.8611334, + "learning_rate": 0.0004267406980069484, + "loss": 0.87157494, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.40795898, + "step": 2913, + "time_per_iteration": 2.747812271118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00275385, + "epoch": 0.5606002308580224, + "flos": 542328778752.0, + "grad_norm": 0.02876490223829942, + "language_loss": 0.7993964, + "learning_rate": 0.0004264325321289808, + "loss": 0.80983406, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.41015625, + "step": 2914, + "time_per_iteration": 2.8028316497802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043869, + "balance_loss_mlp": 1.0028609, + "epoch": 0.5607926125432858, + "flos": 585079464960.0, + "grad_norm": 0.03419971609404561, + "language_loss": 0.86714381, + "learning_rate": 0.00042612439481243736, + "loss": 0.87758255, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.41015625, + "step": 2915, + "time_per_iteration": 2.7691102027893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045259, + "balance_loss_mlp": 1.00417948, + "epoch": 0.5609849942285494, + "flos": 628631137536.0, + "grad_norm": 0.0372312942186238, + "language_loss": 0.90099525, + "learning_rate": 0.00042581628617694735, + "loss": 0.91144788, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.41088867, + "step": 2916, + "time_per_iteration": 2.7420172691345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043547, + "balance_loss_mlp": 1.00261009, + "epoch": 0.561177375913813, + "flos": 589455727104.0, + "grad_norm": 0.03338895186153077, + "language_loss": 0.82208467, + "learning_rate": 0.0004255082063421296, + "loss": 0.83252013, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.40942383, + "step": 2917, + "time_per_iteration": 2.673243999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_mlp": 1.0016005, + "epoch": 0.5613697575990766, + "flos": 528144327168.0, + "grad_norm": 0.03066260992789867, + "language_loss": 0.85543269, + "learning_rate": 0.00042520015542759065, + "loss": 0.86586022, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.41162109, + "step": 2918, + "time_per_iteration": 2.879850387573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_mlp": 1.00201178, + "epoch": 0.5615621392843402, + "flos": 643875144960.0, + "grad_norm": 0.028477148441929827, + "language_loss": 0.88382292, + "learning_rate": 0.00042489213355292687, + "loss": 0.89425319, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.41015625, + "step": 2919, + "time_per_iteration": 2.9279518127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044234, + "balance_loss_mlp": 1.00315475, + "epoch": 0.5617545209696037, + "flos": 428657779968.0, + "grad_norm": 0.03756668389237789, + "language_loss": 0.81703657, + "learning_rate": 0.00042458414083772276, + "loss": 0.82747889, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.41088867, + "step": 2920, + "time_per_iteration": 2.5474023818969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_mlp": 1.00371051, + "epoch": 0.5619469026548672, + "flos": 569590493952.0, + "grad_norm": 0.029467937694277743, + "language_loss": 0.85509026, + "learning_rate": 0.000424276177401552, + "loss": 0.86553693, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.40966797, + "step": 2921, + "time_per_iteration": 2.797123670578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_mlp": 1.00260556, + "epoch": 0.5621392843401308, + "flos": 506244552960.0, + "grad_norm": 0.03575401527758356, + "language_loss": 0.86372185, + "learning_rate": 0.0004239682433639763, + "loss": 0.87415743, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.40966797, + "step": 2922, + "time_per_iteration": 2.6631922721862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_mlp": 1.00281191, + "epoch": 0.5623316660253944, + "flos": 518010494208.0, + "grad_norm": 0.03518251960287723, + "language_loss": 0.86062789, + "learning_rate": 0.0004236603388445467, + "loss": 0.87106532, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.40942383, + "step": 2923, + "time_per_iteration": 2.60380482673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_mlp": 1.00410116, + "epoch": 0.5625240477106579, + "flos": 607139632128.0, + "grad_norm": 0.03089029411800112, + "language_loss": 0.82301855, + "learning_rate": 0.00042335246396280166, + "loss": 0.8334682, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.40869141, + "step": 2924, + "time_per_iteration": 2.7605555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045584, + "balance_loss_mlp": 1.00462389, + "epoch": 0.5627164293959215, + "flos": 451341043968.0, + "grad_norm": 0.04701230911743114, + "language_loss": 0.91272092, + "learning_rate": 0.0004230446188382693, + "loss": 0.92317677, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.40966797, + "step": 2925, + "time_per_iteration": 2.5571765899658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042518, + "balance_loss_mlp": 1.00158191, + "epoch": 0.5629088110811851, + "flos": 743437514496.0, + "grad_norm": 0.0349005963329915, + "language_loss": 0.81125653, + "learning_rate": 0.0004227368035904654, + "loss": 0.82168174, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.40942383, + "step": 2926, + "time_per_iteration": 3.0334270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_mlp": 1.00211096, + "epoch": 0.5631011927664487, + "flos": 497980010496.0, + "grad_norm": 0.0467260030557379, + "language_loss": 0.83361161, + "learning_rate": 0.00042242901833889474, + "loss": 0.84404236, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.40966797, + "step": 2927, + "time_per_iteration": 2.6271822452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_mlp": 1.00153816, + "epoch": 0.5632935744517122, + "flos": 887595561216.0, + "grad_norm": 0.03653524957968277, + "language_loss": 0.8629514, + "learning_rate": 0.0004221212632030501, + "loss": 0.87337685, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.41015625, + "step": 2928, + "time_per_iteration": 3.1174416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_mlp": 1.00542605, + "epoch": 0.5634859561369757, + "flos": 605902186752.0, + "grad_norm": 0.04110669316721802, + "language_loss": 0.80746865, + "learning_rate": 0.0004218135383024124, + "loss": 0.81793177, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.40893555, + "step": 2929, + "time_per_iteration": 2.705615758895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_mlp": 1.00056946, + "epoch": 0.5636783378222393, + "flos": 454903680768.0, + "grad_norm": 0.0339470495466753, + "language_loss": 0.85614669, + "learning_rate": 0.0004215058437564511, + "loss": 0.86656082, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.40844727, + "step": 2930, + "time_per_iteration": 2.5682146549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_mlp": 1.00006831, + "epoch": 0.5638707195075029, + "flos": 519462767616.0, + "grad_norm": 0.03372410984042782, + "language_loss": 0.82691574, + "learning_rate": 0.00042119817968462397, + "loss": 0.83732378, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.4074707, + "step": 2931, + "time_per_iteration": 2.6308341026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105222, + "balance_loss_mlp": 1.01135468, + "epoch": 0.5640631011927665, + "flos": 565845110016.0, + "grad_norm": 0.03794773284405352, + "language_loss": 0.87544155, + "learning_rate": 0.0004208905462063766, + "loss": 0.88596374, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.40869141, + "step": 2932, + "time_per_iteration": 2.6615707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049556, + "balance_loss_mlp": 1.00866711, + "epoch": 0.56425548287803, + "flos": 518038684416.0, + "grad_norm": 0.03232798556838129, + "language_loss": 0.84722394, + "learning_rate": 0.00042058294344114315, + "loss": 0.85771948, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.40893555, + "step": 2933, + "time_per_iteration": 2.6182868480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049116, + "balance_loss_mlp": 1.0083226, + "epoch": 0.5644478645632935, + "flos": 855670824192.0, + "grad_norm": 0.03170317888214056, + "language_loss": 0.78432804, + "learning_rate": 0.0004202753715083456, + "loss": 0.79481918, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.40795898, + "step": 2934, + "time_per_iteration": 3.0613481998443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045, + "balance_loss_mlp": 1.00420666, + "epoch": 0.5646402462485571, + "flos": 554496185856.0, + "grad_norm": 0.03929055225526713, + "language_loss": 0.81611717, + "learning_rate": 0.0004199678305273936, + "loss": 0.82656717, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.40795898, + "step": 2935, + "time_per_iteration": 2.634765386581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00552905, + "epoch": 0.5648326279338207, + "flos": 687312111360.0, + "grad_norm": 0.02956036273454178, + "language_loss": 0.8172124, + "learning_rate": 0.0004196603206176854, + "loss": 0.82767659, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.40893555, + "step": 2936, + "time_per_iteration": 2.9358084201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_mlp": 1.00783014, + "epoch": 0.5650250096190843, + "flos": 804683785728.0, + "grad_norm": 0.03257366451462874, + "language_loss": 0.84142041, + "learning_rate": 0.000419352841898607, + "loss": 0.85190785, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.40917969, + "step": 2937, + "time_per_iteration": 2.9652152061462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_mlp": 1.00891984, + "epoch": 0.5652173913043478, + "flos": 583145045760.0, + "grad_norm": 0.037245032295536384, + "language_loss": 0.7792089, + "learning_rate": 0.000419045394489532, + "loss": 0.78970701, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.40893555, + "step": 2938, + "time_per_iteration": 2.6814448833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048464, + "balance_loss_mlp": 1.00752795, + "epoch": 0.5654097729896114, + "flos": 822168413952.0, + "grad_norm": 0.03166469527574581, + "language_loss": 0.76863134, + "learning_rate": 0.0004187379785098224, + "loss": 0.77911597, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.40942383, + "step": 2939, + "time_per_iteration": 3.1437690258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049709, + "balance_loss_mlp": 1.00881994, + "epoch": 0.565602154674875, + "flos": 785482478592.0, + "grad_norm": 0.035451368889273006, + "language_loss": 0.84531581, + "learning_rate": 0.00041843059407882744, + "loss": 0.85581291, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.40893555, + "step": 2940, + "time_per_iteration": 2.9561386108398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_mlp": 1.00554383, + "epoch": 0.5657945363601385, + "flos": 550744965888.0, + "grad_norm": 0.033205673863039784, + "language_loss": 0.83385015, + "learning_rate": 0.0004181232413158842, + "loss": 0.84431374, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.40820312, + "step": 2941, + "time_per_iteration": 2.6476027965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047072, + "balance_loss_mlp": 1.0061357, + "epoch": 0.5659869180454021, + "flos": 669332698368.0, + "grad_norm": 0.03636978251075169, + "language_loss": 0.83073509, + "learning_rate": 0.0004178159203403179, + "loss": 0.84120584, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.40942383, + "step": 2942, + "time_per_iteration": 2.835840940475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_mlp": 1.00862479, + "epoch": 0.5661792997306656, + "flos": 500949686016.0, + "grad_norm": 0.030415094414242012, + "language_loss": 0.8213833, + "learning_rate": 0.0004175086312714409, + "loss": 0.83187747, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.40795898, + "step": 2943, + "time_per_iteration": 2.6258370876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00911188, + "epoch": 0.5663716814159292, + "flos": 602363849472.0, + "grad_norm": 0.030374801338140925, + "language_loss": 0.84196591, + "learning_rate": 0.00041720137422855366, + "loss": 0.85246402, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.40698242, + "step": 2944, + "time_per_iteration": 2.753483772277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050206, + "balance_loss_mlp": 1.00948393, + "epoch": 0.5665640631011928, + "flos": 542033270784.0, + "grad_norm": 0.0327328941542846, + "language_loss": 0.79511452, + "learning_rate": 0.00041689414933094383, + "loss": 0.80561656, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.40722656, + "step": 2945, + "time_per_iteration": 2.6251614093780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_mlp": 1.00701642, + "epoch": 0.5667564447864564, + "flos": 603062768640.0, + "grad_norm": 0.03650681858880775, + "language_loss": 0.81631696, + "learning_rate": 0.00041658695669788653, + "loss": 0.82679439, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.40722656, + "step": 2946, + "time_per_iteration": 2.7196879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00432932, + "epoch": 0.5669488264717198, + "flos": 660723070464.0, + "grad_norm": 0.039783949444703086, + "language_loss": 0.82089484, + "learning_rate": 0.00041627979644864453, + "loss": 0.83134508, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.40698242, + "step": 2947, + "time_per_iteration": 2.8414080142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_mlp": 1.00243521, + "epoch": 0.5671412081569834, + "flos": 486383210496.0, + "grad_norm": 0.029571262892964766, + "language_loss": 0.81883216, + "learning_rate": 0.0004159726687024683, + "loss": 0.82926297, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.40649414, + "step": 2948, + "time_per_iteration": 2.6365981101989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043624, + "balance_loss_mlp": 1.0029496, + "epoch": 0.567333589842247, + "flos": 731061115392.0, + "grad_norm": 0.03568675680792695, + "language_loss": 0.79577011, + "learning_rate": 0.00041566557357859506, + "loss": 0.80620635, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.40673828, + "step": 2949, + "time_per_iteration": 2.8660199642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046952, + "balance_loss_mlp": 1.00618231, + "epoch": 0.5675259715275106, + "flos": 970559826432.0, + "grad_norm": 0.03148848509964497, + "language_loss": 0.79963183, + "learning_rate": 0.0004153585111962502, + "loss": 0.81010127, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.40771484, + "step": 2950, + "time_per_iteration": 3.284973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049042, + "balance_loss_mlp": 1.00824845, + "epoch": 0.5677183532127742, + "flos": 566214494976.0, + "grad_norm": 0.035222224981726044, + "language_loss": 0.84893769, + "learning_rate": 0.0004150514816746453, + "loss": 0.85942811, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.40795898, + "step": 2951, + "time_per_iteration": 2.688965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053398, + "balance_loss_mlp": 1.0126282, + "epoch": 0.5679107348980377, + "flos": 552746459136.0, + "grad_norm": 0.03211470229094595, + "language_loss": 0.86231828, + "learning_rate": 0.0004147444851329802, + "loss": 0.87285221, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.40771484, + "step": 2952, + "time_per_iteration": 2.654975175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050037, + "balance_loss_mlp": 1.00929093, + "epoch": 0.5681031165833013, + "flos": 820841540352.0, + "grad_norm": 0.031520082579240216, + "language_loss": 0.86395264, + "learning_rate": 0.00041443752169044126, + "loss": 0.87445295, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.4074707, + "step": 2953, + "time_per_iteration": 2.9978690147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_mlp": 1.00384951, + "epoch": 0.5682954982685648, + "flos": 619146646272.0, + "grad_norm": 0.031195671435834585, + "language_loss": 0.85214126, + "learning_rate": 0.0004141305914662025, + "loss": 0.86258864, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.40893555, + "step": 2954, + "time_per_iteration": 2.7177786827087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01214588, + "epoch": 0.5684878799538284, + "flos": 649252637184.0, + "grad_norm": 0.03230481359903608, + "language_loss": 0.81020069, + "learning_rate": 0.0004138236945794246, + "loss": 0.82073009, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.40795898, + "step": 2955, + "time_per_iteration": 2.8862104415893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.01065099, + "epoch": 0.5686802616390919, + "flos": 807354062592.0, + "grad_norm": 0.038353041221636526, + "language_loss": 0.84374332, + "learning_rate": 0.00041351683114925576, + "loss": 0.85425854, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.40869141, + "step": 2956, + "time_per_iteration": 3.0500295162200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_mlp": 1.01126814, + "epoch": 0.5688726433243555, + "flos": 548176756224.0, + "grad_norm": 0.03189027766628176, + "language_loss": 0.87115657, + "learning_rate": 0.0004132100012948308, + "loss": 0.8816781, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.40893555, + "step": 2957, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104687, + "balance_loss_mlp": 1.00593376, + "epoch": 0.5690650250096191, + "flos": 487546778880.0, + "grad_norm": 0.03605588885155363, + "language_loss": 0.84833193, + "learning_rate": 0.00041290320513527145, + "loss": 0.85880065, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.40942383, + "step": 2958, + "time_per_iteration": 2.567070960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010482, + "balance_loss_mlp": 1.00733471, + "epoch": 0.5692574066948827, + "flos": 578555900928.0, + "grad_norm": 0.030752617047449367, + "language_loss": 0.85344827, + "learning_rate": 0.0004125964427896867, + "loss": 0.86393028, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.40869141, + "step": 2959, + "time_per_iteration": 2.672534704208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047545, + "balance_loss_mlp": 1.00663245, + "epoch": 0.5694497883801463, + "flos": 455220576000.0, + "grad_norm": 0.04229544295686443, + "language_loss": 0.79680836, + "learning_rate": 0.0004122897143771723, + "loss": 0.80728376, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.40917969, + "step": 2960, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_mlp": 1.00534308, + "epoch": 0.5696421700654097, + "flos": 560583290880.0, + "grad_norm": 0.03127363894209499, + "language_loss": 0.82077289, + "learning_rate": 0.0004119830200168109, + "loss": 0.83123589, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.40966797, + "step": 2961, + "time_per_iteration": 2.663581609725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045878, + "balance_loss_mlp": 1.00510836, + "epoch": 0.5698345517506733, + "flos": 466502426112.0, + "grad_norm": 0.0350478630821908, + "language_loss": 0.89062726, + "learning_rate": 0.0004116763598276714, + "loss": 0.90108603, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.40771484, + "step": 2962, + "time_per_iteration": 2.521552801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_mlp": 1.00641382, + "epoch": 0.5700269334359369, + "flos": 607192121856.0, + "grad_norm": 0.031424704719117534, + "language_loss": 0.81706619, + "learning_rate": 0.00041136973392881017, + "loss": 0.82753831, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.40795898, + "step": 2963, + "time_per_iteration": 2.91904878616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_mlp": 1.00296056, + "epoch": 0.5702193151212005, + "flos": 563857222656.0, + "grad_norm": 0.03326860309508315, + "language_loss": 0.82831907, + "learning_rate": 0.00041106314243926983, + "loss": 0.83875614, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.4074707, + "step": 2964, + "time_per_iteration": 2.7399420738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_mlp": 1.00340486, + "epoch": 0.570411696806464, + "flos": 524310481920.0, + "grad_norm": 0.03332690132244082, + "language_loss": 0.8800739, + "learning_rate": 0.0004107565854780798, + "loss": 0.89051443, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.40649414, + "step": 2965, + "time_per_iteration": 2.6200034618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_mlp": 1.00565064, + "epoch": 0.5706040784917276, + "flos": 719473063680.0, + "grad_norm": 0.03436086388372073, + "language_loss": 0.81524932, + "learning_rate": 0.000410450063164256, + "loss": 0.82571304, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.40722656, + "step": 2966, + "time_per_iteration": 2.8336212635040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048641, + "balance_loss_mlp": 1.00787103, + "epoch": 0.5707964601769911, + "flos": 477671515392.0, + "grad_norm": 0.03782244517116874, + "language_loss": 0.82540762, + "learning_rate": 0.00041014357561680115, + "loss": 0.83589399, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.40771484, + "step": 2967, + "time_per_iteration": 2.5143654346466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00714386, + "epoch": 0.5709888418622547, + "flos": 581217429504.0, + "grad_norm": 0.030421169355448613, + "language_loss": 0.86193347, + "learning_rate": 0.0004098371229547039, + "loss": 0.87241161, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.40673828, + "step": 2968, + "time_per_iteration": 2.6610617637634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057983, + "balance_loss_mlp": 1.01869202, + "epoch": 0.5711812235475183, + "flos": 1583195536128.0, + "grad_norm": 0.0076189717983582966, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8106879, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.39257812, + "step": 2969, + "time_per_iteration": 4.76263952255249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_mlp": 1.00790465, + "epoch": 0.5713736052327818, + "flos": 469498346496.0, + "grad_norm": 0.03484927048715074, + "language_loss": 0.80634308, + "learning_rate": 0.00040922432276247107, + "loss": 0.81682986, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.40771484, + "step": 2970, + "time_per_iteration": 2.5514628887176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_mlp": 1.0054065, + "epoch": 0.5715659869180454, + "flos": 538755448320.0, + "grad_norm": 0.029079861926461517, + "language_loss": 0.84918243, + "learning_rate": 0.0004089179754702457, + "loss": 0.85964465, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.40820312, + "step": 2971, + "time_per_iteration": 2.749539613723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_mlp": 1.00396252, + "epoch": 0.571758368603309, + "flos": 657251807232.0, + "grad_norm": 0.03418066993480882, + "language_loss": 0.80556142, + "learning_rate": 0.00040861166353919843, + "loss": 0.81600946, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.40844727, + "step": 2972, + "time_per_iteration": 2.814680814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052102, + "balance_loss_mlp": 1.011356, + "epoch": 0.5719507502885726, + "flos": 669100373760.0, + "grad_norm": 0.031053974574008693, + "language_loss": 0.82602715, + "learning_rate": 0.00040830538708824983, + "loss": 0.83654815, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.4074707, + "step": 2973, + "time_per_iteration": 2.904085636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050783, + "balance_loss_mlp": 1.01018071, + "epoch": 0.572143131973836, + "flos": 477280743168.0, + "grad_norm": 0.03419925971016847, + "language_loss": 0.82092619, + "learning_rate": 0.000407999146236307, + "loss": 0.83143401, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.40600586, + "step": 2974, + "time_per_iteration": 2.549262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051203, + "balance_loss_mlp": 1.01062381, + "epoch": 0.5723355136590996, + "flos": 540535310592.0, + "grad_norm": 0.03597856382327793, + "language_loss": 0.83747095, + "learning_rate": 0.0004076929411022634, + "loss": 0.847983, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.40576172, + "step": 2975, + "time_per_iteration": 2.602869987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053058, + "balance_loss_mlp": 1.01235974, + "epoch": 0.5725278953443632, + "flos": 825650370816.0, + "grad_norm": 0.037415312483521146, + "language_loss": 0.8006742, + "learning_rate": 0.0004073867718049982, + "loss": 0.81120479, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.40698242, + "step": 2976, + "time_per_iteration": 3.139498472213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_mlp": 1.00966477, + "epoch": 0.5727202770296268, + "flos": 588570170112.0, + "grad_norm": 0.037681082671355684, + "language_loss": 0.83124882, + "learning_rate": 0.00040708063846337704, + "loss": 0.84175301, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.4074707, + "step": 2977, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00937819, + "epoch": 0.5729126587148904, + "flos": 447941712384.0, + "grad_norm": 0.03249864108633733, + "language_loss": 0.81268066, + "learning_rate": 0.00040677454119625143, + "loss": 0.82318383, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.40942383, + "step": 2978, + "time_per_iteration": 2.5775671005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049596, + "balance_loss_mlp": 1.00870752, + "epoch": 0.5731050404001539, + "flos": 520467888384.0, + "grad_norm": 0.034012599703189976, + "language_loss": 0.83670664, + "learning_rate": 0.0004064684801224587, + "loss": 0.84720254, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.40893555, + "step": 2979, + "time_per_iteration": 2.6424074172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047576, + "balance_loss_mlp": 1.00675905, + "epoch": 0.5732974220854175, + "flos": 505771155456.0, + "grad_norm": 0.032486782592384814, + "language_loss": 0.80872238, + "learning_rate": 0.00040616245536082224, + "loss": 0.81919813, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.40820312, + "step": 2980, + "time_per_iteration": 2.57401704788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050822, + "balance_loss_mlp": 1.01000464, + "epoch": 0.573489803770681, + "flos": 593678399232.0, + "grad_norm": 0.028956426653120197, + "language_loss": 0.82143462, + "learning_rate": 0.00040585646703015165, + "loss": 0.8319428, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.40820312, + "step": 2981, + "time_per_iteration": 2.828683614730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.01010036, + "epoch": 0.5736821854559446, + "flos": 490870288128.0, + "grad_norm": 0.04412597729133787, + "language_loss": 0.78605878, + "learning_rate": 0.0004055505152492419, + "loss": 0.79656816, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.40844727, + "step": 2982, + "time_per_iteration": 2.640928268432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048392, + "balance_loss_mlp": 1.00747919, + "epoch": 0.5738745671412081, + "flos": 459202175232.0, + "grad_norm": 0.034256342510568284, + "language_loss": 0.74769032, + "learning_rate": 0.00040524460013687425, + "loss": 0.7581743, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.40917969, + "step": 2983, + "time_per_iteration": 2.7067794799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105312, + "balance_loss_mlp": 1.0123024, + "epoch": 0.5740669488264717, + "flos": 581621807616.0, + "grad_norm": 0.029467935021435916, + "language_loss": 0.81554836, + "learning_rate": 0.0004049387218118155, + "loss": 0.82607955, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.40820312, + "step": 2984, + "time_per_iteration": 2.9581944942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_mlp": 1.00468242, + "epoch": 0.5742593305117353, + "flos": 525574172160.0, + "grad_norm": 0.03631391131249333, + "language_loss": 0.85729742, + "learning_rate": 0.00040463288039281777, + "loss": 0.86775261, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.40844727, + "step": 2985, + "time_per_iteration": 2.7224113941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056683, + "balance_loss_mlp": 1.01729584, + "epoch": 0.5744517121969989, + "flos": 1557269442816.0, + "grad_norm": 0.010841110534864203, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78933102, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.39355469, + "step": 2986, + "time_per_iteration": 5.064981698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.01089525, + "epoch": 0.5746440938822625, + "flos": 753203907840.0, + "grad_norm": 0.045288596232844924, + "language_loss": 0.82885808, + "learning_rate": 0.0004040213087479444, + "loss": 0.83937448, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.4074707, + "step": 2987, + "time_per_iteration": 2.98020601272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043481, + "balance_loss_mlp": 1.00266409, + "epoch": 0.5748364755675259, + "flos": 502857860352.0, + "grad_norm": 0.036149920431262125, + "language_loss": 0.85748988, + "learning_rate": 0.0004037155787595018, + "loss": 0.86792469, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.40820312, + "step": 2988, + "time_per_iteration": 2.5745627880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_mlp": 1.01026356, + "epoch": 0.5750288572527895, + "flos": 505198603008.0, + "grad_norm": 0.03371383384616788, + "language_loss": 0.81460357, + "learning_rate": 0.000403409886151987, + "loss": 0.82511389, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.40771484, + "step": 2989, + "time_per_iteration": 2.9434561729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045067, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5752212389380531, + "flos": 1544678215680.0, + "grad_norm": 0.006920775411585041, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83044171, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.39453125, + "step": 2990, + "time_per_iteration": 4.784885406494141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_mlp": 1.00367737, + "epoch": 0.5754136206233167, + "flos": 1570674295296.0, + "grad_norm": 0.003743957088283973, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79241765, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.39453125, + "step": 2991, + "time_per_iteration": 4.776461362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049264, + "balance_loss_mlp": 1.00842321, + "epoch": 0.5756060023085803, + "flos": 799562917632.0, + "grad_norm": 0.03045005809397815, + "language_loss": 0.77561808, + "learning_rate": 0.00040249303380173807, + "loss": 0.78611076, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.40844727, + "step": 2992, + "time_per_iteration": 3.0843074321746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_mlp": 1.00451803, + "epoch": 0.5757983839938438, + "flos": 589034819328.0, + "grad_norm": 0.034529184723129894, + "language_loss": 0.79738832, + "learning_rate": 0.00040218749190459126, + "loss": 0.8078438, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.41040039, + "step": 2993, + "time_per_iteration": 2.7403366565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.00428283, + "epoch": 0.5759907656791073, + "flos": 517852046592.0, + "grad_norm": 0.035278528612120996, + "language_loss": 0.82955313, + "learning_rate": 0.00040188198798162775, + "loss": 0.84000504, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.40917969, + "step": 2994, + "time_per_iteration": 2.6673707962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_mlp": 1.00718617, + "epoch": 0.5761831473643709, + "flos": 588290213376.0, + "grad_norm": 0.029287821677584636, + "language_loss": 0.85980493, + "learning_rate": 0.000401576522151455, + "loss": 0.87028569, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.40893555, + "step": 2995, + "time_per_iteration": 2.788686513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_mlp": 1.00815868, + "epoch": 0.5763755290496345, + "flos": 545009749248.0, + "grad_norm": 0.03018415670660867, + "language_loss": 0.8281709, + "learning_rate": 0.0004012710945326651, + "loss": 0.83866143, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.40893555, + "step": 2996, + "time_per_iteration": 2.7784581184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047721, + "balance_loss_mlp": 1.00685585, + "epoch": 0.576567910734898, + "flos": 627428685312.0, + "grad_norm": 0.030965553916741433, + "language_loss": 0.81781155, + "learning_rate": 0.0004009657052438355, + "loss": 0.82828873, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.40869141, + "step": 2997, + "time_per_iteration": 2.787832498550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_mlp": 1.00593948, + "epoch": 0.5767602924201616, + "flos": 539278423296.0, + "grad_norm": 0.0362963808148575, + "language_loss": 0.86264056, + "learning_rate": 0.00040066035440352904, + "loss": 0.87310815, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.40820312, + "step": 2998, + "time_per_iteration": 2.6896724700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045353, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5769526741054252, + "flos": 1563026046720.0, + "grad_norm": 0.005169215201186531, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8033849, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.39746094, + "step": 2999, + "time_per_iteration": 4.891216039657593 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_mlp": 1.00318265, + "epoch": 0.5771450557906888, + "flos": 469172702976.0, + "grad_norm": 0.037596514401195116, + "language_loss": 0.7668246, + "learning_rate": 0.00040004976854266145, + "loss": 0.77726436, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.40795898, + "step": 3000, + "time_per_iteration": 2.51895809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00478971, + "epoch": 0.5773374374759523, + "flos": 575633857536.0, + "grad_norm": 0.03248080927364981, + "language_loss": 0.81750363, + "learning_rate": 0.0003997445337591505, + "loss": 0.82796073, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.40917969, + "step": 3001, + "time_per_iteration": 2.692239999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.0079695, + "epoch": 0.5775298191612158, + "flos": 529505227008.0, + "grad_norm": 0.031913043384180086, + "language_loss": 0.74606609, + "learning_rate": 0.0003994393378982635, + "loss": 0.75655282, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.40698242, + "step": 3002, + "time_per_iteration": 2.665146589279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053272, + "balance_loss_mlp": 1.01369476, + "epoch": 0.5777222008464794, + "flos": 1306899095808.0, + "grad_norm": 0.010106387724362367, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80591273, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.39550781, + "step": 3003, + "time_per_iteration": 4.803764581680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104489, + "balance_loss_mlp": 1.00409698, + "epoch": 0.577914582531743, + "flos": 604793053440.0, + "grad_norm": 0.0386937293491606, + "language_loss": 0.88557941, + "learning_rate": 0.0003988290634182961, + "loss": 0.89602828, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.40795898, + "step": 3004, + "time_per_iteration": 2.7506465911865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_mlp": 1.00943995, + "epoch": 0.5781069642170066, + "flos": 487833538560.0, + "grad_norm": 0.034765884683499934, + "language_loss": 0.81038988, + "learning_rate": 0.0003985239850361453, + "loss": 0.82089031, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.40600586, + "step": 3005, + "time_per_iteration": 2.5988621711730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047258, + "balance_loss_mlp": 1.00653589, + "epoch": 0.5782993459022701, + "flos": 507414924288.0, + "grad_norm": 0.036479253397917216, + "language_loss": 0.85073388, + "learning_rate": 0.0003982189460504777, + "loss": 0.86120641, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.40722656, + "step": 3006, + "time_per_iteration": 2.694517135620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00913548, + "epoch": 0.5784917275875336, + "flos": 603295093248.0, + "grad_norm": 0.03899121610040523, + "language_loss": 0.79739761, + "learning_rate": 0.00039791394657971935, + "loss": 0.80789566, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.40673828, + "step": 3007, + "time_per_iteration": 2.694913387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_mlp": 1.00376368, + "epoch": 0.5786841092727972, + "flos": 522588945408.0, + "grad_norm": 0.03653808704233678, + "language_loss": 0.84952617, + "learning_rate": 0.00039760898674228205, + "loss": 0.85997152, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.40771484, + "step": 3008, + "time_per_iteration": 2.6486122608184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_mlp": 1.00476897, + "epoch": 0.5788764909580608, + "flos": 768836742144.0, + "grad_norm": 0.02798603221606654, + "language_loss": 0.81355041, + "learning_rate": 0.0003973040666565613, + "loss": 0.82400489, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.40673828, + "step": 3009, + "time_per_iteration": 3.029721975326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_mlp": 1.00590491, + "epoch": 0.5790688726433244, + "flos": 600332220672.0, + "grad_norm": 0.03710521046969438, + "language_loss": 0.82796824, + "learning_rate": 0.000396999186440938, + "loss": 0.8384347, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.4074707, + "step": 3010, + "time_per_iteration": 2.866637945175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_mlp": 1.00711966, + "epoch": 0.5792612543285879, + "flos": 524106347520.0, + "grad_norm": 0.03822457095680595, + "language_loss": 0.85752803, + "learning_rate": 0.000396694346213777, + "loss": 0.86800808, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.40893555, + "step": 3011, + "time_per_iteration": 2.6125171184539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045119, + "balance_loss_mlp": 1.00430202, + "epoch": 0.5794536360138515, + "flos": 878080934400.0, + "grad_norm": 0.030461633114119882, + "language_loss": 0.8396455, + "learning_rate": 0.0003963895460934276, + "loss": 0.8500967, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.40820312, + "step": 3012, + "time_per_iteration": 3.1341123580932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047321, + "balance_loss_mlp": 1.00631309, + "epoch": 0.5796460176991151, + "flos": 402299118336.0, + "grad_norm": 0.04162907217084141, + "language_loss": 0.85323715, + "learning_rate": 0.00039608478619822376, + "loss": 0.86371034, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.41015625, + "step": 3013, + "time_per_iteration": 2.45570969581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_mlp": 1.00448704, + "epoch": 0.5798383993843786, + "flos": 619676424192.0, + "grad_norm": 0.02973237056850944, + "language_loss": 0.8328954, + "learning_rate": 0.00039578006664648394, + "loss": 0.84334981, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.40966797, + "step": 3014, + "time_per_iteration": 2.796370506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_mlp": 1.00351644, + "epoch": 0.5800307810696421, + "flos": 845793615360.0, + "grad_norm": 0.037256106488294125, + "language_loss": 0.81995672, + "learning_rate": 0.0003954753875565105, + "loss": 0.83040106, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.40917969, + "step": 3015, + "time_per_iteration": 3.0796241760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045336, + "balance_loss_mlp": 1.00442326, + "epoch": 0.5802231627549057, + "flos": 570365235456.0, + "grad_norm": 0.0302253929683373, + "language_loss": 0.82961631, + "learning_rate": 0.00039517074904659057, + "loss": 0.84006965, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.40917969, + "step": 3016, + "time_per_iteration": 2.6984057426452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105015, + "balance_loss_mlp": 1.00921345, + "epoch": 0.5804155444401693, + "flos": 661663062528.0, + "grad_norm": 0.033398230079863866, + "language_loss": 0.85268873, + "learning_rate": 0.00039486615123499535, + "loss": 0.86319029, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.40942383, + "step": 3017, + "time_per_iteration": 2.8348796367645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051377, + "balance_loss_mlp": 1.01022601, + "epoch": 0.5806079261254329, + "flos": 515058315264.0, + "grad_norm": 0.030637451118741787, + "language_loss": 0.85653043, + "learning_rate": 0.00039456159423997996, + "loss": 0.86704421, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.41162109, + "step": 3018, + "time_per_iteration": 2.6296215057373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_mlp": 1.00740576, + "epoch": 0.5808003078106965, + "flos": 529718109696.0, + "grad_norm": 0.03062870911456177, + "language_loss": 0.90210342, + "learning_rate": 0.00039425707817978406, + "loss": 0.91258705, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.40966797, + "step": 3019, + "time_per_iteration": 2.631979465484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720644, + "epoch": 0.58099268949596, + "flos": 477997158912.0, + "grad_norm": 0.03679030272618613, + "language_loss": 0.84110886, + "learning_rate": 0.00039395260317263124, + "loss": 0.85159171, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.41088867, + "step": 3020, + "time_per_iteration": 2.584413528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00930095, + "epoch": 0.5811850711812235, + "flos": 518688026112.0, + "grad_norm": 0.03473628129951431, + "language_loss": 0.85378569, + "learning_rate": 0.0003936481693367291, + "loss": 0.86428928, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.41064453, + "step": 3021, + "time_per_iteration": 2.6612508296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049465, + "balance_loss_mlp": 1.00833774, + "epoch": 0.5813774528664871, + "flos": 617627298816.0, + "grad_norm": 0.037803518868136904, + "language_loss": 0.88371962, + "learning_rate": 0.0003933437767902697, + "loss": 0.89421427, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.41137695, + "step": 3022, + "time_per_iteration": 2.7910103797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00499725, + "epoch": 0.5815698345517507, + "flos": 568604815104.0, + "grad_norm": 0.03314052138705104, + "language_loss": 0.78534555, + "learning_rate": 0.00039303942565142825, + "loss": 0.7958051, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.40966797, + "step": 3023, + "time_per_iteration": 2.7066261768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_mlp": 1.00525796, + "epoch": 0.5817622162370142, + "flos": 564304375296.0, + "grad_norm": 0.034500169077956666, + "language_loss": 0.76946682, + "learning_rate": 0.0003927351160383644, + "loss": 0.77992761, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.40820312, + "step": 3024, + "time_per_iteration": 2.785215377807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_mlp": 1.00370252, + "epoch": 0.5819545979222778, + "flos": 460154806272.0, + "grad_norm": 0.03482271460519531, + "language_loss": 0.78468955, + "learning_rate": 0.000392430848069222, + "loss": 0.79513502, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.40844727, + "step": 3025, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_mlp": 1.00244236, + "epoch": 0.5821469796075414, + "flos": 542517361920.0, + "grad_norm": 0.03539348008973476, + "language_loss": 0.83090204, + "learning_rate": 0.00039212662186212795, + "loss": 0.8413347, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.40820312, + "step": 3026, + "time_per_iteration": 2.6203463077545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046114, + "balance_loss_mlp": 1.00534403, + "epoch": 0.582339361292805, + "flos": 553341365760.0, + "grad_norm": 0.030591419392928903, + "language_loss": 0.77452922, + "learning_rate": 0.0003918224375351934, + "loss": 0.78499031, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.40771484, + "step": 3027, + "time_per_iteration": 2.700643301010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_mlp": 1.00646877, + "epoch": 0.5825317429780685, + "flos": 497448287232.0, + "grad_norm": 0.03355698207676345, + "language_loss": 0.79253477, + "learning_rate": 0.0003915182952065135, + "loss": 0.80300689, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.4074707, + "step": 3028, + "time_per_iteration": 2.693223714828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043151, + "balance_loss_mlp": 1.00247645, + "epoch": 0.582724124663332, + "flos": 565255060992.0, + "grad_norm": 0.03374091506860629, + "language_loss": 0.88055015, + "learning_rate": 0.0003912141949941664, + "loss": 0.89098167, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.40673828, + "step": 3029, + "time_per_iteration": 2.674584150314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_mlp": 1.00249338, + "epoch": 0.5829165063485956, + "flos": 493112854272.0, + "grad_norm": 0.039605660090179254, + "language_loss": 0.83319384, + "learning_rate": 0.0003909101370162143, + "loss": 0.84362668, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.40795898, + "step": 3030, + "time_per_iteration": 2.592111587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.00718689, + "epoch": 0.5831088880338592, + "flos": 1531879941888.0, + "grad_norm": 0.006346134957791291, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73480463, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.39355469, + "step": 3031, + "time_per_iteration": 4.929339170455933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047591, + "balance_loss_mlp": 1.00686908, + "epoch": 0.5833012697191228, + "flos": 619209829632.0, + "grad_norm": 0.03163493287885039, + "language_loss": 0.83241516, + "learning_rate": 0.0003903021482356622, + "loss": 0.8428911, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.40722656, + "step": 3032, + "time_per_iteration": 2.7828269004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_mlp": 1.00508761, + "epoch": 0.5834936514043862, + "flos": 769294588416.0, + "grad_norm": 0.028764675594544035, + "language_loss": 0.83318806, + "learning_rate": 0.00038999821766910465, + "loss": 0.84364575, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.40673828, + "step": 3033, + "time_per_iteration": 2.976440906524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046889, + "balance_loss_mlp": 1.00616705, + "epoch": 0.5836860330896498, + "flos": 459316881408.0, + "grad_norm": 0.03570453873198092, + "language_loss": 0.86074644, + "learning_rate": 0.00038969432980902606, + "loss": 0.87121534, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.40722656, + "step": 3034, + "time_per_iteration": 2.5605523586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049232, + "balance_loss_mlp": 1.00975037, + "epoch": 0.5838784147749134, + "flos": 1364198760960.0, + "grad_norm": 0.006741388763220325, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80833733, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.39453125, + "step": 3035, + "time_per_iteration": 4.870011329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046432, + "balance_loss_mlp": 1.00566232, + "epoch": 0.584070796460177, + "flos": 568289865216.0, + "grad_norm": 0.0320953374409888, + "language_loss": 0.82746142, + "learning_rate": 0.00038908668268020953, + "loss": 0.83792579, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.40771484, + "step": 3036, + "time_per_iteration": 2.6482043266296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_mlp": 1.00582528, + "epoch": 0.5842631781454406, + "flos": 612666823680.0, + "grad_norm": 0.032158289179941596, + "language_loss": 0.85682309, + "learning_rate": 0.00038878292364738097, + "loss": 0.86729091, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.40966797, + "step": 3037, + "time_per_iteration": 2.7571158409118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104852, + "balance_loss_mlp": 1.00758314, + "epoch": 0.5844555598307041, + "flos": 464333736960.0, + "grad_norm": 0.037716829310632, + "language_loss": 0.87422657, + "learning_rate": 0.0003884792077928508, + "loss": 0.88471174, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.40942383, + "step": 3038, + "time_per_iteration": 2.5060815811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00522828, + "epoch": 0.5846479415159677, + "flos": 411058445568.0, + "grad_norm": 0.036592459093467214, + "language_loss": 0.77285695, + "learning_rate": 0.0003881755352345322, + "loss": 0.78331912, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.40991211, + "step": 3039, + "time_per_iteration": 2.558833360671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049516, + "balance_loss_mlp": 1.0084126, + "epoch": 0.5848403232012312, + "flos": 492266181120.0, + "grad_norm": 0.028436591435814704, + "language_loss": 0.87703776, + "learning_rate": 0.0003878719060903207, + "loss": 0.88753295, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.41113281, + "step": 3040, + "time_per_iteration": 2.563680410385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048574, + "balance_loss_mlp": 1.0073278, + "epoch": 0.5850327048864948, + "flos": 585509121024.0, + "grad_norm": 0.03942000109029475, + "language_loss": 0.8397156, + "learning_rate": 0.0003875683204780961, + "loss": 0.85020131, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.41259766, + "step": 3041, + "time_per_iteration": 2.707235336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046188, + "balance_loss_mlp": 1.00506115, + "epoch": 0.5852250865717584, + "flos": 652719042816.0, + "grad_norm": 0.03661913957485838, + "language_loss": 0.85946143, + "learning_rate": 0.00038726477851572043, + "loss": 0.86992323, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.41137695, + "step": 3042, + "time_per_iteration": 2.7779452800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_mlp": 1.00753701, + "epoch": 0.5854174682570219, + "flos": 535620522240.0, + "grad_norm": 0.03519010087747146, + "language_loss": 0.80754662, + "learning_rate": 0.0003869612803210395, + "loss": 0.81803256, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.41064453, + "step": 3043, + "time_per_iteration": 2.64778733253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_mlp": 1.01044726, + "epoch": 0.5856098499422855, + "flos": 510759820800.0, + "grad_norm": 0.03494290194274924, + "language_loss": 0.83645654, + "learning_rate": 0.0003866578260118817, + "loss": 0.84697139, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.41040039, + "step": 3044, + "time_per_iteration": 2.596379041671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00828719, + "epoch": 0.5858022316275491, + "flos": 594993612288.0, + "grad_norm": 0.03849486234726574, + "language_loss": 0.83826196, + "learning_rate": 0.0003863544157060581, + "loss": 0.84875488, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.41015625, + "step": 3045, + "time_per_iteration": 2.6666998863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_mlp": 1.0086298, + "epoch": 0.5859946133128127, + "flos": 560318885376.0, + "grad_norm": 0.02876341489298987, + "language_loss": 0.82639688, + "learning_rate": 0.0003860510495213634, + "loss": 0.83689421, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.41113281, + "step": 3046, + "time_per_iteration": 2.865504264831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00827503, + "epoch": 0.5861869949980761, + "flos": 554756700672.0, + "grad_norm": 0.0396946944562825, + "language_loss": 0.78689963, + "learning_rate": 0.0003857477275755746, + "loss": 0.79739368, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.41137695, + "step": 3047, + "time_per_iteration": 2.624819278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049982, + "balance_loss_mlp": 1.00887823, + "epoch": 0.5863793766833397, + "flos": 720055331328.0, + "grad_norm": 0.02972376125592825, + "language_loss": 0.84339547, + "learning_rate": 0.00038544444998645167, + "loss": 0.85389531, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.41113281, + "step": 3048, + "time_per_iteration": 2.990790367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_mlp": 1.00750625, + "epoch": 0.5865717583686033, + "flos": 473286504960.0, + "grad_norm": 0.034605288898392046, + "language_loss": 0.82032233, + "learning_rate": 0.00038514121687173767, + "loss": 0.83080769, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.41040039, + "step": 3049, + "time_per_iteration": 2.596529960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049884, + "balance_loss_mlp": 1.0088284, + "epoch": 0.5867641400538669, + "flos": 814847754240.0, + "grad_norm": 0.03903750410866887, + "language_loss": 0.82380903, + "learning_rate": 0.00038483802834915807, + "loss": 0.83430791, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.41064453, + "step": 3050, + "time_per_iteration": 2.9996161460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.00480914, + "epoch": 0.5869565217391305, + "flos": 487518588672.0, + "grad_norm": 0.0350404565928551, + "language_loss": 0.79904723, + "learning_rate": 0.00038453488453642074, + "loss": 0.80950606, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.41088867, + "step": 3051, + "time_per_iteration": 2.7099759578704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_mlp": 1.00626779, + "epoch": 0.587148903424394, + "flos": 570512989440.0, + "grad_norm": 0.03324549798167153, + "language_loss": 0.8786602, + "learning_rate": 0.00038423178555121697, + "loss": 0.88913417, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.41137695, + "step": 3052, + "time_per_iteration": 2.684868097305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_mlp": 1.00359285, + "epoch": 0.5873412851096576, + "flos": 748695442944.0, + "grad_norm": 0.0344494509074348, + "language_loss": 0.86014688, + "learning_rate": 0.00038392873151121994, + "loss": 0.87059504, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.41235352, + "step": 3053, + "time_per_iteration": 3.073838949203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_mlp": 1.00079656, + "epoch": 0.5875336667949211, + "flos": 529188331776.0, + "grad_norm": 0.03507235034672983, + "language_loss": 0.83636832, + "learning_rate": 0.0003836257225340859, + "loss": 0.84678853, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.41235352, + "step": 3054, + "time_per_iteration": 2.6333680152893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_mlp": 1.000633, + "epoch": 0.5877260484801847, + "flos": 825641622528.0, + "grad_norm": 0.032727897026981576, + "language_loss": 0.82534069, + "learning_rate": 0.00038332275873745336, + "loss": 0.83575833, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.41137695, + "step": 3055, + "time_per_iteration": 3.051757335662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_mlp": 1.00292683, + "epoch": 0.5879184301654482, + "flos": 592694665728.0, + "grad_norm": 0.030899230424817493, + "language_loss": 0.83323562, + "learning_rate": 0.0003830198402389431, + "loss": 0.84367692, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.41210938, + "step": 3056, + "time_per_iteration": 2.6873278617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_mlp": 1.00317383, + "epoch": 0.5881108118507118, + "flos": 1549226531328.0, + "grad_norm": 0.008859615514711313, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78391969, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.3984375, + "step": 3057, + "time_per_iteration": 5.044417142868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045675, + "balance_loss_mlp": 1.00461972, + "epoch": 0.5883031935359754, + "flos": 490599079680.0, + "grad_norm": 0.03687508634060279, + "language_loss": 0.83287209, + "learning_rate": 0.0003824141396066855, + "loss": 0.84332883, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.41064453, + "step": 3058, + "time_per_iteration": 2.57017183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_mlp": 1.00458455, + "epoch": 0.588495575221239, + "flos": 583981025280.0, + "grad_norm": 0.03543871049956236, + "language_loss": 0.83470112, + "learning_rate": 0.000382111357708092, + "loss": 0.84515893, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.41210938, + "step": 3059, + "time_per_iteration": 2.710636615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046053, + "balance_loss_mlp": 1.00492609, + "epoch": 0.5886879569065026, + "flos": 662240472576.0, + "grad_norm": 0.03467029745908185, + "language_loss": 0.84034348, + "learning_rate": 0.00038180862157792864, + "loss": 0.85080403, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.41137695, + "step": 3060, + "time_per_iteration": 2.765730619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045299, + "balance_loss_mlp": 1.00429142, + "epoch": 0.588880338591766, + "flos": 563720162304.0, + "grad_norm": 0.034528332603885874, + "language_loss": 0.82661986, + "learning_rate": 0.0003815059313337279, + "loss": 0.83707285, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.41015625, + "step": 3061, + "time_per_iteration": 2.6512649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_mlp": 1.00339055, + "epoch": 0.5890727202770296, + "flos": 555853195008.0, + "grad_norm": 0.028645191608940447, + "language_loss": 0.78527474, + "learning_rate": 0.00038120328709300436, + "loss": 0.79571807, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.40942383, + "step": 3062, + "time_per_iteration": 2.839588165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_mlp": 1.00321651, + "epoch": 0.5892651019622932, + "flos": 656702587392.0, + "grad_norm": 0.03868775593308096, + "language_loss": 0.83858323, + "learning_rate": 0.0003809006889732549, + "loss": 0.84902555, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.41015625, + "step": 3063, + "time_per_iteration": 2.80668306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044985, + "balance_loss_mlp": 1.00395334, + "epoch": 0.5894574836475568, + "flos": 454132829952.0, + "grad_norm": 0.034675820144419535, + "language_loss": 0.8846643, + "learning_rate": 0.0003805981370919589, + "loss": 0.89511412, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.41040039, + "step": 3064, + "time_per_iteration": 2.4926044940948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.00509965, + "epoch": 0.5896498653328203, + "flos": 520112109312.0, + "grad_norm": 0.03109338069781882, + "language_loss": 0.843858, + "learning_rate": 0.0003802956315665771, + "loss": 0.85432076, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.41186523, + "step": 3065, + "time_per_iteration": 2.6821701526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00530875, + "epoch": 0.5898422470180839, + "flos": 550084930560.0, + "grad_norm": 0.039548358411626815, + "language_loss": 0.82298601, + "learning_rate": 0.0003799931725145529, + "loss": 0.83345109, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.41210938, + "step": 3066, + "time_per_iteration": 2.6161272525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_mlp": 1.00532758, + "epoch": 0.5900346287033474, + "flos": 525380731392.0, + "grad_norm": 0.034195441532662435, + "language_loss": 0.86171907, + "learning_rate": 0.00037969076005331083, + "loss": 0.87218219, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.40991211, + "step": 3067, + "time_per_iteration": 2.769503116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046067, + "balance_loss_mlp": 1.00515461, + "epoch": 0.590227010388611, + "flos": 568215988224.0, + "grad_norm": 0.03443045458348014, + "language_loss": 0.88715112, + "learning_rate": 0.00037938839430025817, + "loss": 0.8976118, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.40917969, + "step": 3068, + "time_per_iteration": 2.626838207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046372, + "balance_loss_mlp": 1.00557816, + "epoch": 0.5904193920738746, + "flos": 584456368128.0, + "grad_norm": 0.03106221395948033, + "language_loss": 0.86157519, + "learning_rate": 0.0003790860753727835, + "loss": 0.8720389, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.40795898, + "step": 3069, + "time_per_iteration": 2.825906991958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_mlp": 1.0041821, + "epoch": 0.5906117737591381, + "flos": 530797107456.0, + "grad_norm": 0.033655572520404166, + "language_loss": 0.83318973, + "learning_rate": 0.00037878380338825766, + "loss": 0.84363884, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.40722656, + "step": 3070, + "time_per_iteration": 2.6605753898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_mlp": 1.00264668, + "epoch": 0.5908041554444017, + "flos": 685516697856.0, + "grad_norm": 0.032255816781200916, + "language_loss": 0.81519401, + "learning_rate": 0.00037848157846403287, + "loss": 0.82562816, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.40771484, + "step": 3071, + "time_per_iteration": 2.8913676738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_mlp": 1.00712073, + "epoch": 0.5909965371296653, + "flos": 551133792768.0, + "grad_norm": 0.033304308768315895, + "language_loss": 0.83666503, + "learning_rate": 0.0003781794007174435, + "loss": 0.84714377, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.4074707, + "step": 3072, + "time_per_iteration": 2.7170376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044559, + "balance_loss_mlp": 1.00498199, + "epoch": 0.5911889188149289, + "flos": 1495645038336.0, + "grad_norm": 0.0062576164066865435, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7511909, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.39550781, + "step": 3073, + "time_per_iteration": 4.848031282424927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053183, + "balance_loss_mlp": 1.01248538, + "epoch": 0.5913813005001923, + "flos": 488886291456.0, + "grad_norm": 0.03164327687157731, + "language_loss": 0.81542623, + "learning_rate": 0.0003775751872264152, + "loss": 0.82595801, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.40698242, + "step": 3074, + "time_per_iteration": 2.7835612297058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_mlp": 1.00721872, + "epoch": 0.5915736821854559, + "flos": 574522778880.0, + "grad_norm": 0.03137518576611995, + "language_loss": 0.87806273, + "learning_rate": 0.0003772731517165527, + "loss": 0.88854092, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.40600586, + "step": 3075, + "time_per_iteration": 2.7984819412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045123, + "balance_loss_mlp": 1.00451982, + "epoch": 0.5917660638707195, + "flos": 790861916160.0, + "grad_norm": 0.03467745447845496, + "language_loss": 0.83953345, + "learning_rate": 0.0003769711638534784, + "loss": 0.84998471, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.40600586, + "step": 3076, + "time_per_iteration": 2.9498283863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045943, + "balance_loss_mlp": 1.0053643, + "epoch": 0.5919584455559831, + "flos": 529756993536.0, + "grad_norm": 0.038274807826461636, + "language_loss": 0.7910676, + "learning_rate": 0.00037666922375443446, + "loss": 0.80152702, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.40576172, + "step": 3077, + "time_per_iteration": 2.595907211303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043771, + "balance_loss_mlp": 1.00312054, + "epoch": 0.5921508272412467, + "flos": 561753662208.0, + "grad_norm": 0.037448898185008676, + "language_loss": 0.82402956, + "learning_rate": 0.00037636733153664396, + "loss": 0.83446729, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.40649414, + "step": 3078, + "time_per_iteration": 2.8082337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050449, + "balance_loss_mlp": 1.00984669, + "epoch": 0.5923432089265102, + "flos": 564334510848.0, + "grad_norm": 0.04535413457726027, + "language_loss": 0.80388999, + "learning_rate": 0.0003760654873173124, + "loss": 0.81439447, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.40600586, + "step": 3079, + "time_per_iteration": 2.6586430072784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048929, + "balance_loss_mlp": 1.00832665, + "epoch": 0.5925355906117737, + "flos": 496751313408.0, + "grad_norm": 0.032303837876808815, + "language_loss": 0.82224989, + "learning_rate": 0.00037576369121362566, + "loss": 0.83273923, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.40600586, + "step": 3080, + "time_per_iteration": 2.5874335765838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_mlp": 1.00846922, + "epoch": 0.5927279722970373, + "flos": 567493736448.0, + "grad_norm": 0.03169427730059961, + "language_loss": 0.82085633, + "learning_rate": 0.0003754619433427516, + "loss": 0.83134699, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.40600586, + "step": 3081, + "time_per_iteration": 2.9037671089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_mlp": 1.00400662, + "epoch": 0.5929203539823009, + "flos": 668160381696.0, + "grad_norm": 0.04430970694991959, + "language_loss": 0.78507918, + "learning_rate": 0.0003751602438218392, + "loss": 0.79552627, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.40698242, + "step": 3082, + "time_per_iteration": 2.77486252784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_mlp": 1.00195801, + "epoch": 0.5931127356675644, + "flos": 556786384128.0, + "grad_norm": 0.03446517582568327, + "language_loss": 0.84122735, + "learning_rate": 0.0003748585927680186, + "loss": 0.8516537, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.40673828, + "step": 3083, + "time_per_iteration": 2.6401243209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047685, + "balance_loss_mlp": 1.00698733, + "epoch": 0.593305117352828, + "flos": 536243619072.0, + "grad_norm": 0.03379156982252967, + "language_loss": 0.83284605, + "learning_rate": 0.00037455699029840086, + "loss": 0.84332293, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.40698242, + "step": 3084, + "time_per_iteration": 2.6359477043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_mlp": 1.00723624, + "epoch": 0.5934974990380916, + "flos": 595058740992.0, + "grad_norm": 0.03375272766067447, + "language_loss": 0.84866869, + "learning_rate": 0.0003742554365300787, + "loss": 0.85914803, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.40698242, + "step": 3085, + "time_per_iteration": 2.7629523277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047831, + "balance_loss_mlp": 1.00727594, + "epoch": 0.5936898807233552, + "flos": 714015858432.0, + "grad_norm": 0.08464198739198994, + "language_loss": 0.79301089, + "learning_rate": 0.0003739539315801255, + "loss": 0.80348921, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.40551758, + "step": 3086, + "time_per_iteration": 2.9152019023895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.01004303, + "epoch": 0.5938822624086187, + "flos": 392749498368.0, + "grad_norm": 0.03659508144201786, + "language_loss": 0.92428821, + "learning_rate": 0.000373652475565596, + "loss": 0.93479371, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.4050293, + "step": 3087, + "time_per_iteration": 2.4702134132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050448, + "balance_loss_mlp": 1.00982189, + "epoch": 0.5940746440938822, + "flos": 481336219392.0, + "grad_norm": 0.034289442552625136, + "language_loss": 0.81692433, + "learning_rate": 0.00037335106860352587, + "loss": 0.82742882, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.40625, + "step": 3088, + "time_per_iteration": 2.675694704055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_mlp": 1.00322449, + "epoch": 0.5942670257791458, + "flos": 484307840256.0, + "grad_norm": 0.03351872550432346, + "language_loss": 0.8348605, + "learning_rate": 0.00037304971081093146, + "loss": 0.84530044, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.40771484, + "step": 3089, + "time_per_iteration": 2.5974292755126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_mlp": 1.00181389, + "epoch": 0.5944594074644094, + "flos": 549058422528.0, + "grad_norm": 0.03144984032595776, + "language_loss": 0.81257939, + "learning_rate": 0.00037274840230481024, + "loss": 0.82300425, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.40673828, + "step": 3090, + "time_per_iteration": 2.7465951442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_mlp": 1.00262976, + "epoch": 0.594651789149673, + "flos": 450129843456.0, + "grad_norm": 0.0354227551067568, + "language_loss": 0.79578584, + "learning_rate": 0.00037244714320214077, + "loss": 0.80621862, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.40649414, + "step": 3091, + "time_per_iteration": 2.532076597213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_mlp": 1.00489831, + "epoch": 0.5948441708349365, + "flos": 597466557696.0, + "grad_norm": 0.033875543124705955, + "language_loss": 0.83456963, + "learning_rate": 0.000372145933619882, + "loss": 0.84502512, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.40649414, + "step": 3092, + "time_per_iteration": 2.888296127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00502765, + "epoch": 0.5950365525202, + "flos": 549581397504.0, + "grad_norm": 0.03918584024885415, + "language_loss": 0.83476591, + "learning_rate": 0.000371844773674974, + "loss": 0.84522295, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.40673828, + "step": 3093, + "time_per_iteration": 2.641191244125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042191, + "balance_loss_mlp": 1.00146902, + "epoch": 0.5952289342054636, + "flos": 655964784384.0, + "grad_norm": 0.03345437818943746, + "language_loss": 0.82307684, + "learning_rate": 0.0003715436634843375, + "loss": 0.83349872, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.40722656, + "step": 3094, + "time_per_iteration": 2.8391387462615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_mlp": 1.00185752, + "epoch": 0.5954213158907272, + "flos": 604604470272.0, + "grad_norm": 0.028714859262846556, + "language_loss": 0.8123939, + "learning_rate": 0.00037124260316487355, + "loss": 0.82281804, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.40551758, + "step": 3095, + "time_per_iteration": 2.8300905227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_mlp": 1.00742722, + "epoch": 0.5956136975759908, + "flos": 487268767488.0, + "grad_norm": 0.03390156256560374, + "language_loss": 0.89901024, + "learning_rate": 0.0003709415928334643, + "loss": 0.90949249, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.40795898, + "step": 3096, + "time_per_iteration": 2.594320297241211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_mlp": 1.00376081, + "epoch": 0.5958060792612543, + "flos": 660041647872.0, + "grad_norm": 0.036547009459556086, + "language_loss": 0.8143428, + "learning_rate": 0.00037064063260697233, + "loss": 0.82478929, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.40893555, + "step": 3097, + "time_per_iteration": 2.853452205657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00397766, + "epoch": 0.5959984609465179, + "flos": 724996364544.0, + "grad_norm": 0.03336502037481855, + "language_loss": 0.78911316, + "learning_rate": 0.0003703397226022407, + "loss": 0.79956114, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.40820312, + "step": 3098, + "time_per_iteration": 3.0299534797668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050957, + "balance_loss_mlp": 1.01147461, + "epoch": 0.5961908426317815, + "flos": 1523221703424.0, + "grad_norm": 0.010872658804754508, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76550829, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.39453125, + "step": 3099, + "time_per_iteration": 4.950707674026489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_mlp": 1.00387943, + "epoch": 0.596383224317045, + "flos": 533647219200.0, + "grad_norm": 0.033784299285581076, + "language_loss": 0.84084308, + "learning_rate": 0.0003697380537253339, + "loss": 0.85128987, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.40795898, + "step": 3100, + "time_per_iteration": 2.6651411056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044743, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5965756060023086, + "flos": 592367076864.0, + "grad_norm": 0.032025449945388196, + "language_loss": 0.82004619, + "learning_rate": 0.0003694372950867471, + "loss": 0.83049357, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.40795898, + "step": 3101, + "time_per_iteration": 2.7825992107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_mlp": 1.00341129, + "epoch": 0.5967679876875721, + "flos": 863470717440.0, + "grad_norm": 0.0338522286072748, + "language_loss": 0.78029126, + "learning_rate": 0.0003691365871370976, + "loss": 0.79073191, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.40649414, + "step": 3102, + "time_per_iteration": 3.0174319744110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044179, + "balance_loss_mlp": 1.00340927, + "epoch": 0.5969603693728357, + "flos": 554878209792.0, + "grad_norm": 0.03201933469342105, + "language_loss": 0.85875535, + "learning_rate": 0.00036883592999313093, + "loss": 0.86919713, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.40771484, + "step": 3103, + "time_per_iteration": 2.683260679244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_mlp": 1.00314891, + "epoch": 0.5971527510580993, + "flos": 719937712896.0, + "grad_norm": 0.039464615758245, + "language_loss": 0.79932439, + "learning_rate": 0.0003685353237715722, + "loss": 0.80976272, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.40673828, + "step": 3104, + "time_per_iteration": 2.8593432903289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_mlp": 1.00312221, + "epoch": 0.5973451327433629, + "flos": 648863810304.0, + "grad_norm": 0.031062495288944163, + "language_loss": 0.82383978, + "learning_rate": 0.0003682347685891274, + "loss": 0.83427846, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.4074707, + "step": 3105, + "time_per_iteration": 2.840812921524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_mlp": 1.00504565, + "epoch": 0.5975375144286263, + "flos": 723090135552.0, + "grad_norm": 0.03430317325592521, + "language_loss": 0.81334996, + "learning_rate": 0.0003679342645624822, + "loss": 0.82380736, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.40698242, + "step": 3106, + "time_per_iteration": 2.961186408996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.00520086, + "epoch": 0.5977298961138899, + "flos": 752344595712.0, + "grad_norm": 0.03201923744385334, + "language_loss": 0.82261443, + "learning_rate": 0.0003676338118083025, + "loss": 0.83307385, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.4074707, + "step": 3107, + "time_per_iteration": 2.9809908866882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105353, + "balance_loss_mlp": 1.01264107, + "epoch": 0.5979222777991535, + "flos": 531999559680.0, + "grad_norm": 0.03643788911431517, + "language_loss": 0.79681456, + "learning_rate": 0.0003673334104432347, + "loss": 0.8073498, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.40893555, + "step": 3108, + "time_per_iteration": 2.5879976749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_mlp": 1.01157844, + "epoch": 0.5981146594844171, + "flos": 622915362816.0, + "grad_norm": 0.031178647905512342, + "language_loss": 0.84073299, + "learning_rate": 0.0003670330605839048, + "loss": 0.85125697, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.40820312, + "step": 3109, + "time_per_iteration": 2.843069314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_mlp": 1.00877023, + "epoch": 0.5983070411696807, + "flos": 604710428160.0, + "grad_norm": 0.03611015998230635, + "language_loss": 0.77344596, + "learning_rate": 0.0003667327623469191, + "loss": 0.7839421, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.40844727, + "step": 3110, + "time_per_iteration": 2.7326698303222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00438774, + "epoch": 0.5984994228549442, + "flos": 634670610432.0, + "grad_norm": 0.03877534508876671, + "language_loss": 0.78326917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79372144, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.40844727, + "step": 3111, + "time_per_iteration": 2.784482717514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105463, + "balance_loss_mlp": 1.01369393, + "epoch": 0.5986918045402078, + "flos": 526294478592.0, + "grad_norm": 0.03280596002015671, + "language_loss": 0.82781613, + "learning_rate": 0.00036613232120630393, + "loss": 0.83836246, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.40942383, + "step": 3112, + "time_per_iteration": 2.5862860679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105133, + "balance_loss_mlp": 1.0103699, + "epoch": 0.5988841862254713, + "flos": 484140644352.0, + "grad_norm": 0.03859230842611924, + "language_loss": 0.80514455, + "learning_rate": 0.00036583217853578643, + "loss": 0.81565785, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.40966797, + "step": 3113, + "time_per_iteration": 2.565713405609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048995, + "balance_loss_mlp": 1.00805807, + "epoch": 0.5990765679107349, + "flos": 1142123451648.0, + "grad_norm": 0.034390898471739054, + "language_loss": 0.77730286, + "learning_rate": 0.000365532087953837, + "loss": 0.78779286, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.40942383, + "step": 3114, + "time_per_iteration": 3.646124839782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00889242, + "epoch": 0.5992689495959984, + "flos": 518019242496.0, + "grad_norm": 0.033850887819700186, + "language_loss": 0.89597213, + "learning_rate": 0.00036523204957696065, + "loss": 0.90647066, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.40966797, + "step": 3115, + "time_per_iteration": 2.594458818435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050487, + "balance_loss_mlp": 1.00952673, + "epoch": 0.599461331281262, + "flos": 745942540800.0, + "grad_norm": 0.044244117222237124, + "language_loss": 0.81526911, + "learning_rate": 0.00036493206352164324, + "loss": 0.82577395, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.40966797, + "step": 3116, + "time_per_iteration": 2.9088714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046863, + "balance_loss_mlp": 1.0058552, + "epoch": 0.5996537129665256, + "flos": 593484958464.0, + "grad_norm": 0.034019953192927346, + "language_loss": 0.85863578, + "learning_rate": 0.000364632129904349, + "loss": 0.8691045, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.41015625, + "step": 3117, + "time_per_iteration": 2.7059812545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055187, + "balance_loss_mlp": 1.01415479, + "epoch": 0.5998460946517892, + "flos": 560116696320.0, + "grad_norm": 0.0363455836603733, + "language_loss": 0.78243721, + "learning_rate": 0.00036433224884152283, + "loss": 0.79298902, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.41040039, + "step": 3118, + "time_per_iteration": 2.7368576526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049846, + "balance_loss_mlp": 1.00879073, + "epoch": 0.6000384763370528, + "flos": 485536537344.0, + "grad_norm": 0.037553840644260136, + "language_loss": 0.78583586, + "learning_rate": 0.00036403242044958875, + "loss": 0.79633433, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.41064453, + "step": 3119, + "time_per_iteration": 2.5575714111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105366, + "balance_loss_mlp": 1.01267588, + "epoch": 0.6002308580223162, + "flos": 597878717184.0, + "grad_norm": 0.03820222884564333, + "language_loss": 0.91700655, + "learning_rate": 0.0003637326448449507, + "loss": 0.9275431, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.40991211, + "step": 3120, + "time_per_iteration": 2.742879629135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_mlp": 1.00335419, + "epoch": 0.6004232397075798, + "flos": 546220949760.0, + "grad_norm": 0.03312076086842182, + "language_loss": 0.86720824, + "learning_rate": 0.00036343292214399177, + "loss": 0.87765157, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.40991211, + "step": 3121, + "time_per_iteration": 2.827937364578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048688, + "balance_loss_mlp": 1.00777555, + "epoch": 0.6006156213928434, + "flos": 631151715072.0, + "grad_norm": 0.0990751082853954, + "language_loss": 0.77571696, + "learning_rate": 0.00036313325246307456, + "loss": 0.78620386, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.40917969, + "step": 3122, + "time_per_iteration": 2.844771146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044272, + "balance_loss_mlp": 1.00347829, + "epoch": 0.600808003078107, + "flos": 583405560576.0, + "grad_norm": 0.0330511855915857, + "language_loss": 0.87869143, + "learning_rate": 0.0003628336359185411, + "loss": 0.88913417, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.40795898, + "step": 3123, + "time_per_iteration": 2.728536367416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_mlp": 1.00810611, + "epoch": 0.6010003847633705, + "flos": 636439779072.0, + "grad_norm": 0.035612142743683524, + "language_loss": 0.75946915, + "learning_rate": 0.000362534072626713, + "loss": 0.76995575, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.40551758, + "step": 3124, + "time_per_iteration": 2.7660484313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049761, + "balance_loss_mlp": 1.00915837, + "epoch": 0.6011927664486341, + "flos": 720031031808.0, + "grad_norm": 0.034873879848328126, + "language_loss": 0.81774855, + "learning_rate": 0.00036223456270389093, + "loss": 0.82824624, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.40600586, + "step": 3125, + "time_per_iteration": 2.943265676498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050049, + "balance_loss_mlp": 1.00939894, + "epoch": 0.6013851481338977, + "flos": 500055380736.0, + "grad_norm": 0.03349756434082021, + "language_loss": 0.81548929, + "learning_rate": 0.00036193510626635517, + "loss": 0.82598984, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.40649414, + "step": 3126, + "time_per_iteration": 2.7160630226135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049829, + "balance_loss_mlp": 1.00929773, + "epoch": 0.6015775298191612, + "flos": 750876771072.0, + "grad_norm": 0.03275922867012815, + "language_loss": 0.81968188, + "learning_rate": 0.0003616357034303649, + "loss": 0.83018017, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.40527344, + "step": 3127, + "time_per_iteration": 2.9286913871765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_mlp": 1.00725627, + "epoch": 0.6017699115044248, + "flos": 594264557568.0, + "grad_norm": 0.02908266373706377, + "language_loss": 0.79201299, + "learning_rate": 0.0003613363543121584, + "loss": 0.80249178, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.40625, + "step": 3128, + "time_per_iteration": 2.917598009109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_mlp": 1.00568318, + "epoch": 0.6019622931896883, + "flos": 516202441728.0, + "grad_norm": 0.031364349484999776, + "language_loss": 0.85277975, + "learning_rate": 0.00036103705902795357, + "loss": 0.86324257, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.40600586, + "step": 3129, + "time_per_iteration": 2.7694129943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_mlp": 1.01047814, + "epoch": 0.6021546748749519, + "flos": 491473943040.0, + "grad_norm": 0.0392414269589035, + "language_loss": 0.80161059, + "learning_rate": 0.0003607378176939471, + "loss": 0.81212205, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.40673828, + "step": 3130, + "time_per_iteration": 2.622267961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055595, + "balance_loss_mlp": 1.01494503, + "epoch": 0.6023470565602155, + "flos": 542115896064.0, + "grad_norm": 0.037876950900112984, + "language_loss": 0.82781708, + "learning_rate": 0.00036043863042631465, + "loss": 0.83837301, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.40649414, + "step": 3131, + "time_per_iteration": 2.7120039463043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_mlp": 1.01163399, + "epoch": 0.6025394382454791, + "flos": 846464344320.0, + "grad_norm": 0.039947813860245845, + "language_loss": 0.76966566, + "learning_rate": 0.00036013949734121133, + "loss": 0.78018856, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.40649414, + "step": 3132, + "time_per_iteration": 3.127255916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050529, + "balance_loss_mlp": 1.00990224, + "epoch": 0.6027318199307425, + "flos": 578258447616.0, + "grad_norm": 0.03419044123662342, + "language_loss": 0.8313787, + "learning_rate": 0.00035984041855477043, + "loss": 0.84188402, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.40625, + "step": 3133, + "time_per_iteration": 2.7259347438812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051548, + "balance_loss_mlp": 1.01216125, + "epoch": 0.6029242016160061, + "flos": 1474255600128.0, + "grad_norm": 0.0070819988580959, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79761446, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.39355469, + "step": 3134, + "time_per_iteration": 4.934648513793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_mlp": 1.01171601, + "epoch": 0.6031165833012697, + "flos": 481783372032.0, + "grad_norm": 0.03833547758664617, + "language_loss": 0.80612588, + "learning_rate": 0.00035924242434230637, + "loss": 0.81664813, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.4050293, + "step": 3135, + "time_per_iteration": 2.691655397415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_mlp": 1.00985157, + "epoch": 0.6033089649865333, + "flos": 500465594880.0, + "grad_norm": 0.04302606138210952, + "language_loss": 0.79556847, + "learning_rate": 0.00035894350914844516, + "loss": 0.80607277, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.40576172, + "step": 3136, + "time_per_iteration": 2.6602935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048879, + "balance_loss_mlp": 1.00827622, + "epoch": 0.6035013466717969, + "flos": 557724430848.0, + "grad_norm": 0.03619946216792389, + "language_loss": 0.83608747, + "learning_rate": 0.0003586446487175703, + "loss": 0.84657621, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.40600586, + "step": 3137, + "time_per_iteration": 2.7028918266296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050046, + "balance_loss_mlp": 1.00944352, + "epoch": 0.6036937283570604, + "flos": 595996787712.0, + "grad_norm": 0.03316873106558702, + "language_loss": 0.8565768, + "learning_rate": 0.0003583458431657099, + "loss": 0.86707723, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.40600586, + "step": 3138, + "time_per_iteration": 2.730760097503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051204, + "balance_loss_mlp": 1.01048255, + "epoch": 0.603886110042324, + "flos": 542059515648.0, + "grad_norm": 0.041412274215224906, + "language_loss": 0.83086127, + "learning_rate": 0.00035804709260887056, + "loss": 0.84137332, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.40722656, + "step": 3139, + "time_per_iteration": 2.6989586353302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049269, + "balance_loss_mlp": 1.00852323, + "epoch": 0.6040784917275875, + "flos": 519656208384.0, + "grad_norm": 0.031983597535220364, + "language_loss": 0.89732921, + "learning_rate": 0.0003577483971630373, + "loss": 0.90782189, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.4074707, + "step": 3140, + "time_per_iteration": 2.697202205657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_mlp": 1.00888968, + "epoch": 0.6042708734128511, + "flos": 662014950912.0, + "grad_norm": 0.02881540865080385, + "language_loss": 0.85653752, + "learning_rate": 0.00035744975694417414, + "loss": 0.86703384, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.4074707, + "step": 3141, + "time_per_iteration": 2.853609085083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049159, + "balance_loss_mlp": 1.00838912, + "epoch": 0.6044632550981146, + "flos": 573517658112.0, + "grad_norm": 0.037282810981105224, + "language_loss": 0.83199489, + "learning_rate": 0.00035715117206822344, + "loss": 0.8424865, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.40771484, + "step": 3142, + "time_per_iteration": 2.778184175491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_mlp": 1.00812936, + "epoch": 0.6046556367833782, + "flos": 547729603584.0, + "grad_norm": 0.035085942615977306, + "language_loss": 0.81379992, + "learning_rate": 0.0003568526426511065, + "loss": 0.82428956, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.40844727, + "step": 3143, + "time_per_iteration": 2.626789093017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047012, + "balance_loss_mlp": 1.00612307, + "epoch": 0.6048480184686418, + "flos": 778175424768.0, + "grad_norm": 0.035762108913210126, + "language_loss": 0.83504343, + "learning_rate": 0.000356554168808722, + "loss": 0.84551358, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.40893555, + "step": 3144, + "time_per_iteration": 2.987703323364258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_mlp": 1.00229406, + "epoch": 0.6050404001539054, + "flos": 658376491776.0, + "grad_norm": 0.03425886740508031, + "language_loss": 0.85222483, + "learning_rate": 0.00035625575065694837, + "loss": 0.86265695, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.40917969, + "step": 3145, + "time_per_iteration": 2.8534908294677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_mlp": 1.00359786, + "epoch": 0.605232781839169, + "flos": 550082985216.0, + "grad_norm": 0.03070859084954421, + "language_loss": 0.78136766, + "learning_rate": 0.0003559573883116415, + "loss": 0.79181373, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.41015625, + "step": 3146, + "time_per_iteration": 2.701352119445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_mlp": 1.00323021, + "epoch": 0.6054251635244324, + "flos": 606642902016.0, + "grad_norm": 0.029138241099590467, + "language_loss": 0.8591851, + "learning_rate": 0.00035565908188863604, + "loss": 0.8696267, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.40942383, + "step": 3147, + "time_per_iteration": 2.919374465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_mlp": 1.00640118, + "epoch": 0.605617545209696, + "flos": 614809267968.0, + "grad_norm": 0.029609984696998014, + "language_loss": 0.8021152, + "learning_rate": 0.00035536083150374464, + "loss": 0.81258953, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.41040039, + "step": 3148, + "time_per_iteration": 2.7596092224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053307, + "balance_loss_mlp": 1.01382446, + "epoch": 0.6058099268949596, + "flos": 1501610634240.0, + "grad_norm": 0.006207951084567088, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75801259, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.39453125, + "step": 3149, + "time_per_iteration": 4.876317739486694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051295, + "balance_loss_mlp": 1.01014411, + "epoch": 0.6060023085802232, + "flos": 671705521920.0, + "grad_norm": 0.034498143829504634, + "language_loss": 0.86414444, + "learning_rate": 0.0003547644993114475, + "loss": 0.87465739, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.41162109, + "step": 3150, + "time_per_iteration": 2.845522403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052323, + "balance_loss_mlp": 1.01110053, + "epoch": 0.6061946902654868, + "flos": 607306828032.0, + "grad_norm": 0.035670233665724194, + "language_loss": 0.80287176, + "learning_rate": 0.00035446641773555806, + "loss": 0.81339502, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.41235352, + "step": 3151, + "time_per_iteration": 2.7565760612487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_mlp": 1.01236176, + "epoch": 0.6063870719507503, + "flos": 558953127936.0, + "grad_norm": 0.031088575801094406, + "language_loss": 0.8789348, + "learning_rate": 0.000354168392660816, + "loss": 0.88947117, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.4128418, + "step": 3152, + "time_per_iteration": 2.747297525405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049898, + "balance_loss_mlp": 1.00865126, + "epoch": 0.6065794536360138, + "flos": 558282398976.0, + "grad_norm": 0.032072657791302916, + "language_loss": 0.83342856, + "learning_rate": 0.0003538704242029252, + "loss": 0.84392756, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.41259766, + "step": 3153, + "time_per_iteration": 2.7606263160705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050833, + "balance_loss_mlp": 1.0096823, + "epoch": 0.6067718353212774, + "flos": 691382171904.0, + "grad_norm": 0.035512545115511426, + "language_loss": 0.78534603, + "learning_rate": 0.0003535725124775672, + "loss": 0.79585433, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.41162109, + "step": 3154, + "time_per_iteration": 2.832683801651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046236, + "balance_loss_mlp": 1.00510859, + "epoch": 0.606964217006541, + "flos": 522903895296.0, + "grad_norm": 0.031701324925560485, + "language_loss": 0.87189692, + "learning_rate": 0.00035327465760040126, + "loss": 0.88235927, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.41137695, + "step": 3155, + "time_per_iteration": 2.6946585178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_mlp": 1.00643396, + "epoch": 0.6071565986918045, + "flos": 642713521920.0, + "grad_norm": 0.0351469249432502, + "language_loss": 0.85231131, + "learning_rate": 0.00035297685968706526, + "loss": 0.86278605, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.41040039, + "step": 3156, + "time_per_iteration": 2.7586491107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045389, + "balance_loss_mlp": 1.00416672, + "epoch": 0.6073489803770681, + "flos": 561653540352.0, + "grad_norm": 0.03543028352480344, + "language_loss": 0.83488154, + "learning_rate": 0.00035267911885317454, + "loss": 0.84533542, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.41235352, + "step": 3157, + "time_per_iteration": 2.678812026977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051077, + "balance_loss_mlp": 1.00997388, + "epoch": 0.6075413620623317, + "flos": 587202467328.0, + "grad_norm": 0.03110064511501168, + "language_loss": 0.81796658, + "learning_rate": 0.0003523814352143222, + "loss": 0.82847732, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.41113281, + "step": 3158, + "time_per_iteration": 2.8277432918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052557, + "balance_loss_mlp": 1.01128709, + "epoch": 0.6077337437475953, + "flos": 631972143360.0, + "grad_norm": 0.03468149601951464, + "language_loss": 0.9173736, + "learning_rate": 0.00035208380888607937, + "loss": 0.92789918, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.4128418, + "step": 3159, + "time_per_iteration": 2.787712574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_mlp": 1.01289368, + "epoch": 0.6079261254328588, + "flos": 1471626152448.0, + "grad_norm": 0.014144477200468554, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80514455, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.39550781, + "step": 3160, + "time_per_iteration": 4.879680871963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053925, + "balance_loss_mlp": 1.01444244, + "epoch": 0.6081185071181223, + "flos": 1526205963264.0, + "grad_norm": 0.006801374803666016, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76746154, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.39453125, + "step": 3161, + "time_per_iteration": 5.0031373500823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051684, + "balance_loss_mlp": 1.0106045, + "epoch": 0.6083108888033859, + "flos": 557435725824.0, + "grad_norm": 0.030142563258654227, + "language_loss": 0.82224369, + "learning_rate": 0.00035119127492038446, + "loss": 0.83276057, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.41088867, + "step": 3162, + "time_per_iteration": 2.80432391166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053395, + "balance_loss_mlp": 1.01229131, + "epoch": 0.6085032704886495, + "flos": 842556622080.0, + "grad_norm": 0.03512464115253957, + "language_loss": 0.83202064, + "learning_rate": 0.00035089387898984436, + "loss": 0.84255463, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.41113281, + "step": 3163, + "time_per_iteration": 3.1297876834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_mlp": 1.008147, + "epoch": 0.6086956521739131, + "flos": 685993986048.0, + "grad_norm": 0.03637672327155598, + "language_loss": 0.82543135, + "learning_rate": 0.0003505965409474343, + "loss": 0.83592415, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.41137695, + "step": 3164, + "time_per_iteration": 2.9028842449188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_mlp": 1.00382376, + "epoch": 0.6088880338591766, + "flos": 536866715904.0, + "grad_norm": 0.035078655431856474, + "language_loss": 0.86721897, + "learning_rate": 0.0003502992609085913, + "loss": 0.87766796, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.41088867, + "step": 3165, + "time_per_iteration": 2.752734422683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045246, + "balance_loss_mlp": 1.0041908, + "epoch": 0.6090804155444401, + "flos": 732882773760.0, + "grad_norm": 0.030998406489771316, + "language_loss": 0.82771933, + "learning_rate": 0.00035000203898872954, + "loss": 0.83817178, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.41064453, + "step": 3166, + "time_per_iteration": 2.9903385639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_mlp": 1.00420833, + "epoch": 0.6092727972297037, + "flos": 700243566336.0, + "grad_norm": 0.03412494871544842, + "language_loss": 0.85219544, + "learning_rate": 0.0003497048753032406, + "loss": 0.86264783, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.41040039, + "step": 3167, + "time_per_iteration": 2.8939006328582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052068, + "balance_loss_mlp": 1.01117909, + "epoch": 0.6094651789149673, + "flos": 1053677681664.0, + "grad_norm": 0.032839303584214885, + "language_loss": 0.81472063, + "learning_rate": 0.000349407769967494, + "loss": 0.82524133, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.40893555, + "step": 3168, + "time_per_iteration": 3.384226083755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_mlp": 1.0035919, + "epoch": 0.6096575606002309, + "flos": 504095305728.0, + "grad_norm": 0.03315731648792901, + "language_loss": 0.85102254, + "learning_rate": 0.0003491107230968361, + "loss": 0.86146903, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.41064453, + "step": 3169, + "time_per_iteration": 2.6621110439300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104943, + "balance_loss_mlp": 1.00837409, + "epoch": 0.6098499422854944, + "flos": 586864184832.0, + "grad_norm": 0.02773637180026576, + "language_loss": 0.82196522, + "learning_rate": 0.00034881373480659085, + "loss": 0.83245957, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.41064453, + "step": 3170, + "time_per_iteration": 2.8139965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00795305, + "epoch": 0.610042323970758, + "flos": 470160327168.0, + "grad_norm": 0.03906179499333773, + "language_loss": 0.78314018, + "learning_rate": 0.0003485168052120594, + "loss": 0.79363, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.41040039, + "step": 3171, + "time_per_iteration": 2.5483758449554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052409, + "balance_loss_mlp": 1.01142442, + "epoch": 0.6102347056560216, + "flos": 515199266304.0, + "grad_norm": 0.03618411847150492, + "language_loss": 0.80390579, + "learning_rate": 0.00034821993442851973, + "loss": 0.81442988, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.40991211, + "step": 3172, + "time_per_iteration": 2.590830087661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_mlp": 1.00884163, + "epoch": 0.6104270873412851, + "flos": 469964941056.0, + "grad_norm": 0.03897584044245514, + "language_loss": 0.82572639, + "learning_rate": 0.00034792312257122735, + "loss": 0.83622348, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.40869141, + "step": 3173, + "time_per_iteration": 2.594754457473755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_mlp": 1.00834739, + "epoch": 0.6106194690265486, + "flos": 550940352000.0, + "grad_norm": 0.03632239406226319, + "language_loss": 0.81349075, + "learning_rate": 0.00034762636975541506, + "loss": 0.82398319, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.40893555, + "step": 3174, + "time_per_iteration": 2.6291897296905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046645, + "balance_loss_mlp": 1.00563669, + "epoch": 0.6108118507118122, + "flos": 473881411584.0, + "grad_norm": 0.03249903592127121, + "language_loss": 0.81528097, + "learning_rate": 0.0003473296760962923, + "loss": 0.82574743, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.41015625, + "step": 3175, + "time_per_iteration": 2.6912500858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052322, + "balance_loss_mlp": 1.01264954, + "epoch": 0.6110042323970758, + "flos": 1448182731264.0, + "grad_norm": 0.007043978800011362, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79586065, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.39648438, + "step": 3176, + "time_per_iteration": 4.679258108139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00635707, + "epoch": 0.6111966140823394, + "flos": 795542434560.0, + "grad_norm": 0.03450548999539666, + "language_loss": 0.81482762, + "learning_rate": 0.00034673646670883976, + "loss": 0.82530034, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.40917969, + "step": 3177, + "time_per_iteration": 2.9776415824890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104406, + "balance_loss_mlp": 1.0043869, + "epoch": 0.611388995767603, + "flos": 1561066349568.0, + "grad_norm": 0.006895739494838764, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76759082, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.39648438, + "step": 3178, + "time_per_iteration": 4.9859678745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046014, + "balance_loss_mlp": 1.00512564, + "epoch": 0.6115813774528664, + "flos": 713486080512.0, + "grad_norm": 0.037712756689321836, + "language_loss": 0.81948996, + "learning_rate": 0.0003461434953300865, + "loss": 0.82995009, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.40893555, + "step": 3179, + "time_per_iteration": 2.9206619262695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046107, + "balance_loss_mlp": 1.0051471, + "epoch": 0.61177375913813, + "flos": 685690696704.0, + "grad_norm": 0.02737860550975636, + "language_loss": 0.81828141, + "learning_rate": 0.0003458470991817515, + "loss": 0.8287425, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.40966797, + "step": 3180, + "time_per_iteration": 3.0038623809814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046359, + "balance_loss_mlp": 1.00537503, + "epoch": 0.6119661408233936, + "flos": 512667995136.0, + "grad_norm": 0.03551722244255775, + "language_loss": 0.85187316, + "learning_rate": 0.0003455507628808802, + "loss": 0.86233675, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.40991211, + "step": 3181, + "time_per_iteration": 2.623522996902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048614, + "balance_loss_mlp": 1.0076772, + "epoch": 0.6121585225086572, + "flos": 557856633600.0, + "grad_norm": 0.04043393522454786, + "language_loss": 0.85139406, + "learning_rate": 0.00034525448654252076, + "loss": 0.86188018, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.40942383, + "step": 3182, + "time_per_iteration": 2.701493501663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053271, + "balance_loss_mlp": 1.0125016, + "epoch": 0.6123509041939207, + "flos": 562910427648.0, + "grad_norm": 0.044342295152579134, + "language_loss": 0.83549857, + "learning_rate": 0.0003449582702816976, + "loss": 0.84603125, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.40771484, + "step": 3183, + "time_per_iteration": 2.6956191062927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050719, + "balance_loss_mlp": 1.00980616, + "epoch": 0.6125432858791843, + "flos": 559131017472.0, + "grad_norm": 0.0337797622344846, + "language_loss": 0.833462, + "learning_rate": 0.0003446621142134122, + "loss": 0.84396923, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.40917969, + "step": 3184, + "time_per_iteration": 2.639289379119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049865, + "balance_loss_mlp": 1.0089761, + "epoch": 0.6127356675644479, + "flos": 415897411584.0, + "grad_norm": 0.038637283425345254, + "language_loss": 0.84757721, + "learning_rate": 0.0003443660184526424, + "loss": 0.85807586, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.40893555, + "step": 3185, + "time_per_iteration": 2.4257092475891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_mlp": 1.00855243, + "epoch": 0.6129280492497114, + "flos": 605034126336.0, + "grad_norm": 0.03183522344564459, + "language_loss": 0.86949629, + "learning_rate": 0.0003440699831143429, + "loss": 0.87999046, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.40869141, + "step": 3186, + "time_per_iteration": 2.775930404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051794, + "balance_loss_mlp": 1.01092947, + "epoch": 0.613120430934975, + "flos": 520865463552.0, + "grad_norm": 0.03426856833524134, + "language_loss": 0.82819283, + "learning_rate": 0.0003437740083134449, + "loss": 0.83871073, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.40869141, + "step": 3187, + "time_per_iteration": 2.696072816848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049174, + "balance_loss_mlp": 1.00835705, + "epoch": 0.6133128126202385, + "flos": 512081836800.0, + "grad_norm": 0.03992475023697304, + "language_loss": 0.84158587, + "learning_rate": 0.00034347809416485574, + "loss": 0.8520776, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.40820312, + "step": 3188, + "time_per_iteration": 2.6222550868988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052376, + "balance_loss_mlp": 1.01158273, + "epoch": 0.6135051943055021, + "flos": 608757156096.0, + "grad_norm": 0.032577275408737616, + "language_loss": 0.82338852, + "learning_rate": 0.0003431822407834597, + "loss": 0.83391231, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.40795898, + "step": 3189, + "time_per_iteration": 2.818133592605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050392, + "balance_loss_mlp": 1.00959849, + "epoch": 0.6136975759907657, + "flos": 1162010072064.0, + "grad_norm": 0.04434341362834108, + "language_loss": 0.84634304, + "learning_rate": 0.00034288644828411706, + "loss": 0.85684693, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.40795898, + "step": 3190, + "time_per_iteration": 3.4801251888275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_mlp": 1.00911534, + "epoch": 0.6138899576760293, + "flos": 708173716992.0, + "grad_norm": 0.03680261410998276, + "language_loss": 0.76343262, + "learning_rate": 0.0003425907167816649, + "loss": 0.77393216, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.40844727, + "step": 3191, + "time_per_iteration": 2.859435558319092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049983, + "balance_loss_mlp": 1.00914156, + "epoch": 0.6140823393612928, + "flos": 587619484416.0, + "grad_norm": 0.036153352426406216, + "language_loss": 0.85233247, + "learning_rate": 0.00034229504639091623, + "loss": 0.86283231, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.40844727, + "step": 3192, + "time_per_iteration": 2.7828218936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_mlp": 1.00656581, + "epoch": 0.6142747210465563, + "flos": 805619887104.0, + "grad_norm": 0.035035162625632645, + "language_loss": 0.80565524, + "learning_rate": 0.0003419994372266606, + "loss": 0.81612837, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.4074707, + "step": 3193, + "time_per_iteration": 3.1529080867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046771, + "balance_loss_mlp": 1.00593019, + "epoch": 0.6144671027318199, + "flos": 530545340928.0, + "grad_norm": 0.02881776150326524, + "language_loss": 0.82229221, + "learning_rate": 0.00034170388940366335, + "loss": 0.83275998, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.40844727, + "step": 3194, + "time_per_iteration": 2.733793258666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_mlp": 1.00598967, + "epoch": 0.6146594844170835, + "flos": 806913712896.0, + "grad_norm": 0.03443984664399312, + "language_loss": 0.8074832, + "learning_rate": 0.0003414084030366667, + "loss": 0.81795198, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.40893555, + "step": 3195, + "time_per_iteration": 3.1194753646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049907, + "balance_loss_mlp": 1.00906587, + "epoch": 0.6148518661023471, + "flos": 502762596096.0, + "grad_norm": 0.03247725998101352, + "language_loss": 0.83429492, + "learning_rate": 0.0003411129782403883, + "loss": 0.84479403, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.40844727, + "step": 3196, + "time_per_iteration": 2.701995849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_mlp": 1.00785387, + "epoch": 0.6150442477876106, + "flos": 511699812864.0, + "grad_norm": 0.05177418573029483, + "language_loss": 0.85667449, + "learning_rate": 0.0003408176151295225, + "loss": 0.86716187, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.40893555, + "step": 3197, + "time_per_iteration": 2.6645357608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046183, + "balance_loss_mlp": 1.0052464, + "epoch": 0.6152366294728742, + "flos": 527998518528.0, + "grad_norm": 0.03939493376677649, + "language_loss": 0.7823236, + "learning_rate": 0.00034052231381873944, + "loss": 0.79278541, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.40942383, + "step": 3198, + "time_per_iteration": 2.6415092945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049656, + "balance_loss_mlp": 1.00881481, + "epoch": 0.6154290111581378, + "flos": 474282877440.0, + "grad_norm": 0.04031967856737408, + "language_loss": 0.85886127, + "learning_rate": 0.00034022707442268494, + "loss": 0.86935782, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.40844727, + "step": 3199, + "time_per_iteration": 2.5885183811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_mlp": 1.00976777, + "epoch": 0.6156213928434013, + "flos": 551934779136.0, + "grad_norm": 0.028515598642512706, + "language_loss": 0.82251477, + "learning_rate": 0.0003399318970559813, + "loss": 0.83302015, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.40771484, + "step": 3200, + "time_per_iteration": 2.819209337234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050709, + "balance_loss_mlp": 1.00998724, + "epoch": 0.6158137745286649, + "flos": 752362092288.0, + "grad_norm": 0.030934752464501728, + "language_loss": 0.84934688, + "learning_rate": 0.00033963678183322656, + "loss": 0.85985398, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.40722656, + "step": 3201, + "time_per_iteration": 3.0306894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051067, + "balance_loss_mlp": 1.01027346, + "epoch": 0.6160061562139284, + "flos": 556905947904.0, + "grad_norm": 0.03121820045207164, + "language_loss": 0.83180207, + "learning_rate": 0.0003393417288689945, + "loss": 0.84231275, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.40795898, + "step": 3202, + "time_per_iteration": 2.748361587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050686, + "balance_loss_mlp": 1.00989294, + "epoch": 0.616198537899192, + "flos": 743467650048.0, + "grad_norm": 0.04116101332214976, + "language_loss": 0.76590461, + "learning_rate": 0.00033904673827783504, + "loss": 0.77641141, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.40795898, + "step": 3203, + "time_per_iteration": 2.9209775924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052852, + "balance_loss_mlp": 1.01193893, + "epoch": 0.6163909195844556, + "flos": 479775075840.0, + "grad_norm": 0.031654400686770015, + "language_loss": 0.82428539, + "learning_rate": 0.00033875181017427357, + "loss": 0.83481383, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.40917969, + "step": 3204, + "time_per_iteration": 2.6138155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104652, + "balance_loss_mlp": 1.00551248, + "epoch": 0.6165833012697192, + "flos": 532666397952.0, + "grad_norm": 0.03324868864618939, + "language_loss": 0.81742775, + "learning_rate": 0.00033845694467281133, + "loss": 0.82789296, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.41015625, + "step": 3205, + "time_per_iteration": 2.8665361404418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045816, + "balance_loss_mlp": 1.0049988, + "epoch": 0.6167756829549826, + "flos": 809295284736.0, + "grad_norm": 0.03418345099687322, + "language_loss": 0.83676243, + "learning_rate": 0.00033816214188792516, + "loss": 0.8472206, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.40820312, + "step": 3206, + "time_per_iteration": 3.176194190979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_mlp": 1.00504088, + "epoch": 0.6169680646402462, + "flos": 489910854144.0, + "grad_norm": 0.03420383958613512, + "language_loss": 0.8597641, + "learning_rate": 0.00033786740193406784, + "loss": 0.87022221, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.40771484, + "step": 3207, + "time_per_iteration": 2.60602068901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_mlp": 1.00528312, + "epoch": 0.6171604463255098, + "flos": 620204256768.0, + "grad_norm": 0.033645733240054064, + "language_loss": 0.81914175, + "learning_rate": 0.00033757272492566736, + "loss": 0.82960248, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.40795898, + "step": 3208, + "time_per_iteration": 2.929311990737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_mlp": 1.00502181, + "epoch": 0.6173528280107734, + "flos": 529895999232.0, + "grad_norm": 0.030436054236508022, + "language_loss": 0.87530887, + "learning_rate": 0.0003372781109771278, + "loss": 0.8857677, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.40869141, + "step": 3209, + "time_per_iteration": 2.725886821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_mlp": 1.00390351, + "epoch": 0.617545209696037, + "flos": 597737766144.0, + "grad_norm": 0.031193081131094685, + "language_loss": 0.77093422, + "learning_rate": 0.0003369835602028281, + "loss": 0.78138143, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.40820312, + "step": 3210, + "time_per_iteration": 2.7928357124328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_mlp": 1.00196826, + "epoch": 0.6177375913813005, + "flos": 476106481152.0, + "grad_norm": 0.036241731553070825, + "language_loss": 0.80260098, + "learning_rate": 0.0003366890727171232, + "loss": 0.81302822, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.4074707, + "step": 3211, + "time_per_iteration": 2.688157558441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046468, + "balance_loss_mlp": 1.00565052, + "epoch": 0.617929973066564, + "flos": 530881678080.0, + "grad_norm": 0.03703049785450956, + "language_loss": 0.7920953, + "learning_rate": 0.00033639464863434313, + "loss": 0.80255997, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.40820312, + "step": 3212, + "time_per_iteration": 2.6376640796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105003, + "balance_loss_mlp": 1.01045227, + "epoch": 0.6181223547518276, + "flos": 1422835026432.0, + "grad_norm": 0.010124003783497993, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79492497, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.39550781, + "step": 3213, + "time_per_iteration": 4.704723596572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047001, + "balance_loss_mlp": 1.00618351, + "epoch": 0.6183147364370912, + "flos": 741696536064.0, + "grad_norm": 0.03266398965494079, + "language_loss": 0.79975474, + "learning_rate": 0.00033580599113475543, + "loss": 0.81022477, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.40820312, + "step": 3214, + "time_per_iteration": 2.9861807823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00875139, + "epoch": 0.6185071181223547, + "flos": 382483462656.0, + "grad_norm": 0.034946308334165094, + "language_loss": 0.86866862, + "learning_rate": 0.00033551175794648507, + "loss": 0.87916261, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.40649414, + "step": 3215, + "time_per_iteration": 2.462238311767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050777, + "balance_loss_mlp": 1.01005554, + "epoch": 0.6186994998076183, + "flos": 464305546752.0, + "grad_norm": 0.05487149837237803, + "language_loss": 0.82309055, + "learning_rate": 0.00033521758861821365, + "loss": 0.83359838, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.40722656, + "step": 3216, + "time_per_iteration": 2.6265599727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_mlp": 1.00778484, + "epoch": 0.6188918814928819, + "flos": 486252953088.0, + "grad_norm": 0.035787768578127474, + "language_loss": 0.89356089, + "learning_rate": 0.0003349234832641479, + "loss": 0.90404689, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.40820312, + "step": 3217, + "time_per_iteration": 2.600252628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105038, + "balance_loss_mlp": 1.00956285, + "epoch": 0.6190842631781455, + "flos": 658598122752.0, + "grad_norm": 0.04394177664040498, + "language_loss": 0.81214905, + "learning_rate": 0.00033462944199846975, + "loss": 0.82265282, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.40820312, + "step": 3218, + "time_per_iteration": 3.059032917022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_mlp": 1.00748599, + "epoch": 0.619276644863409, + "flos": 404467807488.0, + "grad_norm": 0.03662586595942604, + "language_loss": 0.87058449, + "learning_rate": 0.00033433546493533606, + "loss": 0.88106751, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.40820312, + "step": 3219, + "time_per_iteration": 2.464569091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049492, + "balance_loss_mlp": 1.00876999, + "epoch": 0.6194690265486725, + "flos": 584241540096.0, + "grad_norm": 0.03704236392673744, + "language_loss": 0.8459326, + "learning_rate": 0.00033404155218887897, + "loss": 0.85642755, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.40722656, + "step": 3220, + "time_per_iteration": 2.717883825302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048251, + "balance_loss_mlp": 1.00745773, + "epoch": 0.6196614082339361, + "flos": 505385240832.0, + "grad_norm": 0.03422152158197648, + "language_loss": 0.87844843, + "learning_rate": 0.00033374770387320534, + "loss": 0.88893092, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.40795898, + "step": 3221, + "time_per_iteration": 2.7630932331085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00921607, + "epoch": 0.6198537899191997, + "flos": 576526217472.0, + "grad_norm": 0.03373583765668511, + "language_loss": 0.85412097, + "learning_rate": 0.00033345392010239737, + "loss": 0.86462182, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.40869141, + "step": 3222, + "time_per_iteration": 2.7410025596618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050416, + "balance_loss_mlp": 1.00952721, + "epoch": 0.6200461716044633, + "flos": 594303441408.0, + "grad_norm": 0.03547804945622036, + "language_loss": 0.82924426, + "learning_rate": 0.0003331602009905118, + "loss": 0.83974844, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.40893555, + "step": 3223, + "time_per_iteration": 2.8037710189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_mlp": 1.01098979, + "epoch": 0.6202385532897268, + "flos": 667411885056.0, + "grad_norm": 0.03269956620721502, + "language_loss": 0.84572297, + "learning_rate": 0.00033286654665158085, + "loss": 0.85624015, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.40722656, + "step": 3224, + "time_per_iteration": 2.948554754257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050192, + "balance_loss_mlp": 1.00939882, + "epoch": 0.6204309349749904, + "flos": 485927309568.0, + "grad_norm": 0.03423910891288116, + "language_loss": 0.88386071, + "learning_rate": 0.0003325729571996109, + "loss": 0.89436263, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.40795898, + "step": 3225, + "time_per_iteration": 2.6549041271209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049912, + "balance_loss_mlp": 1.00914264, + "epoch": 0.6206233166602539, + "flos": 585218470656.0, + "grad_norm": 0.03260898019544377, + "language_loss": 0.84271944, + "learning_rate": 0.000332279432748584, + "loss": 0.85321862, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.40771484, + "step": 3226, + "time_per_iteration": 2.716174840927124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_mlp": 1.00827563, + "epoch": 0.6208156983455175, + "flos": 477912588288.0, + "grad_norm": 0.031713525688758036, + "language_loss": 0.87778246, + "learning_rate": 0.00033198597341245576, + "loss": 0.88827246, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.40722656, + "step": 3227, + "time_per_iteration": 2.596343994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_mlp": 1.00591445, + "epoch": 0.6210080800307811, + "flos": 790469198592.0, + "grad_norm": 0.02931098854288103, + "language_loss": 0.82211602, + "learning_rate": 0.00033169257930515763, + "loss": 0.8325814, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.40625, + "step": 3228, + "time_per_iteration": 3.0495920181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_mlp": 1.01036096, + "epoch": 0.6212004617160446, + "flos": 608917549056.0, + "grad_norm": 0.05193251609129224, + "language_loss": 0.83099496, + "learning_rate": 0.0003313992505405951, + "loss": 0.8415041, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.40551758, + "step": 3229, + "time_per_iteration": 2.7221577167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049521, + "balance_loss_mlp": 1.00896585, + "epoch": 0.6213928434013082, + "flos": 587612681472.0, + "grad_norm": 0.04085502918766405, + "language_loss": 0.81571418, + "learning_rate": 0.0003311059872326487, + "loss": 0.82620943, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.40551758, + "step": 3230, + "time_per_iteration": 2.6938486099243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051014, + "balance_loss_mlp": 1.0103395, + "epoch": 0.6215852250865718, + "flos": 537109734144.0, + "grad_norm": 0.03319484231219387, + "language_loss": 0.79486078, + "learning_rate": 0.0003308127894951734, + "loss": 0.80537093, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.40673828, + "step": 3231, + "time_per_iteration": 2.6565897464752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00634694, + "epoch": 0.6217776067718354, + "flos": 619313842176.0, + "grad_norm": 0.044149605083951216, + "language_loss": 0.8665247, + "learning_rate": 0.00033051965744199834, + "loss": 0.87699568, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.4074707, + "step": 3232, + "time_per_iteration": 2.7405452728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_mlp": 1.00575984, + "epoch": 0.6219699884570988, + "flos": 547100670720.0, + "grad_norm": 0.03240939524045973, + "language_loss": 0.90891719, + "learning_rate": 0.0003302265911869276, + "loss": 0.91938138, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.40649414, + "step": 3233, + "time_per_iteration": 2.9264018535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048831, + "balance_loss_mlp": 1.00827634, + "epoch": 0.6221623701423624, + "flos": 482156647680.0, + "grad_norm": 0.04042837420673253, + "language_loss": 0.8472892, + "learning_rate": 0.0003299335908437397, + "loss": 0.85777748, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.40551758, + "step": 3234, + "time_per_iteration": 2.6122491359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104817, + "balance_loss_mlp": 1.00751972, + "epoch": 0.622354751827626, + "flos": 380872741632.0, + "grad_norm": 0.045523891323386655, + "language_loss": 0.80743796, + "learning_rate": 0.0003296406565261873, + "loss": 0.81791961, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.40649414, + "step": 3235, + "time_per_iteration": 2.4912121295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052853, + "balance_loss_mlp": 1.01241732, + "epoch": 0.6225471335128896, + "flos": 669072183552.0, + "grad_norm": 0.032252040846456206, + "language_loss": 0.85526693, + "learning_rate": 0.0003293477883479978, + "loss": 0.86579549, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.40429688, + "step": 3236, + "time_per_iteration": 2.8378734588623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049915, + "balance_loss_mlp": 1.00943148, + "epoch": 0.6227395151981532, + "flos": 772628791296.0, + "grad_norm": 0.03861340277154514, + "language_loss": 0.80045772, + "learning_rate": 0.0003290549864228727, + "loss": 0.81095684, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.40478516, + "step": 3237, + "time_per_iteration": 2.9996402263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00410068, + "epoch": 0.6229318968834167, + "flos": 485358647808.0, + "grad_norm": 0.03163121059903129, + "language_loss": 0.87001842, + "learning_rate": 0.0003287622508644875, + "loss": 0.88046503, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.40551758, + "step": 3238, + "time_per_iteration": 2.8210766315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051288, + "balance_loss_mlp": 1.01082802, + "epoch": 0.6231242785686802, + "flos": 463877836032.0, + "grad_norm": 0.03974001893419822, + "language_loss": 0.87119055, + "learning_rate": 0.0003284695817864923, + "loss": 0.88170344, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.40454102, + "step": 3239, + "time_per_iteration": 2.4931445121765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048406, + "balance_loss_mlp": 1.00773168, + "epoch": 0.6233166602539438, + "flos": 610211374848.0, + "grad_norm": 0.03997150810707431, + "language_loss": 0.84201944, + "learning_rate": 0.0003281769793025116, + "loss": 0.85250354, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.40673828, + "step": 3240, + "time_per_iteration": 2.71476674079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00999331, + "epoch": 0.6235090419392074, + "flos": 440115574272.0, + "grad_norm": 0.053967997241239116, + "language_loss": 0.9023276, + "learning_rate": 0.00032788444352614346, + "loss": 0.91283357, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.40600586, + "step": 3241, + "time_per_iteration": 2.5143325328826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_mlp": 1.00826836, + "epoch": 0.6237014236244709, + "flos": 505901412864.0, + "grad_norm": 0.03953535493242474, + "language_loss": 0.81586522, + "learning_rate": 0.0003275919745709606, + "loss": 0.82635486, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.40698242, + "step": 3242, + "time_per_iteration": 2.6041946411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052265, + "balance_loss_mlp": 1.01171017, + "epoch": 0.6238938053097345, + "flos": 513996814080.0, + "grad_norm": 0.03348358487194809, + "language_loss": 0.82661837, + "learning_rate": 0.00032729957255050936, + "loss": 0.83714104, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.40551758, + "step": 3243, + "time_per_iteration": 2.6362357139587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053144, + "balance_loss_mlp": 1.01263702, + "epoch": 0.6240861869949981, + "flos": 738023083776.0, + "grad_norm": 0.04011709848771047, + "language_loss": 0.82433391, + "learning_rate": 0.0003270072375783102, + "loss": 0.83486533, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.4050293, + "step": 3244, + "time_per_iteration": 2.890136241912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_mlp": 1.00855565, + "epoch": 0.6242785686802617, + "flos": 495709254144.0, + "grad_norm": 0.03469894111823996, + "language_loss": 0.80177683, + "learning_rate": 0.00032671496976785774, + "loss": 0.81226623, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.40380859, + "step": 3245, + "time_per_iteration": 2.6587681770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_mlp": 1.01091611, + "epoch": 0.6244709503655252, + "flos": 747234421248.0, + "grad_norm": 0.03291682412434118, + "language_loss": 0.76093823, + "learning_rate": 0.0003264227692326205, + "loss": 0.77145123, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.40380859, + "step": 3246, + "time_per_iteration": 3.0954296588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050368, + "balance_loss_mlp": 1.00995624, + "epoch": 0.6246633320507887, + "flos": 493551258624.0, + "grad_norm": 0.036876384824843206, + "language_loss": 0.86561215, + "learning_rate": 0.00032613063608604055, + "loss": 0.8761158, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.40405273, + "step": 3247, + "time_per_iteration": 2.632049560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_mlp": 1.00296032, + "epoch": 0.6248557137360523, + "flos": 518392518144.0, + "grad_norm": 0.03391504049871655, + "language_loss": 0.84063625, + "learning_rate": 0.0003258385704415343, + "loss": 0.85107023, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.40429688, + "step": 3248, + "time_per_iteration": 2.580336809158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00317442, + "epoch": 0.6250480954213159, + "flos": 520429004544.0, + "grad_norm": 0.028687824097281916, + "language_loss": 0.83734399, + "learning_rate": 0.0003255465724124915, + "loss": 0.84777981, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.40405273, + "step": 3249, + "time_per_iteration": 2.699963331222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_mlp": 1.00580287, + "epoch": 0.6252404771065795, + "flos": 517070502144.0, + "grad_norm": 0.03444404266219843, + "language_loss": 0.83187747, + "learning_rate": 0.00032525464211227587, + "loss": 0.84233886, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.40332031, + "step": 3250, + "time_per_iteration": 2.590261697769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045897, + "balance_loss_mlp": 1.0055331, + "epoch": 0.6254328587918431, + "flos": 577997932800.0, + "grad_norm": 0.03271100856558234, + "language_loss": 0.86164498, + "learning_rate": 0.0003249627796542249, + "loss": 0.87210405, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.40356445, + "step": 3251, + "time_per_iteration": 2.706554412841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046601, + "balance_loss_mlp": 1.006284, + "epoch": 0.6256252404771065, + "flos": 599105468928.0, + "grad_norm": 0.035746905542485746, + "language_loss": 0.84805512, + "learning_rate": 0.00032467098515164943, + "loss": 0.8585211, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.40307617, + "step": 3252, + "time_per_iteration": 2.870948076248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_mlp": 1.00411773, + "epoch": 0.6258176221623701, + "flos": 509361982464.0, + "grad_norm": 0.036795712439313615, + "language_loss": 0.84738171, + "learning_rate": 0.00032437925871783456, + "loss": 0.85782516, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.40209961, + "step": 3253, + "time_per_iteration": 2.6761369705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_mlp": 1.00468659, + "epoch": 0.6260100038476337, + "flos": 640805347584.0, + "grad_norm": 0.03851108593477808, + "language_loss": 0.85338682, + "learning_rate": 0.00032408760046603803, + "loss": 0.86383539, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.40161133, + "step": 3254, + "time_per_iteration": 2.8586931228637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_mlp": 1.00344408, + "epoch": 0.6262023855328973, + "flos": 842452609536.0, + "grad_norm": 0.03391057824911436, + "language_loss": 0.78393734, + "learning_rate": 0.00032379601050949193, + "loss": 0.79437345, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.40161133, + "step": 3255, + "time_per_iteration": 3.0973715782165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_mlp": 1.00629032, + "epoch": 0.6263947672181608, + "flos": 523157607168.0, + "grad_norm": 0.03422589562212714, + "language_loss": 0.8863821, + "learning_rate": 0.0003235044889614013, + "loss": 0.89684743, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.40234375, + "step": 3256, + "time_per_iteration": 2.643688917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046154, + "balance_loss_mlp": 1.00593293, + "epoch": 0.6265871489034244, + "flos": 608290561536.0, + "grad_norm": 0.06509285278700487, + "language_loss": 0.84065372, + "learning_rate": 0.0003232130359349451, + "loss": 0.85111523, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.40209961, + "step": 3257, + "time_per_iteration": 2.859252452850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_mlp": 1.00690067, + "epoch": 0.626779530588688, + "flos": 589594732800.0, + "grad_norm": 0.03191133097735202, + "language_loss": 0.82224607, + "learning_rate": 0.0003229216515432751, + "loss": 0.83271682, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.40161133, + "step": 3258, + "time_per_iteration": 2.7475619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_mlp": 1.00625372, + "epoch": 0.6269719122739515, + "flos": 439538164224.0, + "grad_norm": 0.04023600043450841, + "language_loss": 0.80242079, + "learning_rate": 0.0003226303358995174, + "loss": 0.81288606, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.40258789, + "step": 3259, + "time_per_iteration": 2.5837466716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104682, + "balance_loss_mlp": 1.00647962, + "epoch": 0.6271642939592151, + "flos": 564015670272.0, + "grad_norm": 0.027274694738231114, + "language_loss": 0.88901317, + "learning_rate": 0.00032233908911677, + "loss": 0.89948136, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.40332031, + "step": 3260, + "time_per_iteration": 2.825246810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044855, + "balance_loss_mlp": 1.00465786, + "epoch": 0.6273566756444786, + "flos": 515653221888.0, + "grad_norm": 0.03753718779185775, + "language_loss": 0.81557947, + "learning_rate": 0.0003220479113081053, + "loss": 0.82602805, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.40185547, + "step": 3261, + "time_per_iteration": 2.7426939010620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_mlp": 1.00566065, + "epoch": 0.6275490573297422, + "flos": 586588118784.0, + "grad_norm": 0.04387524863401932, + "language_loss": 0.79368806, + "learning_rate": 0.00032175680258656836, + "loss": 0.80414808, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.40332031, + "step": 3262, + "time_per_iteration": 2.704888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_mlp": 1.007092, + "epoch": 0.6277414390150058, + "flos": 560544407040.0, + "grad_norm": 0.03394703934758085, + "language_loss": 0.80846763, + "learning_rate": 0.00032146576306517794, + "loss": 0.81894171, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.40307617, + "step": 3263, + "time_per_iteration": 2.744232654571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_mlp": 1.00529134, + "epoch": 0.6279338207002694, + "flos": 613841085696.0, + "grad_norm": 0.03564897241316152, + "language_loss": 0.81241357, + "learning_rate": 0.0003211747928569255, + "loss": 0.82287127, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.40478516, + "step": 3264, + "time_per_iteration": 2.7210609912872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_mlp": 1.00754583, + "epoch": 0.6281262023855329, + "flos": 626933900544.0, + "grad_norm": 0.03587918693245657, + "language_loss": 0.81859601, + "learning_rate": 0.0003208838920747754, + "loss": 0.82907528, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.40380859, + "step": 3265, + "time_per_iteration": 2.828963041305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_mlp": 1.00379026, + "epoch": 0.6283185840707964, + "flos": 1125420367872.0, + "grad_norm": 0.03507856752255015, + "language_loss": 0.77222586, + "learning_rate": 0.0003205930608316656, + "loss": 0.78266764, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.40380859, + "step": 3266, + "time_per_iteration": 3.4536292552948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_mlp": 1.00251615, + "epoch": 0.62851096575606, + "flos": 516332699136.0, + "grad_norm": 0.05679261767260983, + "language_loss": 0.85571408, + "learning_rate": 0.00032030229924050673, + "loss": 0.86614287, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.40356445, + "step": 3267, + "time_per_iteration": 2.669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048611, + "balance_loss_mlp": 1.00815153, + "epoch": 0.6287033474413236, + "flos": 405062714112.0, + "grad_norm": 0.035560546659782886, + "language_loss": 0.80196536, + "learning_rate": 0.00032001160741418247, + "loss": 0.81245148, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.40454102, + "step": 3268, + "time_per_iteration": 2.6049489974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_mlp": 1.00421953, + "epoch": 0.6288957291265872, + "flos": 526759127808.0, + "grad_norm": 0.05710921395997567, + "language_loss": 0.8274591, + "learning_rate": 0.0003197209854655494, + "loss": 0.83790565, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.40429688, + "step": 3269, + "time_per_iteration": 2.6551384925842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043687, + "balance_loss_mlp": 1.00313175, + "epoch": 0.6290881108118507, + "flos": 604958304000.0, + "grad_norm": 0.03774804220071916, + "language_loss": 0.75090307, + "learning_rate": 0.0003194304335074371, + "loss": 0.7613399, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.40551758, + "step": 3270, + "time_per_iteration": 2.851900577545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049729, + "balance_loss_mlp": 1.0093174, + "epoch": 0.6292804924971143, + "flos": 438598172160.0, + "grad_norm": 0.03683695296075174, + "language_loss": 0.89063656, + "learning_rate": 0.0003191399516526475, + "loss": 0.90113389, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.40405273, + "step": 3271, + "time_per_iteration": 2.5034451484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_mlp": 1.00488937, + "epoch": 0.6294728741823779, + "flos": 607845354240.0, + "grad_norm": 0.03066213341534494, + "language_loss": 0.79802763, + "learning_rate": 0.0003188495400139559, + "loss": 0.80848092, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.40429688, + "step": 3272, + "time_per_iteration": 2.780644178390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_mlp": 1.00486362, + "epoch": 0.6296652558676414, + "flos": 702774837504.0, + "grad_norm": 0.038362375592622004, + "language_loss": 0.85288656, + "learning_rate": 0.00031855919870411013, + "loss": 0.86333817, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.40283203, + "step": 3273, + "time_per_iteration": 2.8482918739318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_mlp": 1.00769854, + "epoch": 0.6298576375529049, + "flos": 524944272384.0, + "grad_norm": 0.03395775035270535, + "language_loss": 0.85278755, + "learning_rate": 0.0003182689278358305, + "loss": 0.86326772, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.40307617, + "step": 3274, + "time_per_iteration": 2.7457242012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_mlp": 1.00567281, + "epoch": 0.6300500192381685, + "flos": 476926909440.0, + "grad_norm": 0.036436552387549975, + "language_loss": 0.80145383, + "learning_rate": 0.0003179787275218105, + "loss": 0.81191462, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.40405273, + "step": 3275, + "time_per_iteration": 2.567723274230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_mlp": 1.00372946, + "epoch": 0.6302424009234321, + "flos": 521891971584.0, + "grad_norm": 0.03333768301867296, + "language_loss": 0.84862459, + "learning_rate": 0.0003176885978747155, + "loss": 0.85906482, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.40283203, + "step": 3276, + "time_per_iteration": 2.6513776779174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_mlp": 1.00587988, + "epoch": 0.6304347826086957, + "flos": 695858555904.0, + "grad_norm": 0.03467401587057451, + "language_loss": 0.83325267, + "learning_rate": 0.0003173985390071839, + "loss": 0.84371352, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.40185547, + "step": 3277, + "time_per_iteration": 2.876150131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054981, + "balance_loss_mlp": 1.01578522, + "epoch": 0.6306271642939593, + "flos": 1470032928000.0, + "grad_norm": 0.010139969116537896, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78955436, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.39160156, + "step": 3278, + "time_per_iteration": 4.770167589187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_mlp": 1.00548971, + "epoch": 0.6308195459792227, + "flos": 602930565888.0, + "grad_norm": 0.03526553994141675, + "language_loss": 0.81487232, + "learning_rate": 0.00031681863406122704, + "loss": 0.82533133, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.40405273, + "step": 3279, + "time_per_iteration": 2.7587971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043917, + "balance_loss_mlp": 1.0036478, + "epoch": 0.6310119276644863, + "flos": 728237248512.0, + "grad_norm": 0.034493081934242914, + "language_loss": 0.85473228, + "learning_rate": 0.00031652878820794087, + "loss": 0.86517143, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.40258789, + "step": 3280, + "time_per_iteration": 2.9854700565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045188, + "balance_loss_mlp": 1.00484729, + "epoch": 0.6312043093497499, + "flos": 520819776768.0, + "grad_norm": 0.037869406847462164, + "language_loss": 0.8647517, + "learning_rate": 0.00031623901358449627, + "loss": 0.87520361, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.40332031, + "step": 3281, + "time_per_iteration": 2.626267910003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_mlp": 1.00399899, + "epoch": 0.6313966910350135, + "flos": 532223136000.0, + "grad_norm": 0.03407480500665165, + "language_loss": 0.88792193, + "learning_rate": 0.0003159493103033936, + "loss": 0.89836484, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.40283203, + "step": 3282, + "time_per_iteration": 2.574249505996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00734711, + "epoch": 0.631589072720277, + "flos": 1382996656896.0, + "grad_norm": 0.01146599852639075, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80965501, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.390625, + "step": 3283, + "time_per_iteration": 4.8656487464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.00684595, + "epoch": 0.6317814544055406, + "flos": 625874344704.0, + "grad_norm": 0.030628800549983924, + "language_loss": 0.83010268, + "learning_rate": 0.0003153701182180776, + "loss": 0.84057581, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.40454102, + "step": 3284, + "time_per_iteration": 2.803232431411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047972, + "balance_loss_mlp": 1.00751245, + "epoch": 0.6319738360908042, + "flos": 499097892096.0, + "grad_norm": 0.036572578748274465, + "language_loss": 0.82564306, + "learning_rate": 0.00031508062963872655, + "loss": 0.83612275, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.40454102, + "step": 3285, + "time_per_iteration": 2.5559017658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00602329, + "epoch": 0.6321662177760677, + "flos": 580909282560.0, + "grad_norm": 0.041327466784405305, + "language_loss": 0.80268341, + "learning_rate": 0.0003147912128514423, + "loss": 0.81314898, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.40527344, + "step": 3286, + "time_per_iteration": 2.7093169689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_mlp": 1.00380301, + "epoch": 0.6323585994613313, + "flos": 602606867712.0, + "grad_norm": 0.0363944042801657, + "language_loss": 0.87847489, + "learning_rate": 0.0003145018679685859, + "loss": 0.88891751, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.40454102, + "step": 3287, + "time_per_iteration": 2.741680145263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_mlp": 1.00691795, + "epoch": 0.6325509811465948, + "flos": 529633539072.0, + "grad_norm": 0.02715728015284293, + "language_loss": 0.88303924, + "learning_rate": 0.00031421259510249134, + "loss": 0.89351344, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.4050293, + "step": 3288, + "time_per_iteration": 2.793593406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050359, + "balance_loss_mlp": 1.00975657, + "epoch": 0.6327433628318584, + "flos": 575345152512.0, + "grad_norm": 0.03790719604682011, + "language_loss": 0.8176173, + "learning_rate": 0.00031392339436546414, + "loss": 0.82812083, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.40600586, + "step": 3289, + "time_per_iteration": 2.806328773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_mlp": 1.00960624, + "epoch": 0.632935744517122, + "flos": 518112561408.0, + "grad_norm": 0.04130029787255878, + "language_loss": 0.84016752, + "learning_rate": 0.00031363426586978205, + "loss": 0.85067028, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.40673828, + "step": 3290, + "time_per_iteration": 2.815406322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00676227, + "epoch": 0.6331281262023856, + "flos": 618597426432.0, + "grad_norm": 0.031083560389852355, + "language_loss": 0.85119176, + "learning_rate": 0.0003133452097276947, + "loss": 0.86166441, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.4050293, + "step": 3291, + "time_per_iteration": 2.7325408458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_mlp": 1.00465119, + "epoch": 0.633320507887649, + "flos": 594116803584.0, + "grad_norm": 0.03244834687463976, + "language_loss": 0.84650022, + "learning_rate": 0.0003130562260514238, + "loss": 0.85695255, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.40576172, + "step": 3292, + "time_per_iteration": 2.7858352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046449, + "balance_loss_mlp": 1.00582266, + "epoch": 0.6335128895729126, + "flos": 583496934144.0, + "grad_norm": 0.03053589669397976, + "language_loss": 0.8217054, + "learning_rate": 0.0003127673149531626, + "loss": 0.83216989, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.40625, + "step": 3293, + "time_per_iteration": 2.755866050720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_mlp": 1.00506401, + "epoch": 0.6337052712581762, + "flos": 453974382336.0, + "grad_norm": 0.03437959175785583, + "language_loss": 0.83448106, + "learning_rate": 0.0003124784765450762, + "loss": 0.84493768, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.40600586, + "step": 3294, + "time_per_iteration": 2.555196762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045908, + "balance_loss_mlp": 1.00535333, + "epoch": 0.6338976529434398, + "flos": 574515975936.0, + "grad_norm": 0.03647562664134654, + "language_loss": 0.810781, + "learning_rate": 0.0003121897109393017, + "loss": 0.82124007, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.40551758, + "step": 3295, + "time_per_iteration": 2.726447582244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_mlp": 1.00441372, + "epoch": 0.6340900346287034, + "flos": 509809135104.0, + "grad_norm": 0.0325303094953836, + "language_loss": 0.89509195, + "learning_rate": 0.0003119010182479481, + "loss": 0.90554118, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.4050293, + "step": 3296, + "time_per_iteration": 2.6128556728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00422609, + "epoch": 0.6342824163139669, + "flos": 480715067904.0, + "grad_norm": 0.036682379732438104, + "language_loss": 0.8339026, + "learning_rate": 0.00031161239858309563, + "loss": 0.84434992, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.4050293, + "step": 3297, + "time_per_iteration": 2.571183443069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_mlp": 1.00323093, + "epoch": 0.6344747979992305, + "flos": 573111334656.0, + "grad_norm": 0.03822576874130642, + "language_loss": 0.83954668, + "learning_rate": 0.0003113238520567964, + "loss": 0.84998387, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.40478516, + "step": 3298, + "time_per_iteration": 2.677607297897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_mlp": 1.00143397, + "epoch": 0.634667179684494, + "flos": 607046313216.0, + "grad_norm": 0.03748382415323519, + "language_loss": 0.818299, + "learning_rate": 0.00031103537878107403, + "loss": 0.82871747, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.40405273, + "step": 3299, + "time_per_iteration": 2.731858730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_mlp": 1.0007478, + "epoch": 0.6348595613697576, + "flos": 648129897984.0, + "grad_norm": 0.036818455755728355, + "language_loss": 0.80712759, + "learning_rate": 0.0003107469788679238, + "loss": 0.81753987, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.40478516, + "step": 3300, + "time_per_iteration": 2.811863660812378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041324, + "balance_loss_mlp": 1.00088787, + "epoch": 0.6350519430550212, + "flos": 640273624320.0, + "grad_norm": 0.03493243312285999, + "language_loss": 0.872877, + "learning_rate": 0.00031045865242931267, + "loss": 0.88329029, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.40429688, + "step": 3301, + "time_per_iteration": 2.7718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042406, + "balance_loss_mlp": 1.00206506, + "epoch": 0.6352443247402847, + "flos": 687831195648.0, + "grad_norm": 0.031178821676135258, + "language_loss": 0.83354819, + "learning_rate": 0.00031017039957717877, + "loss": 0.84397227, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.40332031, + "step": 3302, + "time_per_iteration": 3.0323870182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_mlp": 1.01028883, + "epoch": 0.6354367064255483, + "flos": 560526910464.0, + "grad_norm": 0.03426704048429257, + "language_loss": 0.89209497, + "learning_rate": 0.0003098822204234318, + "loss": 0.9026022, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.40429688, + "step": 3303, + "time_per_iteration": 2.688183069229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048749, + "balance_loss_mlp": 1.00831378, + "epoch": 0.6356290881108119, + "flos": 981062077440.0, + "grad_norm": 0.05617774198225317, + "language_loss": 0.88024724, + "learning_rate": 0.00030959411507995273, + "loss": 0.89073473, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.40429688, + "step": 3304, + "time_per_iteration": 3.2071332931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050985, + "balance_loss_mlp": 1.01050138, + "epoch": 0.6358214697960755, + "flos": 529373024256.0, + "grad_norm": 0.04089277764533041, + "language_loss": 0.81679875, + "learning_rate": 0.00030930608365859407, + "loss": 0.82730865, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.40478516, + "step": 3305, + "time_per_iteration": 2.6791036128997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052235, + "balance_loss_mlp": 1.01184678, + "epoch": 0.6360138514813389, + "flos": 517869543168.0, + "grad_norm": 0.03251934179180288, + "language_loss": 0.88227487, + "learning_rate": 0.00030901812627117943, + "loss": 0.89279723, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.40380859, + "step": 3306, + "time_per_iteration": 2.643564462661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047687, + "balance_loss_mlp": 1.00720358, + "epoch": 0.6362062331666025, + "flos": 467470608384.0, + "grad_norm": 0.0425448547397637, + "language_loss": 0.85627687, + "learning_rate": 0.000308730243029504, + "loss": 0.8667537, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.40478516, + "step": 3307, + "time_per_iteration": 2.5909810066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_mlp": 1.00854588, + "epoch": 0.6363986148518661, + "flos": 550773156096.0, + "grad_norm": 0.03484330169343757, + "language_loss": 0.80282146, + "learning_rate": 0.0003084424340453339, + "loss": 0.81331193, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.4050293, + "step": 3308, + "time_per_iteration": 2.84796142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048791, + "balance_loss_mlp": 1.00830781, + "epoch": 0.6365909965371297, + "flos": 584158914816.0, + "grad_norm": 0.03632736574425893, + "language_loss": 0.82740968, + "learning_rate": 0.0003081546994304064, + "loss": 0.83789754, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.40478516, + "step": 3309, + "time_per_iteration": 2.7956221103668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105091, + "balance_loss_mlp": 1.01052189, + "epoch": 0.6367833782223933, + "flos": 532288264704.0, + "grad_norm": 0.03383722740926899, + "language_loss": 0.83152783, + "learning_rate": 0.0003078670392964298, + "loss": 0.8420369, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.40380859, + "step": 3310, + "time_per_iteration": 2.6194021701812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049103, + "balance_loss_mlp": 1.00883412, + "epoch": 0.6369757599076568, + "flos": 570588811776.0, + "grad_norm": 0.03520180951361345, + "language_loss": 0.83487624, + "learning_rate": 0.00030757945375508406, + "loss": 0.84536731, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.40258789, + "step": 3311, + "time_per_iteration": 2.636317729949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.00614858, + "epoch": 0.6371681415929203, + "flos": 541054394880.0, + "grad_norm": 0.03810911352031966, + "language_loss": 0.81548536, + "learning_rate": 0.00030729194291801944, + "loss": 0.82595289, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.40600586, + "step": 3312, + "time_per_iteration": 2.6490793228149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.00452065, + "epoch": 0.6373605232781839, + "flos": 484531416576.0, + "grad_norm": 0.03667535496624994, + "language_loss": 0.77428758, + "learning_rate": 0.00030700450689685787, + "loss": 0.78473806, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.40527344, + "step": 3313, + "time_per_iteration": 2.535402774810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_mlp": 1.00575566, + "epoch": 0.6375529049634475, + "flos": 579817645824.0, + "grad_norm": 0.03891693330572632, + "language_loss": 0.85701913, + "learning_rate": 0.00030671714580319186, + "loss": 0.86748058, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.40380859, + "step": 3314, + "time_per_iteration": 2.8058876991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044916, + "balance_loss_mlp": 1.00433683, + "epoch": 0.637745286648711, + "flos": 683480211456.0, + "grad_norm": 0.11702238081113171, + "language_loss": 0.83888423, + "learning_rate": 0.0003064298597485846, + "loss": 0.84933341, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.40576172, + "step": 3315, + "time_per_iteration": 2.8778491020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_mlp": 1.00489366, + "epoch": 0.6379376683339746, + "flos": 505649646336.0, + "grad_norm": 0.05211428291246213, + "language_loss": 0.84419525, + "learning_rate": 0.00030614264884457054, + "loss": 0.85464859, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.40429688, + "step": 3316, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_mlp": 1.00977015, + "epoch": 0.6381300500192382, + "flos": 503025056256.0, + "grad_norm": 0.0426813784455398, + "language_loss": 0.77854991, + "learning_rate": 0.000305855513202655, + "loss": 0.7890529, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.40527344, + "step": 3317, + "time_per_iteration": 2.5690500736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048316, + "balance_loss_mlp": 1.0077374, + "epoch": 0.6383224317045018, + "flos": 401367874560.0, + "grad_norm": 0.04267134147869369, + "language_loss": 0.78333461, + "learning_rate": 0.0003055684529343138, + "loss": 0.79381788, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.40576172, + "step": 3318, + "time_per_iteration": 2.4513895511627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054177, + "balance_loss_mlp": 1.01378846, + "epoch": 0.6385148133897653, + "flos": 500363527680.0, + "grad_norm": 0.0362987336754338, + "language_loss": 0.78882575, + "learning_rate": 0.00030528146815099374, + "loss": 0.79936755, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.40380859, + "step": 3319, + "time_per_iteration": 2.6613929271698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058923, + "balance_loss_mlp": 1.01851058, + "epoch": 0.6387071950750288, + "flos": 528695492352.0, + "grad_norm": 0.033070910188452485, + "language_loss": 0.72438365, + "learning_rate": 0.00030499455896411203, + "loss": 0.73497283, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.40405273, + "step": 3320, + "time_per_iteration": 2.641817092895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.02330017, + "epoch": 0.6388995767602924, + "flos": 1459106856960.0, + "grad_norm": 0.013037560040261834, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77363789, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.39648438, + "step": 3321, + "time_per_iteration": 4.960562705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_mlp": 1.00777197, + "epoch": 0.639091958445556, + "flos": 605171186688.0, + "grad_norm": 0.03633146914450565, + "language_loss": 0.77279496, + "learning_rate": 0.0003044209678251865, + "loss": 0.78327799, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.40527344, + "step": 3322, + "time_per_iteration": 2.875474691390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048137, + "balance_loss_mlp": 1.00762939, + "epoch": 0.6392843401308196, + "flos": 585665623296.0, + "grad_norm": 0.031694233880752425, + "language_loss": 0.85324746, + "learning_rate": 0.0003041342860958306, + "loss": 0.86372876, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.4050293, + "step": 3323, + "time_per_iteration": 2.7719669342041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_mlp": 1.00939035, + "epoch": 0.6394767218160831, + "flos": 515729044224.0, + "grad_norm": 0.03911936056883103, + "language_loss": 0.91999781, + "learning_rate": 0.00030384768040828857, + "loss": 0.93049705, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.40527344, + "step": 3324, + "time_per_iteration": 2.6998729705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_mlp": 1.00607193, + "epoch": 0.6396691035013466, + "flos": 542777876736.0, + "grad_norm": 0.04757896669484628, + "language_loss": 0.86295962, + "learning_rate": 0.00030356115087383094, + "loss": 0.87342638, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.40600586, + "step": 3325, + "time_per_iteration": 2.701478958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050797, + "balance_loss_mlp": 1.01033795, + "epoch": 0.6398614851866102, + "flos": 526554993408.0, + "grad_norm": 0.04173120766563636, + "language_loss": 0.85232729, + "learning_rate": 0.00030327469760369803, + "loss": 0.86283523, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.40454102, + "step": 3326, + "time_per_iteration": 2.5700113773345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048429, + "balance_loss_mlp": 1.0079217, + "epoch": 0.6400538668718738, + "flos": 624135311616.0, + "grad_norm": 0.07319214553535336, + "language_loss": 0.85706425, + "learning_rate": 0.0003029883207091009, + "loss": 0.86754858, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.4050293, + "step": 3327, + "time_per_iteration": 2.7076821327209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_mlp": 1.00391674, + "epoch": 0.6402462485571374, + "flos": 504455942400.0, + "grad_norm": 0.03613290239480707, + "language_loss": 0.78819323, + "learning_rate": 0.00030270202030122095, + "loss": 0.79863703, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.40454102, + "step": 3328, + "time_per_iteration": 2.7022666931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043909, + "balance_loss_mlp": 1.00337768, + "epoch": 0.6404386302424009, + "flos": 820663650816.0, + "grad_norm": 0.036325579184177476, + "language_loss": 0.8635475, + "learning_rate": 0.00030241579649121, + "loss": 0.8739866, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.40527344, + "step": 3329, + "time_per_iteration": 2.985426902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_mlp": 1.0080061, + "epoch": 0.6406310119276645, + "flos": 472793665536.0, + "grad_norm": 0.03267380509371782, + "language_loss": 0.80188096, + "learning_rate": 0.00030212964939018994, + "loss": 0.81236637, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.40527344, + "step": 3330, + "time_per_iteration": 2.550344228744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048063, + "balance_loss_mlp": 1.00753188, + "epoch": 0.6408233936129281, + "flos": 426489090816.0, + "grad_norm": 0.03827308355906826, + "language_loss": 0.86015689, + "learning_rate": 0.0003018435791092527, + "loss": 0.87063748, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.40527344, + "step": 3331, + "time_per_iteration": 2.4880104064941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042416, + "balance_loss_mlp": 1.00186062, + "epoch": 0.6410157752981916, + "flos": 550838284800.0, + "grad_norm": 0.0342671152523666, + "language_loss": 0.81525755, + "learning_rate": 0.00030155758575946083, + "loss": 0.82568169, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.40551758, + "step": 3332, + "time_per_iteration": 2.6726834774017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_mlp": 1.00267851, + "epoch": 0.6412081569834551, + "flos": 476861780736.0, + "grad_norm": 0.03538778522895548, + "language_loss": 0.84473503, + "learning_rate": 0.0003012716694518467, + "loss": 0.85516679, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.4050293, + "step": 3333, + "time_per_iteration": 2.5853443145751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042688, + "balance_loss_mlp": 1.00206196, + "epoch": 0.6414005386687187, + "flos": 542031325440.0, + "grad_norm": 0.03182184712742977, + "language_loss": 0.85642707, + "learning_rate": 0.000300985830297413, + "loss": 0.86685395, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.40625, + "step": 3334, + "time_per_iteration": 2.699078321456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_mlp": 1.00170887, + "epoch": 0.6415929203539823, + "flos": 1042957690368.0, + "grad_norm": 0.0341924045479309, + "language_loss": 0.88431525, + "learning_rate": 0.00030070006840713205, + "loss": 0.89473861, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.40625, + "step": 3335, + "time_per_iteration": 3.373852014541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_mlp": 1.0060693, + "epoch": 0.6417853020392459, + "flos": 649580226048.0, + "grad_norm": 0.035751052988779126, + "language_loss": 0.74186742, + "learning_rate": 0.000300414383891947, + "loss": 0.75233489, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.40673828, + "step": 3336, + "time_per_iteration": 2.86029314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_mlp": 1.0014739, + "epoch": 0.6419776837245095, + "flos": 501944113152.0, + "grad_norm": 0.02988455094961003, + "language_loss": 0.89225817, + "learning_rate": 0.00030012877686276973, + "loss": 0.90268028, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.4074707, + "step": 3337, + "time_per_iteration": 2.72491455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_mlp": 1.00569534, + "epoch": 0.642170065409773, + "flos": 621779984640.0, + "grad_norm": 0.03237702044621704, + "language_loss": 0.87225235, + "learning_rate": 0.0002998432474304832, + "loss": 0.88271654, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.40722656, + "step": 3338, + "time_per_iteration": 2.7576870918273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.01165771, + "epoch": 0.6423624470950365, + "flos": 1426641648384.0, + "grad_norm": 0.016568770215616015, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80288672, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.39648438, + "step": 3339, + "time_per_iteration": 4.923727035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_mlp": 1.00143564, + "epoch": 0.6425548287803001, + "flos": 563440205568.0, + "grad_norm": 0.03881466361138169, + "language_loss": 0.890571, + "learning_rate": 0.00029927242179996107, + "loss": 0.90099066, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.40527344, + "step": 3340, + "time_per_iteration": 2.7034361362457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_mlp": 1.00212634, + "epoch": 0.6427472104655637, + "flos": 586614363648.0, + "grad_norm": 0.030378234734855056, + "language_loss": 0.83618605, + "learning_rate": 0.0002989871258233398, + "loss": 0.84661257, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.40527344, + "step": 3341, + "time_per_iteration": 2.7497901916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_mlp": 1.00211823, + "epoch": 0.6429395921508272, + "flos": 405147284736.0, + "grad_norm": 0.03870957855804831, + "language_loss": 0.83240426, + "learning_rate": 0.0002987019078868373, + "loss": 0.84283173, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.40625, + "step": 3342, + "time_per_iteration": 2.425215005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_mlp": 1.00362682, + "epoch": 0.6431319738360908, + "flos": 549833164032.0, + "grad_norm": 0.031726413731120486, + "language_loss": 0.82255763, + "learning_rate": 0.00029841676810118484, + "loss": 0.83300042, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.40649414, + "step": 3343, + "time_per_iteration": 2.693652629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_mlp": 1.00368381, + "epoch": 0.6433243555213544, + "flos": 794706455040.0, + "grad_norm": 0.03684738873998065, + "language_loss": 0.87695611, + "learning_rate": 0.0002981317065770839, + "loss": 0.88739967, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.40673828, + "step": 3344, + "time_per_iteration": 3.0393459796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_mlp": 1.00147617, + "epoch": 0.643516737206618, + "flos": 584113228032.0, + "grad_norm": 0.0395181937617663, + "language_loss": 0.81428736, + "learning_rate": 0.00029784672342520493, + "loss": 0.82471007, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.40795898, + "step": 3345, + "time_per_iteration": 2.6979730129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045514, + "balance_loss_mlp": 1.00479162, + "epoch": 0.6437091188918815, + "flos": 519751472640.0, + "grad_norm": 0.07302138379312399, + "language_loss": 0.8401407, + "learning_rate": 0.00029756181875618834, + "loss": 0.85059583, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.40722656, + "step": 3346, + "time_per_iteration": 2.609215497970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046536, + "balance_loss_mlp": 1.00588584, + "epoch": 0.643901500577145, + "flos": 385787529984.0, + "grad_norm": 0.039174224295971255, + "language_loss": 0.83988988, + "learning_rate": 0.0002972769926806439, + "loss": 0.85035521, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.40649414, + "step": 3347, + "time_per_iteration": 2.4672152996063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044082, + "balance_loss_mlp": 1.00345576, + "epoch": 0.6440938822624086, + "flos": 484698612480.0, + "grad_norm": 0.03574243057214525, + "language_loss": 0.88977337, + "learning_rate": 0.0002969922453091508, + "loss": 0.9002142, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.40625, + "step": 3348, + "time_per_iteration": 2.615544557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044846, + "balance_loss_mlp": 1.00414753, + "epoch": 0.6442862639476722, + "flos": 541638607872.0, + "grad_norm": 0.030177655617681567, + "language_loss": 0.85437477, + "learning_rate": 0.00029670757675225777, + "loss": 0.86482322, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.40698242, + "step": 3349, + "time_per_iteration": 2.7615771293640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047798, + "balance_loss_mlp": 1.0071243, + "epoch": 0.6444786456329358, + "flos": 527959634688.0, + "grad_norm": 0.036762953036999044, + "language_loss": 0.79762578, + "learning_rate": 0.0002964229871204831, + "loss": 0.8081038, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.40673828, + "step": 3350, + "time_per_iteration": 2.6479439735412598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_mlp": 1.00781858, + "epoch": 0.6446710273181993, + "flos": 699162623232.0, + "grad_norm": 0.0356496056156774, + "language_loss": 0.84474576, + "learning_rate": 0.00029613847652431403, + "loss": 0.85523063, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.40673828, + "step": 3351, + "time_per_iteration": 2.852724313735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_mlp": 1.00514281, + "epoch": 0.6448634090034628, + "flos": 626300110080.0, + "grad_norm": 0.031569039076812924, + "language_loss": 0.79828554, + "learning_rate": 0.0002958540450742078, + "loss": 0.80874443, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.4074707, + "step": 3352, + "time_per_iteration": 2.943434238433838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104848, + "balance_loss_mlp": 1.0077343, + "epoch": 0.6450557906887264, + "flos": 602166518016.0, + "grad_norm": 0.03244355782647549, + "language_loss": 0.7780689, + "learning_rate": 0.0002955696928805901, + "loss": 0.78855366, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.4074707, + "step": 3353, + "time_per_iteration": 2.9107890129089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046337, + "balance_loss_mlp": 1.0057348, + "epoch": 0.64524817237399, + "flos": 647385292032.0, + "grad_norm": 0.03305835241833302, + "language_loss": 0.86728162, + "learning_rate": 0.0002952854200538563, + "loss": 0.87774503, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.40600586, + "step": 3354, + "time_per_iteration": 2.8001787662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_mlp": 1.00430393, + "epoch": 0.6454405540592536, + "flos": 474367448064.0, + "grad_norm": 0.03406107124883384, + "language_loss": 0.8233161, + "learning_rate": 0.000295001226704371, + "loss": 0.83376658, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.4074707, + "step": 3355, + "time_per_iteration": 2.6213538646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044402, + "balance_loss_mlp": 1.00372756, + "epoch": 0.6456329357445171, + "flos": 613020657408.0, + "grad_norm": 0.03542934708236725, + "language_loss": 0.82853353, + "learning_rate": 0.00029471711294246783, + "loss": 0.83897758, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.40673828, + "step": 3356, + "time_per_iteration": 2.790909767150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044152, + "balance_loss_mlp": 1.00362051, + "epoch": 0.6458253174297807, + "flos": 732932351232.0, + "grad_norm": 0.03702752169183614, + "language_loss": 0.82778573, + "learning_rate": 0.0002944330788784494, + "loss": 0.83822721, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.40527344, + "step": 3357, + "time_per_iteration": 2.8837075233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044846, + "balance_loss_mlp": 1.00424361, + "epoch": 0.6460176991150443, + "flos": 571555048704.0, + "grad_norm": 0.04139380130769849, + "language_loss": 0.84656543, + "learning_rate": 0.00029414912462258786, + "loss": 0.85701388, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.40600586, + "step": 3358, + "time_per_iteration": 2.8205137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_mlp": 1.00543988, + "epoch": 0.6462100808003078, + "flos": 584243485440.0, + "grad_norm": 0.03729295118772339, + "language_loss": 0.81916165, + "learning_rate": 0.00029386525028512366, + "loss": 0.82962239, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.40625, + "step": 3359, + "time_per_iteration": 2.7342734336853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044545, + "balance_loss_mlp": 1.00391877, + "epoch": 0.6464024624855714, + "flos": 485011617024.0, + "grad_norm": 0.03542298422939795, + "language_loss": 0.87396795, + "learning_rate": 0.0002935814559762666, + "loss": 0.88441336, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.40625, + "step": 3360, + "time_per_iteration": 2.7663137912750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_mlp": 1.00362873, + "epoch": 0.6465948441708349, + "flos": 528843246336.0, + "grad_norm": 0.034215531166731795, + "language_loss": 0.80432177, + "learning_rate": 0.0002932977418061957, + "loss": 0.81476361, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.40551758, + "step": 3361, + "time_per_iteration": 2.680459976196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_mlp": 1.00299382, + "epoch": 0.6467872258560985, + "flos": 670626524160.0, + "grad_norm": 0.03987324070915456, + "language_loss": 0.81433517, + "learning_rate": 0.00029301410788505833, + "loss": 0.82477069, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.40551758, + "step": 3362, + "time_per_iteration": 2.772834539413452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042359, + "balance_loss_mlp": 1.00178003, + "epoch": 0.6469796075413621, + "flos": 433040845056.0, + "grad_norm": 0.046274531894689615, + "language_loss": 0.81467456, + "learning_rate": 0.00029273055432297126, + "loss": 0.82509816, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.40576172, + "step": 3363, + "time_per_iteration": 2.49839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_mlp": 1.00188959, + "epoch": 0.6471719892266257, + "flos": 805102748160.0, + "grad_norm": 0.03834251982821679, + "language_loss": 0.81200004, + "learning_rate": 0.00029244708123001917, + "loss": 0.82242495, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.40600586, + "step": 3364, + "time_per_iteration": 2.968705177307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_mlp": 1.00215495, + "epoch": 0.6473643709118891, + "flos": 578349821184.0, + "grad_norm": 0.036932041933641975, + "language_loss": 0.84809864, + "learning_rate": 0.0002921636887162565, + "loss": 0.85852528, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.4050293, + "step": 3365, + "time_per_iteration": 2.7454428672790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044497, + "balance_loss_mlp": 1.00398982, + "epoch": 0.6475567525971527, + "flos": 762788520960.0, + "grad_norm": 0.046091211557592264, + "language_loss": 0.8445828, + "learning_rate": 0.00029188037689170595, + "loss": 0.85502779, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.4050293, + "step": 3366, + "time_per_iteration": 2.9878523349761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_mlp": 1.00274241, + "epoch": 0.6477491342824163, + "flos": 844501734912.0, + "grad_norm": 0.04252046587739173, + "language_loss": 0.84425056, + "learning_rate": 0.0002915971458663586, + "loss": 0.85468358, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.40551758, + "step": 3367, + "time_per_iteration": 3.052515745162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_mlp": 1.01003003, + "epoch": 0.6479415159676799, + "flos": 886382415360.0, + "grad_norm": 0.03864645902049365, + "language_loss": 0.82315862, + "learning_rate": 0.00029131399575017494, + "loss": 0.83366442, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.40551758, + "step": 3368, + "time_per_iteration": 3.1613588333129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050945, + "balance_loss_mlp": 1.01034212, + "epoch": 0.6481338976529435, + "flos": 616724245248.0, + "grad_norm": 0.06720988527624061, + "language_loss": 0.86632174, + "learning_rate": 0.0002910309266530836, + "loss": 0.87683117, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.40600586, + "step": 3369, + "time_per_iteration": 2.800647497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051276, + "balance_loss_mlp": 1.01067364, + "epoch": 0.648326279338207, + "flos": 511020335616.0, + "grad_norm": 0.03423893349875194, + "language_loss": 0.85872662, + "learning_rate": 0.0002907479386849814, + "loss": 0.86923945, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.40600586, + "step": 3370, + "time_per_iteration": 2.6336069107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105095, + "balance_loss_mlp": 1.0103476, + "epoch": 0.6485186610234706, + "flos": 703869386496.0, + "grad_norm": 0.03204560465373447, + "language_loss": 0.80689716, + "learning_rate": 0.0002904650319557339, + "loss": 0.81740665, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.40600586, + "step": 3371, + "time_per_iteration": 2.9737660884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054287, + "balance_loss_mlp": 1.01349366, + "epoch": 0.6487110427087341, + "flos": 561746859264.0, + "grad_norm": 0.039912158099113866, + "language_loss": 0.81825972, + "learning_rate": 0.0002901822065751758, + "loss": 0.82880259, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.40795898, + "step": 3372, + "time_per_iteration": 2.678905487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054182, + "balance_loss_mlp": 1.01341212, + "epoch": 0.6489034243939977, + "flos": 681302774016.0, + "grad_norm": 0.03214296467255679, + "language_loss": 0.86033392, + "learning_rate": 0.0002898994626531093, + "loss": 0.87087572, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.40771484, + "step": 3373, + "time_per_iteration": 2.9144790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047579, + "balance_loss_mlp": 1.00688112, + "epoch": 0.6490958060792612, + "flos": 475372568832.0, + "grad_norm": 0.03458153211721296, + "language_loss": 0.88523054, + "learning_rate": 0.00028961680029930526, + "loss": 0.8957063, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.40698242, + "step": 3374, + "time_per_iteration": 2.5657663345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048477, + "balance_loss_mlp": 1.00794625, + "epoch": 0.6492881877645248, + "flos": 590003001600.0, + "grad_norm": 0.03430965952422358, + "language_loss": 0.77826953, + "learning_rate": 0.00028933421962350317, + "loss": 0.78875428, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.40527344, + "step": 3375, + "time_per_iteration": 2.782069683074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053456, + "balance_loss_mlp": 1.0128299, + "epoch": 0.6494805694497884, + "flos": 643588385280.0, + "grad_norm": 0.03575939394791191, + "language_loss": 0.84478199, + "learning_rate": 0.0002890517207354104, + "loss": 0.85531658, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.40625, + "step": 3376, + "time_per_iteration": 2.837724447250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047454, + "balance_loss_mlp": 1.00689936, + "epoch": 0.649672951135052, + "flos": 532837484544.0, + "grad_norm": 0.034227306744160566, + "language_loss": 0.82481575, + "learning_rate": 0.0002887693037447029, + "loss": 0.83529025, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.40551758, + "step": 3377, + "time_per_iteration": 2.579442262649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_mlp": 1.00662696, + "epoch": 0.6498653328203156, + "flos": 548446019328.0, + "grad_norm": 0.03719565127882316, + "language_loss": 0.82554042, + "learning_rate": 0.00028848696876102443, + "loss": 0.83601272, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.40600586, + "step": 3378, + "time_per_iteration": 2.6242425441741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047252, + "balance_loss_mlp": 1.00650632, + "epoch": 0.650057714505579, + "flos": 463161420288.0, + "grad_norm": 0.037917560954429594, + "language_loss": 0.8430717, + "learning_rate": 0.00028820471589398723, + "loss": 0.85354424, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.4074707, + "step": 3379, + "time_per_iteration": 2.5716495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_mlp": 1.00565684, + "epoch": 0.6502500961908426, + "flos": 511241966592.0, + "grad_norm": 0.04232947369873583, + "language_loss": 0.78428495, + "learning_rate": 0.00028792254525317196, + "loss": 0.79474926, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.40771484, + "step": 3380, + "time_per_iteration": 2.6657466888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104445, + "balance_loss_mlp": 1.00377584, + "epoch": 0.6504424778761062, + "flos": 580911227904.0, + "grad_norm": 0.0355389042104645, + "language_loss": 0.8194313, + "learning_rate": 0.00028764045694812645, + "loss": 0.82987577, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.40673828, + "step": 3381, + "time_per_iteration": 2.75962233543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047874, + "balance_loss_mlp": 1.00727105, + "epoch": 0.6506348595613698, + "flos": 520467888384.0, + "grad_norm": 0.04062665752895993, + "language_loss": 0.76926279, + "learning_rate": 0.0002873584510883671, + "loss": 0.77974153, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.40600586, + "step": 3382, + "time_per_iteration": 2.5889906883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049006, + "balance_loss_mlp": 1.00837946, + "epoch": 0.6508272412466333, + "flos": 511363475712.0, + "grad_norm": 0.029998580027972052, + "language_loss": 0.86699784, + "learning_rate": 0.0002870765277833788, + "loss": 0.8774879, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.40625, + "step": 3383, + "time_per_iteration": 2.6930124759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_mlp": 1.00863218, + "epoch": 0.6510196229318969, + "flos": 626805588480.0, + "grad_norm": 0.03382855215234118, + "language_loss": 0.80910194, + "learning_rate": 0.00028679468714261347, + "loss": 0.81959337, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.4050293, + "step": 3384, + "time_per_iteration": 2.793992280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.00750864, + "epoch": 0.6512120046171604, + "flos": 475670022144.0, + "grad_norm": 0.034347459077756264, + "language_loss": 0.77632761, + "learning_rate": 0.0002865129292754918, + "loss": 0.78680825, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.40551758, + "step": 3385, + "time_per_iteration": 2.5745677947998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051635, + "balance_loss_mlp": 1.01115131, + "epoch": 0.651404386302424, + "flos": 553031273472.0, + "grad_norm": 0.0319561697529533, + "language_loss": 0.82687205, + "learning_rate": 0.00028623125429140105, + "loss": 0.8373884, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.40478516, + "step": 3386, + "time_per_iteration": 2.8197057247161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049254, + "balance_loss_mlp": 1.00874698, + "epoch": 0.6515967679876876, + "flos": 524375610624.0, + "grad_norm": 0.03843989341560043, + "language_loss": 0.87771493, + "learning_rate": 0.00028594966229969785, + "loss": 0.8882075, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.4050293, + "step": 3387, + "time_per_iteration": 2.6713032722473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_mlp": 1.00899053, + "epoch": 0.6517891496729511, + "flos": 575017563648.0, + "grad_norm": 0.03692798161206562, + "language_loss": 0.8182978, + "learning_rate": 0.00028566815340970577, + "loss": 0.82879114, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.40332031, + "step": 3388, + "time_per_iteration": 2.7321841716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048844, + "balance_loss_mlp": 1.0084554, + "epoch": 0.6519815313582147, + "flos": 556990518528.0, + "grad_norm": 0.03423866481728588, + "language_loss": 0.81470537, + "learning_rate": 0.0002853867277307162, + "loss": 0.82519382, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.40380859, + "step": 3389, + "time_per_iteration": 2.7031924724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049788, + "balance_loss_mlp": 1.00937581, + "epoch": 0.6521739130434783, + "flos": 481522857216.0, + "grad_norm": 0.03513339122298917, + "language_loss": 0.82942468, + "learning_rate": 0.00028510538537198824, + "loss": 0.83992255, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.40405273, + "step": 3390, + "time_per_iteration": 2.703963279724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050398, + "balance_loss_mlp": 1.00993848, + "epoch": 0.6523662947287419, + "flos": 667021112832.0, + "grad_norm": 0.03209400617836455, + "language_loss": 0.86939168, + "learning_rate": 0.00028482412644274867, + "loss": 0.87989569, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.40454102, + "step": 3391, + "time_per_iteration": 2.9381484985351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049716, + "balance_loss_mlp": 1.00920916, + "epoch": 0.6525586764140053, + "flos": 549702906624.0, + "grad_norm": 0.03739783573884853, + "language_loss": 0.75139832, + "learning_rate": 0.00028454295105219207, + "loss": 0.76189548, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.4050293, + "step": 3392, + "time_per_iteration": 2.658132314682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_mlp": 1.00706887, + "epoch": 0.6527510580992689, + "flos": 804391190016.0, + "grad_norm": 0.02478431190679109, + "language_loss": 0.79875654, + "learning_rate": 0.0002842618593094802, + "loss": 0.80923212, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.40478516, + "step": 3393, + "time_per_iteration": 3.1278936862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046198, + "balance_loss_mlp": 1.00571501, + "epoch": 0.6529434397845325, + "flos": 672376250880.0, + "grad_norm": 0.04113995840272075, + "language_loss": 0.80790162, + "learning_rate": 0.00028398085132374243, + "loss": 0.81836367, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.40478516, + "step": 3394, + "time_per_iteration": 2.8653299808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_mlp": 1.00571322, + "epoch": 0.6531358214697961, + "flos": 829876933632.0, + "grad_norm": 0.032703635981260123, + "language_loss": 0.85031712, + "learning_rate": 0.0002836999272040761, + "loss": 0.86077929, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.4050293, + "step": 3395, + "time_per_iteration": 3.131331205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.01005006, + "epoch": 0.6533282031550597, + "flos": 488393452032.0, + "grad_norm": 0.04317230929037854, + "language_loss": 0.84511197, + "learning_rate": 0.00028341908705954575, + "loss": 0.85561681, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.40429688, + "step": 3396, + "time_per_iteration": 2.5415916442871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048904, + "balance_loss_mlp": 1.0094223, + "epoch": 0.6535205848403232, + "flos": 1561105233408.0, + "grad_norm": 0.006364223174853702, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82810712, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.39453125, + "step": 3397, + "time_per_iteration": 4.924402236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0069257, + "epoch": 0.6537129665255867, + "flos": 494704133376.0, + "grad_norm": 0.03394309019693363, + "language_loss": 0.78847253, + "learning_rate": 0.00028285765913198604, + "loss": 0.79894781, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.40600586, + "step": 3398, + "time_per_iteration": 2.595367193222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046629, + "balance_loss_mlp": 1.00595522, + "epoch": 0.6539053482108503, + "flos": 606143259648.0, + "grad_norm": 0.03316024353093433, + "language_loss": 0.82683516, + "learning_rate": 0.0002825770715669227, + "loss": 0.83730143, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.40673828, + "step": 3399, + "time_per_iteration": 2.7097129821777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048676, + "balance_loss_mlp": 1.00807345, + "epoch": 0.6540977298961139, + "flos": 578881544448.0, + "grad_norm": 0.0428136910892252, + "language_loss": 0.81872654, + "learning_rate": 0.00028229656841292634, + "loss": 0.82921332, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.40600586, + "step": 3400, + "time_per_iteration": 2.6833486557006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_mlp": 1.00442982, + "epoch": 0.6542901115813774, + "flos": 512770062336.0, + "grad_norm": 0.04250142071298369, + "language_loss": 0.76713872, + "learning_rate": 0.0002820161497788979, + "loss": 0.77758902, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.40600586, + "step": 3401, + "time_per_iteration": 2.626732349395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048894, + "balance_loss_mlp": 1.00838673, + "epoch": 0.654482493266641, + "flos": 626675331072.0, + "grad_norm": 0.03960445373110503, + "language_loss": 0.87829405, + "learning_rate": 0.00028173581577370545, + "loss": 0.88878298, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.4050293, + "step": 3402, + "time_per_iteration": 2.7741096019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048023, + "balance_loss_mlp": 1.00753999, + "epoch": 0.6546748749519046, + "flos": 525063836160.0, + "grad_norm": 0.03167040591829995, + "language_loss": 0.79177642, + "learning_rate": 0.0002814555665061844, + "loss": 0.80225664, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.40478516, + "step": 3403, + "time_per_iteration": 2.664350986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047128, + "balance_loss_mlp": 1.00664401, + "epoch": 0.6548672566371682, + "flos": 480274718208.0, + "grad_norm": 0.036729511728986385, + "language_loss": 0.78224975, + "learning_rate": 0.00028117540208513715, + "loss": 0.79272103, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.40478516, + "step": 3404, + "time_per_iteration": 2.6802027225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043621, + "balance_loss_mlp": 1.00306582, + "epoch": 0.6550596383224317, + "flos": 617136404736.0, + "grad_norm": 0.034100585633273374, + "language_loss": 0.85354125, + "learning_rate": 0.00028089532261933313, + "loss": 0.86397743, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.40551758, + "step": 3405, + "time_per_iteration": 2.7186086177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_mlp": 1.00567865, + "epoch": 0.6552520200076952, + "flos": 489808786944.0, + "grad_norm": 0.041360786835332355, + "language_loss": 0.86205178, + "learning_rate": 0.0002806153282175087, + "loss": 0.87251461, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.40600586, + "step": 3406, + "time_per_iteration": 2.5789847373962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046471, + "balance_loss_mlp": 1.00584447, + "epoch": 0.6554444016929588, + "flos": 688859649024.0, + "grad_norm": 0.034986799312927766, + "language_loss": 0.8358103, + "learning_rate": 0.0002803354189883679, + "loss": 0.84627509, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.40625, + "step": 3407, + "time_per_iteration": 2.837360382080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_mlp": 1.01023173, + "epoch": 0.6556367833782224, + "flos": 544171824384.0, + "grad_norm": 0.032399307772020214, + "language_loss": 0.86254793, + "learning_rate": 0.00028005559504058053, + "loss": 0.87305439, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.40405273, + "step": 3408, + "time_per_iteration": 2.7328412532806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_mlp": 1.00673985, + "epoch": 0.655829165063486, + "flos": 674731577856.0, + "grad_norm": 0.033393765710147245, + "language_loss": 0.77549541, + "learning_rate": 0.0002797758564827838, + "loss": 0.78596783, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.4050293, + "step": 3409, + "time_per_iteration": 2.8037917613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048239, + "balance_loss_mlp": 1.00761223, + "epoch": 0.6560215467487496, + "flos": 532837484544.0, + "grad_norm": 0.037569861592142095, + "language_loss": 0.83625042, + "learning_rate": 0.0002794962034235824, + "loss": 0.84673285, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.40625, + "step": 3410, + "time_per_iteration": 2.660435676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00789142, + "epoch": 0.656213928434013, + "flos": 592460395776.0, + "grad_norm": 0.035927702009128905, + "language_loss": 0.75148469, + "learning_rate": 0.00027921663597154695, + "loss": 0.76196802, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.40429688, + "step": 3411, + "time_per_iteration": 2.7516040802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.01015997, + "epoch": 0.6564063101192766, + "flos": 416678956032.0, + "grad_norm": 0.07901014031845595, + "language_loss": 0.81708795, + "learning_rate": 0.00027893715423521525, + "loss": 0.82759392, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.40429688, + "step": 3412, + "time_per_iteration": 2.4704418182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045819, + "balance_loss_mlp": 1.00547826, + "epoch": 0.6565986918045402, + "flos": 454271835648.0, + "grad_norm": 0.03411050033810387, + "language_loss": 0.84291053, + "learning_rate": 0.00027865775832309163, + "loss": 0.85336864, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.40332031, + "step": 3413, + "time_per_iteration": 2.6385068893432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.00854325, + "epoch": 0.6567910734898038, + "flos": 548799853056.0, + "grad_norm": 0.036374593364126635, + "language_loss": 0.86917508, + "learning_rate": 0.00027837844834364733, + "loss": 0.87966299, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.40234375, + "step": 3414, + "time_per_iteration": 2.6444642543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048805, + "balance_loss_mlp": 1.00860763, + "epoch": 0.6569834551750673, + "flos": 656765770752.0, + "grad_norm": 0.03225713211671443, + "language_loss": 0.87055808, + "learning_rate": 0.00027809922440532, + "loss": 0.88104612, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.40185547, + "step": 3415, + "time_per_iteration": 2.847615957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.01114511, + "epoch": 0.6571758368603309, + "flos": 540811376640.0, + "grad_norm": 0.035988230545184526, + "language_loss": 0.81540048, + "learning_rate": 0.00027782008661651406, + "loss": 0.82591534, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.40332031, + "step": 3416, + "time_per_iteration": 2.767226457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049379, + "balance_loss_mlp": 1.00906193, + "epoch": 0.6573682185455945, + "flos": 498379531008.0, + "grad_norm": 0.03451446989535273, + "language_loss": 0.87885237, + "learning_rate": 0.00027754103508560013, + "loss": 0.88934618, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.40307617, + "step": 3417, + "time_per_iteration": 2.6277449131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045088, + "balance_loss_mlp": 1.00481939, + "epoch": 0.657560600230858, + "flos": 448353871872.0, + "grad_norm": 0.03502749433462501, + "language_loss": 0.8376503, + "learning_rate": 0.0002772620699209163, + "loss": 0.8481012, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.40258789, + "step": 3418, + "time_per_iteration": 2.603851318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041901, + "balance_loss_mlp": 1.00168002, + "epoch": 0.6577529819161216, + "flos": 482920695552.0, + "grad_norm": 0.033924516533442195, + "language_loss": 0.80503142, + "learning_rate": 0.0002769831912307658, + "loss": 0.81545043, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.40209961, + "step": 3419, + "time_per_iteration": 2.567737340927124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_mlp": 1.00313556, + "epoch": 0.6579453636013851, + "flos": 531860553984.0, + "grad_norm": 0.04823961507786352, + "language_loss": 0.80877286, + "learning_rate": 0.00027670439912341917, + "loss": 0.81920785, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.40356445, + "step": 3420, + "time_per_iteration": 2.639587163925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043978, + "balance_loss_mlp": 1.00354195, + "epoch": 0.6581377452866487, + "flos": 629243540736.0, + "grad_norm": 0.032258458979824364, + "language_loss": 0.84138131, + "learning_rate": 0.0002764256937071129, + "loss": 0.85182106, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.40429688, + "step": 3421, + "time_per_iteration": 2.793288469314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_mlp": 1.00347602, + "epoch": 0.6583301269719123, + "flos": 549674716416.0, + "grad_norm": 0.033092634832732, + "language_loss": 0.87840796, + "learning_rate": 0.00027614707509005036, + "loss": 0.88884783, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.4050293, + "step": 3422, + "time_per_iteration": 2.672691822052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_mlp": 1.0038892, + "epoch": 0.6585225086571759, + "flos": 428397265152.0, + "grad_norm": 0.041046610709459384, + "language_loss": 0.7990576, + "learning_rate": 0.0002758685433804008, + "loss": 0.80950087, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.40429688, + "step": 3423, + "time_per_iteration": 2.5028507709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_mlp": 1.00448632, + "epoch": 0.6587148903424394, + "flos": 861050261760.0, + "grad_norm": 0.040364444047634805, + "language_loss": 0.7997486, + "learning_rate": 0.00027559009868630005, + "loss": 0.81019825, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.40478516, + "step": 3424, + "time_per_iteration": 3.1220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_mlp": 1.00671124, + "epoch": 0.6589072720277029, + "flos": 807037167360.0, + "grad_norm": 0.05893519085395252, + "language_loss": 0.80930316, + "learning_rate": 0.0002753117411158491, + "loss": 0.81977397, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.40356445, + "step": 3425, + "time_per_iteration": 3.0889339447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047077, + "balance_loss_mlp": 1.00676, + "epoch": 0.6590996537129665, + "flos": 549674716416.0, + "grad_norm": 0.03274381739097603, + "language_loss": 0.90609264, + "learning_rate": 0.0002750334707771168, + "loss": 0.91656339, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.40307617, + "step": 3426, + "time_per_iteration": 2.6541290283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_mlp": 1.00647414, + "epoch": 0.6592920353982301, + "flos": 455109760512.0, + "grad_norm": 0.03777224687776173, + "language_loss": 0.81529361, + "learning_rate": 0.0002747552877781369, + "loss": 0.82576048, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.40209961, + "step": 3427, + "time_per_iteration": 2.5356411933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_mlp": 1.00858271, + "epoch": 0.6594844170834937, + "flos": 568261675008.0, + "grad_norm": 0.03735814383850805, + "language_loss": 0.82849789, + "learning_rate": 0.0002744771922269097, + "loss": 0.83898544, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.40161133, + "step": 3428, + "time_per_iteration": 2.7781617641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_mlp": 1.00761461, + "epoch": 0.6596767987687572, + "flos": 1189755878400.0, + "grad_norm": 0.035375644925624505, + "language_loss": 0.82642734, + "learning_rate": 0.0002741991842314015, + "loss": 0.83690447, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.40087891, + "step": 3429, + "time_per_iteration": 3.484401226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050876, + "balance_loss_mlp": 1.01070201, + "epoch": 0.6598691804540208, + "flos": 504468581376.0, + "grad_norm": 0.033809257581419436, + "language_loss": 0.86197507, + "learning_rate": 0.0002739212638995445, + "loss": 0.87248385, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.40161133, + "step": 3430, + "time_per_iteration": 2.557008743286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_mlp": 1.00654662, + "epoch": 0.6600615621392844, + "flos": 532399080192.0, + "grad_norm": 0.03652926945024374, + "language_loss": 0.83438206, + "learning_rate": 0.00027364343133923696, + "loss": 0.84484929, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.40161133, + "step": 3431, + "time_per_iteration": 2.662047863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010455, + "balance_loss_mlp": 1.00534999, + "epoch": 0.6602539438245479, + "flos": 566557635072.0, + "grad_norm": 0.03543857868011933, + "language_loss": 0.83350068, + "learning_rate": 0.0002733656866583431, + "loss": 0.84395564, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.40136719, + "step": 3432, + "time_per_iteration": 2.676973581314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_mlp": 1.00558269, + "epoch": 0.6604463255098114, + "flos": 858592867584.0, + "grad_norm": 0.037899677341019365, + "language_loss": 0.83285594, + "learning_rate": 0.0002730880299646927, + "loss": 0.8433131, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.40112305, + "step": 3433, + "time_per_iteration": 3.0207436084747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_mlp": 1.00585747, + "epoch": 0.660638707195075, + "flos": 675680318208.0, + "grad_norm": 0.03767896728200409, + "language_loss": 0.85914338, + "learning_rate": 0.0002728104613660821, + "loss": 0.8696028, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.40063477, + "step": 3434, + "time_per_iteration": 2.847806215286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_mlp": 1.0032711, + "epoch": 0.6608310888803386, + "flos": 890524407552.0, + "grad_norm": 0.03485230588781084, + "language_loss": 0.8359797, + "learning_rate": 0.0002725329809702729, + "loss": 0.84641242, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.39990234, + "step": 3435, + "time_per_iteration": 3.1851022243499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_mlp": 1.0028832, + "epoch": 0.6610234705656022, + "flos": 1138108804608.0, + "grad_norm": 0.04206643775716819, + "language_loss": 0.76903141, + "learning_rate": 0.0002722555888849921, + "loss": 0.7794615, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.40112305, + "step": 3436, + "time_per_iteration": 3.453571081161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_mlp": 1.00474417, + "epoch": 0.6612158522508658, + "flos": 468959820288.0, + "grad_norm": 0.03417683071505001, + "language_loss": 0.80971491, + "learning_rate": 0.00027197828521793334, + "loss": 0.82016289, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.40039062, + "step": 3437, + "time_per_iteration": 2.5737972259521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_mlp": 1.00892961, + "epoch": 0.6614082339361292, + "flos": 572774997504.0, + "grad_norm": 0.03444646564186984, + "language_loss": 0.85238397, + "learning_rate": 0.0002717010700767552, + "loss": 0.86287451, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.40112305, + "step": 3438, + "time_per_iteration": 2.6816329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_mlp": 1.00700414, + "epoch": 0.6616006156213928, + "flos": 499460474112.0, + "grad_norm": 0.039408018339583364, + "language_loss": 0.7639091, + "learning_rate": 0.00027142394356908226, + "loss": 0.77437991, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.40063477, + "step": 3439, + "time_per_iteration": 2.6397507190704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_mlp": 1.01604831, + "epoch": 0.6617929973066564, + "flos": 603610043136.0, + "grad_norm": 0.03512262783038589, + "language_loss": 0.85516727, + "learning_rate": 0.00027114690580250456, + "loss": 0.8657307, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.40283203, + "step": 3440, + "time_per_iteration": 2.8226699829101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00607085, + "epoch": 0.66198537899192, + "flos": 523995532032.0, + "grad_norm": 0.03484935524221126, + "language_loss": 0.87502497, + "learning_rate": 0.0002708699568845776, + "loss": 0.88549048, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.40478516, + "step": 3441, + "time_per_iteration": 2.666151762008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054733, + "balance_loss_mlp": 1.01563263, + "epoch": 0.6621777606771835, + "flos": 1569612794112.0, + "grad_norm": 0.008720086595697616, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80342519, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.390625, + "step": 3442, + "time_per_iteration": 4.902445316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043667, + "balance_loss_mlp": 1.00320721, + "epoch": 0.6623701423624471, + "flos": 527690371584.0, + "grad_norm": 0.04147844177514617, + "language_loss": 0.83753407, + "learning_rate": 0.0002703163260247261, + "loss": 0.84797072, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.40454102, + "step": 3443, + "time_per_iteration": 2.6544172763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_mlp": 1.00157988, + "epoch": 0.6625625240477107, + "flos": 529216521984.0, + "grad_norm": 0.040243971726719965, + "language_loss": 0.82285839, + "learning_rate": 0.0002700396442977399, + "loss": 0.83327973, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.40551758, + "step": 3444, + "time_per_iteration": 2.659823179244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046923, + "balance_loss_mlp": 1.00648713, + "epoch": 0.6627549057329742, + "flos": 474196361472.0, + "grad_norm": 0.03873462944333031, + "language_loss": 0.84804982, + "learning_rate": 0.0002697630518492817, + "loss": 0.85851908, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.40429688, + "step": 3445, + "time_per_iteration": 2.6407060623168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042694, + "balance_loss_mlp": 1.00218678, + "epoch": 0.6629472874182378, + "flos": 529012387584.0, + "grad_norm": 0.03365832032426446, + "language_loss": 0.86288029, + "learning_rate": 0.0002694865487867343, + "loss": 0.87330723, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.4050293, + "step": 3446, + "time_per_iteration": 2.6234817504882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_mlp": 0.99986076, + "epoch": 0.6631396691035013, + "flos": 614379611904.0, + "grad_norm": 0.029868994053189296, + "language_loss": 0.85050064, + "learning_rate": 0.0002692101352174453, + "loss": 0.86090481, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.40551758, + "step": 3447, + "time_per_iteration": 2.7610418796539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054226, + "balance_loss_mlp": 1.01357543, + "epoch": 0.6633320507887649, + "flos": 610434951168.0, + "grad_norm": 0.03566276224507284, + "language_loss": 0.85075617, + "learning_rate": 0.00026893381124872787, + "loss": 0.86129844, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.40649414, + "step": 3448, + "time_per_iteration": 2.7092947959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.0067625, + "epoch": 0.6635244324740285, + "flos": 751142143488.0, + "grad_norm": 0.03834758690665688, + "language_loss": 0.81510758, + "learning_rate": 0.00026865757698786097, + "loss": 0.82558024, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.4050293, + "step": 3449, + "time_per_iteration": 3.0252504348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050032, + "balance_loss_mlp": 1.00952482, + "epoch": 0.6637168141592921, + "flos": 665748674304.0, + "grad_norm": 0.03495621172774381, + "language_loss": 0.82439375, + "learning_rate": 0.000268381432542088, + "loss": 0.83489406, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.4050293, + "step": 3450, + "time_per_iteration": 2.847905397415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_mlp": 1.00598156, + "epoch": 0.6639091958445555, + "flos": 607921176576.0, + "grad_norm": 0.03480028422588226, + "language_loss": 0.80330265, + "learning_rate": 0.00026810537801861807, + "loss": 0.8137669, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.40429688, + "step": 3451, + "time_per_iteration": 2.8109076023101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044982, + "balance_loss_mlp": 1.00442719, + "epoch": 0.6641015775298191, + "flos": 477680263680.0, + "grad_norm": 0.03370448580538907, + "language_loss": 0.81616271, + "learning_rate": 0.0002678294135246243, + "loss": 0.82661253, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.40551758, + "step": 3452, + "time_per_iteration": 2.77632999420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_mlp": 1.00337756, + "epoch": 0.6642939592150827, + "flos": 905596361472.0, + "grad_norm": 0.035596990972813804, + "language_loss": 0.87064171, + "learning_rate": 0.0002675535391672463, + "loss": 0.88108027, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.40478516, + "step": 3453, + "time_per_iteration": 3.1011788845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046586, + "balance_loss_mlp": 1.00610256, + "epoch": 0.6644863409003463, + "flos": 582938966016.0, + "grad_norm": 0.03233314445792202, + "language_loss": 0.86734712, + "learning_rate": 0.0002672777550535877, + "loss": 0.87781298, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.40478516, + "step": 3454, + "time_per_iteration": 2.799320936203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047695, + "balance_loss_mlp": 1.00714028, + "epoch": 0.6646787225856099, + "flos": 479970461952.0, + "grad_norm": 0.04849178662998588, + "language_loss": 0.85994661, + "learning_rate": 0.00026700206129071747, + "loss": 0.87042361, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.40551758, + "step": 3455, + "time_per_iteration": 2.5544278621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044362, + "balance_loss_mlp": 1.00371206, + "epoch": 0.6648711042708734, + "flos": 450828762624.0, + "grad_norm": 0.04059200209413719, + "language_loss": 0.89189559, + "learning_rate": 0.00026672645798566925, + "loss": 0.90233922, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.40649414, + "step": 3456, + "time_per_iteration": 2.501304864883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_mlp": 1.00669408, + "epoch": 0.665063485956137, + "flos": 860597273088.0, + "grad_norm": 0.0398485152985426, + "language_loss": 0.7998091, + "learning_rate": 0.00026645094524544225, + "loss": 0.81028181, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.40576172, + "step": 3457, + "time_per_iteration": 3.276411294937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_mlp": 1.00266171, + "epoch": 0.6652558676414005, + "flos": 605472530688.0, + "grad_norm": 0.027841742129180558, + "language_loss": 0.75740635, + "learning_rate": 0.00026617552317699945, + "loss": 0.76784027, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.40722656, + "step": 3458, + "time_per_iteration": 2.801248550415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046474, + "balance_loss_mlp": 1.00591886, + "epoch": 0.6654482493266641, + "flos": 511411107840.0, + "grad_norm": 0.036000642082667296, + "language_loss": 0.87457603, + "learning_rate": 0.0002659001918872693, + "loss": 0.88504076, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.40551758, + "step": 3459, + "time_per_iteration": 2.6388814449310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050084, + "balance_loss_mlp": 1.00948107, + "epoch": 0.6656406310119277, + "flos": 566661647616.0, + "grad_norm": 0.03405161677383315, + "language_loss": 0.81573474, + "learning_rate": 0.0002656249514831449, + "loss": 0.82623559, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.40600586, + "step": 3460, + "time_per_iteration": 2.6583993434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052206, + "balance_loss_mlp": 1.01155555, + "epoch": 0.6658330126971912, + "flos": 1026060187392.0, + "grad_norm": 0.03356522396560915, + "language_loss": 0.87476516, + "learning_rate": 0.00026534980207148416, + "loss": 0.88528717, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.40649414, + "step": 3461, + "time_per_iteration": 3.4255144596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_mlp": 1.00962222, + "epoch": 0.6660253943824548, + "flos": 818234446848.0, + "grad_norm": 0.03543783293435262, + "language_loss": 0.74157602, + "learning_rate": 0.0002650747437591097, + "loss": 0.75208062, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.40844727, + "step": 3462, + "time_per_iteration": 2.99372935295105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.00879669, + "epoch": 0.6662177760677184, + "flos": 1499533318656.0, + "grad_norm": 0.007196146037648728, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82927668, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.39257812, + "step": 3463, + "time_per_iteration": 5.021228075027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047386, + "balance_loss_mlp": 1.00668836, + "epoch": 0.666410157752982, + "flos": 501108133632.0, + "grad_norm": 0.0343393236578971, + "language_loss": 0.8738476, + "learning_rate": 0.00026452490085933155, + "loss": 0.88432145, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.40698242, + "step": 3464, + "time_per_iteration": 2.5860917568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048288, + "balance_loss_mlp": 1.00747108, + "epoch": 0.6666025394382454, + "flos": 482139151104.0, + "grad_norm": 0.04334646456147875, + "language_loss": 0.90236807, + "learning_rate": 0.00026425011648539614, + "loss": 0.91285098, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.40820312, + "step": 3465, + "time_per_iteration": 2.5441110134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049485, + "balance_loss_mlp": 1.00864422, + "epoch": 0.666794921123509, + "flos": 547692665088.0, + "grad_norm": 0.03397954120439615, + "language_loss": 0.83244991, + "learning_rate": 0.00026397542363768267, + "loss": 0.84294474, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.40844727, + "step": 3466, + "time_per_iteration": 2.74092698097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051821, + "balance_loss_mlp": 1.01097989, + "epoch": 0.6669873028087726, + "flos": 472943364864.0, + "grad_norm": 0.036434069598551014, + "language_loss": 0.82217574, + "learning_rate": 0.0002637008224228362, + "loss": 0.83269393, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.40844727, + "step": 3467, + "time_per_iteration": 2.5710275173187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_mlp": 1.00785649, + "epoch": 0.6671796844940362, + "flos": 548500454400.0, + "grad_norm": 0.030766968440674072, + "language_loss": 0.8512944, + "learning_rate": 0.00026342631294746653, + "loss": 0.86178112, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.40820312, + "step": 3468, + "time_per_iteration": 2.7195847034454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_mlp": 1.00493824, + "epoch": 0.6673720661792998, + "flos": 1072123689216.0, + "grad_norm": 0.03165025767658557, + "language_loss": 0.81300414, + "learning_rate": 0.0002631518953181476, + "loss": 0.8234629, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.40942383, + "step": 3469, + "time_per_iteration": 3.4572696685791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045563, + "balance_loss_mlp": 1.00588989, + "epoch": 0.6675644478645633, + "flos": 1527113874432.0, + "grad_norm": 0.008139756237930116, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77370852, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.39648438, + "step": 3470, + "time_per_iteration": 4.91265869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_mlp": 1.00341797, + "epoch": 0.6677568295498268, + "flos": 580844153856.0, + "grad_norm": 0.03268114077515944, + "language_loss": 0.80885828, + "learning_rate": 0.00026260333602377985, + "loss": 0.81930137, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.40893555, + "step": 3471, + "time_per_iteration": 2.7573940753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043519, + "balance_loss_mlp": 1.00277328, + "epoch": 0.6679492112350904, + "flos": 384791157504.0, + "grad_norm": 0.03558012533984873, + "language_loss": 0.87711406, + "learning_rate": 0.0002623291945717007, + "loss": 0.88754922, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.4074707, + "step": 3472, + "time_per_iteration": 2.442338466644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_mlp": 1.00364411, + "epoch": 0.668141592920354, + "flos": 1152616954368.0, + "grad_norm": 0.0328139503917561, + "language_loss": 0.84606934, + "learning_rate": 0.00026205514539161175, + "loss": 0.85651278, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.40698242, + "step": 3473, + "time_per_iteration": 3.503469705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044285, + "balance_loss_mlp": 1.00353932, + "epoch": 0.6683339746056175, + "flos": 562292188416.0, + "grad_norm": 0.030626159125144124, + "language_loss": 0.84382141, + "learning_rate": 0.00026178118858990773, + "loss": 0.85426426, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.4074707, + "step": 3474, + "time_per_iteration": 2.8285627365112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_mlp": 1.00330997, + "epoch": 0.6685263562908811, + "flos": 515329523712.0, + "grad_norm": 0.030456650520625777, + "language_loss": 0.8459208, + "learning_rate": 0.0002615073242729483, + "loss": 0.85636061, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.40673828, + "step": 3475, + "time_per_iteration": 2.637474775314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047916, + "balance_loss_mlp": 1.00726593, + "epoch": 0.6687187379761447, + "flos": 631002015744.0, + "grad_norm": 0.030827527571606016, + "language_loss": 0.85137111, + "learning_rate": 0.0002612335525470573, + "loss": 0.86185026, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.40649414, + "step": 3476, + "time_per_iteration": 2.823110342025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_mlp": 1.00759649, + "epoch": 0.6689111196614083, + "flos": 536688826368.0, + "grad_norm": 0.0342797401257031, + "language_loss": 0.78870076, + "learning_rate": 0.0002609598735185221, + "loss": 0.79918253, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.40576172, + "step": 3477, + "time_per_iteration": 2.6825544834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048062, + "balance_loss_mlp": 1.00736415, + "epoch": 0.6691035013466718, + "flos": 604161208320.0, + "grad_norm": 0.031585406138604756, + "language_loss": 0.83722425, + "learning_rate": 0.00026068628729359445, + "loss": 0.84770489, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.40698242, + "step": 3478, + "time_per_iteration": 2.77055287361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_mlp": 1.00893426, + "epoch": 0.6692958830319353, + "flos": 634128193536.0, + "grad_norm": 0.03192222919752024, + "language_loss": 0.76639205, + "learning_rate": 0.00026041279397848996, + "loss": 0.77688813, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.40673828, + "step": 3479, + "time_per_iteration": 2.8836774826049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.00585258, + "epoch": 0.6694882647171989, + "flos": 646749556224.0, + "grad_norm": 0.03482378260676791, + "language_loss": 0.83261842, + "learning_rate": 0.00026013939367938797, + "loss": 0.84308422, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.40722656, + "step": 3480, + "time_per_iteration": 2.8915905952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044065, + "balance_loss_mlp": 1.00339055, + "epoch": 0.6696806464024625, + "flos": 570762810624.0, + "grad_norm": 0.033098295415039676, + "language_loss": 0.81370211, + "learning_rate": 0.00025986608650243204, + "loss": 0.82414275, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.40673828, + "step": 3481, + "time_per_iteration": 2.785128116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_mlp": 1.00114262, + "epoch": 0.6698730280877261, + "flos": 623964225024.0, + "grad_norm": 0.029494842151893377, + "language_loss": 0.79968995, + "learning_rate": 0.0002595928725537293, + "loss": 0.81010795, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.40649414, + "step": 3482, + "time_per_iteration": 2.862269639968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044098, + "balance_loss_mlp": 1.00361502, + "epoch": 0.6700654097729896, + "flos": 503509147392.0, + "grad_norm": 0.04687738924835003, + "language_loss": 0.88447571, + "learning_rate": 0.0002593197519393509, + "loss": 0.89491665, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.40478516, + "step": 3483, + "time_per_iteration": 2.5955467224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044048, + "balance_loss_mlp": 1.00356483, + "epoch": 0.6702577914582531, + "flos": 625119045120.0, + "grad_norm": 0.03040614525342857, + "language_loss": 0.79865301, + "learning_rate": 0.00025904672476533165, + "loss": 0.80909348, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.40478516, + "step": 3484, + "time_per_iteration": 2.83461594581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00695693, + "epoch": 0.6704501731435167, + "flos": 457213320960.0, + "grad_norm": 0.03431199864252877, + "language_loss": 0.83164477, + "learning_rate": 0.0002587737911376704, + "loss": 0.84211963, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.40527344, + "step": 3485, + "time_per_iteration": 2.6094586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_mlp": 1.00313866, + "epoch": 0.6706425548287803, + "flos": 544258340352.0, + "grad_norm": 0.0329892912769069, + "language_loss": 0.84059811, + "learning_rate": 0.00025850095116232885, + "loss": 0.851035, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.40551758, + "step": 3486, + "time_per_iteration": 2.686342477798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_mlp": 1.00332236, + "epoch": 0.6708349365140439, + "flos": 635180946432.0, + "grad_norm": 0.03091711657706004, + "language_loss": 0.78076321, + "learning_rate": 0.000258228204945233, + "loss": 0.79120129, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.40478516, + "step": 3487, + "time_per_iteration": 2.9295520782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_mlp": 1.0044347, + "epoch": 0.6710273181993074, + "flos": 641903787264.0, + "grad_norm": 0.032938145156071165, + "language_loss": 0.85185027, + "learning_rate": 0.00025795555259227254, + "loss": 0.86229968, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.4050293, + "step": 3488, + "time_per_iteration": 2.79502534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_mlp": 1.00561631, + "epoch": 0.671219699884571, + "flos": 555025963776.0, + "grad_norm": 0.02894865619678765, + "language_loss": 0.84055519, + "learning_rate": 0.00025768299420930046, + "loss": 0.85101712, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.40576172, + "step": 3489, + "time_per_iteration": 2.779972553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_mlp": 1.00622523, + "epoch": 0.6714120815698346, + "flos": 732782651904.0, + "grad_norm": 0.0327604861643189, + "language_loss": 0.8377071, + "learning_rate": 0.0002574105299021332, + "loss": 0.84817296, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.40356445, + "step": 3490, + "time_per_iteration": 2.893480062484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_mlp": 1.00433028, + "epoch": 0.6716044632550981, + "flos": 689947395072.0, + "grad_norm": 0.03209886664090861, + "language_loss": 0.8471486, + "learning_rate": 0.00025713815977655084, + "loss": 0.85759532, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.40332031, + "step": 3491, + "time_per_iteration": 2.957084894180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.0041728, + "epoch": 0.6717968449403616, + "flos": 461587637760.0, + "grad_norm": 0.0366727841184643, + "language_loss": 0.85291302, + "learning_rate": 0.0002568658839382969, + "loss": 0.8633579, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.40307617, + "step": 3492, + "time_per_iteration": 2.5661098957061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054708, + "balance_loss_mlp": 1.01429558, + "epoch": 0.6719892266256252, + "flos": 502597345536.0, + "grad_norm": 0.0394893912508571, + "language_loss": 0.8491143, + "learning_rate": 0.00025659370249307814, + "loss": 0.85966134, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.40405273, + "step": 3493, + "time_per_iteration": 2.6122422218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051838, + "balance_loss_mlp": 1.01144993, + "epoch": 0.6721816083108888, + "flos": 684737098752.0, + "grad_norm": 0.033378667785843884, + "language_loss": 0.85795897, + "learning_rate": 0.00025632161554656473, + "loss": 0.86847734, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.40380859, + "step": 3494, + "time_per_iteration": 2.8829426765441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049339, + "balance_loss_mlp": 1.00897467, + "epoch": 0.6723739899961524, + "flos": 586896265728.0, + "grad_norm": 0.03541855963970859, + "language_loss": 0.8296122, + "learning_rate": 0.00025604962320439017, + "loss": 0.84010559, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.40356445, + "step": 3495, + "time_per_iteration": 2.7043375968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_mlp": 1.00421739, + "epoch": 0.672566371681416, + "flos": 507740567808.0, + "grad_norm": 0.03528245901985063, + "language_loss": 0.82875669, + "learning_rate": 0.0002557777255721516, + "loss": 0.83920175, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.40283203, + "step": 3496, + "time_per_iteration": 2.719181776046753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045131, + "balance_loss_mlp": 1.00479066, + "epoch": 0.6727587533666795, + "flos": 536736458496.0, + "grad_norm": 0.036828443855142154, + "language_loss": 0.81081581, + "learning_rate": 0.0002555059227554087, + "loss": 0.82126713, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.40332031, + "step": 3497, + "time_per_iteration": 2.7057156562805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_mlp": 1.0047394, + "epoch": 0.672951135051943, + "flos": 604037753856.0, + "grad_norm": 0.0344810885559189, + "language_loss": 0.78363037, + "learning_rate": 0.00025523421485968453, + "loss": 0.79408205, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.40429688, + "step": 3498, + "time_per_iteration": 2.8460867404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_mlp": 1.00299513, + "epoch": 0.6731435167372066, + "flos": 812679065088.0, + "grad_norm": 0.0462085280228462, + "language_loss": 0.8591696, + "learning_rate": 0.00025496260199046585, + "loss": 0.86960292, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.40332031, + "step": 3499, + "time_per_iteration": 2.971506357192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_mlp": 1.0028913, + "epoch": 0.6733358984224702, + "flos": 612751394304.0, + "grad_norm": 0.03556230846218865, + "language_loss": 0.84967017, + "learning_rate": 0.000254691084253202, + "loss": 0.86010194, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.40283203, + "step": 3500, + "time_per_iteration": 2.8486316204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_mlp": 1.00318992, + "epoch": 0.6735282801077337, + "flos": 559968942336.0, + "grad_norm": 0.24449978816738047, + "language_loss": 0.77738857, + "learning_rate": 0.00025441966175330567, + "loss": 0.7878232, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.40258789, + "step": 3501, + "time_per_iteration": 2.631596803665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_mlp": 1.0023365, + "epoch": 0.6737206617929973, + "flos": 673633138176.0, + "grad_norm": 0.03266233971307438, + "language_loss": 0.80253637, + "learning_rate": 0.00025414833459615183, + "loss": 0.81296146, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.40161133, + "step": 3502, + "time_per_iteration": 2.822633981704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044113, + "balance_loss_mlp": 1.00398731, + "epoch": 0.6739130434782609, + "flos": 634642420224.0, + "grad_norm": 0.03194426719542878, + "language_loss": 0.80720419, + "learning_rate": 0.0002538771028870796, + "loss": 0.81764531, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.40112305, + "step": 3503, + "time_per_iteration": 2.8278305530548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044086, + "balance_loss_mlp": 1.00376928, + "epoch": 0.6741054251635245, + "flos": 532546834176.0, + "grad_norm": 0.03505319293998398, + "language_loss": 0.82144141, + "learning_rate": 0.0002536059667313903, + "loss": 0.8318823, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.40307617, + "step": 3504, + "time_per_iteration": 2.772728443145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045901, + "balance_loss_mlp": 1.00548911, + "epoch": 0.674297806848788, + "flos": 543652740096.0, + "grad_norm": 0.033634031590092824, + "language_loss": 0.89796269, + "learning_rate": 0.0002533349262343483, + "loss": 0.90842175, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.40405273, + "step": 3505, + "time_per_iteration": 2.6931023597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048869, + "balance_loss_mlp": 1.00845683, + "epoch": 0.6744901885340515, + "flos": 464455246080.0, + "grad_norm": 0.03724604036951252, + "language_loss": 0.82972419, + "learning_rate": 0.0002530639815011807, + "loss": 0.84021288, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.40405273, + "step": 3506, + "time_per_iteration": 2.5213606357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104646, + "balance_loss_mlp": 1.00604796, + "epoch": 0.6746825702193151, + "flos": 633022950912.0, + "grad_norm": 0.0353973537861221, + "language_loss": 0.85602045, + "learning_rate": 0.0002527931326370781, + "loss": 0.866485, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.40405273, + "step": 3507, + "time_per_iteration": 2.7929484844207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046008, + "balance_loss_mlp": 1.00554848, + "epoch": 0.6748749519045787, + "flos": 672393747456.0, + "grad_norm": 0.038454630804936565, + "language_loss": 0.83645785, + "learning_rate": 0.00025252237974719276, + "loss": 0.84691793, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.40454102, + "step": 3508, + "time_per_iteration": 2.8264431953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00374067, + "epoch": 0.6750673335898423, + "flos": 768493602048.0, + "grad_norm": 0.034781380834319586, + "language_loss": 0.81037247, + "learning_rate": 0.00025225172293664056, + "loss": 0.82081401, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.40405273, + "step": 3509, + "time_per_iteration": 2.988295078277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_mlp": 1.00597382, + "epoch": 0.6752597152751059, + "flos": 1515907846656.0, + "grad_norm": 0.0075717383905430985, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77978498, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.39355469, + "step": 3510, + "time_per_iteration": 4.925229787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00339806, + "epoch": 0.6754520969603693, + "flos": 688534005504.0, + "grad_norm": 0.03671107253105254, + "language_loss": 0.85454929, + "learning_rate": 0.00025171069797381106, + "loss": 0.8649869, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.40356445, + "step": 3511, + "time_per_iteration": 2.8605566024780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042968, + "balance_loss_mlp": 1.00265193, + "epoch": 0.6756444786456329, + "flos": 501618469632.0, + "grad_norm": 0.03363257909810701, + "language_loss": 0.82468766, + "learning_rate": 0.00025144033003157864, + "loss": 0.83511734, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.40307617, + "step": 3512, + "time_per_iteration": 2.6560440063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043999, + "balance_loss_mlp": 1.00382507, + "epoch": 0.6758368603308965, + "flos": 493660128768.0, + "grad_norm": 0.04010660433283205, + "language_loss": 0.79292786, + "learning_rate": 0.00025117005858876806, + "loss": 0.80336791, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.40161133, + "step": 3513, + "time_per_iteration": 2.6984188556671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_mlp": 1.00392663, + "epoch": 0.6760292420161601, + "flos": 557044953600.0, + "grad_norm": 0.035892201444293004, + "language_loss": 0.86103761, + "learning_rate": 0.000250899883750308, + "loss": 0.8714788, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.40185547, + "step": 3514, + "time_per_iteration": 2.7181315422058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046252, + "balance_loss_mlp": 1.00600672, + "epoch": 0.6762216237014236, + "flos": 608722162944.0, + "grad_norm": 0.033450458947787066, + "language_loss": 0.81925356, + "learning_rate": 0.00025062980562109006, + "loss": 0.82971609, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.40234375, + "step": 3515, + "time_per_iteration": 2.78231143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043998, + "balance_loss_mlp": 1.00377643, + "epoch": 0.6764140053866872, + "flos": 534928406016.0, + "grad_norm": 0.037161832732059044, + "language_loss": 0.83539182, + "learning_rate": 0.0002503598243059677, + "loss": 0.84583181, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.40209961, + "step": 3516, + "time_per_iteration": 2.7860419750213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046044, + "balance_loss_mlp": 1.00584662, + "epoch": 0.6766063870719508, + "flos": 505862529024.0, + "grad_norm": 0.041409918101289474, + "language_loss": 0.80496907, + "learning_rate": 0.0002500899399097568, + "loss": 0.81542951, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.40185547, + "step": 3517, + "time_per_iteration": 2.6418778896331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041487, + "balance_loss_mlp": 1.00131381, + "epoch": 0.6767987687572143, + "flos": 514194145536.0, + "grad_norm": 0.03808875517476391, + "language_loss": 0.86208284, + "learning_rate": 0.0002498201525372359, + "loss": 0.87249774, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.40161133, + "step": 3518, + "time_per_iteration": 2.569801092147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041708, + "balance_loss_mlp": 1.00148714, + "epoch": 0.6769911504424779, + "flos": 526079650560.0, + "grad_norm": 0.03452143000851854, + "language_loss": 0.83818328, + "learning_rate": 0.00024955046229314584, + "loss": 0.84860039, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.40209961, + "step": 3519, + "time_per_iteration": 2.602756977081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_mlp": 1.00353205, + "epoch": 0.6771835321277414, + "flos": 450837510912.0, + "grad_norm": 0.03417107794198843, + "language_loss": 0.87895727, + "learning_rate": 0.00024928086928218947, + "loss": 0.8893944, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.40161133, + "step": 3520, + "time_per_iteration": 2.4941091537475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00443459, + "epoch": 0.677375913813005, + "flos": 710674852608.0, + "grad_norm": 0.03642632041664857, + "language_loss": 0.76859355, + "learning_rate": 0.00024901137360903216, + "loss": 0.7790401, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.40209961, + "step": 3521, + "time_per_iteration": 2.985905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_mlp": 1.00404227, + "epoch": 0.6775682954982686, + "flos": 429346005504.0, + "grad_norm": 0.039972484461639736, + "language_loss": 0.81834614, + "learning_rate": 0.00024874197537830115, + "loss": 0.82878971, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.40307617, + "step": 3522, + "time_per_iteration": 2.525432586669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045522, + "balance_loss_mlp": 1.0052768, + "epoch": 0.6777606771835322, + "flos": 438821748480.0, + "grad_norm": 0.0378942066794791, + "language_loss": 0.83926749, + "learning_rate": 0.00024847267469458684, + "loss": 0.84972268, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.40234375, + "step": 3523, + "time_per_iteration": 2.519306182861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045594, + "balance_loss_mlp": 1.0053252, + "epoch": 0.6779530588687956, + "flos": 776788280064.0, + "grad_norm": 0.03620909543605363, + "language_loss": 0.78424889, + "learning_rate": 0.00024820347166244034, + "loss": 0.79470479, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.40258789, + "step": 3524, + "time_per_iteration": 3.016852378845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045119, + "balance_loss_mlp": 1.00494587, + "epoch": 0.6781454405540592, + "flos": 572905254912.0, + "grad_norm": 0.03295614224458047, + "language_loss": 0.85541701, + "learning_rate": 0.0002479343663863755, + "loss": 0.86586821, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.40161133, + "step": 3525, + "time_per_iteration": 2.7807812690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046011, + "balance_loss_mlp": 1.00586104, + "epoch": 0.6783378222393228, + "flos": 485983689984.0, + "grad_norm": 0.034679626335120894, + "language_loss": 0.77479804, + "learning_rate": 0.00024766535897086876, + "loss": 0.78525817, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.40136719, + "step": 3526, + "time_per_iteration": 2.599513530731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_mlp": 1.0070163, + "epoch": 0.6785302039245864, + "flos": 483832497408.0, + "grad_norm": 0.03442955801383442, + "language_loss": 0.797737, + "learning_rate": 0.0002473964495203578, + "loss": 0.80820936, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.40209961, + "step": 3527, + "time_per_iteration": 2.6847755908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_mlp": 1.00640571, + "epoch": 0.67872258560985, + "flos": 525862877184.0, + "grad_norm": 0.03305823044562006, + "language_loss": 0.861408, + "learning_rate": 0.0002471276381392425, + "loss": 0.87187475, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.40258789, + "step": 3528, + "time_per_iteration": 4.207594156265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_mlp": 1.00437164, + "epoch": 0.6789149672951135, + "flos": 1555894937088.0, + "grad_norm": 0.004731891717640295, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79231918, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.38867188, + "step": 3529, + "time_per_iteration": 4.977165222167969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046125, + "balance_loss_mlp": 1.00583255, + "epoch": 0.6791073489803771, + "flos": 742686105600.0, + "grad_norm": 0.033652850666290056, + "language_loss": 0.84582424, + "learning_rate": 0.00024659031000260826, + "loss": 0.85628551, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.40283203, + "step": 3530, + "time_per_iteration": 2.9048852920532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_mlp": 1.00703049, + "epoch": 0.6792997306656406, + "flos": 577448712960.0, + "grad_norm": 0.040150019342018534, + "language_loss": 0.81559235, + "learning_rate": 0.0002463217934556985, + "loss": 0.82606721, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.40454102, + "step": 3531, + "time_per_iteration": 2.6925132274627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046124, + "balance_loss_mlp": 1.00692749, + "epoch": 0.6794921123509042, + "flos": 1506546809856.0, + "grad_norm": 0.009705737357192788, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77578211, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.39160156, + "step": 3532, + "time_per_iteration": 4.747551202774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_mlp": 1.00138783, + "epoch": 0.6796844940361677, + "flos": 700141499136.0, + "grad_norm": 0.03694517286226913, + "language_loss": 0.84159917, + "learning_rate": 0.0002457850559259306, + "loss": 0.8520174, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.40429688, + "step": 3533, + "time_per_iteration": 2.8468008041381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_mlp": 1.00326967, + "epoch": 0.6798768757214313, + "flos": 553816708608.0, + "grad_norm": 0.03486714477103508, + "language_loss": 0.82139623, + "learning_rate": 0.00024551683515145275, + "loss": 0.83183265, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.40356445, + "step": 3534, + "time_per_iteration": 2.6637539863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_mlp": 1.00261092, + "epoch": 0.6800692574066949, + "flos": 523976090112.0, + "grad_norm": 0.03293406934357783, + "language_loss": 0.87167442, + "learning_rate": 0.0002452487131761014, + "loss": 0.88210464, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.40405273, + "step": 3535, + "time_per_iteration": 2.719104051589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041768, + "balance_loss_mlp": 1.00128436, + "epoch": 0.6802616390919585, + "flos": 575130324480.0, + "grad_norm": 0.03513185710250464, + "language_loss": 0.80471444, + "learning_rate": 0.00024498069010397093, + "loss": 0.81513214, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.40478516, + "step": 3536, + "time_per_iteration": 2.656780242919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042968, + "balance_loss_mlp": 1.00250804, + "epoch": 0.6804540207772221, + "flos": 489129309696.0, + "grad_norm": 0.03285150643596687, + "language_loss": 0.85294282, + "learning_rate": 0.00024471276603911697, + "loss": 0.86337245, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.40454102, + "step": 3537, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046239, + "balance_loss_mlp": 1.00566006, + "epoch": 0.6806464024624855, + "flos": 579745714176.0, + "grad_norm": 0.0319685563784025, + "language_loss": 0.79588819, + "learning_rate": 0.0002444449410855572, + "loss": 0.80635059, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.40576172, + "step": 3538, + "time_per_iteration": 2.7366206645965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048213, + "balance_loss_mlp": 1.00777721, + "epoch": 0.6808387841477491, + "flos": 554793639168.0, + "grad_norm": 0.028008178154431115, + "language_loss": 0.8488512, + "learning_rate": 0.00024417721534727033, + "loss": 0.85933334, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.40429688, + "step": 3539, + "time_per_iteration": 2.6501903533935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_mlp": 1.00716043, + "epoch": 0.6810311658330127, + "flos": 427754726400.0, + "grad_norm": 0.0434584868230971, + "language_loss": 0.83537716, + "learning_rate": 0.00024390958892819687, + "loss": 0.84585309, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.40429688, + "step": 3540, + "time_per_iteration": 2.5052664279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046344, + "balance_loss_mlp": 1.00571704, + "epoch": 0.6812235475182763, + "flos": 573461277696.0, + "grad_norm": 0.03693481574756638, + "language_loss": 0.81626362, + "learning_rate": 0.0002436420619322381, + "loss": 0.82672703, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.40625, + "step": 3541, + "time_per_iteration": 2.832705497741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049574, + "balance_loss_mlp": 1.00901949, + "epoch": 0.6814159292035398, + "flos": 502994920704.0, + "grad_norm": 0.03366403266770877, + "language_loss": 0.83297849, + "learning_rate": 0.0002433746344632577, + "loss": 0.84347427, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.40551758, + "step": 3542, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050212, + "balance_loss_mlp": 1.00972831, + "epoch": 0.6816083108888034, + "flos": 766956758016.0, + "grad_norm": 0.03487918397791305, + "language_loss": 0.80590951, + "learning_rate": 0.00024310730662508006, + "loss": 0.81641161, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.40478516, + "step": 3543, + "time_per_iteration": 3.086225986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051043, + "balance_loss_mlp": 1.0106312, + "epoch": 0.681800692574067, + "flos": 480480797952.0, + "grad_norm": 0.03000398684674813, + "language_loss": 0.88137174, + "learning_rate": 0.0002428400785214911, + "loss": 0.89188218, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.40405273, + "step": 3544, + "time_per_iteration": 2.5797877311706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050788, + "balance_loss_mlp": 1.01030433, + "epoch": 0.6819930742593305, + "flos": 692834445312.0, + "grad_norm": 0.03498907792035314, + "language_loss": 0.83317804, + "learning_rate": 0.00024257295025623794, + "loss": 0.84368593, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.40478516, + "step": 3545, + "time_per_iteration": 2.817002534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_mlp": 1.00852597, + "epoch": 0.6821854559445941, + "flos": 679355715840.0, + "grad_norm": 0.03355065924517062, + "language_loss": 0.81087142, + "learning_rate": 0.00024230592193302892, + "loss": 0.82136154, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.40478516, + "step": 3546, + "time_per_iteration": 2.9010307788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00469804, + "epoch": 0.6823778376298576, + "flos": 463133230080.0, + "grad_norm": 0.04387981272485442, + "language_loss": 0.85039532, + "learning_rate": 0.00024203899365553372, + "loss": 0.86084759, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.40527344, + "step": 3547, + "time_per_iteration": 2.5003862380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045105, + "balance_loss_mlp": 1.00543213, + "epoch": 0.6825702193151212, + "flos": 1478176939776.0, + "grad_norm": 0.005965966657319216, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7777946, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.39648438, + "step": 3548, + "time_per_iteration": 4.51382303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043331, + "balance_loss_mlp": 1.00299025, + "epoch": 0.6827626010003848, + "flos": 724414096896.0, + "grad_norm": 0.03369751046554337, + "language_loss": 0.83353454, + "learning_rate": 0.00024150543765216848, + "loss": 0.84396785, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.40332031, + "step": 3549, + "time_per_iteration": 2.868882179260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043901, + "balance_loss_mlp": 1.00348902, + "epoch": 0.6829549826856484, + "flos": 559940752128.0, + "grad_norm": 0.03314347093854088, + "language_loss": 0.83934271, + "learning_rate": 0.00024123881013344352, + "loss": 0.84978169, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.40405273, + "step": 3550, + "time_per_iteration": 2.673149347305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043918, + "balance_loss_mlp": 1.00352979, + "epoch": 0.6831473643709118, + "flos": 626134859520.0, + "grad_norm": 0.03193969534774964, + "language_loss": 0.80188608, + "learning_rate": 0.00024097228307472202, + "loss": 0.81232524, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.40380859, + "step": 3551, + "time_per_iteration": 2.783318519592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_mlp": 1.00367296, + "epoch": 0.6833397460561754, + "flos": 715098746880.0, + "grad_norm": 0.03508880753124507, + "language_loss": 0.82590389, + "learning_rate": 0.00024070585657947846, + "loss": 0.83634502, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.40429688, + "step": 3552, + "time_per_iteration": 2.87227725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_mlp": 1.00414932, + "epoch": 0.683532127741439, + "flos": 465727684608.0, + "grad_norm": 0.028861941577793874, + "language_loss": 0.86039191, + "learning_rate": 0.00024043953075114934, + "loss": 0.87083775, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.40429688, + "step": 3553, + "time_per_iteration": 2.685239315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044014, + "balance_loss_mlp": 1.00353038, + "epoch": 0.6837245094267026, + "flos": 583340431872.0, + "grad_norm": 0.03309577009255294, + "language_loss": 0.89582229, + "learning_rate": 0.00024017330569313128, + "loss": 0.9062624, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.40478516, + "step": 3554, + "time_per_iteration": 2.738507032394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044038, + "balance_loss_mlp": 1.00345898, + "epoch": 0.6839168911119662, + "flos": 795524937984.0, + "grad_norm": 0.03513613894761906, + "language_loss": 0.75376379, + "learning_rate": 0.0002399071815087821, + "loss": 0.7642042, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.40576172, + "step": 3555, + "time_per_iteration": 3.038098096847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049451, + "balance_loss_mlp": 1.00908649, + "epoch": 0.6841092727972297, + "flos": 581115362304.0, + "grad_norm": 0.037584614918211315, + "language_loss": 0.84306592, + "learning_rate": 0.00023964115830142025, + "loss": 0.85356045, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.40356445, + "step": 3556, + "time_per_iteration": 2.6664743423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047993, + "balance_loss_mlp": 1.0076046, + "epoch": 0.6843016544824932, + "flos": 384595771392.0, + "grad_norm": 0.04136622286730017, + "language_loss": 0.88220561, + "learning_rate": 0.00023937523617432522, + "loss": 0.89268553, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.40380859, + "step": 3557, + "time_per_iteration": 2.429532289505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104626, + "balance_loss_mlp": 1.00582457, + "epoch": 0.6844940361677568, + "flos": 1441289793792.0, + "grad_norm": 0.032795620592968935, + "language_loss": 0.87315959, + "learning_rate": 0.00023910941523073705, + "loss": 0.88362217, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.40429688, + "step": 3558, + "time_per_iteration": 3.8917641639709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104589, + "balance_loss_mlp": 1.00550175, + "epoch": 0.6846864178530204, + "flos": 521900719872.0, + "grad_norm": 0.03199772830475091, + "language_loss": 0.86959422, + "learning_rate": 0.0002388436955738566, + "loss": 0.8800531, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.40380859, + "step": 3559, + "time_per_iteration": 2.6707799434661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045488, + "balance_loss_mlp": 1.00514805, + "epoch": 0.6848787995382839, + "flos": 719230045440.0, + "grad_norm": 0.030101152384031323, + "language_loss": 0.81828642, + "learning_rate": 0.00023857807730684523, + "loss": 0.82874131, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.40332031, + "step": 3560, + "time_per_iteration": 2.8835229873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_mlp": 1.00440454, + "epoch": 0.6850711812235475, + "flos": 512162516736.0, + "grad_norm": 0.03806744815323664, + "language_loss": 0.83236831, + "learning_rate": 0.00023831256053282547, + "loss": 0.84281576, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.40332031, + "step": 3561, + "time_per_iteration": 2.723851203918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104642, + "balance_loss_mlp": 1.0061748, + "epoch": 0.6852635629088111, + "flos": 669432820224.0, + "grad_norm": 0.034115256160246236, + "language_loss": 0.78766859, + "learning_rate": 0.00023804714535488003, + "loss": 0.79813278, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.40234375, + "step": 3562, + "time_per_iteration": 2.862870454788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_mlp": 1.00953674, + "epoch": 0.6854559445940747, + "flos": 1526367323136.0, + "grad_norm": 0.0075236953863810525, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80858457, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.39257812, + "step": 3563, + "time_per_iteration": 4.951240539550781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045746, + "balance_loss_mlp": 1.00550103, + "epoch": 0.6856483262793382, + "flos": 455137950720.0, + "grad_norm": 0.03558245087854763, + "language_loss": 0.81134826, + "learning_rate": 0.00023751662019934488, + "loss": 0.82180572, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.40234375, + "step": 3564, + "time_per_iteration": 2.5381388664245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043955, + "balance_loss_mlp": 1.00378108, + "epoch": 0.6858407079646017, + "flos": 616689252096.0, + "grad_norm": 0.034154017668987145, + "language_loss": 0.79535556, + "learning_rate": 0.00023725151042772364, + "loss": 0.80579513, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.40161133, + "step": 3565, + "time_per_iteration": 2.8012731075286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044161, + "balance_loss_mlp": 1.00394011, + "epoch": 0.6860330896498653, + "flos": 467095387392.0, + "grad_norm": 0.03227163562068172, + "language_loss": 0.83989513, + "learning_rate": 0.00023698650266411276, + "loss": 0.85033673, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.40209961, + "step": 3566, + "time_per_iteration": 2.6114397048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045444, + "balance_loss_mlp": 1.00527024, + "epoch": 0.6862254713351289, + "flos": 865839650304.0, + "grad_norm": 0.03269984364116833, + "language_loss": 0.83511543, + "learning_rate": 0.00023672159701139755, + "loss": 0.84556985, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.40161133, + "step": 3567, + "time_per_iteration": 3.2268896102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045242, + "balance_loss_mlp": 1.00504482, + "epoch": 0.6864178530203925, + "flos": 448091411712.0, + "grad_norm": 0.03951724412418829, + "language_loss": 0.86782575, + "learning_rate": 0.00023645679357242296, + "loss": 0.87827814, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.40185547, + "step": 3568, + "time_per_iteration": 2.5142667293548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00512683, + "epoch": 0.6866102347056561, + "flos": 425212761600.0, + "grad_norm": 0.04100777191651884, + "language_loss": 0.84717417, + "learning_rate": 0.00023619209244999534, + "loss": 0.85762644, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.40087891, + "step": 3569, + "time_per_iteration": 2.506850004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_mlp": 1.00339389, + "epoch": 0.6868026163909196, + "flos": 473334137088.0, + "grad_norm": 0.0410478225777228, + "language_loss": 0.85724694, + "learning_rate": 0.0002359274937468806, + "loss": 0.86768192, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.40087891, + "step": 3570, + "time_per_iteration": 2.5271074771881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_mlp": 1.00446975, + "epoch": 0.6869949980761831, + "flos": 465206654976.0, + "grad_norm": 0.037625801670490476, + "language_loss": 0.78364801, + "learning_rate": 0.00023566299756580512, + "loss": 0.79409337, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.40039062, + "step": 3571, + "time_per_iteration": 2.641204595565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_mlp": 1.00682652, + "epoch": 0.6871873797614467, + "flos": 427131629568.0, + "grad_norm": 0.03563606510751839, + "language_loss": 0.78681505, + "learning_rate": 0.0002353986040094551, + "loss": 0.79728556, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.40209961, + "step": 3572, + "time_per_iteration": 2.508169412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051965, + "balance_loss_mlp": 1.01188707, + "epoch": 0.6873797614467103, + "flos": 444555019776.0, + "grad_norm": 0.03726905033743987, + "language_loss": 0.79780114, + "learning_rate": 0.00023513431318047796, + "loss": 0.80832076, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.40063477, + "step": 3573, + "time_per_iteration": 2.524447441101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049651, + "balance_loss_mlp": 1.00952482, + "epoch": 0.6875721431319738, + "flos": 993915764736.0, + "grad_norm": 0.03636326410660492, + "language_loss": 0.77452493, + "learning_rate": 0.00023487012518147977, + "loss": 0.78502142, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.40112305, + "step": 3574, + "time_per_iteration": 3.220405340194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050375, + "balance_loss_mlp": 1.0102489, + "epoch": 0.6877645248172374, + "flos": 1287448957440.0, + "grad_norm": 0.03573540340682003, + "language_loss": 0.8513974, + "learning_rate": 0.00023460604011502772, + "loss": 0.86190116, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.40112305, + "step": 3575, + "time_per_iteration": 3.642275094985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050219, + "balance_loss_mlp": 1.01016474, + "epoch": 0.687956906502501, + "flos": 878230633728.0, + "grad_norm": 0.03712322767152043, + "language_loss": 0.86061072, + "learning_rate": 0.00023434205808364845, + "loss": 0.87111294, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.40039062, + "step": 3576, + "time_per_iteration": 3.093545436859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047023, + "balance_loss_mlp": 1.00680172, + "epoch": 0.6881492881877646, + "flos": 564471571200.0, + "grad_norm": 0.039318035109250464, + "language_loss": 0.86179203, + "learning_rate": 0.00023407817918982932, + "loss": 0.87226224, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.40209961, + "step": 3577, + "time_per_iteration": 2.755629777908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_mlp": 1.00656283, + "epoch": 0.6883416698730281, + "flos": 796510616832.0, + "grad_norm": 0.03470611198905491, + "language_loss": 0.79102242, + "learning_rate": 0.00023381440353601718, + "loss": 0.80149001, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.40185547, + "step": 3578, + "time_per_iteration": 2.990251302719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_mlp": 1.00540566, + "epoch": 0.6885340515582916, + "flos": 724880691456.0, + "grad_norm": 0.04272273427793483, + "language_loss": 0.86559987, + "learning_rate": 0.00023355073122461822, + "loss": 0.87605572, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.40161133, + "step": 3579, + "time_per_iteration": 2.9245500564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_mlp": 1.00522161, + "epoch": 0.6887264332435552, + "flos": 1012522165248.0, + "grad_norm": 0.033292192645982856, + "language_loss": 0.83352244, + "learning_rate": 0.00023328716235799973, + "loss": 0.84397686, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.40209961, + "step": 3580, + "time_per_iteration": 3.2759361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049535, + "balance_loss_mlp": 1.00936127, + "epoch": 0.6889188149288188, + "flos": 586347045888.0, + "grad_norm": 0.03483646378728446, + "language_loss": 0.84317255, + "learning_rate": 0.00023302369703848803, + "loss": 0.85366791, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.40161133, + "step": 3581, + "time_per_iteration": 2.692676544189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.00718749, + "epoch": 0.6891111966140824, + "flos": 637277703936.0, + "grad_norm": 0.03603221184194459, + "language_loss": 0.80829328, + "learning_rate": 0.00023276033536836937, + "loss": 0.81876856, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.40332031, + "step": 3582, + "time_per_iteration": 2.7863240242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_mlp": 1.00297284, + "epoch": 0.6893035782993459, + "flos": 496312909056.0, + "grad_norm": 0.032647536159740746, + "language_loss": 0.85196984, + "learning_rate": 0.00023249707744988984, + "loss": 0.86240131, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.40161133, + "step": 3583, + "time_per_iteration": 2.6404688358306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_mlp": 1.00486732, + "epoch": 0.6894959599846094, + "flos": 459149685504.0, + "grad_norm": 0.038319027803205424, + "language_loss": 0.82998735, + "learning_rate": 0.00023223392338525529, + "loss": 0.84043896, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.40283203, + "step": 3584, + "time_per_iteration": 2.526021957397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050689, + "balance_loss_mlp": 1.01030123, + "epoch": 0.689688341669873, + "flos": 506057915136.0, + "grad_norm": 0.03433951849080314, + "language_loss": 0.79221714, + "learning_rate": 0.00023197087327663107, + "loss": 0.802724, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.40380859, + "step": 3585, + "time_per_iteration": 2.6632885932922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_mlp": 1.00449693, + "epoch": 0.6898807233551366, + "flos": 765219670272.0, + "grad_norm": 0.036720139480463, + "language_loss": 0.81855822, + "learning_rate": 0.00023170792722614243, + "loss": 0.82900751, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.40429688, + "step": 3586, + "time_per_iteration": 2.8943870067596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046135, + "balance_loss_mlp": 1.00577044, + "epoch": 0.6900731050404002, + "flos": 584573986560.0, + "grad_norm": 0.03037103532376505, + "language_loss": 0.84293818, + "learning_rate": 0.00023144508533587377, + "loss": 0.85339952, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.40356445, + "step": 3587, + "time_per_iteration": 2.826327085494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_mlp": 1.00622261, + "epoch": 0.6902654867256637, + "flos": 713206123776.0, + "grad_norm": 0.03728824809581911, + "language_loss": 0.79222, + "learning_rate": 0.0002311823477078698, + "loss": 0.8026861, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.40380859, + "step": 3588, + "time_per_iteration": 2.9109723567962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046452, + "balance_loss_mlp": 1.00611138, + "epoch": 0.6904578684109273, + "flos": 598304482560.0, + "grad_norm": 0.034163579129476235, + "language_loss": 0.85722661, + "learning_rate": 0.00023091971444413428, + "loss": 0.8676911, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.40332031, + "step": 3589, + "time_per_iteration": 4.1711201667785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044858, + "balance_loss_mlp": 1.00454128, + "epoch": 0.6906502500961909, + "flos": 586177904640.0, + "grad_norm": 0.030860818872724436, + "language_loss": 0.82910645, + "learning_rate": 0.00023065718564663012, + "loss": 0.83955508, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.40307617, + "step": 3590, + "time_per_iteration": 2.7104885578155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_mlp": 1.00551605, + "epoch": 0.6908426317814544, + "flos": 1591143183360.0, + "grad_norm": 0.007096149350185522, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74956298, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.39160156, + "step": 3591, + "time_per_iteration": 5.0011866092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045736, + "balance_loss_mlp": 1.0053004, + "epoch": 0.6910350134667179, + "flos": 501805107456.0, + "grad_norm": 0.029643353067264133, + "language_loss": 0.81368697, + "learning_rate": 0.0002301324418579666, + "loss": 0.82414436, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.40429688, + "step": 3592, + "time_per_iteration": 2.710340738296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050949, + "balance_loss_mlp": 1.01184845, + "epoch": 0.6912273951519815, + "flos": 1412135443968.0, + "grad_norm": 0.014289812000501409, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79739422, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.390625, + "step": 3593, + "time_per_iteration": 4.750281810760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_mlp": 1.00402915, + "epoch": 0.6914197768372451, + "flos": 636557397504.0, + "grad_norm": 0.0367015241777211, + "language_loss": 0.8129431, + "learning_rate": 0.00022960811715677415, + "loss": 0.82338709, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.40356445, + "step": 3594, + "time_per_iteration": 2.846938133239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_mlp": 1.00485945, + "epoch": 0.6916121585225087, + "flos": 559202949120.0, + "grad_norm": 0.030135543775537642, + "language_loss": 0.82059658, + "learning_rate": 0.00022934611221845608, + "loss": 0.83104908, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.40380859, + "step": 3595, + "time_per_iteration": 2.8187928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049786, + "balance_loss_mlp": 1.00925434, + "epoch": 0.6918045402077723, + "flos": 530293574400.0, + "grad_norm": 0.0337393790819551, + "language_loss": 0.78598142, + "learning_rate": 0.00022908421235729609, + "loss": 0.79647928, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.40527344, + "step": 3596, + "time_per_iteration": 2.7116031646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_mlp": 1.00894892, + "epoch": 0.6919969218930357, + "flos": 571426736640.0, + "grad_norm": 0.033365686577519565, + "language_loss": 0.8572033, + "learning_rate": 0.0002288224176749728, + "loss": 0.86769807, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.40527344, + "step": 3597, + "time_per_iteration": 2.6345982551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053776, + "balance_loss_mlp": 1.01334012, + "epoch": 0.6921893035782993, + "flos": 684504774144.0, + "grad_norm": 0.03882210113784689, + "language_loss": 0.79009509, + "learning_rate": 0.00022856072827312385, + "loss": 0.80063289, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.40429688, + "step": 3598, + "time_per_iteration": 2.7988228797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055733, + "balance_loss_mlp": 1.01532125, + "epoch": 0.6923816852635629, + "flos": 547794732288.0, + "grad_norm": 0.03734800797345761, + "language_loss": 0.77726078, + "learning_rate": 0.00022829914425334598, + "loss": 0.78781813, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.40405273, + "step": 3599, + "time_per_iteration": 2.628700017929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053695, + "balance_loss_mlp": 1.01318777, + "epoch": 0.6925740669488265, + "flos": 511057274112.0, + "grad_norm": 0.04268943868915618, + "language_loss": 0.81083095, + "learning_rate": 0.0002280376657171956, + "loss": 0.82136786, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.4050293, + "step": 3600, + "time_per_iteration": 2.6388540267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_mlp": 1.00723314, + "epoch": 0.69276644863409, + "flos": 870914831616.0, + "grad_norm": 0.03151516530710953, + "language_loss": 0.76992857, + "learning_rate": 0.00022777629276618706, + "loss": 0.78040528, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.40429688, + "step": 3601, + "time_per_iteration": 3.086951732635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_mlp": 1.00776148, + "epoch": 0.6929588303193536, + "flos": 626918349312.0, + "grad_norm": 0.03515773513290382, + "language_loss": 0.77888995, + "learning_rate": 0.0002275150255017947, + "loss": 0.78937119, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.40356445, + "step": 3602, + "time_per_iteration": 2.8207640647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_mlp": 1.01031494, + "epoch": 0.6931512120046172, + "flos": 1548807568896.0, + "grad_norm": 0.008023369975758985, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76782179, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.39160156, + "step": 3603, + "time_per_iteration": 5.031715631484985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_mlp": 1.00765991, + "epoch": 0.6933435936898807, + "flos": 1451326405632.0, + "grad_norm": 0.006067349506475221, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.7617377, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.38964844, + "step": 3604, + "time_per_iteration": 4.695592641830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_mlp": 1.00505233, + "epoch": 0.6935359753751443, + "flos": 541931203584.0, + "grad_norm": 0.03296159343177749, + "language_loss": 0.85254478, + "learning_rate": 0.0002267318588424379, + "loss": 0.86299777, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.40234375, + "step": 3605, + "time_per_iteration": 2.62107253074646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_mlp": 1.00330186, + "epoch": 0.6937283570604078, + "flos": 720691067136.0, + "grad_norm": 0.03433808415235627, + "language_loss": 0.87899154, + "learning_rate": 0.00022647101533842845, + "loss": 0.88942766, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.40307617, + "step": 3606, + "time_per_iteration": 2.9330387115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043345, + "balance_loss_mlp": 1.00302887, + "epoch": 0.6939207387456714, + "flos": 523194545664.0, + "grad_norm": 0.042523396404585766, + "language_loss": 0.76967436, + "learning_rate": 0.00022621027802778872, + "loss": 0.7801078, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.40307617, + "step": 3607, + "time_per_iteration": 2.6252737045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00430596, + "epoch": 0.694113120430935, + "flos": 536402066688.0, + "grad_norm": 0.03600646931475283, + "language_loss": 0.7913326, + "learning_rate": 0.00022594964701174586, + "loss": 0.80177784, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.40209961, + "step": 3608, + "time_per_iteration": 2.674360513687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044111, + "balance_loss_mlp": 1.00391352, + "epoch": 0.6943055021161986, + "flos": 524395052544.0, + "grad_norm": 0.03489608183841533, + "language_loss": 0.85372239, + "learning_rate": 0.00022568912239148586, + "loss": 0.86416358, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.40185547, + "step": 3609, + "time_per_iteration": 2.610682964324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043745, + "balance_loss_mlp": 1.0034523, + "epoch": 0.694497883801462, + "flos": 485971051008.0, + "grad_norm": 0.03140889244124769, + "language_loss": 0.81940842, + "learning_rate": 0.00022542870426815344, + "loss": 0.82984591, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.40283203, + "step": 3610, + "time_per_iteration": 2.7095394134521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_mlp": 1.00303328, + "epoch": 0.6946902654867256, + "flos": 462425562624.0, + "grad_norm": 0.03725802111731568, + "language_loss": 0.86767513, + "learning_rate": 0.00022516839274285173, + "loss": 0.87810791, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.40234375, + "step": 3611, + "time_per_iteration": 2.5144243240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_mlp": 1.00223184, + "epoch": 0.6948826471719892, + "flos": 513868502016.0, + "grad_norm": 0.03700002274884872, + "language_loss": 0.75493568, + "learning_rate": 0.00022490818791664265, + "loss": 0.76536095, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.40283203, + "step": 3612, + "time_per_iteration": 2.5950610637664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_mlp": 1.00429881, + "epoch": 0.6950750288572528, + "flos": 558256154112.0, + "grad_norm": 0.03078051424242557, + "language_loss": 0.86039829, + "learning_rate": 0.00022464808989054676, + "loss": 0.87084323, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.40185547, + "step": 3613, + "time_per_iteration": 2.6489851474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_mlp": 1.00567627, + "epoch": 0.6952674105425164, + "flos": 543522482688.0, + "grad_norm": 0.037582150054456365, + "language_loss": 0.76400638, + "learning_rate": 0.00022438809876554284, + "loss": 0.77446485, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.40161133, + "step": 3614, + "time_per_iteration": 2.666472911834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046875, + "balance_loss_mlp": 1.00672579, + "epoch": 0.6954597922277799, + "flos": 547857915648.0, + "grad_norm": 0.03577219625118018, + "language_loss": 0.81085944, + "learning_rate": 0.00022412821464256873, + "loss": 0.82132822, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.40136719, + "step": 3615, + "time_per_iteration": 2.6799051761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.00479829, + "epoch": 0.6956521739130435, + "flos": 520541765376.0, + "grad_norm": 0.03709092288517812, + "language_loss": 0.82944018, + "learning_rate": 0.00022386843762252023, + "loss": 0.83988917, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.40087891, + "step": 3616, + "time_per_iteration": 2.600679397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_mlp": 1.00496709, + "epoch": 0.695844555598307, + "flos": 467264528640.0, + "grad_norm": 0.03687910314272662, + "language_loss": 0.8069849, + "learning_rate": 0.00022360876780625193, + "loss": 0.81743586, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.40112305, + "step": 3617, + "time_per_iteration": 2.5893161296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_mlp": 1.00462067, + "epoch": 0.6960369372835706, + "flos": 601932248064.0, + "grad_norm": 0.02883770808166936, + "language_loss": 0.80609798, + "learning_rate": 0.00022334920529457604, + "loss": 0.81654525, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.40087891, + "step": 3618, + "time_per_iteration": 2.8958587646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045417, + "balance_loss_mlp": 1.00538695, + "epoch": 0.6962293189688342, + "flos": 645466424064.0, + "grad_norm": 0.029378827731847603, + "language_loss": 0.88201439, + "learning_rate": 0.00022308975018826423, + "loss": 0.89246857, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.40014648, + "step": 3619, + "time_per_iteration": 2.849514961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_mlp": 1.00476694, + "epoch": 0.6964217006540977, + "flos": 639958674432.0, + "grad_norm": 0.03836514772463411, + "language_loss": 0.84951282, + "learning_rate": 0.00022283040258804564, + "loss": 0.85996217, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.40161133, + "step": 3620, + "time_per_iteration": 2.755397319793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042358, + "balance_loss_mlp": 1.00220859, + "epoch": 0.6966140823393613, + "flos": 653387826432.0, + "grad_norm": 0.036503412775040926, + "language_loss": 0.84546065, + "learning_rate": 0.00022257116259460802, + "loss": 0.85588425, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.40136719, + "step": 3621, + "time_per_iteration": 2.8644983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_mlp": 1.00342941, + "epoch": 0.6968064640246249, + "flos": 705825192960.0, + "grad_norm": 0.030665085995137797, + "language_loss": 0.81856084, + "learning_rate": 0.00022231203030859725, + "loss": 0.82899684, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.40161133, + "step": 3622, + "time_per_iteration": 3.017775297164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_mlp": 1.00803852, + "epoch": 0.6969988457098885, + "flos": 493531816704.0, + "grad_norm": 0.04078314735210094, + "language_loss": 0.8408944, + "learning_rate": 0.00022205300583061737, + "loss": 0.85137624, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.40136719, + "step": 3623, + "time_per_iteration": 2.5776522159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_mlp": 1.00615692, + "epoch": 0.6971912273951519, + "flos": 1355615377920.0, + "grad_norm": 0.00769674903149883, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83883369, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.38964844, + "step": 3624, + "time_per_iteration": 4.895683288574219 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046564, + "balance_loss_mlp": 1.00653315, + "epoch": 0.6973836090804155, + "flos": 603575049984.0, + "grad_norm": 0.03550964133883238, + "language_loss": 0.77939522, + "learning_rate": 0.00022153528070095735, + "loss": 0.7898609, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.40014648, + "step": 3625, + "time_per_iteration": 2.73093581199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_mlp": 1.00628626, + "epoch": 0.6975759907656791, + "flos": 525111468288.0, + "grad_norm": 0.03728439171184861, + "language_loss": 0.88488603, + "learning_rate": 0.00022127658025027568, + "loss": 0.89534825, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.39916992, + "step": 3626, + "time_per_iteration": 2.645886182785034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_mlp": 1.00771928, + "epoch": 0.6977683724509427, + "flos": 481878636288.0, + "grad_norm": 0.032998889272974, + "language_loss": 0.85482383, + "learning_rate": 0.00022101798800962258, + "loss": 0.86530197, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.40087891, + "step": 3627, + "time_per_iteration": 2.6026127338409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048765, + "balance_loss_mlp": 1.00875843, + "epoch": 0.6979607541362063, + "flos": 523641698304.0, + "grad_norm": 0.041862603089362516, + "language_loss": 0.79471421, + "learning_rate": 0.00022075950407939227, + "loss": 0.80520177, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.39990234, + "step": 3628, + "time_per_iteration": 2.61621356010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045103, + "balance_loss_mlp": 1.00514364, + "epoch": 0.6981531358214698, + "flos": 549116748288.0, + "grad_norm": 0.03728815941445965, + "language_loss": 0.83285969, + "learning_rate": 0.0002205011285599367, + "loss": 0.84331071, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.39941406, + "step": 3629, + "time_per_iteration": 2.6081953048706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041921, + "balance_loss_mlp": 1.00200999, + "epoch": 0.6983455175067333, + "flos": 701276877312.0, + "grad_norm": 0.05573052179945255, + "language_loss": 0.80735791, + "learning_rate": 0.00022024286155156658, + "loss": 0.81777716, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.39892578, + "step": 3630, + "time_per_iteration": 2.828234910964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041029, + "balance_loss_mlp": 1.00106966, + "epoch": 0.6985378991919969, + "flos": 486120750336.0, + "grad_norm": 0.034934255505656486, + "language_loss": 0.86530191, + "learning_rate": 0.00021998470315454994, + "loss": 0.87571216, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.39941406, + "step": 3631, + "time_per_iteration": 2.689331293106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_mlp": 1.00034702, + "epoch": 0.6987302808772605, + "flos": 559893120000.0, + "grad_norm": 0.03380510665243889, + "language_loss": 0.86876583, + "learning_rate": 0.00021972665346911275, + "loss": 0.87916821, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.39868164, + "step": 3632, + "time_per_iteration": 2.689023017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040303, + "balance_loss_mlp": 1.00032043, + "epoch": 0.698922662562524, + "flos": 484568355072.0, + "grad_norm": 0.03644538242957212, + "language_loss": 0.80445158, + "learning_rate": 0.00021946871259543877, + "loss": 0.81485462, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.3996582, + "step": 3633, + "time_per_iteration": 2.584099292755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040446, + "balance_loss_mlp": 1.00048685, + "epoch": 0.6991150442477876, + "flos": 720206976000.0, + "grad_norm": 0.03286124329654603, + "language_loss": 0.83436686, + "learning_rate": 0.00021921088063366957, + "loss": 0.84477133, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.39941406, + "step": 3634, + "time_per_iteration": 2.9156620502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_mlp": 1.00328004, + "epoch": 0.6993074259330512, + "flos": 490160675328.0, + "grad_norm": 0.03268452893811677, + "language_loss": 0.82517856, + "learning_rate": 0.00021895315768390435, + "loss": 0.83561075, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.39916992, + "step": 3635, + "time_per_iteration": 2.5866551399230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_mlp": 1.01126432, + "epoch": 0.6994998076183148, + "flos": 719469172992.0, + "grad_norm": 0.02932000302360117, + "language_loss": 0.88269186, + "learning_rate": 0.00021869554384619999, + "loss": 0.89320338, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.39868164, + "step": 3636, + "time_per_iteration": 2.971536159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050583, + "balance_loss_mlp": 1.01057589, + "epoch": 0.6996921893035783, + "flos": 580164676608.0, + "grad_norm": 0.03639524799705141, + "language_loss": 0.81240088, + "learning_rate": 0.00021843803922057115, + "loss": 0.82290673, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.39990234, + "step": 3637, + "time_per_iteration": 2.725170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_mlp": 1.01060295, + "epoch": 0.6998845709888418, + "flos": 519675650304.0, + "grad_norm": 0.03468807829141317, + "language_loss": 0.82837808, + "learning_rate": 0.00021818064390698977, + "loss": 0.83888352, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.39916992, + "step": 3638, + "time_per_iteration": 2.633237838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_mlp": 1.00311363, + "epoch": 0.7000769526741054, + "flos": 622096879872.0, + "grad_norm": 0.03453806338856074, + "language_loss": 0.87273943, + "learning_rate": 0.0002179233580053861, + "loss": 0.8831709, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.40014648, + "step": 3639, + "time_per_iteration": 2.7544472217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_mlp": 1.0029943, + "epoch": 0.700269334359369, + "flos": 561056688384.0, + "grad_norm": 0.033530662596956085, + "language_loss": 0.85948008, + "learning_rate": 0.00021766618161564688, + "loss": 0.86991107, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.40087891, + "step": 3640, + "time_per_iteration": 2.7110095024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_mlp": 1.00132048, + "epoch": 0.7004617160446326, + "flos": 484362275328.0, + "grad_norm": 0.03557696097422109, + "language_loss": 0.87556666, + "learning_rate": 0.00021740911483761677, + "loss": 0.88598037, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.40039062, + "step": 3641, + "time_per_iteration": 2.5866122245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_mlp": 1.00230658, + "epoch": 0.7006540977298961, + "flos": 698322753024.0, + "grad_norm": 0.029252813269696705, + "language_loss": 0.92278117, + "learning_rate": 0.00021715215777109837, + "loss": 0.93320382, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.39941406, + "step": 3642, + "time_per_iteration": 2.9658164978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.01600468, + "epoch": 0.7008464794151597, + "flos": 505771155456.0, + "grad_norm": 0.0370639666427534, + "language_loss": 0.84983593, + "learning_rate": 0.00021689531051585103, + "loss": 0.86039579, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.3996582, + "step": 3643, + "time_per_iteration": 2.605422019958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055864, + "balance_loss_mlp": 1.01583362, + "epoch": 0.7010388611004232, + "flos": 538273302528.0, + "grad_norm": 0.03585337078400258, + "language_loss": 0.8111937, + "learning_rate": 0.00021663857317159196, + "loss": 0.82175231, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.40014648, + "step": 3644, + "time_per_iteration": 2.601376533508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053049, + "balance_loss_mlp": 1.01304281, + "epoch": 0.7012312427856868, + "flos": 548315761920.0, + "grad_norm": 0.03435070912909032, + "language_loss": 0.82316148, + "learning_rate": 0.00021638194583799487, + "loss": 0.83369195, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.39990234, + "step": 3645, + "time_per_iteration": 2.686854839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_mlp": 1.00636709, + "epoch": 0.7014236244709504, + "flos": 942974413056.0, + "grad_norm": 0.03710405842133189, + "language_loss": 0.83184248, + "learning_rate": 0.00021612542861469176, + "loss": 0.84230787, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.40161133, + "step": 3646, + "time_per_iteration": 3.2522597312927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.01623774, + "epoch": 0.7016160061562139, + "flos": 526209907968.0, + "grad_norm": 0.03458129081843451, + "language_loss": 0.82967472, + "learning_rate": 0.00021586902160127135, + "loss": 0.84023744, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.40014648, + "step": 3647, + "time_per_iteration": 2.592898368835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_mlp": 1.01410604, + "epoch": 0.7018083878414775, + "flos": 374245165056.0, + "grad_norm": 0.045887676858618894, + "language_loss": 0.74931926, + "learning_rate": 0.00021561272489727974, + "loss": 0.75986135, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.40087891, + "step": 3648, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047959, + "balance_loss_mlp": 1.00785732, + "epoch": 0.7020007695267411, + "flos": 528834498048.0, + "grad_norm": 0.03554324535987718, + "language_loss": 0.81039417, + "learning_rate": 0.0002153565386022199, + "loss": 0.82087374, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.40087891, + "step": 3649, + "time_per_iteration": 2.695328712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00897217, + "epoch": 0.7021931512120047, + "flos": 691373423616.0, + "grad_norm": 0.035617603587249046, + "language_loss": 0.82844687, + "learning_rate": 0.00021510046281555262, + "loss": 0.83893853, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.40185547, + "step": 3650, + "time_per_iteration": 2.8195676803588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047858, + "balance_loss_mlp": 1.00761259, + "epoch": 0.7023855328972681, + "flos": 640926856704.0, + "grad_norm": 0.042051655567710275, + "language_loss": 0.82163751, + "learning_rate": 0.0002148444976366949, + "loss": 0.83211613, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.40234375, + "step": 3651, + "time_per_iteration": 2.7910640239715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049797, + "balance_loss_mlp": 1.00969481, + "epoch": 0.7025779145825317, + "flos": 562007374080.0, + "grad_norm": 0.03669409965522196, + "language_loss": 0.8294403, + "learning_rate": 0.00021458864316502136, + "loss": 0.83993822, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.40087891, + "step": 3652, + "time_per_iteration": 2.7377076148986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050304, + "balance_loss_mlp": 1.01020181, + "epoch": 0.7027702962677953, + "flos": 448371368448.0, + "grad_norm": 0.037398832167444995, + "language_loss": 0.87441307, + "learning_rate": 0.0002143328994998634, + "loss": 0.88491613, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.40087891, + "step": 3653, + "time_per_iteration": 2.510070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048411, + "balance_loss_mlp": 1.00833249, + "epoch": 0.7029626779530589, + "flos": 623714403840.0, + "grad_norm": 0.0361167635185571, + "language_loss": 0.78985465, + "learning_rate": 0.00021407726674050982, + "loss": 0.80033875, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.40063477, + "step": 3654, + "time_per_iteration": 2.8577005863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_mlp": 1.00903809, + "epoch": 0.7031550596383225, + "flos": 630734697984.0, + "grad_norm": 0.031984411751134825, + "language_loss": 0.87403131, + "learning_rate": 0.0002138217449862061, + "loss": 0.88452226, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.40039062, + "step": 3655, + "time_per_iteration": 2.731257915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051587, + "balance_loss_mlp": 1.01160455, + "epoch": 0.703347441323586, + "flos": 531860553984.0, + "grad_norm": 0.032014026327257146, + "language_loss": 0.7905367, + "learning_rate": 0.00021356633433615403, + "loss": 0.80105257, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.3996582, + "step": 3656, + "time_per_iteration": 2.6462786197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.01192546, + "epoch": 0.7035398230088495, + "flos": 694916618496.0, + "grad_norm": 0.025544718758457735, + "language_loss": 0.83906752, + "learning_rate": 0.0002133110348895133, + "loss": 0.84958708, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.40014648, + "step": 3657, + "time_per_iteration": 2.968036413192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_mlp": 1.01158977, + "epoch": 0.7037322046941131, + "flos": 969667466496.0, + "grad_norm": 0.030163391429171182, + "language_loss": 0.85463339, + "learning_rate": 0.0002130558467453999, + "loss": 0.8651489, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.39941406, + "step": 3658, + "time_per_iteration": 3.3951528072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047875, + "balance_loss_mlp": 1.00789237, + "epoch": 0.7039245863793767, + "flos": 503926164480.0, + "grad_norm": 0.029582354045105844, + "language_loss": 0.84755009, + "learning_rate": 0.0002128007700028865, + "loss": 0.85802877, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.3996582, + "step": 3659, + "time_per_iteration": 2.754249334335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_mlp": 1.00460947, + "epoch": 0.7041169680646402, + "flos": 466938885120.0, + "grad_norm": 0.03694565934757681, + "language_loss": 0.8474158, + "learning_rate": 0.00021254580476100276, + "loss": 0.85786295, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.40087891, + "step": 3660, + "time_per_iteration": 2.576219081878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_mlp": 1.00359917, + "epoch": 0.7043093497499038, + "flos": 633322349568.0, + "grad_norm": 0.037641747763634714, + "language_loss": 0.79470807, + "learning_rate": 0.00021229095111873497, + "loss": 0.80514407, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.39990234, + "step": 3661, + "time_per_iteration": 2.7775161266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043368, + "balance_loss_mlp": 1.00333726, + "epoch": 0.7045017314351674, + "flos": 544096002048.0, + "grad_norm": 0.03023690962448049, + "language_loss": 0.86693418, + "learning_rate": 0.0002120362091750261, + "loss": 0.87736779, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.40014648, + "step": 3662, + "time_per_iteration": 2.815168857574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.00267351, + "epoch": 0.704694113120431, + "flos": 429141871104.0, + "grad_norm": 0.036907150984541, + "language_loss": 0.87510955, + "learning_rate": 0.00021178157902877566, + "loss": 0.88553607, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.3996582, + "step": 3663, + "time_per_iteration": 2.458578109741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_mlp": 1.00373864, + "epoch": 0.7048864948056945, + "flos": 651713922048.0, + "grad_norm": 0.04106624653226338, + "language_loss": 0.87760627, + "learning_rate": 0.0002115270607788397, + "loss": 0.88804281, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.39892578, + "step": 3664, + "time_per_iteration": 2.756804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_mlp": 1.00445461, + "epoch": 0.705078876490958, + "flos": 413494452480.0, + "grad_norm": 0.03442797785772838, + "language_loss": 0.86509478, + "learning_rate": 0.00021127265452403133, + "loss": 0.87553817, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.39868164, + "step": 3665, + "time_per_iteration": 2.534076690673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_mlp": 1.0045929, + "epoch": 0.7052712581762216, + "flos": 1423150943232.0, + "grad_norm": 0.008458198264264957, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85135132, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.38867188, + "step": 3666, + "time_per_iteration": 4.859800815582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_mlp": 1.00266969, + "epoch": 0.7054636398614852, + "flos": 494070342912.0, + "grad_norm": 0.037128971718994215, + "language_loss": 0.833794, + "learning_rate": 0.00021076417839483065, + "loss": 0.84422016, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.39916992, + "step": 3667, + "time_per_iteration": 2.7798430919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00283003, + "epoch": 0.7056560215467488, + "flos": 451377982464.0, + "grad_norm": 0.031014936324499143, + "language_loss": 0.85416818, + "learning_rate": 0.00021051010871784589, + "loss": 0.86459577, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.39916992, + "step": 3668, + "time_per_iteration": 2.560455560684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_mlp": 1.00304842, + "epoch": 0.7058484032320124, + "flos": 566818149888.0, + "grad_norm": 0.030353159640158514, + "language_loss": 0.79448986, + "learning_rate": 0.0002102561514308045, + "loss": 0.8049202, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.3996582, + "step": 3669, + "time_per_iteration": 2.7246358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_mlp": 1.00234294, + "epoch": 0.7060407849172758, + "flos": 568103227392.0, + "grad_norm": 0.03405380367536788, + "language_loss": 0.82700998, + "learning_rate": 0.00021000230663230135, + "loss": 0.83743417, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.40063477, + "step": 3670, + "time_per_iteration": 2.6809375286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_mlp": 1.00293946, + "epoch": 0.7062331666025394, + "flos": 469713174528.0, + "grad_norm": 0.035705889445470915, + "language_loss": 0.83772206, + "learning_rate": 0.00020974857442088762, + "loss": 0.8481518, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.40014648, + "step": 3671, + "time_per_iteration": 2.6487808227539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_mlp": 1.00330818, + "epoch": 0.706425548287803, + "flos": 596417695488.0, + "grad_norm": 0.03583731061026118, + "language_loss": 0.89143217, + "learning_rate": 0.00020949495489507104, + "loss": 0.90186673, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.40136719, + "step": 3672, + "time_per_iteration": 2.704887628555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_mlp": 1.00331914, + "epoch": 0.7066179299730666, + "flos": 476814148608.0, + "grad_norm": 0.034102097435369114, + "language_loss": 0.84997833, + "learning_rate": 0.00020924144815331525, + "loss": 0.86041224, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.40063477, + "step": 3673, + "time_per_iteration": 2.5945112705230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_mlp": 1.0026809, + "epoch": 0.7068103116583301, + "flos": 507436311552.0, + "grad_norm": 0.033684521411270194, + "language_loss": 0.83985698, + "learning_rate": 0.00020898805429404044, + "loss": 0.85028362, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.3996582, + "step": 3674, + "time_per_iteration": 2.5818920135498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_mlp": 1.00266123, + "epoch": 0.7070026933435937, + "flos": 680575664640.0, + "grad_norm": 0.03512873001655734, + "language_loss": 0.78734016, + "learning_rate": 0.0002087347734156228, + "loss": 0.7977668, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.39990234, + "step": 3675, + "time_per_iteration": 2.8316643238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044552, + "balance_loss_mlp": 1.00447345, + "epoch": 0.7071950750288573, + "flos": 473166941184.0, + "grad_norm": 0.03289895415072129, + "language_loss": 0.79907787, + "learning_rate": 0.00020848160561639452, + "loss": 0.8095234, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.40063477, + "step": 3676, + "time_per_iteration": 2.662691354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_mlp": 1.00358856, + "epoch": 0.7073874567141208, + "flos": 474684343296.0, + "grad_norm": 0.031178438211795275, + "language_loss": 0.86372793, + "learning_rate": 0.0002082285509946445, + "loss": 0.87416512, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.40112305, + "step": 3677, + "time_per_iteration": 2.54286789894104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043907, + "balance_loss_mlp": 1.0038054, + "epoch": 0.7075798383993844, + "flos": 547037487360.0, + "grad_norm": 0.033007214142821914, + "language_loss": 0.83766264, + "learning_rate": 0.00020797560964861683, + "loss": 0.84810174, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.40087891, + "step": 3678, + "time_per_iteration": 2.7636282444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043429, + "balance_loss_mlp": 1.00335097, + "epoch": 0.7077722200846479, + "flos": 663391401984.0, + "grad_norm": 0.033779282823635445, + "language_loss": 0.81209165, + "learning_rate": 0.0002077227816765122, + "loss": 0.82252598, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.40063477, + "step": 3679, + "time_per_iteration": 3.0056393146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_mlp": 1.00824738, + "epoch": 0.7079646017699115, + "flos": 1533303046656.0, + "grad_norm": 0.005266739458106997, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.7749517, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.39160156, + "step": 3680, + "time_per_iteration": 4.76727819442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_mlp": 1.00492287, + "epoch": 0.7081569834551751, + "flos": 622646099712.0, + "grad_norm": 0.03129589389619307, + "language_loss": 0.78969026, + "learning_rate": 0.00020721746624665383, + "loss": 0.80013955, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.39990234, + "step": 3681, + "time_per_iteration": 2.72866153717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_mlp": 1.00419092, + "epoch": 0.7083493651404387, + "flos": 796035273984.0, + "grad_norm": 0.031303476473040825, + "language_loss": 0.80593359, + "learning_rate": 0.00020696497898508114, + "loss": 0.81637675, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.40112305, + "step": 3682, + "time_per_iteration": 3.041132926940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044378, + "balance_loss_mlp": 1.00425231, + "epoch": 0.7085417468257021, + "flos": 815162704128.0, + "grad_norm": 0.03799512363441117, + "language_loss": 0.78282857, + "learning_rate": 0.00020671260548979316, + "loss": 0.79327232, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.40112305, + "step": 3683, + "time_per_iteration": 2.980470895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046933, + "balance_loss_mlp": 1.00675917, + "epoch": 0.7087341285109657, + "flos": 701797906944.0, + "grad_norm": 0.03765603647775186, + "language_loss": 0.85959506, + "learning_rate": 0.00020646034585876982, + "loss": 0.87006438, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.40161133, + "step": 3684, + "time_per_iteration": 2.83225417137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00350797, + "epoch": 0.7089265101962293, + "flos": 597735820800.0, + "grad_norm": 0.030001144776417084, + "language_loss": 0.8503226, + "learning_rate": 0.00020620820018994718, + "loss": 0.86075842, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.40063477, + "step": 3685, + "time_per_iteration": 2.808814287185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_mlp": 1.00334978, + "epoch": 0.7091188918814929, + "flos": 488167930368.0, + "grad_norm": 0.039691244265052834, + "language_loss": 0.82984829, + "learning_rate": 0.00020595616858121675, + "loss": 0.84028256, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.40063477, + "step": 3686, + "time_per_iteration": 2.696423292160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_mlp": 1.00316179, + "epoch": 0.7093112735667565, + "flos": 601256661504.0, + "grad_norm": 0.03416651463344776, + "language_loss": 0.81164849, + "learning_rate": 0.00020570425113042586, + "loss": 0.82208097, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.40063477, + "step": 3687, + "time_per_iteration": 2.735722303390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_mlp": 1.00281906, + "epoch": 0.70950365525202, + "flos": 506850153216.0, + "grad_norm": 0.03675476987666338, + "language_loss": 0.86545879, + "learning_rate": 0.0002054524479353776, + "loss": 0.87588727, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.40014648, + "step": 3688, + "time_per_iteration": 2.6537790298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042758, + "balance_loss_mlp": 1.0026803, + "epoch": 0.7096960369372836, + "flos": 733425190656.0, + "grad_norm": 0.03699911632186226, + "language_loss": 0.81610233, + "learning_rate": 0.00020520075909383063, + "loss": 0.82652992, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.40063477, + "step": 3689, + "time_per_iteration": 2.8920962810516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_mlp": 1.00576556, + "epoch": 0.7098884186225471, + "flos": 973652956416.0, + "grad_norm": 0.0320857463001868, + "language_loss": 0.811288, + "learning_rate": 0.00020494918470349916, + "loss": 0.82174444, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.39868164, + "step": 3690, + "time_per_iteration": 3.3136045932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045022, + "balance_loss_mlp": 1.00513482, + "epoch": 0.7100808003078107, + "flos": 505258874112.0, + "grad_norm": 0.03898509483209187, + "language_loss": 0.86111224, + "learning_rate": 0.00020469772486205297, + "loss": 0.87156248, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.39868164, + "step": 3691, + "time_per_iteration": 2.6186795234680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_mlp": 1.00715148, + "epoch": 0.7102731819930742, + "flos": 541390732032.0, + "grad_norm": 0.07359850513533242, + "language_loss": 0.81684911, + "learning_rate": 0.0002044463796671177, + "loss": 0.82731974, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.39892578, + "step": 3692, + "time_per_iteration": 2.7347307205200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047542, + "balance_loss_mlp": 1.00767887, + "epoch": 0.7104655636783378, + "flos": 621628339968.0, + "grad_norm": 0.03494472731168418, + "language_loss": 0.80876124, + "learning_rate": 0.00020419514921627408, + "loss": 0.8192367, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.3984375, + "step": 3693, + "time_per_iteration": 2.9353420734405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_mlp": 1.00568545, + "epoch": 0.7106579453636014, + "flos": 558377663232.0, + "grad_norm": 0.034076048259573104, + "language_loss": 0.77580255, + "learning_rate": 0.00020394403360705855, + "loss": 0.78625828, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.39868164, + "step": 3694, + "time_per_iteration": 2.7425014972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041714, + "balance_loss_mlp": 1.00187469, + "epoch": 0.710850327048865, + "flos": 514063888128.0, + "grad_norm": 0.03425732262265505, + "language_loss": 0.88495499, + "learning_rate": 0.00020369303293696228, + "loss": 0.89537215, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.39819336, + "step": 3695, + "time_per_iteration": 2.6524975299835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_mlp": 1.00138831, + "epoch": 0.7110427087341286, + "flos": 424507039488.0, + "grad_norm": 0.03544655381873144, + "language_loss": 0.78715348, + "learning_rate": 0.00020344214730343304, + "loss": 0.79756576, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.39819336, + "step": 3696, + "time_per_iteration": 2.5949435234069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044579, + "balance_loss_mlp": 1.0046916, + "epoch": 0.711235090419392, + "flos": 578654077440.0, + "grad_norm": 0.028723552959570162, + "language_loss": 0.79433203, + "learning_rate": 0.00020319137680387296, + "loss": 0.80477786, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.39868164, + "step": 3697, + "time_per_iteration": 2.9308555126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00476861, + "epoch": 0.7114274721046556, + "flos": 448985716992.0, + "grad_norm": 0.03974363326367457, + "language_loss": 0.81048799, + "learning_rate": 0.0002029407215356398, + "loss": 0.82093453, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.39868164, + "step": 3698, + "time_per_iteration": 2.51846981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047508, + "balance_loss_mlp": 1.00747764, + "epoch": 0.7116198537899192, + "flos": 623093252352.0, + "grad_norm": 0.03573092214562991, + "language_loss": 0.83794999, + "learning_rate": 0.00020269018159604663, + "loss": 0.84842503, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.40014648, + "step": 3699, + "time_per_iteration": 2.7074286937713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.00760162, + "epoch": 0.7118122354751828, + "flos": 499720988928.0, + "grad_norm": 0.03677211843520988, + "language_loss": 0.82181633, + "learning_rate": 0.00020243975708236162, + "loss": 0.83229172, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.39916992, + "step": 3700, + "time_per_iteration": 2.564375877380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_mlp": 1.00660574, + "epoch": 0.7120046171604463, + "flos": 573845246976.0, + "grad_norm": 0.03454353277878698, + "language_loss": 0.86407083, + "learning_rate": 0.00020218944809180818, + "loss": 0.87453598, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.39892578, + "step": 3701, + "time_per_iteration": 2.7084884643554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_mlp": 1.00657344, + "epoch": 0.7121969988457099, + "flos": 573771369984.0, + "grad_norm": 0.03303682180607054, + "language_loss": 0.8533892, + "learning_rate": 0.00020193925472156493, + "loss": 0.86385572, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.40063477, + "step": 3702, + "time_per_iteration": 2.7079381942749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_mlp": 1.00603485, + "epoch": 0.7123893805309734, + "flos": 1526823224064.0, + "grad_norm": 0.008337798105396301, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75334108, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.38867188, + "step": 3703, + "time_per_iteration": 4.8932740688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057166, + "balance_loss_mlp": 1.01696837, + "epoch": 0.712581762216237, + "flos": 616414152960.0, + "grad_norm": 0.03156423949245577, + "language_loss": 0.84361899, + "learning_rate": 0.00020143921523049863, + "loss": 0.85419071, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.40185547, + "step": 3704, + "time_per_iteration": 2.9233312606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052935, + "balance_loss_mlp": 1.01285696, + "epoch": 0.7127741439015006, + "flos": 598875089664.0, + "grad_norm": 0.03941549169831495, + "language_loss": 0.84401309, + "learning_rate": 0.00020118936930380837, + "loss": 0.85454243, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.40063477, + "step": 3705, + "time_per_iteration": 2.7015953063964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_mlp": 1.00774693, + "epoch": 0.7129665255867641, + "flos": 538440498432.0, + "grad_norm": 0.03692779593562928, + "language_loss": 0.81897098, + "learning_rate": 0.0002009396393856932, + "loss": 0.82945073, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.40209961, + "step": 3706, + "time_per_iteration": 2.649216890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047729, + "balance_loss_mlp": 1.00746036, + "epoch": 0.7131589072720277, + "flos": 527521230336.0, + "grad_norm": 0.035672100544370096, + "language_loss": 0.82740968, + "learning_rate": 0.00020069002557310673, + "loss": 0.83788699, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.40258789, + "step": 3707, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043206, + "balance_loss_mlp": 1.00281823, + "epoch": 0.7133512889572913, + "flos": 532097736192.0, + "grad_norm": 0.0323096227749812, + "language_loss": 0.77545685, + "learning_rate": 0.00020044052796295807, + "loss": 0.78588891, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.40380859, + "step": 3708, + "time_per_iteration": 2.791064500808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048487, + "balance_loss_mlp": 1.00821805, + "epoch": 0.7135436706425549, + "flos": 504551206656.0, + "grad_norm": 0.04325770643622515, + "language_loss": 0.82374418, + "learning_rate": 0.00020019114665211063, + "loss": 0.83422899, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.40258789, + "step": 3709, + "time_per_iteration": 2.6297860145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046781, + "balance_loss_mlp": 1.00648808, + "epoch": 0.7137360523278183, + "flos": 516968434944.0, + "grad_norm": 0.035345949050593885, + "language_loss": 0.81970435, + "learning_rate": 0.00019994188173738276, + "loss": 0.83017212, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.40283203, + "step": 3710, + "time_per_iteration": 2.6330204010009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047924, + "balance_loss_mlp": 1.00755966, + "epoch": 0.7139284340130819, + "flos": 511537474560.0, + "grad_norm": 0.03739330083001905, + "language_loss": 0.81062478, + "learning_rate": 0.0001996927333155477, + "loss": 0.82110405, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.40356445, + "step": 3711, + "time_per_iteration": 2.74644136428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049541, + "balance_loss_mlp": 1.0092001, + "epoch": 0.7141208156983455, + "flos": 891800736768.0, + "grad_norm": 0.03143322017513776, + "language_loss": 0.85805249, + "learning_rate": 0.00019944370148333346, + "loss": 0.86854792, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.40332031, + "step": 3712, + "time_per_iteration": 3.1481471061706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049102, + "balance_loss_mlp": 1.00871384, + "epoch": 0.7143131973836091, + "flos": 536884212480.0, + "grad_norm": 0.034489718193939395, + "language_loss": 0.80643392, + "learning_rate": 0.00019919478633742278, + "loss": 0.81692493, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.40380859, + "step": 3713, + "time_per_iteration": 2.6485395431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048925, + "balance_loss_mlp": 1.00848949, + "epoch": 0.7145055790688727, + "flos": 474627962880.0, + "grad_norm": 0.04039016318386717, + "language_loss": 0.85767764, + "learning_rate": 0.00019894598797445302, + "loss": 0.86816686, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.40429688, + "step": 3714, + "time_per_iteration": 2.5401811599731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050713, + "balance_loss_mlp": 1.01037288, + "epoch": 0.7146979607541362, + "flos": 571702802688.0, + "grad_norm": 0.03221862991626059, + "language_loss": 0.82471192, + "learning_rate": 0.00019869730649101615, + "loss": 0.83521909, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.40332031, + "step": 3715, + "time_per_iteration": 2.75704288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105074, + "balance_loss_mlp": 1.0103991, + "epoch": 0.7148903424393998, + "flos": 841139341824.0, + "grad_norm": 0.03811132383920714, + "language_loss": 0.72900105, + "learning_rate": 0.00019844874198365943, + "loss": 0.73950851, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.40332031, + "step": 3716, + "time_per_iteration": 3.115915536880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049607, + "balance_loss_mlp": 1.00921834, + "epoch": 0.7150827241246633, + "flos": 542879943936.0, + "grad_norm": 0.037838986549668586, + "language_loss": 0.84377575, + "learning_rate": 0.00019820029454888362, + "loss": 0.85427183, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.40380859, + "step": 3717, + "time_per_iteration": 2.7640199661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.01282501, + "epoch": 0.7152751058099269, + "flos": 1587190741248.0, + "grad_norm": 0.009155096775058921, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.7557348, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.39355469, + "step": 3718, + "time_per_iteration": 5.020099639892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_mlp": 1.00296581, + "epoch": 0.7154674874951905, + "flos": 518429456640.0, + "grad_norm": 0.0370915875215028, + "language_loss": 0.80511153, + "learning_rate": 0.0001977037512828529, + "loss": 0.81554461, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.40332031, + "step": 3719, + "time_per_iteration": 2.593027114868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043585, + "balance_loss_mlp": 1.00326824, + "epoch": 0.715659869180454, + "flos": 603640178688.0, + "grad_norm": 0.03300286270545162, + "language_loss": 0.86582744, + "learning_rate": 0.0001974556556443734, + "loss": 0.87626332, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.40307617, + "step": 3720, + "time_per_iteration": 2.725634813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047164, + "balance_loss_mlp": 1.0068953, + "epoch": 0.7158522508657176, + "flos": 532770410496.0, + "grad_norm": 0.029643200911988788, + "language_loss": 0.89179665, + "learning_rate": 0.00019720767746402547, + "loss": 0.90226829, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.40258789, + "step": 3721, + "time_per_iteration": 2.727351188659668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061105, + "balance_loss_mlp": 1.02069271, + "epoch": 0.7160446325509812, + "flos": 558646926336.0, + "grad_norm": 0.03644218382348141, + "language_loss": 0.80571723, + "learning_rate": 0.00019695981683808222, + "loss": 0.81632823, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.40405273, + "step": 3722, + "time_per_iteration": 2.7068886756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.01662219, + "epoch": 0.7162370142362448, + "flos": 692283280128.0, + "grad_norm": 0.03246359808294338, + "language_loss": 0.85348076, + "learning_rate": 0.00019671207386277225, + "loss": 0.86404943, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.40234375, + "step": 3723, + "time_per_iteration": 2.9236690998077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046407, + "balance_loss_mlp": 1.00611401, + "epoch": 0.7164293959215082, + "flos": 795459809280.0, + "grad_norm": 0.035040971125857495, + "language_loss": 0.78785622, + "learning_rate": 0.0001964644486342777, + "loss": 0.79832029, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.40283203, + "step": 3724, + "time_per_iteration": 2.9631621837615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_mlp": 1.00506568, + "epoch": 0.7166217776067718, + "flos": 495205721088.0, + "grad_norm": 0.03180638125163834, + "language_loss": 0.86850977, + "learning_rate": 0.00019621694124873524, + "loss": 0.87896389, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.40332031, + "step": 3725, + "time_per_iteration": 2.6598877906799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_mlp": 1.00958252, + "epoch": 0.7168141592920354, + "flos": 1403964220416.0, + "grad_norm": 0.007874165171020433, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77589142, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.39453125, + "step": 3726, + "time_per_iteration": 4.864764451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049401, + "balance_loss_mlp": 1.00922716, + "epoch": 0.717006540977299, + "flos": 794600497152.0, + "grad_norm": 0.03337333426789978, + "language_loss": 0.77893984, + "learning_rate": 0.00019572228039082428, + "loss": 0.78943384, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.40161133, + "step": 3727, + "time_per_iteration": 3.107271432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_mlp": 1.01066017, + "epoch": 0.7171989226625626, + "flos": 555964010496.0, + "grad_norm": 0.028215345270395674, + "language_loss": 0.84187287, + "learning_rate": 0.0001954751271105002, + "loss": 0.85238069, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.40112305, + "step": 3728, + "time_per_iteration": 2.8074874877929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049671, + "balance_loss_mlp": 1.00940251, + "epoch": 0.717391304347826, + "flos": 557062450176.0, + "grad_norm": 0.03474148956732634, + "language_loss": 0.81498766, + "learning_rate": 0.00019522809205721687, + "loss": 0.8254844, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.40258789, + "step": 3729, + "time_per_iteration": 2.736825704574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048977, + "balance_loss_mlp": 1.00885069, + "epoch": 0.7175836860330896, + "flos": 539955955200.0, + "grad_norm": 0.033940302209900526, + "language_loss": 0.83540523, + "learning_rate": 0.0001949811753268816, + "loss": 0.84589505, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.40112305, + "step": 3730, + "time_per_iteration": 2.732431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047309, + "balance_loss_mlp": 1.00720656, + "epoch": 0.7177760677183532, + "flos": 516651539712.0, + "grad_norm": 0.04023515024908783, + "language_loss": 0.83238113, + "learning_rate": 0.00019473437701535634, + "loss": 0.8428542, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.40087891, + "step": 3731, + "time_per_iteration": 2.608720064163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_mlp": 1.00444937, + "epoch": 0.7179684494036168, + "flos": 675940833024.0, + "grad_norm": 0.03223034722468918, + "language_loss": 0.90125024, + "learning_rate": 0.00019448769721845677, + "loss": 0.9116962, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.40136719, + "step": 3732, + "time_per_iteration": 2.8010287284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_mlp": 1.00342619, + "epoch": 0.7181608310888803, + "flos": 470876742912.0, + "grad_norm": 0.03459418465075036, + "language_loss": 0.86262, + "learning_rate": 0.00019424113603195203, + "loss": 0.87305647, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.40209961, + "step": 3733, + "time_per_iteration": 2.5431971549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044053, + "balance_loss_mlp": 1.0037843, + "epoch": 0.7183532127741439, + "flos": 595185107712.0, + "grad_norm": 0.037144823365086815, + "language_loss": 0.8025893, + "learning_rate": 0.0001939946935515657, + "loss": 0.81302989, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.40258789, + "step": 3734, + "time_per_iteration": 2.8843894004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_mlp": 1.00533688, + "epoch": 0.7185455944594075, + "flos": 499916375040.0, + "grad_norm": 0.03883855208122221, + "language_loss": 0.8098954, + "learning_rate": 0.0001937483698729755, + "loss": 0.82035124, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.40234375, + "step": 3735, + "time_per_iteration": 2.6381587982177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_mlp": 1.00243688, + "epoch": 0.718737976144671, + "flos": 816308775936.0, + "grad_norm": 0.032230667359085925, + "language_loss": 0.82948256, + "learning_rate": 0.0001935021650918128, + "loss": 0.83990961, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.40258789, + "step": 3736, + "time_per_iteration": 3.0015594959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.00332057, + "epoch": 0.7189303578299346, + "flos": 439240710912.0, + "grad_norm": 0.03694442625738843, + "language_loss": 0.87466842, + "learning_rate": 0.0001932560793036625, + "loss": 0.88510168, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.39990234, + "step": 3737, + "time_per_iteration": 2.522517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043371, + "balance_loss_mlp": 1.00341213, + "epoch": 0.7191227395151981, + "flos": 550447512576.0, + "grad_norm": 0.0396546540063306, + "language_loss": 0.86941743, + "learning_rate": 0.00019301011260406382, + "loss": 0.87985116, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.39941406, + "step": 3738, + "time_per_iteration": 2.6374080181121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046219, + "balance_loss_mlp": 1.00616467, + "epoch": 0.7193151212004617, + "flos": 628081917696.0, + "grad_norm": 0.032473190286521646, + "language_loss": 0.80187446, + "learning_rate": 0.00019276426508850936, + "loss": 0.81233668, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.40039062, + "step": 3739, + "time_per_iteration": 2.7331862449645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_mlp": 1.00620306, + "epoch": 0.7195075028857253, + "flos": 742441142016.0, + "grad_norm": 0.03365291152671841, + "language_loss": 0.80674922, + "learning_rate": 0.00019251853685244564, + "loss": 0.8172121, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.40063477, + "step": 3740, + "time_per_iteration": 3.040309429168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044377, + "balance_loss_mlp": 1.00410771, + "epoch": 0.7196998845709889, + "flos": 804291068160.0, + "grad_norm": 0.03612611127551407, + "language_loss": 0.81356812, + "learning_rate": 0.00019227292799127283, + "loss": 0.82401186, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.40258789, + "step": 3741, + "time_per_iteration": 3.0432052612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_mlp": 1.00416589, + "epoch": 0.7198922662562524, + "flos": 926777774592.0, + "grad_norm": 0.036362359760093145, + "language_loss": 0.79752231, + "learning_rate": 0.00019202743860034454, + "loss": 0.80796617, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.40209961, + "step": 3742, + "time_per_iteration": 3.223635196685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049155, + "balance_loss_mlp": 1.0088625, + "epoch": 0.7200846479415159, + "flos": 581208681216.0, + "grad_norm": 0.0348094997574978, + "language_loss": 0.84359837, + "learning_rate": 0.00019178206877496873, + "loss": 0.85408992, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.40283203, + "step": 3743, + "time_per_iteration": 2.6937367916107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.0053103, + "epoch": 0.7202770296267795, + "flos": 558840367104.0, + "grad_norm": 0.028995122197605715, + "language_loss": 0.85587943, + "learning_rate": 0.0001915368186104059, + "loss": 0.86633497, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.40234375, + "step": 3744, + "time_per_iteration": 2.737929582595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_mlp": 1.00722742, + "epoch": 0.7204694113120431, + "flos": 673772143872.0, + "grad_norm": 0.03601847406415609, + "language_loss": 0.81636101, + "learning_rate": 0.0001912916882018706, + "loss": 0.82683504, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.40161133, + "step": 3745, + "time_per_iteration": 2.8627820014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010475, + "balance_loss_mlp": 1.00727844, + "epoch": 0.7206617929973067, + "flos": 800596228608.0, + "grad_norm": 0.04088395220221656, + "language_loss": 0.80132556, + "learning_rate": 0.00019104667764453125, + "loss": 0.8118006, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.40209961, + "step": 3746, + "time_per_iteration": 3.0283303260803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.01020253, + "epoch": 0.7208541746825702, + "flos": 532939551744.0, + "grad_norm": 0.030159350032508997, + "language_loss": 0.80461586, + "learning_rate": 0.00019080178703350926, + "loss": 0.81511962, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.40161133, + "step": 3747, + "time_per_iteration": 2.6268179416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_mlp": 1.00945175, + "epoch": 0.7210465563678338, + "flos": 536169742080.0, + "grad_norm": 0.034039887094515435, + "language_loss": 0.83305407, + "learning_rate": 0.00019055701646387952, + "loss": 0.84355056, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.40185547, + "step": 3748, + "time_per_iteration": 2.642871618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050407, + "balance_loss_mlp": 1.0114975, + "epoch": 0.7212389380530974, + "flos": 1537249652736.0, + "grad_norm": 0.008513050614024542, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81523097, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.38867188, + "step": 3749, + "time_per_iteration": 4.767102003097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046475, + "balance_loss_mlp": 1.00627732, + "epoch": 0.7214313197383609, + "flos": 462453752832.0, + "grad_norm": 0.03442724668025846, + "language_loss": 0.86840045, + "learning_rate": 0.00019006783582886368, + "loss": 0.87886518, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.40185547, + "step": 3750, + "time_per_iteration": 2.5307884216308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.00384998, + "epoch": 0.7216237014236244, + "flos": 1038913874688.0, + "grad_norm": 0.03633272884659257, + "language_loss": 0.83278096, + "learning_rate": 0.00018982342595339437, + "loss": 0.84322238, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.40283203, + "step": 3751, + "time_per_iteration": 3.5147032737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044645, + "balance_loss_mlp": 1.00437641, + "epoch": 0.721816083108888, + "flos": 897451382784.0, + "grad_norm": 0.033868816355573705, + "language_loss": 0.82631296, + "learning_rate": 0.00018957913649915076, + "loss": 0.83675945, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.40258789, + "step": 3752, + "time_per_iteration": 3.1239399909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_mlp": 1.00446343, + "epoch": 0.7220084647941516, + "flos": 524312427264.0, + "grad_norm": 0.03748349952969219, + "language_loss": 0.80553722, + "learning_rate": 0.00018933496756097428, + "loss": 0.81598485, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.40283203, + "step": 3753, + "time_per_iteration": 2.6250908374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.00487828, + "epoch": 0.7222008464794152, + "flos": 817472344320.0, + "grad_norm": 0.035953196977106826, + "language_loss": 0.82196552, + "learning_rate": 0.0001890909192336603, + "loss": 0.83241749, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.40307617, + "step": 3754, + "time_per_iteration": 3.015929698944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_mlp": 1.00417137, + "epoch": 0.7223932281646788, + "flos": 750373238016.0, + "grad_norm": 0.03340807501662783, + "language_loss": 0.70701879, + "learning_rate": 0.00018884699161195623, + "loss": 0.7174632, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.40258789, + "step": 3755, + "time_per_iteration": 2.934309959411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043188, + "balance_loss_mlp": 1.00279963, + "epoch": 0.7225856098499422, + "flos": 746989457664.0, + "grad_norm": 0.03539660333033103, + "language_loss": 0.77625644, + "learning_rate": 0.00018860318479056327, + "loss": 0.78668833, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.40380859, + "step": 3756, + "time_per_iteration": 3.092843770980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_mlp": 1.00541389, + "epoch": 0.7227779915352058, + "flos": 548435325696.0, + "grad_norm": 0.03162886339795087, + "language_loss": 0.84069121, + "learning_rate": 0.00018835949886413555, + "loss": 0.85114777, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.40234375, + "step": 3757, + "time_per_iteration": 2.697178602218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047884, + "balance_loss_mlp": 1.00756705, + "epoch": 0.7229703732204694, + "flos": 531506720256.0, + "grad_norm": 0.03673346832571833, + "language_loss": 0.78688115, + "learning_rate": 0.0001881159339272806, + "loss": 0.79735994, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.40307617, + "step": 3758, + "time_per_iteration": 2.672168731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00597107, + "epoch": 0.723162754905733, + "flos": 529366221312.0, + "grad_norm": 0.03397833212706175, + "language_loss": 0.79266065, + "learning_rate": 0.00018787249007455858, + "loss": 0.80312276, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.40234375, + "step": 3759, + "time_per_iteration": 2.587975025177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046442, + "balance_loss_mlp": 1.00629199, + "epoch": 0.7233551365909965, + "flos": 656060048640.0, + "grad_norm": 0.03524788149604232, + "language_loss": 0.71597099, + "learning_rate": 0.00018762916740048302, + "loss": 0.72643542, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.40136719, + "step": 3760, + "time_per_iteration": 2.7926323413848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_mlp": 1.00701594, + "epoch": 0.7235475182762601, + "flos": 523444366848.0, + "grad_norm": 0.0316872797389574, + "language_loss": 0.86490506, + "learning_rate": 0.0001873859659995195, + "loss": 0.87537622, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.40087891, + "step": 3761, + "time_per_iteration": 2.7313694953918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047248, + "balance_loss_mlp": 1.00721729, + "epoch": 0.7237398999615237, + "flos": 610322190336.0, + "grad_norm": 0.03701947835091587, + "language_loss": 0.84027237, + "learning_rate": 0.0001871428859660878, + "loss": 0.85074484, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.40014648, + "step": 3762, + "time_per_iteration": 2.724437952041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_mlp": 1.00707483, + "epoch": 0.7239322816467872, + "flos": 660282720768.0, + "grad_norm": 0.032017946801170455, + "language_loss": 0.82444721, + "learning_rate": 0.00018689992739455975, + "loss": 0.83491802, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.39990234, + "step": 3763, + "time_per_iteration": 2.8985331058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045695, + "balance_loss_mlp": 1.00566471, + "epoch": 0.7241246633320508, + "flos": 970941850368.0, + "grad_norm": 0.0325077929756691, + "language_loss": 0.8663789, + "learning_rate": 0.00018665709037926027, + "loss": 0.87683582, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.40014648, + "step": 3764, + "time_per_iteration": 3.3307945728302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_mlp": 1.00323248, + "epoch": 0.7243170450173143, + "flos": 516000252672.0, + "grad_norm": 0.037062443743513, + "language_loss": 0.85301733, + "learning_rate": 0.00018641437501446694, + "loss": 0.86344957, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.3996582, + "step": 3765, + "time_per_iteration": 2.57521915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_mlp": 1.00170028, + "epoch": 0.7245094267025779, + "flos": 560806867200.0, + "grad_norm": 0.03616258332607596, + "language_loss": 0.82752323, + "learning_rate": 0.0001861717813944104, + "loss": 0.83794075, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.40039062, + "step": 3766, + "time_per_iteration": 2.6512858867645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_mlp": 1.00287974, + "epoch": 0.7247018083878415, + "flos": 613775956992.0, + "grad_norm": 0.03625673893536532, + "language_loss": 0.79743433, + "learning_rate": 0.00018592930961327365, + "loss": 0.80786318, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.39990234, + "step": 3767, + "time_per_iteration": 2.704402208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_mlp": 1.00588417, + "epoch": 0.7248941900731051, + "flos": 635871117312.0, + "grad_norm": 0.03196657989519071, + "language_loss": 0.88960397, + "learning_rate": 0.00018568695976519273, + "loss": 0.90006363, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.40063477, + "step": 3768, + "time_per_iteration": 2.764528751373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046055, + "balance_loss_mlp": 1.0059768, + "epoch": 0.7250865717583687, + "flos": 425837803776.0, + "grad_norm": 0.0390622861553884, + "language_loss": 0.80584097, + "learning_rate": 0.00018544473194425593, + "loss": 0.81630147, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.40063477, + "step": 3769, + "time_per_iteration": 2.4841666221618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043424, + "balance_loss_mlp": 1.00329816, + "epoch": 0.7252789534436321, + "flos": 636398949888.0, + "grad_norm": 0.04244308423853245, + "language_loss": 0.79393184, + "learning_rate": 0.00018520262624450485, + "loss": 0.80436611, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.40112305, + "step": 3770, + "time_per_iteration": 2.8432021141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046388, + "balance_loss_mlp": 1.00638103, + "epoch": 0.7254713351288957, + "flos": 618354408192.0, + "grad_norm": 0.03205335937009439, + "language_loss": 0.87801862, + "learning_rate": 0.00018496064275993324, + "loss": 0.88848257, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.39990234, + "step": 3771, + "time_per_iteration": 2.753740072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046271, + "balance_loss_mlp": 1.00612164, + "epoch": 0.7256637168141593, + "flos": 768291412992.0, + "grad_norm": 0.038084131410306525, + "language_loss": 0.82372004, + "learning_rate": 0.00018471878158448686, + "loss": 0.83418274, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.40136719, + "step": 3772, + "time_per_iteration": 2.917302370071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_mlp": 1.0082382, + "epoch": 0.7258560984994229, + "flos": 496727980800.0, + "grad_norm": 0.02992069132066452, + "language_loss": 0.84553695, + "learning_rate": 0.00018447704281206512, + "loss": 0.85602057, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.40112305, + "step": 3773, + "time_per_iteration": 2.843857765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048932, + "balance_loss_mlp": 1.00878251, + "epoch": 0.7260484801846864, + "flos": 531142192896.0, + "grad_norm": 0.03465658020099934, + "language_loss": 0.83523774, + "learning_rate": 0.0001842354265365191, + "loss": 0.84572709, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.40136719, + "step": 3774, + "time_per_iteration": 2.6899774074554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046049, + "balance_loss_mlp": 1.00592351, + "epoch": 0.72624086186995, + "flos": 626108614656.0, + "grad_norm": 0.036794080035960464, + "language_loss": 0.81133199, + "learning_rate": 0.0001839939328516526, + "loss": 0.82179248, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.40112305, + "step": 3775, + "time_per_iteration": 2.75508451461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104583, + "balance_loss_mlp": 1.0056808, + "epoch": 0.7264332435552135, + "flos": 717805962240.0, + "grad_norm": 0.03611168561837021, + "language_loss": 0.82141531, + "learning_rate": 0.0001837525618512218, + "loss": 0.83187354, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.40136719, + "step": 3776, + "time_per_iteration": 2.8697876930236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.0069201, + "epoch": 0.7266256252404771, + "flos": 682242766080.0, + "grad_norm": 0.036803325831150785, + "language_loss": 0.83319986, + "learning_rate": 0.00018351131362893519, + "loss": 0.84367126, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.40209961, + "step": 3777, + "time_per_iteration": 2.7980828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_mlp": 1.00580347, + "epoch": 0.7268180069257407, + "flos": 519918668544.0, + "grad_norm": 0.038913474879357805, + "language_loss": 0.81077832, + "learning_rate": 0.00018327018827845364, + "loss": 0.82123971, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.40332031, + "step": 3778, + "time_per_iteration": 2.610944986343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00537205, + "epoch": 0.7270103886110042, + "flos": 513673115904.0, + "grad_norm": 0.03821848161600015, + "language_loss": 0.88036418, + "learning_rate": 0.00018302918589339036, + "loss": 0.89082056, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.40258789, + "step": 3779, + "time_per_iteration": 2.6776628494262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044448, + "balance_loss_mlp": 1.00413156, + "epoch": 0.7272027702962678, + "flos": 547692665088.0, + "grad_norm": 0.03543573147287282, + "language_loss": 0.90566671, + "learning_rate": 0.00018278830656731054, + "loss": 0.91611117, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.40307617, + "step": 3780, + "time_per_iteration": 2.6612467765808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_mlp": 1.003613, + "epoch": 0.7273951519815314, + "flos": 594155687424.0, + "grad_norm": 0.02879348395383923, + "language_loss": 0.86881804, + "learning_rate": 0.00018254755039373222, + "loss": 0.87925708, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.40283203, + "step": 3781, + "time_per_iteration": 2.724158763885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045168, + "balance_loss_mlp": 1.00482738, + "epoch": 0.727587533666795, + "flos": 607139632128.0, + "grad_norm": 0.03859798712496429, + "language_loss": 0.84525704, + "learning_rate": 0.0001823069174661252, + "loss": 0.85570872, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.40332031, + "step": 3782, + "time_per_iteration": 2.7668051719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_mlp": 1.00395322, + "epoch": 0.7277799153520584, + "flos": 514026949632.0, + "grad_norm": 0.03650439895450689, + "language_loss": 0.78873003, + "learning_rate": 0.00018206640787791112, + "loss": 0.79917252, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.40283203, + "step": 3783, + "time_per_iteration": 2.649040699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_mlp": 1.00268304, + "epoch": 0.727972297037322, + "flos": 538794332160.0, + "grad_norm": 0.03501392489574684, + "language_loss": 0.86669183, + "learning_rate": 0.00018182602172246416, + "loss": 0.87712133, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.40258789, + "step": 3784, + "time_per_iteration": 2.603267192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045035, + "balance_loss_mlp": 1.00474274, + "epoch": 0.7281646787225856, + "flos": 536076423168.0, + "grad_norm": 0.037923852732183974, + "language_loss": 0.77186882, + "learning_rate": 0.00018158575909311075, + "loss": 0.78231919, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.40283203, + "step": 3785, + "time_per_iteration": 2.6864423751831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_mlp": 1.00489032, + "epoch": 0.7283570604078492, + "flos": 626210681856.0, + "grad_norm": 0.0363846490797151, + "language_loss": 0.80090117, + "learning_rate": 0.000181345620083129, + "loss": 0.81135345, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.40332031, + "step": 3786, + "time_per_iteration": 2.7992641925811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_mlp": 1.00548017, + "epoch": 0.7285494420931128, + "flos": 535255994880.0, + "grad_norm": 0.04682580138791378, + "language_loss": 0.86931181, + "learning_rate": 0.00018110560478574927, + "loss": 0.87977034, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.40356445, + "step": 3787, + "time_per_iteration": 2.680211305618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_mlp": 1.00277114, + "epoch": 0.7287418237783763, + "flos": 667741419264.0, + "grad_norm": 0.04795946543380901, + "language_loss": 0.80688787, + "learning_rate": 0.0001808657132941533, + "loss": 0.81731963, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.40405273, + "step": 3788, + "time_per_iteration": 2.793989658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_mlp": 1.00286126, + "epoch": 0.7289342054636399, + "flos": 551639271168.0, + "grad_norm": 0.04788875018667363, + "language_loss": 0.83400464, + "learning_rate": 0.00018062594570147572, + "loss": 0.84443599, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.40258789, + "step": 3789, + "time_per_iteration": 2.5800626277923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043681, + "balance_loss_mlp": 1.00331712, + "epoch": 0.7291265871489034, + "flos": 689139605760.0, + "grad_norm": 0.0306016583616733, + "language_loss": 0.86152685, + "learning_rate": 0.00018038630210080243, + "loss": 0.87196368, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.40356445, + "step": 3790, + "time_per_iteration": 2.791778326034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_mlp": 1.00133801, + "epoch": 0.729318968834167, + "flos": 573771369984.0, + "grad_norm": 0.03320164846736232, + "language_loss": 0.8504535, + "learning_rate": 0.0001801467825851712, + "loss": 0.86087084, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.40380859, + "step": 3791, + "time_per_iteration": 2.7736573219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_mlp": 1.00278151, + "epoch": 0.7295113505194305, + "flos": 587165528832.0, + "grad_norm": 0.039500127545913186, + "language_loss": 0.79190361, + "learning_rate": 0.00017990738724757172, + "loss": 0.80233628, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.40478516, + "step": 3792, + "time_per_iteration": 2.8482463359832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_mlp": 1.00345612, + "epoch": 0.7297037322046941, + "flos": 708442980096.0, + "grad_norm": 0.03263259511522569, + "language_loss": 0.82787073, + "learning_rate": 0.00017966811618094598, + "loss": 0.83830941, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.40405273, + "step": 3793, + "time_per_iteration": 2.889319658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044424, + "balance_loss_mlp": 1.0039407, + "epoch": 0.7298961138899577, + "flos": 488308881408.0, + "grad_norm": 0.03689917900491825, + "language_loss": 0.85408473, + "learning_rate": 0.00017942896947818664, + "loss": 0.86452901, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.40478516, + "step": 3794, + "time_per_iteration": 2.550274133682251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_mlp": 1.00383759, + "epoch": 0.7300884955752213, + "flos": 1368624600576.0, + "grad_norm": 0.005828351386569188, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75868088, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.39160156, + "step": 3795, + "time_per_iteration": 4.89626932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042163, + "balance_loss_mlp": 1.00179839, + "epoch": 0.7302808772604849, + "flos": 532837484544.0, + "grad_norm": 0.04171921070399138, + "language_loss": 0.85686743, + "learning_rate": 0.00017895104953559947, + "loss": 0.86728907, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.40356445, + "step": 3796, + "time_per_iteration": 2.57736873626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_mlp": 1.00203419, + "epoch": 0.7304732589457483, + "flos": 437063273472.0, + "grad_norm": 0.04046264333697194, + "language_loss": 0.90178061, + "learning_rate": 0.00017871227648131672, + "loss": 0.91220486, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.40380859, + "step": 3797, + "time_per_iteration": 2.474209785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_mlp": 1.00489223, + "epoch": 0.7306656406310119, + "flos": 452604734208.0, + "grad_norm": 0.029697022991301388, + "language_loss": 0.82934296, + "learning_rate": 0.0001784736281619907, + "loss": 0.83979571, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.40380859, + "step": 3798, + "time_per_iteration": 2.5923726558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044562, + "balance_loss_mlp": 1.00407827, + "epoch": 0.7308580223162755, + "flos": 513030577152.0, + "grad_norm": 0.032710497654363443, + "language_loss": 0.75410861, + "learning_rate": 0.00017823510467027232, + "loss": 0.7645542, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.40478516, + "step": 3799, + "time_per_iteration": 2.7622478008270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045673, + "balance_loss_mlp": 1.00521374, + "epoch": 0.7310504040015391, + "flos": 376283596800.0, + "grad_norm": 0.039904062723008, + "language_loss": 0.79136682, + "learning_rate": 0.00017799670609876516, + "loss": 0.80182356, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.40454102, + "step": 3800, + "time_per_iteration": 2.493797540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_mlp": 1.00222194, + "epoch": 0.7312427856868026, + "flos": 550382383872.0, + "grad_norm": 0.0325229913216085, + "language_loss": 0.89329851, + "learning_rate": 0.00017775843254002366, + "loss": 0.90372574, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.4050293, + "step": 3801, + "time_per_iteration": 2.7277941703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047641, + "balance_loss_mlp": 1.00727654, + "epoch": 0.7314351673720662, + "flos": 768678294528.0, + "grad_norm": 0.03330924575668911, + "language_loss": 0.84167385, + "learning_rate": 0.00017752028408655367, + "loss": 0.8521502, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.40356445, + "step": 3802, + "time_per_iteration": 3.040632486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_mlp": 1.00824392, + "epoch": 0.7316275490573297, + "flos": 487705226496.0, + "grad_norm": 0.03826862590336393, + "language_loss": 0.8564449, + "learning_rate": 0.00017728226083081272, + "loss": 0.86693048, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.40307617, + "step": 3803, + "time_per_iteration": 2.5550501346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048554, + "balance_loss_mlp": 1.00833249, + "epoch": 0.7318199307425933, + "flos": 474413134848.0, + "grad_norm": 0.03815942500131441, + "language_loss": 0.82039976, + "learning_rate": 0.00017704436286520965, + "loss": 0.83088529, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.40209961, + "step": 3804, + "time_per_iteration": 2.58294677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_mlp": 1.00793362, + "epoch": 0.7320123124278569, + "flos": 550512641280.0, + "grad_norm": 0.03634721787215332, + "language_loss": 0.8514055, + "learning_rate": 0.0001768065902821046, + "loss": 0.86188722, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.40234375, + "step": 3805, + "time_per_iteration": 2.6493990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_mlp": 1.00665021, + "epoch": 0.7322046941131204, + "flos": 571900134144.0, + "grad_norm": 0.03447588355898286, + "language_loss": 0.82488358, + "learning_rate": 0.00017656894317380907, + "loss": 0.83535278, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.40258789, + "step": 3806, + "time_per_iteration": 2.7446413040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_mlp": 1.00468445, + "epoch": 0.732397075798384, + "flos": 1472503928064.0, + "grad_norm": 0.008037479366224719, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77074862, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.38867188, + "step": 3807, + "time_per_iteration": 5.015838623046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_mlp": 1.00413609, + "epoch": 0.7325894574836476, + "flos": 465831697152.0, + "grad_norm": 0.038585998350043275, + "language_loss": 0.84359336, + "learning_rate": 0.00017609402575064875, + "loss": 0.85403788, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.40307617, + "step": 3808, + "time_per_iteration": 2.5619466304779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044203, + "balance_loss_mlp": 1.00398183, + "epoch": 0.7327818391689112, + "flos": 496482050304.0, + "grad_norm": 0.03775450514575077, + "language_loss": 0.81649804, + "learning_rate": 0.00017585675562016367, + "loss": 0.82694006, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.40209961, + "step": 3809, + "time_per_iteration": 2.5793349742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044159, + "balance_loss_mlp": 1.00524902, + "epoch": 0.7329742208541746, + "flos": 1436682162432.0, + "grad_norm": 0.007309956802170158, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78257012, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.38867188, + "step": 3810, + "time_per_iteration": 4.810467720031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_mlp": 1.00303602, + "epoch": 0.7331666025394382, + "flos": 497869195008.0, + "grad_norm": 0.0392578744691446, + "language_loss": 0.85801327, + "learning_rate": 0.00017538259298196474, + "loss": 0.86844486, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.40112305, + "step": 3811, + "time_per_iteration": 2.5858519077301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_mlp": 1.00657213, + "epoch": 0.7333589842247018, + "flos": 539639059968.0, + "grad_norm": 0.03309973691359967, + "language_loss": 0.82286286, + "learning_rate": 0.00017514570065833745, + "loss": 0.83333039, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.40161133, + "step": 3812, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_mlp": 1.00525999, + "epoch": 0.7335513659099654, + "flos": 492042604800.0, + "grad_norm": 0.03925978819405336, + "language_loss": 0.81363267, + "learning_rate": 0.00017490893445433426, + "loss": 0.82408601, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.40063477, + "step": 3813, + "time_per_iteration": 2.608065128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_mlp": 1.00384104, + "epoch": 0.733743747595229, + "flos": 563253567744.0, + "grad_norm": 0.033972583106890976, + "language_loss": 0.82267326, + "learning_rate": 0.00017467229446187587, + "loss": 0.83311474, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.40307617, + "step": 3814, + "time_per_iteration": 2.6955394744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043196, + "balance_loss_mlp": 1.00290346, + "epoch": 0.7339361292804925, + "flos": 539649753600.0, + "grad_norm": 0.03487524168244714, + "language_loss": 0.81803584, + "learning_rate": 0.00017443578077283424, + "loss": 0.82846785, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.40283203, + "step": 3815, + "time_per_iteration": 2.6844675540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_mlp": 1.00742269, + "epoch": 0.734128510965756, + "flos": 549561955584.0, + "grad_norm": 0.03210943726156845, + "language_loss": 0.85443103, + "learning_rate": 0.0001741993934790319, + "loss": 0.86490697, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.40161133, + "step": 3816, + "time_per_iteration": 2.754804849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.0069536, + "epoch": 0.7343208926510196, + "flos": 541202148864.0, + "grad_norm": 0.03979674876858525, + "language_loss": 0.84579813, + "learning_rate": 0.00017396313267224273, + "loss": 0.85627079, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.40307617, + "step": 3817, + "time_per_iteration": 2.7152209281921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046673, + "balance_loss_mlp": 1.00638032, + "epoch": 0.7345132743362832, + "flos": 572171342592.0, + "grad_norm": 0.03405657916649516, + "language_loss": 0.88968074, + "learning_rate": 0.0001737269984441912, + "loss": 0.9001475, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.40283203, + "step": 3818, + "time_per_iteration": 2.63198184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_mlp": 1.00906372, + "epoch": 0.7347056560215467, + "flos": 546481464576.0, + "grad_norm": 0.04751068267806247, + "language_loss": 0.85475308, + "learning_rate": 0.00017349099088655263, + "loss": 0.86524642, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.40258789, + "step": 3819, + "time_per_iteration": 2.796168804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_mlp": 1.0060178, + "epoch": 0.7348980377068103, + "flos": 597077730816.0, + "grad_norm": 0.03129969376285051, + "language_loss": 0.81227374, + "learning_rate": 0.00017325511009095375, + "loss": 0.82273662, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.40258789, + "step": 3820, + "time_per_iteration": 2.7165210247039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_mlp": 1.00621831, + "epoch": 0.7350904193920739, + "flos": 539612815104.0, + "grad_norm": 0.03503609859827407, + "language_loss": 0.84185189, + "learning_rate": 0.00017301935614897113, + "loss": 0.8523168, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.40258789, + "step": 3821, + "time_per_iteration": 2.7012970447540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046737, + "balance_loss_mlp": 1.00656378, + "epoch": 0.7352828010773375, + "flos": 514061942784.0, + "grad_norm": 0.02996543941139594, + "language_loss": 0.8232463, + "learning_rate": 0.00017278372915213274, + "loss": 0.83371365, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.40161133, + "step": 3822, + "time_per_iteration": 2.646228313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105101, + "balance_loss_mlp": 1.01171875, + "epoch": 0.735475182762601, + "flos": 1557258749184.0, + "grad_norm": 0.004879497460224864, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80944854, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.39257812, + "step": 3823, + "time_per_iteration": 5.001528024673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_mlp": 1.00636435, + "epoch": 0.7356675644478645, + "flos": 682612151040.0, + "grad_norm": 0.0358517187113506, + "language_loss": 0.8115629, + "learning_rate": 0.00017231285635975314, + "loss": 0.82202852, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.40185547, + "step": 3824, + "time_per_iteration": 2.916127920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_mlp": 1.00615466, + "epoch": 0.7358599461331281, + "flos": 516232577280.0, + "grad_norm": 0.05204398731861849, + "language_loss": 0.83695984, + "learning_rate": 0.00017207761074702115, + "loss": 0.8474232, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.40161133, + "step": 3825, + "time_per_iteration": 2.62750506401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_mlp": 1.00662804, + "epoch": 0.7360523278183917, + "flos": 444917601792.0, + "grad_norm": 0.03194798623104488, + "language_loss": 0.84528393, + "learning_rate": 0.0001718424924450514, + "loss": 0.85575122, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.40087891, + "step": 3826, + "time_per_iteration": 2.61261248588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046993, + "balance_loss_mlp": 1.00684357, + "epoch": 0.7362447095036553, + "flos": 604551980544.0, + "grad_norm": 0.028984397633237662, + "language_loss": 0.86482602, + "learning_rate": 0.00017160750154512482, + "loss": 0.875296, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.40136719, + "step": 3827, + "time_per_iteration": 2.6998865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_mlp": 1.00305784, + "epoch": 0.7364370911889189, + "flos": 554251222272.0, + "grad_norm": 0.040234447169501614, + "language_loss": 0.8371399, + "learning_rate": 0.0001713726381384731, + "loss": 0.84757173, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.40112305, + "step": 3828, + "time_per_iteration": 2.746196746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_mlp": 1.00163531, + "epoch": 0.7366294728741823, + "flos": 449990837760.0, + "grad_norm": 0.03659096604544618, + "language_loss": 0.81686258, + "learning_rate": 0.00017113790231627812, + "loss": 0.82728064, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.40161133, + "step": 3829, + "time_per_iteration": 2.5232386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_mlp": 1.00445557, + "epoch": 0.7368218545594459, + "flos": 1538705816832.0, + "grad_norm": 0.007725694552394297, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80301964, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.39257812, + "step": 3830, + "time_per_iteration": 4.843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_mlp": 1.00405526, + "epoch": 0.7370142362447095, + "flos": 516473650176.0, + "grad_norm": 0.03681023024701871, + "language_loss": 0.82271254, + "learning_rate": 0.00017066881378973936, + "loss": 0.83315504, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.40185547, + "step": 3831, + "time_per_iteration": 2.684302806854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_mlp": 1.00578201, + "epoch": 0.7372066179299731, + "flos": 501905229312.0, + "grad_norm": 0.03287634093560934, + "language_loss": 0.83259964, + "learning_rate": 0.00017043446126751189, + "loss": 0.84305775, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.40014648, + "step": 3832, + "time_per_iteration": 2.710259199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_mlp": 1.00409663, + "epoch": 0.7373989996152366, + "flos": 559167955968.0, + "grad_norm": 0.03638251388363948, + "language_loss": 0.76960367, + "learning_rate": 0.00017020023669397376, + "loss": 0.78004539, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.40063477, + "step": 3833, + "time_per_iteration": 2.735877752304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050598, + "balance_loss_mlp": 1.01054347, + "epoch": 0.7375913813005002, + "flos": 507781396992.0, + "grad_norm": 0.059668100448601574, + "language_loss": 0.82237148, + "learning_rate": 0.0001699661401600589, + "loss": 0.8328774, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.40039062, + "step": 3834, + "time_per_iteration": 2.579663038253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_mlp": 1.007357, + "epoch": 0.7377837629857638, + "flos": 487156006656.0, + "grad_norm": 0.03637906521459096, + "language_loss": 0.78828633, + "learning_rate": 0.00016973217175665205, + "loss": 0.79876041, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.40039062, + "step": 3835, + "time_per_iteration": 2.6623384952545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_mlp": 1.00731659, + "epoch": 0.7379761446710273, + "flos": 1417880375808.0, + "grad_norm": 0.007661340220520439, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82212675, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.39160156, + "step": 3836, + "time_per_iteration": 4.928514003753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_mlp": 1.00634801, + "epoch": 0.7381685263562909, + "flos": 630910642176.0, + "grad_norm": 0.035800200298820535, + "language_loss": 0.84820634, + "learning_rate": 0.00016926461970465047, + "loss": 0.85866964, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.3996582, + "step": 3837, + "time_per_iteration": 2.762173891067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043931, + "balance_loss_mlp": 1.00382948, + "epoch": 0.7383609080415544, + "flos": 740652531456.0, + "grad_norm": 0.029602535209274302, + "language_loss": 0.84896356, + "learning_rate": 0.00016903103623757516, + "loss": 0.85940289, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.40087891, + "step": 3838, + "time_per_iteration": 3.0506296157836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_mlp": 1.00541735, + "epoch": 0.738553289726818, + "flos": 551257247232.0, + "grad_norm": 0.038121042805401205, + "language_loss": 0.807634, + "learning_rate": 0.00016879758126404738, + "loss": 0.8180899, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.40161133, + "step": 3839, + "time_per_iteration": 2.715830087661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104385, + "balance_loss_mlp": 1.00372398, + "epoch": 0.7387456714120816, + "flos": 911776785408.0, + "grad_norm": 0.03920302310428291, + "language_loss": 0.80385631, + "learning_rate": 0.00016856425487470216, + "loss": 0.81429482, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.40112305, + "step": 3840, + "time_per_iteration": 3.1212151050567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044355, + "balance_loss_mlp": 1.00422895, + "epoch": 0.7389380530973452, + "flos": 854197163520.0, + "grad_norm": 0.035349098992081385, + "language_loss": 0.79466581, + "learning_rate": 0.00016833105716012486, + "loss": 0.80510932, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.40112305, + "step": 3841, + "time_per_iteration": 3.1690988540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_mlp": 1.0040617, + "epoch": 0.7391304347826086, + "flos": 818421084672.0, + "grad_norm": 0.0368177293104177, + "language_loss": 0.85204184, + "learning_rate": 0.00016809798821085088, + "loss": 0.86248374, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.40112305, + "step": 3842, + "time_per_iteration": 3.033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104542, + "balance_loss_mlp": 1.00536537, + "epoch": 0.7393228164678722, + "flos": 573938565888.0, + "grad_norm": 0.03389595177699646, + "language_loss": 0.89421487, + "learning_rate": 0.00016786504811736565, + "loss": 0.90466905, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.40039062, + "step": 3843, + "time_per_iteration": 2.723698616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104513, + "balance_loss_mlp": 1.00500441, + "epoch": 0.7395151981531358, + "flos": 686576253696.0, + "grad_norm": 0.0300135100261375, + "language_loss": 0.83072603, + "learning_rate": 0.00016763223697010442, + "loss": 0.84117734, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.40112305, + "step": 3844, + "time_per_iteration": 2.975797414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_mlp": 1.00389111, + "epoch": 0.7397075798383994, + "flos": 557455167744.0, + "grad_norm": 0.04240767697887406, + "language_loss": 0.84802914, + "learning_rate": 0.00016739955485945256, + "loss": 0.85846901, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.40087891, + "step": 3845, + "time_per_iteration": 2.720717191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_mlp": 1.00448394, + "epoch": 0.739899961523663, + "flos": 547822922496.0, + "grad_norm": 0.04053063595065812, + "language_loss": 0.86230588, + "learning_rate": 0.00016716700187574513, + "loss": 0.87275296, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.40209961, + "step": 3846, + "time_per_iteration": 2.703578472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_mlp": 1.00492537, + "epoch": 0.7400923432089265, + "flos": 610304693760.0, + "grad_norm": 0.03543720475620032, + "language_loss": 0.84347486, + "learning_rate": 0.0001669345781092675, + "loss": 0.85392559, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.40136719, + "step": 3847, + "time_per_iteration": 2.7703194618225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_mlp": 1.00455177, + "epoch": 0.7402847248941901, + "flos": 592180439040.0, + "grad_norm": 0.0397830502127856, + "language_loss": 0.87809312, + "learning_rate": 0.0001667022836502546, + "loss": 0.8885411, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.40234375, + "step": 3848, + "time_per_iteration": 2.760023355484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_mlp": 1.00629044, + "epoch": 0.7404771065794536, + "flos": 478305305856.0, + "grad_norm": 0.03878201132992699, + "language_loss": 0.83579338, + "learning_rate": 0.00016647011858889077, + "loss": 0.84625876, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.40234375, + "step": 3849, + "time_per_iteration": 2.566498041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_mlp": 1.00385714, + "epoch": 0.7406694882647172, + "flos": 497467729152.0, + "grad_norm": 0.04044358723064945, + "language_loss": 0.86492926, + "learning_rate": 0.00016623808301531056, + "loss": 0.87536979, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.40185547, + "step": 3850, + "time_per_iteration": 2.659444808959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043918, + "balance_loss_mlp": 1.00367308, + "epoch": 0.7408618699499807, + "flos": 563327444736.0, + "grad_norm": 0.04103255616090965, + "language_loss": 0.79822052, + "learning_rate": 0.00016600617701959842, + "loss": 0.80865979, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.40234375, + "step": 3851, + "time_per_iteration": 2.7590160369873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044136, + "balance_loss_mlp": 1.0050354, + "epoch": 0.7410542516352443, + "flos": 1391472136704.0, + "grad_norm": 0.004180276378427017, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.7988795, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.390625, + "step": 3852, + "time_per_iteration": 4.960458040237427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047168, + "balance_loss_mlp": 1.00699425, + "epoch": 0.7412466333205079, + "flos": 671212682496.0, + "grad_norm": 0.032734679500117485, + "language_loss": 0.81693292, + "learning_rate": 0.00016554275412186315, + "loss": 0.82740462, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.40161133, + "step": 3853, + "time_per_iteration": 2.8345468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045922, + "balance_loss_mlp": 1.00579631, + "epoch": 0.7414390150057715, + "flos": 490319122944.0, + "grad_norm": 0.03898197484032271, + "language_loss": 0.81142187, + "learning_rate": 0.0001653112373997568, + "loss": 0.82188106, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.40112305, + "step": 3854, + "time_per_iteration": 2.6750757694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048137, + "balance_loss_mlp": 1.00786769, + "epoch": 0.7416313966910351, + "flos": 600494558976.0, + "grad_norm": 0.046812555930759385, + "language_loss": 0.75529599, + "learning_rate": 0.0001650798506153517, + "loss": 0.76577735, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.40258789, + "step": 3855, + "time_per_iteration": 2.7398931980133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_mlp": 1.00440431, + "epoch": 0.7418237783762985, + "flos": 543587611392.0, + "grad_norm": 0.04165043457756402, + "language_loss": 0.84612322, + "learning_rate": 0.00016484859385848023, + "loss": 0.85657072, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.40332031, + "step": 3856, + "time_per_iteration": 2.6185436248779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047216, + "balance_loss_mlp": 1.00692356, + "epoch": 0.7420161600615621, + "flos": 545224577280.0, + "grad_norm": 0.03738954086230496, + "language_loss": 0.77780879, + "learning_rate": 0.0001646174672189243, + "loss": 0.78828102, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.40283203, + "step": 3857, + "time_per_iteration": 2.689188241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_mlp": 1.00661707, + "epoch": 0.7422085417468257, + "flos": 528211401216.0, + "grad_norm": 0.03526154422012509, + "language_loss": 0.80570501, + "learning_rate": 0.00016438647078641488, + "loss": 0.81617367, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.40234375, + "step": 3858, + "time_per_iteration": 2.5922017097473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_mlp": 1.00247657, + "epoch": 0.7424009234320893, + "flos": 509761502976.0, + "grad_norm": 0.033547873778652565, + "language_loss": 0.83657616, + "learning_rate": 0.00016415560465063344, + "loss": 0.84700406, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.40307617, + "step": 3859, + "time_per_iteration": 2.7559196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_mlp": 1.00216925, + "epoch": 0.7425933051173528, + "flos": 513607987200.0, + "grad_norm": 0.0418042544684692, + "language_loss": 0.79894865, + "learning_rate": 0.0001639248689012095, + "loss": 0.8093735, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.40307617, + "step": 3860, + "time_per_iteration": 2.5863146781921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042581, + "balance_loss_mlp": 1.00235939, + "epoch": 0.7427856868026164, + "flos": 459378119424.0, + "grad_norm": 0.03937431006783476, + "language_loss": 0.88026142, + "learning_rate": 0.00016369426362772271, + "loss": 0.89068723, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.40209961, + "step": 3861, + "time_per_iteration": 2.761857271194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046937, + "balance_loss_mlp": 1.00681162, + "epoch": 0.74297806848788, + "flos": 606188946432.0, + "grad_norm": 0.03201159100602054, + "language_loss": 0.80801797, + "learning_rate": 0.00016346378891970233, + "loss": 0.81848741, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.40112305, + "step": 3862, + "time_per_iteration": 2.8071773052215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_mlp": 1.00797915, + "epoch": 0.7431704501731435, + "flos": 893071229952.0, + "grad_norm": 0.0336740145247338, + "language_loss": 0.81989479, + "learning_rate": 0.00016323344486662633, + "loss": 0.8303746, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.39990234, + "step": 3863, + "time_per_iteration": 3.324979066848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_mlp": 1.0081737, + "epoch": 0.7433628318584071, + "flos": 593352755712.0, + "grad_norm": 0.03174757765296807, + "language_loss": 0.78870291, + "learning_rate": 0.00016300323155792247, + "loss": 0.7991842, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.39941406, + "step": 3864, + "time_per_iteration": 2.9272854328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052134, + "balance_loss_mlp": 1.01215136, + "epoch": 0.7435552135436706, + "flos": 478190599680.0, + "grad_norm": 0.033980491156459056, + "language_loss": 0.89128578, + "learning_rate": 0.00016277314908296687, + "loss": 0.90180707, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.3996582, + "step": 3865, + "time_per_iteration": 2.6214301586151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050851, + "balance_loss_mlp": 1.01086855, + "epoch": 0.7437475952289342, + "flos": 674432179200.0, + "grad_norm": 0.04325039484001494, + "language_loss": 0.76593798, + "learning_rate": 0.00016254319753108604, + "loss": 0.77644652, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.3996582, + "step": 3866, + "time_per_iteration": 2.899153232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_mlp": 1.00706291, + "epoch": 0.7439399769141978, + "flos": 771771424512.0, + "grad_norm": 0.03836259627327615, + "language_loss": 0.77282906, + "learning_rate": 0.00016231337699155492, + "loss": 0.78330016, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.40039062, + "step": 3867, + "time_per_iteration": 3.037646532058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046954, + "balance_loss_mlp": 1.00680459, + "epoch": 0.7441323585994614, + "flos": 649039754496.0, + "grad_norm": 0.035166098424979836, + "language_loss": 0.78786439, + "learning_rate": 0.0001620836875535977, + "loss": 0.79833388, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.40136719, + "step": 3868, + "time_per_iteration": 2.850342273712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044105, + "balance_loss_mlp": 1.00385988, + "epoch": 0.7443247402847248, + "flos": 566501254656.0, + "grad_norm": 0.03170658148117992, + "language_loss": 0.81203747, + "learning_rate": 0.00016185412930638766, + "loss": 0.82247853, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.40234375, + "step": 3869, + "time_per_iteration": 2.845094680786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_mlp": 1.00283337, + "epoch": 0.7445171219699884, + "flos": 579680585472.0, + "grad_norm": 0.03566273998402668, + "language_loss": 0.8328712, + "learning_rate": 0.00016162470233904765, + "loss": 0.843301, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.40136719, + "step": 3870, + "time_per_iteration": 2.720104217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043223, + "balance_loss_mlp": 1.00304985, + "epoch": 0.744709503655252, + "flos": 620030257920.0, + "grad_norm": 0.03479057330030947, + "language_loss": 0.82728422, + "learning_rate": 0.00016139540674064856, + "loss": 0.83771646, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.40161133, + "step": 3871, + "time_per_iteration": 2.7673120498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_mlp": 1.00208843, + "epoch": 0.7449018853405156, + "flos": 529681171200.0, + "grad_norm": 0.03196452770059439, + "language_loss": 0.78282529, + "learning_rate": 0.00016116624260021113, + "loss": 0.79324627, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.39990234, + "step": 3872, + "time_per_iteration": 2.7602975368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_mlp": 1.00197768, + "epoch": 0.7450942670257792, + "flos": 434223855360.0, + "grad_norm": 0.03942691463996184, + "language_loss": 0.84282726, + "learning_rate": 0.0001609372100067046, + "loss": 0.85324788, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.40063477, + "step": 3873, + "time_per_iteration": 2.557443618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_mlp": 1.00324678, + "epoch": 0.7452866487110427, + "flos": 698166250752.0, + "grad_norm": 0.03979606180562333, + "language_loss": 0.85209823, + "learning_rate": 0.0001607083090490475, + "loss": 0.86253166, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.40087891, + "step": 3874, + "time_per_iteration": 2.9215829372406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042214, + "balance_loss_mlp": 1.00213587, + "epoch": 0.7454790303963063, + "flos": 513280398336.0, + "grad_norm": 0.038948732221191794, + "language_loss": 0.80756831, + "learning_rate": 0.00016047953981610714, + "loss": 0.81799042, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.40063477, + "step": 3875, + "time_per_iteration": 2.7751615047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042316, + "balance_loss_mlp": 1.00331116, + "epoch": 0.7456714120815698, + "flos": 1328876637696.0, + "grad_norm": 0.007608844356592571, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80771959, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.38964844, + "step": 3876, + "time_per_iteration": 4.963236331939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104273, + "balance_loss_mlp": 1.00258029, + "epoch": 0.7458637937668334, + "flos": 722972517120.0, + "grad_norm": 0.03405336651276997, + "language_loss": 0.81492639, + "learning_rate": 0.0001600223968795889, + "loss": 0.82535368, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.40136719, + "step": 3877, + "time_per_iteration": 2.910365581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040028, + "balance_loss_mlp": 1.00102234, + "epoch": 0.746056175452097, + "flos": 1504869014784.0, + "grad_norm": 0.004565558570820898, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76736104, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.38964844, + "step": 3878, + "time_per_iteration": 4.932594060897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_mlp": 1.00196517, + "epoch": 0.7462485571373605, + "flos": 521295119616.0, + "grad_norm": 0.03746689938213739, + "language_loss": 0.82366681, + "learning_rate": 0.00015956578190706483, + "loss": 0.83408701, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.40039062, + "step": 3879, + "time_per_iteration": 2.6971168518066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043092, + "balance_loss_mlp": 1.00282276, + "epoch": 0.7464409388226241, + "flos": 482167341312.0, + "grad_norm": 0.03527801182694915, + "language_loss": 0.76289219, + "learning_rate": 0.00015933767262892468, + "loss": 0.77332312, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.40258789, + "step": 3880, + "time_per_iteration": 2.739508628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043029, + "balance_loss_mlp": 1.00290275, + "epoch": 0.7466333205078877, + "flos": 487742164992.0, + "grad_norm": 0.04213099092543845, + "language_loss": 0.82585847, + "learning_rate": 0.00015910969560762927, + "loss": 0.83628881, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.40112305, + "step": 3881, + "time_per_iteration": 2.562812089920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041177, + "balance_loss_mlp": 1.00102758, + "epoch": 0.7468257021931513, + "flos": 612408254208.0, + "grad_norm": 0.03436500005268551, + "language_loss": 0.83349586, + "learning_rate": 0.00015888185093168727, + "loss": 0.84390759, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.40136719, + "step": 3882, + "time_per_iteration": 2.775710105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_mlp": 1.00434875, + "epoch": 0.7470180838784147, + "flos": 534485144064.0, + "grad_norm": 0.033392076126709996, + "language_loss": 0.81580567, + "learning_rate": 0.00015865413868955581, + "loss": 0.82625163, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.40234375, + "step": 3883, + "time_per_iteration": 2.641209125518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_mlp": 1.00276339, + "epoch": 0.7472104655636783, + "flos": 740673918720.0, + "grad_norm": 0.03165690169757385, + "language_loss": 0.83215499, + "learning_rate": 0.00015842655896964054, + "loss": 0.84258389, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.40112305, + "step": 3884, + "time_per_iteration": 3.0401206016540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_mlp": 1.00191045, + "epoch": 0.7474028472489419, + "flos": 641502321408.0, + "grad_norm": 0.03740320780985832, + "language_loss": 0.74281669, + "learning_rate": 0.00015819911186029567, + "loss": 0.75323802, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.40209961, + "step": 3885, + "time_per_iteration": 2.7730581760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_mlp": 1.00332129, + "epoch": 0.7475952289342055, + "flos": 591326962944.0, + "grad_norm": 0.03361665798046632, + "language_loss": 0.8701033, + "learning_rate": 0.00015797179744982443, + "loss": 0.88053918, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.40258789, + "step": 3886, + "time_per_iteration": 2.708472967147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_mlp": 1.00303507, + "epoch": 0.7477876106194691, + "flos": 489220683264.0, + "grad_norm": 0.029904604338816032, + "language_loss": 0.79095513, + "learning_rate": 0.00015774461582647765, + "loss": 0.80138862, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.40307617, + "step": 3887, + "time_per_iteration": 2.619105100631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044379, + "balance_loss_mlp": 1.00406253, + "epoch": 0.7479799923047326, + "flos": 555790011648.0, + "grad_norm": 0.036783241933874694, + "language_loss": 0.81563759, + "learning_rate": 0.00015751756707845505, + "loss": 0.82608134, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.40307617, + "step": 3888, + "time_per_iteration": 2.639768123626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041973, + "balance_loss_mlp": 1.00170422, + "epoch": 0.7481723739899961, + "flos": 768791055360.0, + "grad_norm": 0.03246382733666718, + "language_loss": 0.88938636, + "learning_rate": 0.00015729065129390502, + "loss": 0.89980614, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.40258789, + "step": 3889, + "time_per_iteration": 3.0039196014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041937, + "balance_loss_mlp": 1.00162077, + "epoch": 0.7483647556752597, + "flos": 497161527552.0, + "grad_norm": 0.037416161983298064, + "language_loss": 0.82518947, + "learning_rate": 0.0001570638685609241, + "loss": 0.83560884, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.40307617, + "step": 3890, + "time_per_iteration": 2.6009106636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_mlp": 1.00238311, + "epoch": 0.7485571373605233, + "flos": 473826976512.0, + "grad_norm": 0.0374886975546847, + "language_loss": 0.80841064, + "learning_rate": 0.00015683721896755693, + "loss": 0.81883812, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.40356445, + "step": 3891, + "time_per_iteration": 2.5633225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050194, + "balance_loss_mlp": 1.0109024, + "epoch": 0.7487495190457868, + "flos": 1557901287936.0, + "grad_norm": 0.009107033640044568, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83260679, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.39257812, + "step": 3892, + "time_per_iteration": 4.94974160194397 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046825, + "balance_loss_mlp": 1.00665128, + "epoch": 0.7489419007310504, + "flos": 582967156224.0, + "grad_norm": 0.04143959916189291, + "language_loss": 0.85828441, + "learning_rate": 0.00015638431955158528, + "loss": 0.8687526, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.40161133, + "step": 3893, + "time_per_iteration": 2.6816978454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_mlp": 1.0072118, + "epoch": 0.749134282416314, + "flos": 568698134016.0, + "grad_norm": 0.030135437984765083, + "language_loss": 0.81634343, + "learning_rate": 0.00015615806990481186, + "loss": 0.82681662, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.40087891, + "step": 3894, + "time_per_iteration": 2.7294962406158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046959, + "balance_loss_mlp": 1.0068568, + "epoch": 0.7493266641015776, + "flos": 534166303488.0, + "grad_norm": 0.0348465154646137, + "language_loss": 0.84720361, + "learning_rate": 0.00015593195374931452, + "loss": 0.85767317, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.40087891, + "step": 3895, + "time_per_iteration": 2.7430076599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047, + "balance_loss_mlp": 1.00685048, + "epoch": 0.7495190457868411, + "flos": 524718750720.0, + "grad_norm": 0.040656951694221274, + "language_loss": 0.80276871, + "learning_rate": 0.00015570597117287922, + "loss": 0.81323874, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.40136719, + "step": 3896, + "time_per_iteration": 2.6507298946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_mlp": 1.00107622, + "epoch": 0.7497114274721046, + "flos": 515190518016.0, + "grad_norm": 0.03462966662761621, + "language_loss": 0.78418148, + "learning_rate": 0.0001554801222632406, + "loss": 0.79459298, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.40063477, + "step": 3897, + "time_per_iteration": 2.595093250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_mlp": 1.00186181, + "epoch": 0.7499038091573682, + "flos": 495997959168.0, + "grad_norm": 0.03336183376647943, + "language_loss": 0.85394609, + "learning_rate": 0.00015525440710808052, + "loss": 0.86436647, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.40161133, + "step": 3898, + "time_per_iteration": 2.643571376800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_mlp": 1.00302041, + "epoch": 0.7500961908426318, + "flos": 738989320704.0, + "grad_norm": 0.03519199778666105, + "language_loss": 0.78480381, + "learning_rate": 0.00015502882579502953, + "loss": 0.79523695, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.40283203, + "step": 3899, + "time_per_iteration": 2.97965669631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043279, + "balance_loss_mlp": 1.00303352, + "epoch": 0.7502885725278954, + "flos": 534537633792.0, + "grad_norm": 0.03091865582012727, + "language_loss": 0.85061979, + "learning_rate": 0.00015480337841166592, + "loss": 0.86105257, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.40234375, + "step": 3900, + "time_per_iteration": 2.7444653511047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043426, + "balance_loss_mlp": 1.00322855, + "epoch": 0.7504809542131589, + "flos": 590559024384.0, + "grad_norm": 0.034641340110691664, + "language_loss": 0.83055896, + "learning_rate": 0.00015457806504551647, + "loss": 0.84099317, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.40185547, + "step": 3901, + "time_per_iteration": 2.847846269607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_mlp": 1.0071348, + "epoch": 0.7506733358984224, + "flos": 512583424512.0, + "grad_norm": 0.03350221131006084, + "language_loss": 0.78925437, + "learning_rate": 0.0001543528857840554, + "loss": 0.79972672, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.40087891, + "step": 3902, + "time_per_iteration": 2.6609957218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047342, + "balance_loss_mlp": 1.00728762, + "epoch": 0.750865717583686, + "flos": 540383665920.0, + "grad_norm": 0.03644816467758723, + "language_loss": 0.80910051, + "learning_rate": 0.000154127840714705, + "loss": 0.81957394, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.40039062, + "step": 3903, + "time_per_iteration": 2.778198003768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048232, + "balance_loss_mlp": 1.00810659, + "epoch": 0.7510580992689496, + "flos": 477541257984.0, + "grad_norm": 0.040090358516612946, + "language_loss": 0.8254571, + "learning_rate": 0.00015390292992483557, + "loss": 0.83593941, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.40112305, + "step": 3904, + "time_per_iteration": 2.5382485389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047868, + "balance_loss_mlp": 1.0078609, + "epoch": 0.7512504809542132, + "flos": 580201615104.0, + "grad_norm": 0.03358602757025677, + "language_loss": 0.84426451, + "learning_rate": 0.00015367815350176523, + "loss": 0.85474312, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.39990234, + "step": 3905, + "time_per_iteration": 2.741651773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_mlp": 1.00804722, + "epoch": 0.7514428626394767, + "flos": 419564060928.0, + "grad_norm": 0.03247714739847641, + "language_loss": 0.83377486, + "learning_rate": 0.00015345351153275987, + "loss": 0.84425521, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.3996582, + "step": 3906, + "time_per_iteration": 2.5285587310791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041564, + "balance_loss_mlp": 1.0013901, + "epoch": 0.7516352443247403, + "flos": 642255675648.0, + "grad_norm": 0.03199624670716249, + "language_loss": 0.81475991, + "learning_rate": 0.00015322900410503332, + "loss": 0.82517552, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.40161133, + "step": 3907, + "time_per_iteration": 2.814133405685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_mlp": 1.00366187, + "epoch": 0.7518276260100039, + "flos": 582192414720.0, + "grad_norm": 0.03412627966929826, + "language_loss": 0.77873939, + "learning_rate": 0.00015300463130574703, + "loss": 0.78917778, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.40161133, + "step": 3908, + "time_per_iteration": 2.909247875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_mlp": 1.00210238, + "epoch": 0.7520200076952674, + "flos": 688616630784.0, + "grad_norm": 0.028908861637072923, + "language_loss": 0.82461572, + "learning_rate": 0.00015278039322201033, + "loss": 0.83503771, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.40087891, + "step": 3909, + "time_per_iteration": 2.9831857681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044247, + "balance_loss_mlp": 1.00419319, + "epoch": 0.7522123893805309, + "flos": 487416521472.0, + "grad_norm": 0.03727501857461446, + "language_loss": 0.8023864, + "learning_rate": 0.00015255628994088004, + "loss": 0.8128289, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.40039062, + "step": 3910, + "time_per_iteration": 2.5681653022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104353, + "balance_loss_mlp": 1.00352335, + "epoch": 0.7524047710657945, + "flos": 820592686080.0, + "grad_norm": 0.03692479601662457, + "language_loss": 0.75641394, + "learning_rate": 0.00015233232154936082, + "loss": 0.76684928, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.39990234, + "step": 3911, + "time_per_iteration": 3.284299612045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046246, + "balance_loss_mlp": 1.00616753, + "epoch": 0.7525971527510581, + "flos": 700782092544.0, + "grad_norm": 0.03573003611692562, + "language_loss": 0.76908588, + "learning_rate": 0.0001521084881344048, + "loss": 0.77954835, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.40063477, + "step": 3912, + "time_per_iteration": 2.8574602603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_mlp": 1.00155079, + "epoch": 0.7527895344363217, + "flos": 634950567168.0, + "grad_norm": 0.03264325310237669, + "language_loss": 0.8679074, + "learning_rate": 0.00015188478978291208, + "loss": 0.87832272, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.3996582, + "step": 3913, + "time_per_iteration": 2.7522592544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_mlp": 1.00173748, + "epoch": 0.7529819161215853, + "flos": 563933044992.0, + "grad_norm": 0.03193556827495635, + "language_loss": 0.86971831, + "learning_rate": 0.00015166122658173014, + "loss": 0.88013625, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.40039062, + "step": 3914, + "time_per_iteration": 2.8044931888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_mlp": 1.00230658, + "epoch": 0.7531742978068487, + "flos": 691957636608.0, + "grad_norm": 0.032939092122736, + "language_loss": 0.89373708, + "learning_rate": 0.00015143779861765332, + "loss": 0.90416014, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.39990234, + "step": 3915, + "time_per_iteration": 2.895873546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_mlp": 1.00266302, + "epoch": 0.7533666794921123, + "flos": 682307894784.0, + "grad_norm": 0.030283450917942635, + "language_loss": 0.81763279, + "learning_rate": 0.00015121450597742458, + "loss": 0.82805902, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.39941406, + "step": 3916, + "time_per_iteration": 2.8187012672424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00275862, + "epoch": 0.7535590611773759, + "flos": 624814788864.0, + "grad_norm": 0.03530069245734392, + "language_loss": 0.79033458, + "learning_rate": 0.00015099134874773369, + "loss": 0.80076224, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.39990234, + "step": 3917, + "time_per_iteration": 2.729224443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_mlp": 1.0022496, + "epoch": 0.7537514428626395, + "flos": 520494133248.0, + "grad_norm": 0.030735782054698856, + "language_loss": 0.80733752, + "learning_rate": 0.00015076832701521793, + "loss": 0.81775939, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.39916992, + "step": 3918, + "time_per_iteration": 2.7341344356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_mlp": 1.00632989, + "epoch": 0.753943824547903, + "flos": 725035248384.0, + "grad_norm": 0.03833991263993651, + "language_loss": 0.82337809, + "learning_rate": 0.000150545440866462, + "loss": 0.83384174, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.40014648, + "step": 3919, + "time_per_iteration": 2.988217353820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045499, + "balance_loss_mlp": 1.00544465, + "epoch": 0.7541362062331666, + "flos": 438467914752.0, + "grad_norm": 0.03907672700659196, + "language_loss": 0.78807712, + "learning_rate": 0.000150322690387998, + "loss": 0.79853213, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.40039062, + "step": 3920, + "time_per_iteration": 2.503204107284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_mlp": 1.00841296, + "epoch": 0.7543285879184302, + "flos": 566344752384.0, + "grad_norm": 0.03511209658305934, + "language_loss": 0.7581147, + "learning_rate": 0.00015010007566630535, + "loss": 0.76859963, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.40063477, + "step": 3921, + "time_per_iteration": 2.785719633102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_mlp": 1.00624704, + "epoch": 0.7545209696036937, + "flos": 522059167488.0, + "grad_norm": 0.043005780548435554, + "language_loss": 0.81968284, + "learning_rate": 0.00014987759678781077, + "loss": 0.83014631, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.40087891, + "step": 3922, + "time_per_iteration": 2.611830711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045351, + "balance_loss_mlp": 1.00524938, + "epoch": 0.7547133512889573, + "flos": 617210281728.0, + "grad_norm": 0.034097045182419745, + "language_loss": 0.82924581, + "learning_rate": 0.00014965525383888795, + "loss": 0.83969939, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.40087891, + "step": 3923, + "time_per_iteration": 2.7791478633880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104536, + "balance_loss_mlp": 1.00532925, + "epoch": 0.7549057329742208, + "flos": 752142406656.0, + "grad_norm": 0.03232128162967594, + "language_loss": 0.72664821, + "learning_rate": 0.00014943304690585851, + "loss": 0.73710179, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.40014648, + "step": 3924, + "time_per_iteration": 2.8950600624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047725, + "balance_loss_mlp": 1.00767088, + "epoch": 0.7550981146594844, + "flos": 515451032832.0, + "grad_norm": 0.03846404623424841, + "language_loss": 0.79993105, + "learning_rate": 0.0001492109760749908, + "loss": 0.81040823, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.40039062, + "step": 3925, + "time_per_iteration": 2.582379102706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.00763071, + "epoch": 0.755290496344748, + "flos": 523027349760.0, + "grad_norm": 0.03160852953683284, + "language_loss": 0.80470473, + "learning_rate": 0.00014898904143250002, + "loss": 0.81518203, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.40087891, + "step": 3926, + "time_per_iteration": 2.642066240310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047142, + "balance_loss_mlp": 1.00804138, + "epoch": 0.7554828780300116, + "flos": 1417706376960.0, + "grad_norm": 0.005903328707274883, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76802349, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.390625, + "step": 3927, + "time_per_iteration": 4.909573793411255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045265, + "balance_loss_mlp": 1.00513911, + "epoch": 0.7556752597152752, + "flos": 557986891008.0, + "grad_norm": 0.0318859682760306, + "language_loss": 0.80794632, + "learning_rate": 0.0001485455810572474, + "loss": 0.81839895, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.40112305, + "step": 3928, + "time_per_iteration": 2.635267734527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_mlp": 1.00466609, + "epoch": 0.7558676414005386, + "flos": 564742779648.0, + "grad_norm": 0.029085057110465686, + "language_loss": 0.84313619, + "learning_rate": 0.00014832405549665236, + "loss": 0.853585, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.40209961, + "step": 3929, + "time_per_iteration": 2.7366552352905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_mlp": 1.00514007, + "epoch": 0.7560600230858022, + "flos": 562535206656.0, + "grad_norm": 0.03398651483995001, + "language_loss": 0.79036754, + "learning_rate": 0.00014810266646876746, + "loss": 0.80082047, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.40136719, + "step": 3930, + "time_per_iteration": 2.748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_mlp": 1.00550461, + "epoch": 0.7562524047710658, + "flos": 720958384896.0, + "grad_norm": 0.03398387115243252, + "language_loss": 0.78128892, + "learning_rate": 0.00014788141405954364, + "loss": 0.79174572, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.40161133, + "step": 3931, + "time_per_iteration": 3.0010688304901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046674, + "balance_loss_mlp": 1.0064286, + "epoch": 0.7564447864563294, + "flos": 544397346048.0, + "grad_norm": 0.04087931734394053, + "language_loss": 0.85259515, + "learning_rate": 0.00014766029835487865, + "loss": 0.8630619, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.40234375, + "step": 3932, + "time_per_iteration": 2.7051644325256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045407, + "balance_loss_mlp": 1.00528109, + "epoch": 0.7566371681415929, + "flos": 727095067392.0, + "grad_norm": 0.040524003150174424, + "language_loss": 0.80254388, + "learning_rate": 0.0001474393194406173, + "loss": 0.812998, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.40112305, + "step": 3933, + "time_per_iteration": 2.88698410987854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045593, + "balance_loss_mlp": 1.00546694, + "epoch": 0.7568295498268565, + "flos": 577807404288.0, + "grad_norm": 0.03205492443871288, + "language_loss": 0.80140668, + "learning_rate": 0.00014721847740255112, + "loss": 0.81186259, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.40112305, + "step": 3934, + "time_per_iteration": 2.8201425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_mlp": 1.00357056, + "epoch": 0.75702193151212, + "flos": 1523218791168.0, + "grad_norm": 0.006266777740012466, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74954593, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.390625, + "step": 3935, + "time_per_iteration": 4.622663736343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_mlp": 1.00100183, + "epoch": 0.7572143131973836, + "flos": 526489864704.0, + "grad_norm": 0.04266541401518767, + "language_loss": 0.78904128, + "learning_rate": 0.00014677720429790526, + "loss": 0.79945183, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.40039062, + "step": 3936, + "time_per_iteration": 2.5691311359405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104174, + "balance_loss_mlp": 1.00159049, + "epoch": 0.7574066948826472, + "flos": 551823963648.0, + "grad_norm": 0.029232134856981343, + "language_loss": 0.85000217, + "learning_rate": 0.0001465567734026429, + "loss": 0.86041951, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.40136719, + "step": 3937, + "time_per_iteration": 2.6958813667297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045039, + "balance_loss_mlp": 1.00488961, + "epoch": 0.7575990765679107, + "flos": 396769981440.0, + "grad_norm": 0.04157992306337891, + "language_loss": 0.82874024, + "learning_rate": 0.00014633647972621034, + "loss": 0.83919066, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.40136719, + "step": 3938, + "time_per_iteration": 2.443800449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_mlp": 1.00556958, + "epoch": 0.7577914582531743, + "flos": 586186652928.0, + "grad_norm": 0.031504909373110845, + "language_loss": 0.86987495, + "learning_rate": 0.00014611632335413354, + "loss": 0.8803314, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.40063477, + "step": 3939, + "time_per_iteration": 2.7657620906829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043556, + "balance_loss_mlp": 1.00357366, + "epoch": 0.7579838399384379, + "flos": 822485309184.0, + "grad_norm": 0.033895333971604005, + "language_loss": 0.83048445, + "learning_rate": 0.00014589630437188456, + "loss": 0.84091997, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.3996582, + "step": 3940, + "time_per_iteration": 3.1827540397644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_mlp": 1.00527036, + "epoch": 0.7581762216237015, + "flos": 444806786304.0, + "grad_norm": 0.03886523682666057, + "language_loss": 0.78962266, + "learning_rate": 0.00014567642286488253, + "loss": 0.8000766, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.40112305, + "step": 3941, + "time_per_iteration": 2.5701324939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_mlp": 1.00506926, + "epoch": 0.7583686033089649, + "flos": 541939951872.0, + "grad_norm": 0.03861315862447661, + "language_loss": 0.79739159, + "learning_rate": 0.00014545667891849258, + "loss": 0.8078438, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.40136719, + "step": 3942, + "time_per_iteration": 2.6185083389282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_mlp": 1.00675285, + "epoch": 0.7585609849942285, + "flos": 523613508096.0, + "grad_norm": 0.03344324045472487, + "language_loss": 0.82940769, + "learning_rate": 0.00014523707261802733, + "loss": 0.83987653, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.40112305, + "step": 3943, + "time_per_iteration": 2.615499973297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045818, + "balance_loss_mlp": 1.00564396, + "epoch": 0.7587533666794921, + "flos": 542908134144.0, + "grad_norm": 0.03989389594451329, + "language_loss": 0.81696534, + "learning_rate": 0.00014501760404874527, + "loss": 0.82742351, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.40161133, + "step": 3944, + "time_per_iteration": 2.7015254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047263, + "balance_loss_mlp": 1.00713706, + "epoch": 0.7589457483647557, + "flos": 607521656064.0, + "grad_norm": 0.037013243760391015, + "language_loss": 0.86645532, + "learning_rate": 0.00014479827329585176, + "loss": 0.87692797, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.40112305, + "step": 3945, + "time_per_iteration": 2.707260847091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.008057, + "epoch": 0.7591381300500193, + "flos": 556252715520.0, + "grad_norm": 0.030362278965781222, + "language_loss": 0.85217047, + "learning_rate": 0.00014457908044449846, + "loss": 0.86265111, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.39990234, + "step": 3946, + "time_per_iteration": 2.7425830364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048648, + "balance_loss_mlp": 1.00868881, + "epoch": 0.7593305117352828, + "flos": 530814604032.0, + "grad_norm": 0.0320699776647955, + "language_loss": 0.83156931, + "learning_rate": 0.00014436002557978371, + "loss": 0.8420558, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.39941406, + "step": 3947, + "time_per_iteration": 2.852153778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052135, + "balance_loss_mlp": 1.01313019, + "epoch": 0.7595228934205464, + "flos": 1505925658368.0, + "grad_norm": 0.007143494000939788, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77695286, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.38964844, + "step": 3948, + "time_per_iteration": 4.8901238441467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_mlp": 1.00338674, + "epoch": 0.7597152751058099, + "flos": 456468715008.0, + "grad_norm": 0.03356126441084979, + "language_loss": 0.80132592, + "learning_rate": 0.0001439223301503945, + "loss": 0.81176007, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.40014648, + "step": 3949, + "time_per_iteration": 2.5245442390441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_mlp": 1.0028255, + "epoch": 0.7599076567910735, + "flos": 686799830016.0, + "grad_norm": 0.04215278284699455, + "language_loss": 0.76435691, + "learning_rate": 0.00014370368975564834, + "loss": 0.77478409, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.39868164, + "step": 3950, + "time_per_iteration": 3.002926826477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_mlp": 1.0027709, + "epoch": 0.760100038476337, + "flos": 533495574528.0, + "grad_norm": 0.03911832457042585, + "language_loss": 0.84080267, + "learning_rate": 0.00014348518768739766, + "loss": 0.85123098, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.40039062, + "step": 3951, + "time_per_iteration": 2.7287793159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_mlp": 1.00780487, + "epoch": 0.7602924201616006, + "flos": 1474919526144.0, + "grad_norm": 0.009800306556812065, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77774942, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.390625, + "step": 3952, + "time_per_iteration": 4.851192951202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_mlp": 1.00179482, + "epoch": 0.7604848018468642, + "flos": 776041728768.0, + "grad_norm": 0.043428396505350506, + "language_loss": 0.86555135, + "learning_rate": 0.00014304859886964867, + "loss": 0.87596822, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.39868164, + "step": 3953, + "time_per_iteration": 3.0201337337493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_mlp": 1.00415182, + "epoch": 0.7606771835321278, + "flos": 559261274880.0, + "grad_norm": 0.03249370950181494, + "language_loss": 0.8406316, + "learning_rate": 0.00014283051228964878, + "loss": 0.85107362, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.40039062, + "step": 3954, + "time_per_iteration": 2.6745314598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046758, + "balance_loss_mlp": 1.00687051, + "epoch": 0.7608695652173914, + "flos": 526433484288.0, + "grad_norm": 0.03436460979792566, + "language_loss": 0.83105361, + "learning_rate": 0.00014261256437514197, + "loss": 0.84152114, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.39868164, + "step": 3955, + "time_per_iteration": 2.6607260704040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046091, + "balance_loss_mlp": 1.0060848, + "epoch": 0.7610619469026548, + "flos": 616168222464.0, + "grad_norm": 0.03814764574124358, + "language_loss": 0.82773203, + "learning_rate": 0.0001423947552107428, + "loss": 0.83819294, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.39990234, + "step": 3956, + "time_per_iteration": 2.731502056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_mlp": 1.00053155, + "epoch": 0.7612543285879184, + "flos": 864818978304.0, + "grad_norm": 0.03440554152429829, + "language_loss": 0.77563798, + "learning_rate": 0.00014217708488101243, + "loss": 0.78604192, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.3984375, + "step": 3957, + "time_per_iteration": 3.0592825412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_mlp": 1.00076258, + "epoch": 0.761446710273182, + "flos": 554728510464.0, + "grad_norm": 0.045631291273616384, + "language_loss": 0.77730322, + "learning_rate": 0.0001419595534704579, + "loss": 0.78771019, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.39916992, + "step": 3958, + "time_per_iteration": 2.693791389465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_mlp": 1.00143242, + "epoch": 0.7616390919584456, + "flos": 468326029824.0, + "grad_norm": 0.03770259597334161, + "language_loss": 0.81622386, + "learning_rate": 0.00014174216106353237, + "loss": 0.82663804, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.3996582, + "step": 3959, + "time_per_iteration": 2.6240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043866, + "balance_loss_mlp": 1.00385952, + "epoch": 0.7618314736437091, + "flos": 499432283904.0, + "grad_norm": 0.036732960604225574, + "language_loss": 0.76590341, + "learning_rate": 0.00014152490774463512, + "loss": 0.77634203, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.39990234, + "step": 3960, + "time_per_iteration": 2.6385769844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_mlp": 1.00236273, + "epoch": 0.7620238553289727, + "flos": 435452552448.0, + "grad_norm": 0.04258907673967457, + "language_loss": 0.87829125, + "learning_rate": 0.00014130779359811135, + "loss": 0.88871497, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.39990234, + "step": 3961, + "time_per_iteration": 2.530336380004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046467, + "balance_loss_mlp": 1.00657988, + "epoch": 0.7622162370142362, + "flos": 665542594560.0, + "grad_norm": 0.03171084912805384, + "language_loss": 0.86222768, + "learning_rate": 0.0001410908187082521, + "loss": 0.87269235, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.39868164, + "step": 3962, + "time_per_iteration": 2.8736419677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047801, + "balance_loss_mlp": 1.0077945, + "epoch": 0.7624086186994998, + "flos": 559028950272.0, + "grad_norm": 0.03864138857233312, + "language_loss": 0.84107929, + "learning_rate": 0.0001408739831592949, + "loss": 0.85155731, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.39990234, + "step": 3963, + "time_per_iteration": 2.639000415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_mlp": 1.00829566, + "epoch": 0.7626010003847634, + "flos": 630287545344.0, + "grad_norm": 0.04234358402280358, + "language_loss": 0.77802932, + "learning_rate": 0.0001406572870354224, + "loss": 0.78851116, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.39868164, + "step": 3964, + "time_per_iteration": 2.855811834335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_mlp": 1.00591099, + "epoch": 0.7627933820700269, + "flos": 438849938688.0, + "grad_norm": 0.03234706292902695, + "language_loss": 0.87125206, + "learning_rate": 0.00014044073042076337, + "loss": 0.88171101, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.3996582, + "step": 3965, + "time_per_iteration": 2.5181050300598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_mlp": 1.00586748, + "epoch": 0.7629857637552905, + "flos": 533794973184.0, + "grad_norm": 0.028534394430764273, + "language_loss": 0.89329129, + "learning_rate": 0.00014022431339939302, + "loss": 0.90375006, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.39990234, + "step": 3966, + "time_per_iteration": 2.671855926513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_mlp": 1.00679314, + "epoch": 0.7631781454405541, + "flos": 681237645312.0, + "grad_norm": 0.04110089752084587, + "language_loss": 0.78748721, + "learning_rate": 0.00014000803605533163, + "loss": 0.79795587, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.40063477, + "step": 3967, + "time_per_iteration": 2.8315372467041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104216, + "balance_loss_mlp": 1.00203407, + "epoch": 0.7633705271258177, + "flos": 508489064448.0, + "grad_norm": 0.04146307364785201, + "language_loss": 0.8433795, + "learning_rate": 0.00013979189847254553, + "loss": 0.85380107, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.40112305, + "step": 3968, + "time_per_iteration": 2.601447582244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_mlp": 1.00454402, + "epoch": 0.7635629088110811, + "flos": 620039006208.0, + "grad_norm": 0.03458604771119312, + "language_loss": 0.81047332, + "learning_rate": 0.00013957590073494674, + "loss": 0.82091957, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.40063477, + "step": 3969, + "time_per_iteration": 2.8777170181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044405, + "balance_loss_mlp": 1.00430274, + "epoch": 0.7637552904963447, + "flos": 639567902208.0, + "grad_norm": 0.03961564196889536, + "language_loss": 0.79463089, + "learning_rate": 0.0001393600429263931, + "loss": 0.80507493, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.40087891, + "step": 3970, + "time_per_iteration": 2.7422754764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.0035553, + "epoch": 0.7639476721816083, + "flos": 1566686860032.0, + "grad_norm": 0.00740169880788124, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75787538, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.390625, + "step": 3971, + "time_per_iteration": 4.935492038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_mlp": 1.00043809, + "epoch": 0.7641400538668719, + "flos": 497020576512.0, + "grad_norm": 0.032719762183458435, + "language_loss": 0.81907034, + "learning_rate": 0.0001389287474315804, + "loss": 0.82947505, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.40014648, + "step": 3972, + "time_per_iteration": 2.630120038986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046444, + "balance_loss_mlp": 1.00638986, + "epoch": 0.7643324355521355, + "flos": 579515334912.0, + "grad_norm": 0.03140885431358122, + "language_loss": 0.80818957, + "learning_rate": 0.00013871330991276505, + "loss": 0.81865394, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.40039062, + "step": 3973, + "time_per_iteration": 2.685450553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_mlp": 1.00257015, + "epoch": 0.764524817237399, + "flos": 786233887488.0, + "grad_norm": 0.035934794543156075, + "language_loss": 0.81384689, + "learning_rate": 0.00013849801265788247, + "loss": 0.82427323, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.40039062, + "step": 3974, + "time_per_iteration": 3.039971113204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_mlp": 1.00235903, + "epoch": 0.7647171989226625, + "flos": 527299599360.0, + "grad_norm": 0.03568861441891304, + "language_loss": 0.83377182, + "learning_rate": 0.00013828285575051818, + "loss": 0.84419549, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.39990234, + "step": 3975, + "time_per_iteration": 2.6113204956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_mlp": 1.00243056, + "epoch": 0.7649095806079261, + "flos": 556029139200.0, + "grad_norm": 0.03438397238975277, + "language_loss": 0.84555364, + "learning_rate": 0.0001380678392742035, + "loss": 0.85597825, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.40014648, + "step": 3976, + "time_per_iteration": 2.702728509902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042138, + "balance_loss_mlp": 1.0021317, + "epoch": 0.7651019622931897, + "flos": 650389960704.0, + "grad_norm": 0.02964586673443437, + "language_loss": 0.84697402, + "learning_rate": 0.00013785296331241526, + "loss": 0.85739541, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.39990234, + "step": 3977, + "time_per_iteration": 2.8500404357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_mlp": 1.00282192, + "epoch": 0.7652943439784533, + "flos": 1048113551616.0, + "grad_norm": 0.03693742198159439, + "language_loss": 0.8784855, + "learning_rate": 0.00013763822794857583, + "loss": 0.88891351, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.3996582, + "step": 3978, + "time_per_iteration": 3.2964861392974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_mlp": 1.00266194, + "epoch": 0.7654867256637168, + "flos": 505415376384.0, + "grad_norm": 0.03301663266188199, + "language_loss": 0.9032107, + "learning_rate": 0.00013742363326605278, + "loss": 0.91363835, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.40087891, + "step": 3979, + "time_per_iteration": 2.7543904781341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_mlp": 1.00289607, + "epoch": 0.7656791073489804, + "flos": 575864236800.0, + "grad_norm": 0.031055895405363115, + "language_loss": 0.78887016, + "learning_rate": 0.00013720917934815935, + "loss": 0.79929984, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.40063477, + "step": 3980, + "time_per_iteration": 2.757488489151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043263, + "balance_loss_mlp": 1.0031848, + "epoch": 0.765871489034244, + "flos": 493792331520.0, + "grad_norm": 0.04115022529331337, + "language_loss": 0.83214378, + "learning_rate": 0.00013699486627815344, + "loss": 0.84257638, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.40063477, + "step": 3981, + "time_per_iteration": 2.6013007164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_mlp": 1.00347948, + "epoch": 0.7660638707195075, + "flos": 487051994112.0, + "grad_norm": 0.036811021847235705, + "language_loss": 0.83011079, + "learning_rate": 0.00013678069413923928, + "loss": 0.84054542, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.3996582, + "step": 3982, + "time_per_iteration": 2.647836208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042258, + "balance_loss_mlp": 1.00225163, + "epoch": 0.766256252404771, + "flos": 445243245312.0, + "grad_norm": 0.03517202501681349, + "language_loss": 0.8304435, + "learning_rate": 0.00013656666301456555, + "loss": 0.84086609, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.39990234, + "step": 3983, + "time_per_iteration": 2.5181782245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045473, + "balance_loss_mlp": 1.00541902, + "epoch": 0.7664486340900346, + "flos": 486214069248.0, + "grad_norm": 0.03304538519441237, + "language_loss": 0.84839791, + "learning_rate": 0.0001363527729872267, + "loss": 0.85885262, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.40039062, + "step": 3984, + "time_per_iteration": 2.7154600620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_mlp": 1.00527942, + "epoch": 0.7666410157752982, + "flos": 647385292032.0, + "grad_norm": 0.036051539426371945, + "language_loss": 0.77239299, + "learning_rate": 0.00013613902414026207, + "loss": 0.78284609, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.40014648, + "step": 3985, + "time_per_iteration": 2.793349027633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_mlp": 1.00238621, + "epoch": 0.7668333974605618, + "flos": 775661650176.0, + "grad_norm": 0.03427802042896287, + "language_loss": 0.82765865, + "learning_rate": 0.00013592541655665642, + "loss": 0.83808166, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.39892578, + "step": 3986, + "time_per_iteration": 2.9631149768829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_mlp": 1.00257468, + "epoch": 0.7670257791458254, + "flos": 614513760000.0, + "grad_norm": 0.03630429655058752, + "language_loss": 0.85794669, + "learning_rate": 0.00013571195031933947, + "loss": 0.86837274, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.40014648, + "step": 3987, + "time_per_iteration": 2.684053659439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040779, + "balance_loss_mlp": 1.0018692, + "epoch": 0.7672181608310888, + "flos": 1488365207808.0, + "grad_norm": 0.004720848952888087, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81522119, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.38867188, + "step": 3988, + "time_per_iteration": 4.726950168609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.0030216, + "epoch": 0.7674105425163524, + "flos": 611867782656.0, + "grad_norm": 0.03766281507369906, + "language_loss": 0.85887635, + "learning_rate": 0.00013528544221501655, + "loss": 0.86930621, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.39941406, + "step": 3989, + "time_per_iteration": 2.710402011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043109, + "balance_loss_mlp": 1.00315046, + "epoch": 0.767602924201616, + "flos": 846605295360.0, + "grad_norm": 0.0329376529812033, + "language_loss": 0.82137692, + "learning_rate": 0.00013507240051359586, + "loss": 0.83180797, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.39941406, + "step": 3990, + "time_per_iteration": 3.0520286560058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_mlp": 1.00342703, + "epoch": 0.7677953058868796, + "flos": 528146272512.0, + "grad_norm": 0.038347091036525886, + "language_loss": 0.8687346, + "learning_rate": 0.00013485950048963425, + "loss": 0.87916845, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.39941406, + "step": 3991, + "time_per_iteration": 2.597275495529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.01365852, + "epoch": 0.7679876875721431, + "flos": 925112618496.0, + "grad_norm": 0.036512387474733066, + "language_loss": 0.83205199, + "learning_rate": 0.00013464674222578643, + "loss": 0.84258771, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.39892578, + "step": 3992, + "time_per_iteration": 3.1764492988586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.01389194, + "epoch": 0.7681800692574067, + "flos": 459019428096.0, + "grad_norm": 0.03635515300980307, + "language_loss": 0.83761203, + "learning_rate": 0.00013443412580465292, + "loss": 0.84815073, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.3996582, + "step": 3993, + "time_per_iteration": 2.583146810531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053421, + "balance_loss_mlp": 1.01348555, + "epoch": 0.7683724509426703, + "flos": 659733500928.0, + "grad_norm": 0.040381204925205964, + "language_loss": 0.84726322, + "learning_rate": 0.00013422165130877857, + "loss": 0.85779738, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.39916992, + "step": 3994, + "time_per_iteration": 2.8877995014190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_mlp": 1.00473106, + "epoch": 0.7685648326279338, + "flos": 556339231488.0, + "grad_norm": 0.052990639724004036, + "language_loss": 0.80869007, + "learning_rate": 0.00013400931882065327, + "loss": 0.81913817, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.40063477, + "step": 3995, + "time_per_iteration": 2.693859815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043655, + "balance_loss_mlp": 1.00367188, + "epoch": 0.7687572143131974, + "flos": 688744942848.0, + "grad_norm": 0.032666888186809864, + "language_loss": 0.81219018, + "learning_rate": 0.0001337971284227118, + "loss": 0.82262671, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.3996582, + "step": 3996, + "time_per_iteration": 3.0207791328430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_mlp": 1.00307465, + "epoch": 0.7689495959984609, + "flos": 1492668559872.0, + "grad_norm": 0.00690868544016345, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77160406, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.38867188, + "step": 3997, + "time_per_iteration": 4.991567134857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_mlp": 1.00463462, + "epoch": 0.7691419776837245, + "flos": 571500613632.0, + "grad_norm": 0.032008326579370035, + "language_loss": 0.80634248, + "learning_rate": 0.0001333731742268438, + "loss": 0.81678867, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.3996582, + "step": 3998, + "time_per_iteration": 2.698580026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040606, + "balance_loss_mlp": 1.00064719, + "epoch": 0.7693343593689881, + "flos": 521191107072.0, + "grad_norm": 0.03337650423069263, + "language_loss": 0.85920131, + "learning_rate": 0.0001331614105935109, + "loss": 0.86960733, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.39941406, + "step": 3999, + "time_per_iteration": 2.693692684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_mlp": 1.00495398, + "epoch": 0.7695267410542517, + "flos": 661552247040.0, + "grad_norm": 0.031590911772699855, + "language_loss": 0.84561241, + "learning_rate": 0.00013294978937954883, + "loss": 0.85606205, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.39990234, + "step": 4000, + "time_per_iteration": 2.7991349697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_mlp": 1.00548565, + "epoch": 0.7697191227395151, + "flos": 547859860992.0, + "grad_norm": 0.04547292617376322, + "language_loss": 0.8583228, + "learning_rate": 0.00013273831066711655, + "loss": 0.86877775, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.39990234, + "step": 4001, + "time_per_iteration": 2.640451192855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_mlp": 1.00354123, + "epoch": 0.7699115044247787, + "flos": 541696933632.0, + "grad_norm": 0.030960933943813315, + "language_loss": 0.80473912, + "learning_rate": 0.00013252697453831747, + "loss": 0.8151741, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.39941406, + "step": 4002, + "time_per_iteration": 2.709754467010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_mlp": 1.00417161, + "epoch": 0.7701038861100423, + "flos": 564143982336.0, + "grad_norm": 0.03227531523104023, + "language_loss": 0.82851601, + "learning_rate": 0.00013231578107519916, + "loss": 0.83895779, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.39990234, + "step": 4003, + "time_per_iteration": 2.914151191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_mlp": 1.0037117, + "epoch": 0.7702962677953059, + "flos": 482734057728.0, + "grad_norm": 0.0383418204368582, + "language_loss": 0.83275282, + "learning_rate": 0.00013210473035975422, + "loss": 0.84318936, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.39916992, + "step": 4004, + "time_per_iteration": 2.605908155441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.0033915, + "epoch": 0.7704886494805695, + "flos": 771806417664.0, + "grad_norm": 0.03621639997578191, + "language_loss": 0.85901195, + "learning_rate": 0.0001318938224739201, + "loss": 0.8694452, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.39916992, + "step": 4005, + "time_per_iteration": 3.059812545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044187, + "balance_loss_mlp": 1.00441921, + "epoch": 0.770681031165833, + "flos": 602318162688.0, + "grad_norm": 0.030887976595528478, + "language_loss": 0.84163052, + "learning_rate": 0.00013168305749957843, + "loss": 0.85207236, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.39746094, + "step": 4006, + "time_per_iteration": 2.730853796005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_mlp": 1.00063193, + "epoch": 0.7708734128510966, + "flos": 497096398848.0, + "grad_norm": 0.03317085046195358, + "language_loss": 0.83013129, + "learning_rate": 0.00013147243551855532, + "loss": 0.84053576, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.39794922, + "step": 4007, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_mlp": 1.00025964, + "epoch": 0.7710657945363601, + "flos": 568455115776.0, + "grad_norm": 0.02959339881613439, + "language_loss": 0.81033671, + "learning_rate": 0.00013126195661262148, + "loss": 0.82073796, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.3984375, + "step": 4008, + "time_per_iteration": 2.8038330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_mlp": 1.00230277, + "epoch": 0.7712581762216237, + "flos": 605750542080.0, + "grad_norm": 0.030762375032726955, + "language_loss": 0.8689748, + "learning_rate": 0.00013105162086349216, + "loss": 0.87939554, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.39746094, + "step": 4009, + "time_per_iteration": 2.8229057788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045827, + "balance_loss_mlp": 1.00593925, + "epoch": 0.7714505579068872, + "flos": 531997614336.0, + "grad_norm": 0.03203683238249966, + "language_loss": 0.86152643, + "learning_rate": 0.00013084142835282687, + "loss": 0.87198472, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.39868164, + "step": 4010, + "time_per_iteration": 2.6913058757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_mlp": 1.00637054, + "epoch": 0.7716429395921508, + "flos": 1425382815744.0, + "grad_norm": 0.007782218935032237, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80929649, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.38867188, + "step": 4011, + "time_per_iteration": 4.785134315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_mlp": 1.00353301, + "epoch": 0.7718353212774144, + "flos": 579587266560.0, + "grad_norm": 0.03553512598849003, + "language_loss": 0.89913195, + "learning_rate": 0.0001304214733732485, + "loss": 0.90956569, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.39819336, + "step": 4012, + "time_per_iteration": 4.228041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_mlp": 1.00733876, + "epoch": 0.772027702962678, + "flos": 511773689856.0, + "grad_norm": 0.036769707264373286, + "language_loss": 0.83085632, + "learning_rate": 0.00013021171106737672, + "loss": 0.8413288, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.39892578, + "step": 4013, + "time_per_iteration": 2.6609246730804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_mlp": 1.00402582, + "epoch": 0.7722200846479416, + "flos": 526748434176.0, + "grad_norm": 0.0322565513109964, + "language_loss": 0.80160201, + "learning_rate": 0.00013000209232605071, + "loss": 0.81204116, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.39868164, + "step": 4014, + "time_per_iteration": 2.6655430793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042333, + "balance_loss_mlp": 1.00244582, + "epoch": 0.772412466333205, + "flos": 480602307072.0, + "grad_norm": 0.033386370052076744, + "language_loss": 0.80578887, + "learning_rate": 0.0001297926172306519, + "loss": 0.81621224, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.39868164, + "step": 4015, + "time_per_iteration": 2.6234195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_mlp": 1.00240195, + "epoch": 0.7726048480184686, + "flos": 907314007296.0, + "grad_norm": 0.032763935043036714, + "language_loss": 0.79579479, + "learning_rate": 0.0001295832858625055, + "loss": 0.8062169, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.39794922, + "step": 4016, + "time_per_iteration": 3.251309394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_mlp": 1.0024209, + "epoch": 0.7727972297037322, + "flos": 632567049984.0, + "grad_norm": 0.031482852227098596, + "language_loss": 0.70049077, + "learning_rate": 0.00012937409830288154, + "loss": 0.71091413, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.39892578, + "step": 4017, + "time_per_iteration": 2.821953296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043688, + "balance_loss_mlp": 1.00377691, + "epoch": 0.7729896113889958, + "flos": 415673835264.0, + "grad_norm": 0.04152117908534408, + "language_loss": 0.85515797, + "learning_rate": 0.00012916505463299362, + "loss": 0.86559486, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.39892578, + "step": 4018, + "time_per_iteration": 2.5182814598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_mlp": 1.00375712, + "epoch": 0.7731819930742593, + "flos": 670105494528.0, + "grad_norm": 0.03808310048663825, + "language_loss": 0.78672588, + "learning_rate": 0.00012895615493399972, + "loss": 0.79716301, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.39941406, + "step": 4019, + "time_per_iteration": 2.8195626735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042435, + "balance_loss_mlp": 1.00249946, + "epoch": 0.7733743747595229, + "flos": 490859594496.0, + "grad_norm": 0.04130359033653859, + "language_loss": 0.83203042, + "learning_rate": 0.00012874739928700192, + "loss": 0.84245479, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.39916992, + "step": 4020, + "time_per_iteration": 2.561997652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_mlp": 1.00260687, + "epoch": 0.7735667564447865, + "flos": 660888321024.0, + "grad_norm": 0.03933419760406932, + "language_loss": 0.8045736, + "learning_rate": 0.00012853878777304624, + "loss": 0.81499898, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.39916992, + "step": 4021, + "time_per_iteration": 2.866426706314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_mlp": 1.00597656, + "epoch": 0.77375913813005, + "flos": 534491947008.0, + "grad_norm": 0.03165766491354683, + "language_loss": 0.84674478, + "learning_rate": 0.000128330320473123, + "loss": 0.85720319, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.3984375, + "step": 4022, + "time_per_iteration": 2.689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.01501465, + "epoch": 0.7739515198153136, + "flos": 1523382096384.0, + "grad_norm": 0.014118198615631215, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79385823, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.38867188, + "step": 4023, + "time_per_iteration": 4.873432874679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_mlp": 1.00660741, + "epoch": 0.7741439015005771, + "flos": 641252500224.0, + "grad_norm": 0.03719222986938954, + "language_loss": 0.8204658, + "learning_rate": 0.0001279138188390543, + "loss": 0.83093119, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.39916992, + "step": 4024, + "time_per_iteration": 2.7891459465026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042489, + "balance_loss_mlp": 1.00267303, + "epoch": 0.7743362831858407, + "flos": 667025003520.0, + "grad_norm": 0.033177934187398395, + "language_loss": 0.86806941, + "learning_rate": 0.00012770578466660915, + "loss": 0.87849432, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.39794922, + "step": 4025, + "time_per_iteration": 2.8528504371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_mlp": 1.00135612, + "epoch": 0.7745286648711043, + "flos": 563994283008.0, + "grad_norm": 0.03246135787328845, + "language_loss": 0.82025433, + "learning_rate": 0.0001274978950315968, + "loss": 0.83066702, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.39892578, + "step": 4026, + "time_per_iteration": 2.8042469024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_mlp": 1.00322056, + "epoch": 0.7747210465563679, + "flos": 517962862080.0, + "grad_norm": 0.03718030635862113, + "language_loss": 0.83308971, + "learning_rate": 0.00012729015001472716, + "loss": 0.84352028, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.39819336, + "step": 4027, + "time_per_iteration": 2.6950860023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042837, + "balance_loss_mlp": 1.00294983, + "epoch": 0.7749134282416313, + "flos": 635369529600.0, + "grad_norm": 0.032368305886577194, + "language_loss": 0.81846035, + "learning_rate": 0.00012708254969665418, + "loss": 0.82888865, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.39868164, + "step": 4028, + "time_per_iteration": 2.7516164779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043156, + "balance_loss_mlp": 1.00326848, + "epoch": 0.7751058099268949, + "flos": 496351792896.0, + "grad_norm": 0.03964938582220019, + "language_loss": 0.83793879, + "learning_rate": 0.00012687509415797526, + "loss": 0.84837031, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.39868164, + "step": 4029, + "time_per_iteration": 2.5878894329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104315, + "balance_loss_mlp": 1.003286, + "epoch": 0.7752981916121585, + "flos": 511363475712.0, + "grad_norm": 0.03526859549006244, + "language_loss": 0.8169387, + "learning_rate": 0.00012666778347923208, + "loss": 0.82737017, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.3984375, + "step": 4030, + "time_per_iteration": 2.717336893081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104185, + "balance_loss_mlp": 1.00208211, + "epoch": 0.7754905732974221, + "flos": 498566168832.0, + "grad_norm": 0.03835604282300423, + "language_loss": 0.84299457, + "learning_rate": 0.0001264606177409092, + "loss": 0.85341311, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.39746094, + "step": 4031, + "time_per_iteration": 2.6836674213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_mlp": 1.00206888, + "epoch": 0.7756829549826857, + "flos": 481783372032.0, + "grad_norm": 0.032423363351723834, + "language_loss": 0.86526835, + "learning_rate": 0.00012625359702343609, + "loss": 0.87568641, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.3972168, + "step": 4032, + "time_per_iteration": 2.74953031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042791, + "balance_loss_mlp": 1.00316608, + "epoch": 0.7758753366679492, + "flos": 553686451200.0, + "grad_norm": 0.036449679892663574, + "language_loss": 0.85421842, + "learning_rate": 0.00012604672140718504, + "loss": 0.86464632, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.39599609, + "step": 4033, + "time_per_iteration": 2.6570351123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.0033319, + "epoch": 0.7760677183532128, + "flos": 705065035776.0, + "grad_norm": 0.035522641284568106, + "language_loss": 0.78343493, + "learning_rate": 0.00012583999097247233, + "loss": 0.79386473, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.39624023, + "step": 4034, + "time_per_iteration": 2.828260660171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_mlp": 1.00273633, + "epoch": 0.7762601000384763, + "flos": 524479623168.0, + "grad_norm": 0.037193057814734455, + "language_loss": 0.80287337, + "learning_rate": 0.0001256334057995578, + "loss": 0.81329775, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.39672852, + "step": 4035, + "time_per_iteration": 2.694047689437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042576, + "balance_loss_mlp": 1.00292659, + "epoch": 0.7764524817237399, + "flos": 558618736128.0, + "grad_norm": 0.03306447256493109, + "language_loss": 0.85536653, + "learning_rate": 0.000125426965968645, + "loss": 0.86579227, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.39624023, + "step": 4036, + "time_per_iteration": 2.668351173400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_mlp": 1.00257981, + "epoch": 0.7766448634090035, + "flos": 580817908992.0, + "grad_norm": 0.03814563398191554, + "language_loss": 0.83102942, + "learning_rate": 0.00012522067155988092, + "loss": 0.84145141, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.39599609, + "step": 4037, + "time_per_iteration": 2.738696336746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_mlp": 1.00182426, + "epoch": 0.776837245094267, + "flos": 636819857664.0, + "grad_norm": 0.03633176837238025, + "language_loss": 0.75620854, + "learning_rate": 0.00012501452265335617, + "loss": 0.76662356, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.39648438, + "step": 4038, + "time_per_iteration": 2.8050642013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.00211596, + "epoch": 0.7770296267795306, + "flos": 615814388736.0, + "grad_norm": 0.05953534229047703, + "language_loss": 0.82882428, + "learning_rate": 0.0001248085193291047, + "loss": 0.83924192, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.39624023, + "step": 4039, + "time_per_iteration": 2.7656314373016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104134, + "balance_loss_mlp": 1.00166762, + "epoch": 0.7772220084647942, + "flos": 880297255680.0, + "grad_norm": 0.03559940349726857, + "language_loss": 0.82936066, + "learning_rate": 0.00012460266166710443, + "loss": 0.83977401, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.39648438, + "step": 4040, + "time_per_iteration": 3.203681468963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_mlp": 1.00201309, + "epoch": 0.7774143901500578, + "flos": 841039219968.0, + "grad_norm": 0.03667780998396207, + "language_loss": 0.78218615, + "learning_rate": 0.00012439694974727633, + "loss": 0.79260302, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.39648438, + "step": 4041, + "time_per_iteration": 3.1048929691314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042796, + "balance_loss_mlp": 1.00305152, + "epoch": 0.7776067718353212, + "flos": 569229857280.0, + "grad_norm": 0.03323606563363869, + "language_loss": 0.80526865, + "learning_rate": 0.00012419138364948458, + "loss": 0.8156966, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.3972168, + "step": 4042, + "time_per_iteration": 2.759185791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104225, + "balance_loss_mlp": 1.00255311, + "epoch": 0.7777991535205848, + "flos": 747210121728.0, + "grad_norm": 0.04016086024334982, + "language_loss": 0.83042264, + "learning_rate": 0.00012398596345353702, + "loss": 0.84084511, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.39672852, + "step": 4043, + "time_per_iteration": 2.8885416984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055285, + "balance_loss_mlp": 1.01556456, + "epoch": 0.7779915352058484, + "flos": 539183159040.0, + "grad_norm": 0.05452361141280675, + "language_loss": 0.8397001, + "learning_rate": 0.0001237806892391851, + "loss": 0.85025299, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.39697266, + "step": 4044, + "time_per_iteration": 2.6936380863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051551, + "balance_loss_mlp": 1.01173472, + "epoch": 0.778183916891112, + "flos": 635955687936.0, + "grad_norm": 0.03830611533598255, + "language_loss": 0.81336337, + "learning_rate": 0.0001235755610861233, + "loss": 0.82387888, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.39794922, + "step": 4045, + "time_per_iteration": 2.8001327514648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051224, + "balance_loss_mlp": 1.01140773, + "epoch": 0.7783762985763756, + "flos": 589790118912.0, + "grad_norm": 0.03835840399941748, + "language_loss": 0.85962141, + "learning_rate": 0.0001233705790739893, + "loss": 0.87013364, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.39794922, + "step": 4046, + "time_per_iteration": 2.7678940296173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_mlp": 1.00712717, + "epoch": 0.7785686802616391, + "flos": 932241782784.0, + "grad_norm": 0.03816222734497192, + "language_loss": 0.75308621, + "learning_rate": 0.0001231657432823643, + "loss": 0.76355535, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.39770508, + "step": 4047, + "time_per_iteration": 3.2008919715881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_mlp": 1.00816751, + "epoch": 0.7787610619469026, + "flos": 498956941056.0, + "grad_norm": 0.03863312039595469, + "language_loss": 0.79339081, + "learning_rate": 0.0001229610537907725, + "loss": 0.80387038, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.39770508, + "step": 4048, + "time_per_iteration": 2.6606078147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047514, + "balance_loss_mlp": 1.00776947, + "epoch": 0.7789534436321662, + "flos": 516651539712.0, + "grad_norm": 0.03926321418096956, + "language_loss": 0.91044021, + "learning_rate": 0.00012275651067868143, + "loss": 0.92091531, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.3972168, + "step": 4049, + "time_per_iteration": 2.5831098556518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048801, + "balance_loss_mlp": 1.00903308, + "epoch": 0.7791458253174298, + "flos": 990062477568.0, + "grad_norm": 0.03241253923352413, + "language_loss": 0.80757916, + "learning_rate": 0.00012255211402550182, + "loss": 0.81806719, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.39746094, + "step": 4050, + "time_per_iteration": 3.227099657058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_mlp": 1.00596547, + "epoch": 0.7793382070026933, + "flos": 630185478144.0, + "grad_norm": 0.040685190405043196, + "language_loss": 0.77082014, + "learning_rate": 0.00012234786391058727, + "loss": 0.78127873, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.39868164, + "step": 4051, + "time_per_iteration": 2.803809881210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_mlp": 1.00439727, + "epoch": 0.7795305886879569, + "flos": 532763607552.0, + "grad_norm": 0.14552341545352887, + "language_loss": 0.85931647, + "learning_rate": 0.0001221437604132352, + "loss": 0.86975908, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.3984375, + "step": 4052, + "time_per_iteration": 2.6521799564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044071, + "balance_loss_mlp": 1.00425482, + "epoch": 0.7797229703732205, + "flos": 613142166528.0, + "grad_norm": 0.03707443730916314, + "language_loss": 0.81622672, + "learning_rate": 0.0001219398036126852, + "loss": 0.82666743, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.39794922, + "step": 4053, + "time_per_iteration": 2.7498905658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043695, + "balance_loss_mlp": 1.00385571, + "epoch": 0.7799153520584841, + "flos": 873796045824.0, + "grad_norm": 0.03756906141902607, + "language_loss": 0.78616834, + "learning_rate": 0.00012173599358812027, + "loss": 0.79660529, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.39819336, + "step": 4054, + "time_per_iteration": 3.2531538009643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010442, + "balance_loss_mlp": 1.00424063, + "epoch": 0.7801077337437476, + "flos": 584745073152.0, + "grad_norm": 0.034551857273689666, + "language_loss": 0.83048439, + "learning_rate": 0.0001215323304186668, + "loss": 0.84092641, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.39941406, + "step": 4055, + "time_per_iteration": 2.802626371383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104311, + "balance_loss_mlp": 1.00329399, + "epoch": 0.7803001154290111, + "flos": 602281224192.0, + "grad_norm": 0.03735081367855325, + "language_loss": 0.87971795, + "learning_rate": 0.00012132881418339364, + "loss": 0.890149, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.39794922, + "step": 4056, + "time_per_iteration": 2.779559850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_mlp": 1.00430298, + "epoch": 0.7804924971142747, + "flos": 1482928411392.0, + "grad_norm": 0.004870984594471592, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.7856068, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.38769531, + "step": 4057, + "time_per_iteration": 4.857725620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_mlp": 1.00350153, + "epoch": 0.7806848787995383, + "flos": 631516242432.0, + "grad_norm": 0.03794339503679321, + "language_loss": 0.77468872, + "learning_rate": 0.00012092222283137944, + "loss": 0.78512168, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.39770508, + "step": 4058, + "time_per_iteration": 2.742105722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045322, + "balance_loss_mlp": 1.00650787, + "epoch": 0.7808772604848019, + "flos": 1420747984128.0, + "grad_norm": 0.008365987604131462, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79951632, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.38769531, + "step": 4059, + "time_per_iteration": 4.828707695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043702, + "balance_loss_mlp": 1.00390983, + "epoch": 0.7810696421700654, + "flos": 733104404736.0, + "grad_norm": 0.03231348100854236, + "language_loss": 0.83930951, + "learning_rate": 0.00012051622016348856, + "loss": 0.84974658, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.39770508, + "step": 4060, + "time_per_iteration": 2.9990715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043011, + "balance_loss_mlp": 1.00324297, + "epoch": 0.781262023855329, + "flos": 425837803776.0, + "grad_norm": 0.036166261595935334, + "language_loss": 0.84719038, + "learning_rate": 0.00012031343978315539, + "loss": 0.85762048, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.39746094, + "step": 4061, + "time_per_iteration": 2.4627366065979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042268, + "balance_loss_mlp": 1.00247562, + "epoch": 0.7814544055405925, + "flos": 502074370560.0, + "grad_norm": 0.0342232602285917, + "language_loss": 0.83237028, + "learning_rate": 0.00012011080681021774, + "loss": 0.84279293, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.39770508, + "step": 4062, + "time_per_iteration": 2.689143657684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104349, + "balance_loss_mlp": 1.00372207, + "epoch": 0.7816467872258561, + "flos": 463393744896.0, + "grad_norm": 0.03454181235361348, + "language_loss": 0.86497313, + "learning_rate": 0.00011990832132334512, + "loss": 0.87540805, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.39746094, + "step": 4063, + "time_per_iteration": 2.554494619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_mlp": 1.00317287, + "epoch": 0.7818391689111197, + "flos": 742108695552.0, + "grad_norm": 0.04030756572766353, + "language_loss": 0.82932305, + "learning_rate": 0.00011970598340114897, + "loss": 0.8397525, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.39746094, + "step": 4064, + "time_per_iteration": 2.970621109008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_mlp": 1.00241697, + "epoch": 0.7820315505963832, + "flos": 548806656000.0, + "grad_norm": 0.039516882872222964, + "language_loss": 0.84180045, + "learning_rate": 0.00011950379312218396, + "loss": 0.85222203, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.3972168, + "step": 4065, + "time_per_iteration": 2.7288360595703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104416, + "balance_loss_mlp": 1.00446284, + "epoch": 0.7822239322816468, + "flos": 730260129024.0, + "grad_norm": 0.03113922880228995, + "language_loss": 0.86965168, + "learning_rate": 0.00011930175056494719, + "loss": 0.88009328, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.39672852, + "step": 4066, + "time_per_iteration": 2.9733567237854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_mlp": 1.00383461, + "epoch": 0.7824163139669104, + "flos": 452986758144.0, + "grad_norm": 0.03027995654836667, + "language_loss": 0.76300895, + "learning_rate": 0.00011909985580787885, + "loss": 0.77344429, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.39672852, + "step": 4067, + "time_per_iteration": 2.63247013092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_mlp": 1.0023967, + "epoch": 0.782608695652174, + "flos": 541621111296.0, + "grad_norm": 0.030067199560216216, + "language_loss": 0.81511915, + "learning_rate": 0.00011889810892936137, + "loss": 0.82554203, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.39868164, + "step": 4068, + "time_per_iteration": 2.725503444671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042501, + "balance_loss_mlp": 1.00256538, + "epoch": 0.7828010773374374, + "flos": 501429886464.0, + "grad_norm": 0.036639479010935665, + "language_loss": 0.77685481, + "learning_rate": 0.00011869651000771959, + "loss": 0.78727984, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.39916992, + "step": 4069, + "time_per_iteration": 2.831753730773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.00274503, + "epoch": 0.782993459022701, + "flos": 601918642176.0, + "grad_norm": 0.036456028329252196, + "language_loss": 0.83725941, + "learning_rate": 0.00011849505912122117, + "loss": 0.84768599, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.39892578, + "step": 4070, + "time_per_iteration": 2.7105395793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042326, + "balance_loss_mlp": 1.00246227, + "epoch": 0.7831858407079646, + "flos": 811476612864.0, + "grad_norm": 0.03866218742365993, + "language_loss": 0.78222632, + "learning_rate": 0.00011829375634807654, + "loss": 0.79264963, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.3984375, + "step": 4071, + "time_per_iteration": 3.082258939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_mlp": 1.00321043, + "epoch": 0.7833782223932282, + "flos": 808014097920.0, + "grad_norm": 0.03240130540030076, + "language_loss": 0.81343973, + "learning_rate": 0.00011809260176643821, + "loss": 0.82386994, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.39794922, + "step": 4072, + "time_per_iteration": 3.0537989139556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_mlp": 1.00296557, + "epoch": 0.7835706040784918, + "flos": 521900719872.0, + "grad_norm": 0.03900176982337939, + "language_loss": 0.84087825, + "learning_rate": 0.00011789159545440131, + "loss": 0.85130656, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.3984375, + "step": 4073, + "time_per_iteration": 2.628188133239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_mlp": 1.00314867, + "epoch": 0.7837629857637552, + "flos": 506744195328.0, + "grad_norm": 0.031003851704209363, + "language_loss": 0.82853079, + "learning_rate": 0.00011769073749000348, + "loss": 0.83895999, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.39746094, + "step": 4074, + "time_per_iteration": 2.7814579010009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_mlp": 1.00359035, + "epoch": 0.7839553674490188, + "flos": 517135630848.0, + "grad_norm": 0.03896088374638199, + "language_loss": 0.76594853, + "learning_rate": 0.0001174900279512246, + "loss": 0.77638209, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.39746094, + "step": 4075, + "time_per_iteration": 2.5712804794311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043376, + "balance_loss_mlp": 1.00363171, + "epoch": 0.7841477491342824, + "flos": 507651139584.0, + "grad_norm": 0.03246431097284687, + "language_loss": 0.82211149, + "learning_rate": 0.00011728946691598707, + "loss": 0.83254528, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.3972168, + "step": 4076, + "time_per_iteration": 2.604954242706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00401795, + "epoch": 0.784340130819546, + "flos": 720905895168.0, + "grad_norm": 0.038070904406741414, + "language_loss": 0.76823437, + "learning_rate": 0.00011708905446215561, + "loss": 0.77867198, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.3972168, + "step": 4077, + "time_per_iteration": 2.8703718185424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_mlp": 1.00389552, + "epoch": 0.7845325125048095, + "flos": 515514216192.0, + "grad_norm": 0.030616823376727165, + "language_loss": 0.80449855, + "learning_rate": 0.00011688879066753711, + "loss": 0.81493515, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.39746094, + "step": 4078, + "time_per_iteration": 2.693617582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042875, + "balance_loss_mlp": 1.00313067, + "epoch": 0.7847248941900731, + "flos": 467051645952.0, + "grad_norm": 0.040474516708916684, + "language_loss": 0.87913537, + "learning_rate": 0.00011668867560988122, + "loss": 0.88956416, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.3972168, + "step": 4079, + "time_per_iteration": 2.5590639114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_mlp": 1.00582922, + "epoch": 0.7849172758753367, + "flos": 504084612096.0, + "grad_norm": 0.03640725809465974, + "language_loss": 0.84891224, + "learning_rate": 0.00011648870936687916, + "loss": 0.85936773, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.39697266, + "step": 4080, + "time_per_iteration": 2.7692296504974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046484, + "balance_loss_mlp": 1.00678754, + "epoch": 0.7851096575606002, + "flos": 533032870656.0, + "grad_norm": 0.04308382250768319, + "language_loss": 0.79184526, + "learning_rate": 0.00011628889201616461, + "loss": 0.80231011, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.39672852, + "step": 4081, + "time_per_iteration": 2.6643264293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_mlp": 1.00411689, + "epoch": 0.7853020392458638, + "flos": 571044712704.0, + "grad_norm": 0.03315243630239655, + "language_loss": 0.82372963, + "learning_rate": 0.00011608922363531393, + "loss": 0.8341682, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.3972168, + "step": 4082, + "time_per_iteration": 2.6805782318115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_mlp": 1.00403428, + "epoch": 0.7854944209311273, + "flos": 833992680960.0, + "grad_norm": 0.03684416800395475, + "language_loss": 0.83803403, + "learning_rate": 0.00011588970430184504, + "loss": 0.84847128, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.39672852, + "step": 4083, + "time_per_iteration": 3.0843493938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_mlp": 1.00404453, + "epoch": 0.7856868026163909, + "flos": 561011001600.0, + "grad_norm": 0.030260484959858683, + "language_loss": 0.82344627, + "learning_rate": 0.00011569033409321822, + "loss": 0.83388317, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.39624023, + "step": 4084, + "time_per_iteration": 2.692643165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044727, + "balance_loss_mlp": 1.0050776, + "epoch": 0.7858791843016545, + "flos": 546268581888.0, + "grad_norm": 0.039325334071154384, + "language_loss": 0.73417258, + "learning_rate": 0.00011549111308683591, + "loss": 0.74461985, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.39624023, + "step": 4085, + "time_per_iteration": 2.6917884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_mlp": 1.00529623, + "epoch": 0.7860715659869181, + "flos": 381840923904.0, + "grad_norm": 0.042614016338838545, + "language_loss": 0.8128258, + "learning_rate": 0.00011529204136004251, + "loss": 0.82327527, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.39624023, + "step": 4086, + "time_per_iteration": 2.4572253227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_mlp": 1.0049361, + "epoch": 0.7862639476721817, + "flos": 568513441536.0, + "grad_norm": 0.03346159984299651, + "language_loss": 0.84931922, + "learning_rate": 0.00011509311899012459, + "loss": 0.85976499, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.39624023, + "step": 4087, + "time_per_iteration": 2.763591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_mlp": 1.00353885, + "epoch": 0.7864563293574451, + "flos": 546323016960.0, + "grad_norm": 0.03949651761127577, + "language_loss": 0.78551108, + "learning_rate": 0.00011489434605431053, + "loss": 0.79594392, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.3972168, + "step": 4088, + "time_per_iteration": 2.6439645290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042959, + "balance_loss_mlp": 1.00321484, + "epoch": 0.7866487110427087, + "flos": 564649460736.0, + "grad_norm": 0.036592949661453156, + "language_loss": 0.81577885, + "learning_rate": 0.0001146957226297708, + "loss": 0.82620847, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.3972168, + "step": 4089, + "time_per_iteration": 2.679487705230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042934, + "balance_loss_mlp": 1.00321376, + "epoch": 0.7868410927279723, + "flos": 729559264512.0, + "grad_norm": 0.030545920555930417, + "language_loss": 0.76902366, + "learning_rate": 0.00011449724879361827, + "loss": 0.77945304, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.39697266, + "step": 4090, + "time_per_iteration": 2.9623334407806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043691, + "balance_loss_mlp": 1.00404155, + "epoch": 0.7870334744132359, + "flos": 522447994368.0, + "grad_norm": 0.042680254244296036, + "language_loss": 0.74582481, + "learning_rate": 0.00011429892462290687, + "loss": 0.75626171, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.39624023, + "step": 4091, + "time_per_iteration": 2.718287229537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_mlp": 1.00435197, + "epoch": 0.7872258560984994, + "flos": 452363661312.0, + "grad_norm": 0.033106880677710115, + "language_loss": 0.83571684, + "learning_rate": 0.00011410075019463295, + "loss": 0.84615809, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.39746094, + "step": 4092, + "time_per_iteration": 2.627462148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043831, + "balance_loss_mlp": 1.00413382, + "epoch": 0.787418237783763, + "flos": 516250073856.0, + "grad_norm": 0.03274569080250533, + "language_loss": 0.80842328, + "learning_rate": 0.00011390272558573461, + "loss": 0.8188616, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.39672852, + "step": 4093, + "time_per_iteration": 2.678356409072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_mlp": 1.00474429, + "epoch": 0.7876106194690266, + "flos": 486057566976.0, + "grad_norm": 0.03217400572636969, + "language_loss": 0.80303454, + "learning_rate": 0.00011370485087309202, + "loss": 0.81347895, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.39672852, + "step": 4094, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_mlp": 1.00449383, + "epoch": 0.7878030011542901, + "flos": 543930751488.0, + "grad_norm": 0.036296400111331464, + "language_loss": 0.79175836, + "learning_rate": 0.00011350712613352688, + "loss": 0.80220002, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.39648438, + "step": 4095, + "time_per_iteration": 2.705301284790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_mlp": 1.00463307, + "epoch": 0.7879953828395537, + "flos": 517749979392.0, + "grad_norm": 0.042475497231540135, + "language_loss": 0.79742056, + "learning_rate": 0.00011330955144380283, + "loss": 0.80786359, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.39648438, + "step": 4096, + "time_per_iteration": 2.592628240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_mlp": 1.00336826, + "epoch": 0.7881877645248172, + "flos": 583377370368.0, + "grad_norm": 0.033751498450810845, + "language_loss": 0.86674351, + "learning_rate": 0.00011311212688062483, + "loss": 0.87717438, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.39697266, + "step": 4097, + "time_per_iteration": 2.8006155490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_mlp": 1.0035969, + "epoch": 0.7883801462100808, + "flos": 590328645120.0, + "grad_norm": 0.0369008039403456, + "language_loss": 0.78409964, + "learning_rate": 0.0001129148525206402, + "loss": 0.79453301, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.3972168, + "step": 4098, + "time_per_iteration": 2.824293375015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_mlp": 1.00373876, + "epoch": 0.7885725278953444, + "flos": 482742806016.0, + "grad_norm": 0.04185353422422626, + "language_loss": 0.86687458, + "learning_rate": 0.00011271772844043759, + "loss": 0.87730914, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.39697266, + "step": 4099, + "time_per_iteration": 2.6993777751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_mlp": 1.00372386, + "epoch": 0.788764909580608, + "flos": 758099254272.0, + "grad_norm": 0.0413483333130522, + "language_loss": 0.76537859, + "learning_rate": 0.00011252075471654727, + "loss": 0.77581275, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.39672852, + "step": 4100, + "time_per_iteration": 2.9176177978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_mlp": 1.00374079, + "epoch": 0.7889572912658714, + "flos": 703880080128.0, + "grad_norm": 0.0322415537049841, + "language_loss": 0.7816056, + "learning_rate": 0.00011232393142544133, + "loss": 0.79204047, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.3972168, + "step": 4101, + "time_per_iteration": 2.9494380950927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044086, + "balance_loss_mlp": 1.00436497, + "epoch": 0.789149672951135, + "flos": 737841303552.0, + "grad_norm": 0.03312890995407851, + "language_loss": 0.83342379, + "learning_rate": 0.00011212725864353323, + "loss": 0.84386468, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.39697266, + "step": 4102, + "time_per_iteration": 3.066310405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104599, + "balance_loss_mlp": 1.00727081, + "epoch": 0.7893420546363986, + "flos": 1484490533376.0, + "grad_norm": 0.0037033448465983686, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77381915, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.38671875, + "step": 4103, + "time_per_iteration": 4.842837810516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043191, + "balance_loss_mlp": 1.00349379, + "epoch": 0.7895344363216622, + "flos": 510080343552.0, + "grad_norm": 0.04492862133379161, + "language_loss": 0.75946063, + "learning_rate": 0.00011173436491267291, + "loss": 0.76989251, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.39672852, + "step": 4104, + "time_per_iteration": 2.6089494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_mlp": 1.00348377, + "epoch": 0.7897268180069258, + "flos": 543038391552.0, + "grad_norm": 0.035594569075133434, + "language_loss": 0.82524866, + "learning_rate": 0.0001115381441162554, + "loss": 0.83568072, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.39697266, + "step": 4105, + "time_per_iteration": 2.610574245452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_mlp": 1.00495148, + "epoch": 0.7899191996921893, + "flos": 1415752515840.0, + "grad_norm": 0.004244579927016686, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74627399, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.38671875, + "step": 4106, + "time_per_iteration": 4.910478830337524 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042244, + "balance_loss_mlp": 1.00254726, + "epoch": 0.7901115813774529, + "flos": 624022550784.0, + "grad_norm": 0.03217840063053149, + "language_loss": 0.855353, + "learning_rate": 0.00011114615504234465, + "loss": 0.86577547, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.39672852, + "step": 4107, + "time_per_iteration": 2.746295690536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_mlp": 1.0045898, + "epoch": 0.7903039630627164, + "flos": 646805936640.0, + "grad_norm": 0.033942053342870586, + "language_loss": 0.81416857, + "learning_rate": 0.00011095038691703468, + "loss": 0.82461071, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.39599609, + "step": 4108, + "time_per_iteration": 2.8708901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104442, + "balance_loss_mlp": 1.00479472, + "epoch": 0.79049634474798, + "flos": 595612818432.0, + "grad_norm": 0.037550083801842486, + "language_loss": 0.83416122, + "learning_rate": 0.00011075476983417998, + "loss": 0.84460539, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.39599609, + "step": 4109, + "time_per_iteration": 2.8592021465301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043584, + "balance_loss_mlp": 1.00393546, + "epoch": 0.7906887264332435, + "flos": 717332564736.0, + "grad_norm": 0.03806568849711228, + "language_loss": 0.7824564, + "learning_rate": 0.00011055930386972579, + "loss": 0.79289222, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.39624023, + "step": 4110, + "time_per_iteration": 2.860257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_mlp": 1.00162232, + "epoch": 0.7908811081185071, + "flos": 791261436672.0, + "grad_norm": 0.034643176312320036, + "language_loss": 0.78703582, + "learning_rate": 0.00011036398909955863, + "loss": 0.79744881, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.39648438, + "step": 4111, + "time_per_iteration": 2.9770195484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043101, + "balance_loss_mlp": 1.0034523, + "epoch": 0.7910734898037707, + "flos": 643076103936.0, + "grad_norm": 0.033380496511460814, + "language_loss": 0.8228001, + "learning_rate": 0.00011016882559950648, + "loss": 0.83323109, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.39624023, + "step": 4112, + "time_per_iteration": 2.8614118099212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_mlp": 1.00446427, + "epoch": 0.7912658714890343, + "flos": 670561395456.0, + "grad_norm": 0.037601887407010925, + "language_loss": 0.80818218, + "learning_rate": 0.00010997381344533853, + "loss": 0.81862211, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.39501953, + "step": 4113, + "time_per_iteration": 2.806915521621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 1.00639081, + "epoch": 0.7914582531742979, + "flos": 558887999232.0, + "grad_norm": 0.03473923170116899, + "language_loss": 0.81077361, + "learning_rate": 0.00010977895271276517, + "loss": 0.82123232, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.39453125, + "step": 4114, + "time_per_iteration": 2.710866928100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046255, + "balance_loss_mlp": 1.00667739, + "epoch": 0.7916506348595613, + "flos": 571192466688.0, + "grad_norm": 0.03381455786010569, + "language_loss": 0.80545115, + "learning_rate": 0.00010958424347743807, + "loss": 0.81591368, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.39550781, + "step": 4115, + "time_per_iteration": 2.720463991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044883, + "balance_loss_mlp": 1.00528204, + "epoch": 0.7918430165448249, + "flos": 719647062528.0, + "grad_norm": 0.03312205517130564, + "language_loss": 0.8089326, + "learning_rate": 0.00010938968581494991, + "loss": 0.81938136, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.39575195, + "step": 4116, + "time_per_iteration": 2.9487526416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_mlp": 1.004758, + "epoch": 0.7920353982300885, + "flos": 554737258752.0, + "grad_norm": 0.04353090133720626, + "language_loss": 0.79680514, + "learning_rate": 0.000109195279800835, + "loss": 0.80724961, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.39672852, + "step": 4117, + "time_per_iteration": 2.7193853855133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046231, + "balance_loss_mlp": 1.0065577, + "epoch": 0.7922277799153521, + "flos": 811541741568.0, + "grad_norm": 0.051618169063903374, + "language_loss": 0.76734924, + "learning_rate": 0.00010900102551056834, + "loss": 0.77781159, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.39648438, + "step": 4118, + "time_per_iteration": 3.0203771591186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_mlp": 1.00644493, + "epoch": 0.7924201616006156, + "flos": 422245031424.0, + "grad_norm": 0.03727479456025455, + "language_loss": 0.84903586, + "learning_rate": 0.00010880692301956601, + "loss": 0.85949719, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.39672852, + "step": 4119, + "time_per_iteration": 2.5143675804138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_mlp": 1.00416684, + "epoch": 0.7926125432858792, + "flos": 619105817088.0, + "grad_norm": 0.030768589003691713, + "language_loss": 0.86626256, + "learning_rate": 0.00010861297240318518, + "loss": 0.876701, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.39648438, + "step": 4120, + "time_per_iteration": 2.870023250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_mlp": 1.00611162, + "epoch": 0.7928049249711427, + "flos": 603611988480.0, + "grad_norm": 0.0348759372841926, + "language_loss": 0.8754127, + "learning_rate": 0.00010841917373672444, + "loss": 0.88587052, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.39648438, + "step": 4121, + "time_per_iteration": 2.7993838787078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_mlp": 1.00568569, + "epoch": 0.7929973066564063, + "flos": 657232365312.0, + "grad_norm": 0.04825872036668382, + "language_loss": 0.79469776, + "learning_rate": 0.00010822552709542293, + "loss": 0.80515188, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.39697266, + "step": 4122, + "time_per_iteration": 2.8277747631073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104501, + "balance_loss_mlp": 1.00526559, + "epoch": 0.7931896883416699, + "flos": 537435377664.0, + "grad_norm": 0.033652478318624945, + "language_loss": 0.86540711, + "learning_rate": 0.0001080320325544612, + "loss": 0.87585717, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.3972168, + "step": 4123, + "time_per_iteration": 2.7195277214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_mlp": 1.00394118, + "epoch": 0.7933820700269334, + "flos": 499069701888.0, + "grad_norm": 0.034451341323961555, + "language_loss": 0.83510745, + "learning_rate": 0.00010783869018895997, + "loss": 0.84554619, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.39916992, + "step": 4124, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_mlp": 1.00495958, + "epoch": 0.793574451712197, + "flos": 538496878848.0, + "grad_norm": 0.03367415266088285, + "language_loss": 0.84549522, + "learning_rate": 0.00010764550007398189, + "loss": 0.85594392, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.39892578, + "step": 4125, + "time_per_iteration": 4.054261207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045425, + "balance_loss_mlp": 1.00556159, + "epoch": 0.7937668333974606, + "flos": 489259567104.0, + "grad_norm": 0.03475053715190497, + "language_loss": 0.82054108, + "learning_rate": 0.00010745246228452982, + "loss": 0.83099532, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.3984375, + "step": 4126, + "time_per_iteration": 2.5979418754577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_mlp": 1.0054512, + "epoch": 0.7939592150827242, + "flos": 528480664320.0, + "grad_norm": 0.03444144820805524, + "language_loss": 0.8203451, + "learning_rate": 0.00010725957689554771, + "loss": 0.83079869, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.39892578, + "step": 4127, + "time_per_iteration": 2.7990803718566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_mlp": 1.0037564, + "epoch": 0.7941515967679876, + "flos": 542804121600.0, + "grad_norm": 0.027974353873713647, + "language_loss": 0.84939337, + "learning_rate": 0.00010706684398192013, + "loss": 0.85982978, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.39868164, + "step": 4128, + "time_per_iteration": 2.6992971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_mlp": 1.00342357, + "epoch": 0.7943439784532512, + "flos": 519524005632.0, + "grad_norm": 0.0378035902598828, + "language_loss": 0.82137024, + "learning_rate": 0.00010687426361847313, + "loss": 0.83180356, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.39892578, + "step": 4129, + "time_per_iteration": 2.7055931091308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_mlp": 1.00368559, + "epoch": 0.7945363601385148, + "flos": 510060901632.0, + "grad_norm": 0.033194408400906726, + "language_loss": 0.86515343, + "learning_rate": 0.00010668183587997254, + "loss": 0.87558991, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.39941406, + "step": 4130, + "time_per_iteration": 2.6280934810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043077, + "balance_loss_mlp": 1.00318933, + "epoch": 0.7947287418237784, + "flos": 652402147584.0, + "grad_norm": 0.029896706291295146, + "language_loss": 0.77920771, + "learning_rate": 0.0001064895608411256, + "loss": 0.78963846, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.39868164, + "step": 4131, + "time_per_iteration": 2.8259942531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042645, + "balance_loss_mlp": 1.00282872, + "epoch": 0.794921123509042, + "flos": 697374012672.0, + "grad_norm": 0.04755906636232369, + "language_loss": 0.80848777, + "learning_rate": 0.00010629743857657998, + "loss": 0.81891429, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.39794922, + "step": 4132, + "time_per_iteration": 2.8961074352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_mlp": 1.00901794, + "epoch": 0.7951135051943055, + "flos": 1406079441408.0, + "grad_norm": 0.006864430064478978, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.716465, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.38769531, + "step": 4133, + "time_per_iteration": 4.614002704620361 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_mlp": 1.00524557, + "epoch": 0.795305886879569, + "flos": 811450368000.0, + "grad_norm": 0.03507425862831722, + "language_loss": 0.82587457, + "learning_rate": 0.00010591365266868802, + "loss": 0.83632547, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.39819336, + "step": 4134, + "time_per_iteration": 2.9641194343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044941, + "balance_loss_mlp": 1.0061264, + "epoch": 0.7954982685648326, + "flos": 1429216660992.0, + "grad_norm": 0.005948416138120475, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76556724, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.38769531, + "step": 4135, + "time_per_iteration": 4.960731029510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.00557125, + "epoch": 0.7956906502500962, + "flos": 390748005120.0, + "grad_norm": 0.05367501121915611, + "language_loss": 0.80196106, + "learning_rate": 0.00010553047875229166, + "loss": 0.81241471, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.39770508, + "step": 4136, + "time_per_iteration": 2.5680596828460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045778, + "balance_loss_mlp": 1.00596261, + "epoch": 0.7958830319353598, + "flos": 516586411008.0, + "grad_norm": 0.03268572059370949, + "language_loss": 0.83743113, + "learning_rate": 0.00010533912147689328, + "loss": 0.84788889, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.39794922, + "step": 4137, + "time_per_iteration": 2.6882131099700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_mlp": 1.00492001, + "epoch": 0.7960754136206233, + "flos": 494927709696.0, + "grad_norm": 0.03240268195496617, + "language_loss": 0.82921439, + "learning_rate": 0.00010514791742243656, + "loss": 0.83966154, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.39770508, + "step": 4138, + "time_per_iteration": 2.5695807933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_mlp": 1.0049001, + "epoch": 0.7962677953058869, + "flos": 657006843648.0, + "grad_norm": 0.03902501447603489, + "language_loss": 0.83096194, + "learning_rate": 0.00010495686666315341, + "loss": 0.84140819, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.39697266, + "step": 4139, + "time_per_iteration": 2.8975212574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044322, + "balance_loss_mlp": 1.00443506, + "epoch": 0.7964601769911505, + "flos": 543420415488.0, + "grad_norm": 0.04091295752087844, + "language_loss": 0.777354, + "learning_rate": 0.00010476596927321635, + "loss": 0.78779727, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.39868164, + "step": 4140, + "time_per_iteration": 2.654552459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_mlp": 1.00785065, + "epoch": 0.796652558676414, + "flos": 538827379968.0, + "grad_norm": 0.03162317226196635, + "language_loss": 0.80818027, + "learning_rate": 0.00010457522532673835, + "loss": 0.81865692, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.39794922, + "step": 4141, + "time_per_iteration": 2.842707633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044598, + "balance_loss_mlp": 1.00480628, + "epoch": 0.7968449403616775, + "flos": 476052046080.0, + "grad_norm": 0.03609806445163449, + "language_loss": 0.83603644, + "learning_rate": 0.00010438463489777272, + "loss": 0.8464824, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.39770508, + "step": 4142, + "time_per_iteration": 2.5717051029205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_mlp": 1.00287914, + "epoch": 0.7970373220469411, + "flos": 568726324224.0, + "grad_norm": 0.03529843245430609, + "language_loss": 0.7784009, + "learning_rate": 0.00010419419806031316, + "loss": 0.78882766, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.39770508, + "step": 4143, + "time_per_iteration": 2.6530473232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_mlp": 1.00467539, + "epoch": 0.7972297037322047, + "flos": 557351155200.0, + "grad_norm": 0.03335474096113663, + "language_loss": 0.84457743, + "learning_rate": 0.00010400391488829403, + "loss": 0.85502243, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.39794922, + "step": 4144, + "time_per_iteration": 2.832122564315796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044556, + "balance_loss_mlp": 1.00471592, + "epoch": 0.7974220854174683, + "flos": 577307761920.0, + "grad_norm": 0.030245112607884015, + "language_loss": 0.87015516, + "learning_rate": 0.00010381378545558984, + "loss": 0.88060075, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.39819336, + "step": 4145, + "time_per_iteration": 2.6970877647399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104421, + "balance_loss_mlp": 1.00434661, + "epoch": 0.7976144671027319, + "flos": 484056073728.0, + "grad_norm": 0.03356319241102144, + "language_loss": 0.8495326, + "learning_rate": 0.00010362380983601505, + "loss": 0.85997462, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.3984375, + "step": 4146, + "time_per_iteration": 2.5355587005615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_mlp": 1.00420487, + "epoch": 0.7978068487879953, + "flos": 1079654319360.0, + "grad_norm": 0.028459484935127146, + "language_loss": 0.79190552, + "learning_rate": 0.00010343398810332477, + "loss": 0.80234593, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.39819336, + "step": 4147, + "time_per_iteration": 3.4484007358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_mlp": 1.00370336, + "epoch": 0.7979992304732589, + "flos": 735016469760.0, + "grad_norm": 0.038421904097834796, + "language_loss": 0.84714222, + "learning_rate": 0.00010324432033121467, + "loss": 0.8575781, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.39868164, + "step": 4148, + "time_per_iteration": 2.8759710788726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044627, + "balance_loss_mlp": 1.00476301, + "epoch": 0.7981916121585225, + "flos": 416750887680.0, + "grad_norm": 0.03692074531599656, + "language_loss": 0.84042895, + "learning_rate": 0.00010305480659332005, + "loss": 0.85087514, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.3984375, + "step": 4149, + "time_per_iteration": 2.6903555393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_mlp": 1.00493348, + "epoch": 0.7983839938437861, + "flos": 466213721088.0, + "grad_norm": 0.03398705424173267, + "language_loss": 0.84049666, + "learning_rate": 0.00010286544696321682, + "loss": 0.85094512, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.39892578, + "step": 4150, + "time_per_iteration": 2.5223419666290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_mlp": 1.00731564, + "epoch": 0.7985763755290496, + "flos": 511623990528.0, + "grad_norm": 0.03850329476813429, + "language_loss": 0.80184937, + "learning_rate": 0.00010267624151442073, + "loss": 0.81232083, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.39819336, + "step": 4151, + "time_per_iteration": 2.6372790336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045244, + "balance_loss_mlp": 1.00545216, + "epoch": 0.7987687572143132, + "flos": 1012279147008.0, + "grad_norm": 0.036156953147693155, + "language_loss": 0.81612265, + "learning_rate": 0.000102487190320388, + "loss": 0.8265751, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.39770508, + "step": 4152, + "time_per_iteration": 3.3100497722625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_mlp": 1.00644886, + "epoch": 0.7989611388995768, + "flos": 1022749317120.0, + "grad_norm": 0.0483734968534093, + "language_loss": 0.80480343, + "learning_rate": 0.00010229829345451475, + "loss": 0.81526625, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.39819336, + "step": 4153, + "time_per_iteration": 3.305338144302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048343, + "balance_loss_mlp": 1.00855112, + "epoch": 0.7991535205848403, + "flos": 1103038447872.0, + "grad_norm": 0.03770888532142324, + "language_loss": 0.80308628, + "learning_rate": 0.00010210955099013724, + "loss": 0.81356978, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.39770508, + "step": 4154, + "time_per_iteration": 3.409900188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047808, + "balance_loss_mlp": 1.00789726, + "epoch": 0.7993459022701039, + "flos": 836280933888.0, + "grad_norm": 0.04128229855953485, + "language_loss": 0.77654159, + "learning_rate": 0.00010192096300053167, + "loss": 0.78701961, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.39892578, + "step": 4155, + "time_per_iteration": 3.1075351238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_mlp": 1.00387979, + "epoch": 0.7995382839553674, + "flos": 523770010368.0, + "grad_norm": 0.043215230874116634, + "language_loss": 0.85791343, + "learning_rate": 0.00010173252955891477, + "loss": 0.86835158, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.39916992, + "step": 4156, + "time_per_iteration": 2.741454839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_mlp": 1.00358212, + "epoch": 0.799730665640631, + "flos": 538859460864.0, + "grad_norm": 0.0402681416401722, + "language_loss": 0.73719215, + "learning_rate": 0.00010154425073844253, + "loss": 0.74762684, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.39868164, + "step": 4157, + "time_per_iteration": 2.709291458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_mlp": 1.00384808, + "epoch": 0.7999230473258946, + "flos": 506068608768.0, + "grad_norm": 0.03223966585630621, + "language_loss": 0.82729542, + "learning_rate": 0.00010135612661221138, + "loss": 0.83773249, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.3984375, + "step": 4158, + "time_per_iteration": 2.557003974914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043464, + "balance_loss_mlp": 1.00357628, + "epoch": 0.8001154290111582, + "flos": 1028977373184.0, + "grad_norm": 0.03912877230354993, + "language_loss": 0.82057023, + "learning_rate": 0.00010116815725325751, + "loss": 0.83100486, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.39868164, + "step": 4159, + "time_per_iteration": 3.304746389389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_mlp": 1.00401807, + "epoch": 0.8003078106964217, + "flos": 752270718720.0, + "grad_norm": 0.03707561964119669, + "language_loss": 0.81281012, + "learning_rate": 0.00010098034273455725, + "loss": 0.82324892, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.3984375, + "step": 4160, + "time_per_iteration": 2.93477463722229 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 344944048, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9407559341441024.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/training_args.bin b/sft_pretrain/Full_xmoe/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e62437ed6fbf4cf3ea22fcfae3749bb9df2d0109 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144fbe7f1cf435dbbf0ef9621414cb3e97a5ff4a560571b878000caf2931b07 +size 7992 diff --git a/sft_pretrain/Full_xmoe/checkpoint-4160/zero_to_fp32.py b/sft_pretrain/Full_xmoe/checkpoint-4160/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-4160/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/added_tokens.json b/sft_pretrain/Full_xmoe/checkpoint-5198/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/config.json b/sft_pretrain/Full_xmoe/checkpoint-5198/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5ed860286ec8c9b3f17e5234326d2ed728ca6a65 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "xmoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/generation_config.json b/sft_pretrain/Full_xmoe/checkpoint-5198/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..905a4c90bc083a732694563ea8bd226b68eaef32 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cda93fdbb0271c2d052039b2f58ae94785e40a2c1a98605662af4d2d1b6703a +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea59253fd74806fba8391ad25272a27a8a943747 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f636e93f6e6914138fee9a7438817a0e60998ae30f727cf0c3bd696c00b0f07 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13dbf2f8c10e2d54edb11f72473131ead18b52e3 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8651a485753c5af79d920ec42bf34bd983789f866977012b4d290b200c6f1bdd +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a94b875ed8d220ea0a3981eec91dedbeb15cd07 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:537104dfd5eac995c5fa7bd6cbd720628d6e21271c5f6a60d71932032a4f0111 +size 396609872 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60e8fd19de1a759072172b148f101dbb69c23a0e --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b3ba66c704737eaeed3baa812b803c6ec9e171522c56ff9f7aea6e32a8b39c +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec56e94231d07e1734651863bce505f1a2b3d431 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9c2b409a386931ad32ef217586e77da1b65f0c53f80513871121b85d5ee3e2 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..656de63a30286d79c01640f0412f29a6332d456e --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:640b0d60bb9e5012a64fd1eda1f8ef52b6bc080cc92afbd4c99c8bd526e36d85 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69b054bbfeb0cb8f084235892d084addde142160 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfc1946d4570eb8e928b861bd7a7184455b7a844d7742e41ca38d8dff82511e5 +size 2117322914 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/latest b/sft_pretrain/Full_xmoe/checkpoint-5198/latest new file mode 100644 index 0000000000000000000000000000000000000000..c0e63763d1d13a0ca7a3b62ff8f5cd1d69cc4978 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/latest @@ -0,0 +1 @@ +global_step5198 \ No newline at end of file diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/model-00001-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-5198/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/model-00002-of-00002.safetensors b/sft_pretrain/Full_xmoe/checkpoint-5198/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..06069d52819dbbf22163c9da5588880b2cc1c3d8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b44129307c0d6bc8186640d841927892e858873599257b9253937fa4c18940df +size 3759044016 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/model.safetensors.index.json b/sft_pretrain/Full_xmoe/checkpoint-5198/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..507806fb086ee2ffdb4c1df263574fc5a7cfa513 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/model.safetensors.index.json @@ -0,0 +1,675 @@ +{ + "metadata": { + "total_size": 8731443248 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_0.pth b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_1.pth b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_2.pth b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_3.pth b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/special_tokens_map.json b/sft_pretrain/Full_xmoe/checkpoint-5198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/tokenizer.model b/sft_pretrain/Full_xmoe/checkpoint-5198/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/tokenizer_config.json b/sft_pretrain/Full_xmoe/checkpoint-5198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/trainer_state.json b/sft_pretrain/Full_xmoe/checkpoint-5198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e1858438d2ac34274779642f60b99310df16d043 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/trainer_state.json @@ -0,0 +1,78003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334678, + "balance_loss_mlp": 2.48847342, + "epoch": 0.00019238168526356292, + "flos": 471022563072.0, + "grad_norm": 15.010934477254423, + "language_loss": 2.91277003, + "learning_rate": 0.0, + "loss": 1.95375419, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 8.6015625, + "step": 1, + "time_per_iteration": 23.313215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03608113, + "balance_loss_mlp": 3.00043201, + "epoch": 0.00038476337052712584, + "flos": 505538830848.0, + "grad_norm": 25.821694542927546, + "language_loss": 10.7459116, + "learning_rate": 0.00013726078121135892, + "loss": 10.78199196, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.06640625, + "step": 2, + "time_per_iteration": 2.6342098712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03648002, + "balance_loss_mlp": 3.03803182, + "epoch": 0.0005771450557906887, + "flos": 600334166016.0, + "grad_norm": 27.537763142134942, + "language_loss": 10.88985825, + "learning_rate": 0.00021755319103969496, + "loss": 10.9263401, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.08984375, + "step": 3, + "time_per_iteration": 2.9129159450531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03639085, + "balance_loss_mlp": 3.03521824, + "epoch": 0.0007695267410542517, + "flos": 581497386240.0, + "grad_norm": 10.719163482624658, + "language_loss": 8.79598808, + "learning_rate": 0.00027452156242271784, + "loss": 8.83237934, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.02734375, + "step": 4, + "time_per_iteration": 2.72357439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03604871, + "balance_loss_mlp": 3.01435566, + "epoch": 0.0009619084263178145, + "flos": 487154061312.0, + "grad_norm": 22.68157363884245, + "language_loss": 9.41989708, + "learning_rate": 0.0003187096642208417, + "loss": 9.45594501, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.8984375, + "step": 5, + "time_per_iteration": 2.6791844367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03472164, + "balance_loss_mlp": 2.9011035, + "epoch": 0.0011542901115813775, + "flos": 561167503872.0, + "grad_norm": 7.113488232519407, + "language_loss": 9.41725159, + "learning_rate": 0.0003548139722510539, + "loss": 9.45197296, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.72265625, + "step": 6, + "time_per_iteration": 2.7308623790740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03266853, + "balance_loss_mlp": 2.70799947, + "epoch": 0.0013466717968449403, + "flos": 534951738624.0, + "grad_norm": 3.189932925125429, + "language_loss": 8.01036549, + "learning_rate": 0.00038533972973918044, + "loss": 8.0430336, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.59765625, + "step": 7, + "time_per_iteration": 2.6907436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02962571, + "balance_loss_mlp": 2.41211033, + "epoch": 0.0015390534821085034, + "flos": 493334485248.0, + "grad_norm": 5.13822781788523, + "language_loss": 7.84486008, + "learning_rate": 0.0004117823436340768, + "loss": 7.87448597, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.51171875, + "step": 8, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550478, + "balance_loss_mlp": 2.0114615, + "epoch": 0.0017314351673720662, + "flos": 565776090624.0, + "grad_norm": 3.8232757327488405, + "language_loss": 7.62468719, + "learning_rate": 0.00043510638207938993, + "loss": 7.65019178, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.7688682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02337757, + "balance_loss_mlp": 1.81705093, + "epoch": 0.001923816852635629, + "flos": 594509521152.0, + "grad_norm": 3.0012265425900817, + "language_loss": 6.96830463, + "learning_rate": 0.00045597044543220066, + "loss": 6.99168253, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.20703125, + "step": 10, + "time_per_iteration": 2.736985921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02262083, + "balance_loss_mlp": 1.74290299, + "epoch": 0.002116198537899192, + "flos": 610895709696.0, + "grad_norm": 2.2728267884834983, + "language_loss": 6.92078686, + "learning_rate": 0.00047484428652143135, + "loss": 6.94340801, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.19140625, + "step": 11, + "time_per_iteration": 2.8857340812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308547, + "balance_loss_mlp": 1.78135598, + "epoch": 0.002308580223162755, + "flos": 546175262976.0, + "grad_norm": 4.334726148282724, + "language_loss": 6.71077013, + "learning_rate": 0.0004920747534624128, + "loss": 6.73385572, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 5.2734375, + "step": 12, + "time_per_iteration": 2.635601282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02317905, + "balance_loss_mlp": 1.79147708, + "epoch": 0.002500961908426318, + "flos": 645924270336.0, + "grad_norm": 3.1568536142119923, + "language_loss": 6.53248501, + "learning_rate": 0.0005079252465375872, + "loss": 6.55566406, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 5.265625, + "step": 13, + "time_per_iteration": 2.8112540245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02242807, + "balance_loss_mlp": 1.72019386, + "epoch": 0.0026933435936898806, + "flos": 488849352960.0, + "grad_norm": 7.572425831928954, + "language_loss": 6.47189951, + "learning_rate": 0.0005226005109505393, + "loss": 6.49432755, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 5.2265625, + "step": 14, + "time_per_iteration": 2.590078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02247915, + "balance_loss_mlp": 1.72415757, + "epoch": 0.0028857252789534437, + "flos": 435526429440.0, + "grad_norm": 2.3229781853457747, + "language_loss": 6.01724243, + "learning_rate": 0.0005362628552605367, + "loss": 6.03972149, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 5.23828125, + "step": 15, + "time_per_iteration": 2.636983871459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02135688, + "balance_loss_mlp": 1.62108541, + "epoch": 0.0030781069642170067, + "flos": 597841778688.0, + "grad_norm": 4.36506198708269, + "language_loss": 5.46747923, + "learning_rate": 0.0005490431248454357, + "loss": 5.48883629, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 5.14453125, + "step": 16, + "time_per_iteration": 2.6904103755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02173305, + "balance_loss_mlp": 1.67586899, + "epoch": 0.0032704886494805694, + "flos": 1541513154048.0, + "grad_norm": 0.3693165783384919, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77878416, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 4.96875, + "step": 17, + "time_per_iteration": 6.815098285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01958957, + "balance_loss_mlp": 1.45846832, + "epoch": 0.0034628703347441324, + "flos": 474971102976.0, + "grad_norm": 7.376330921510473, + "language_loss": 3.16160107, + "learning_rate": 0.0005723671632907488, + "loss": 3.18119049, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 5.0, + "step": 18, + "time_per_iteration": 2.7730185985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974299, + "balance_loss_mlp": 1.48144007, + "epoch": 0.0036552520200076955, + "flos": 449478556416.0, + "grad_norm": 2.0435067055151803, + "language_loss": 1.8205657, + "learning_rate": 0.0005830738490244919, + "loss": 1.84030867, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.921875, + "step": 19, + "time_per_iteration": 2.5196421146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215561, + "balance_loss_mlp": 1.73147547, + "epoch": 0.003847633705271258, + "flos": 637351580928.0, + "grad_norm": 2.199322832792736, + "language_loss": 1.81859815, + "learning_rate": 0.0005932312266435596, + "loss": 1.84075379, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.83203125, + "step": 20, + "time_per_iteration": 2.7772061824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02397049, + "balance_loss_mlp": 1.91639686, + "epoch": 0.004040015390534821, + "flos": 590591105280.0, + "grad_norm": 2.068137361611091, + "language_loss": 1.81285238, + "learning_rate": 0.0006028929207788754, + "loss": 1.83682299, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.796875, + "step": 21, + "time_per_iteration": 2.7197327613830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02949394, + "balance_loss_mlp": 2.47560835, + "epoch": 0.004232397075798384, + "flos": 757866929664.0, + "grad_norm": 0.9893066861855494, + "language_loss": 1.43565178, + "learning_rate": 0.0006121050677327902, + "loss": 1.46514571, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.7265625, + "step": 22, + "time_per_iteration": 2.8821635246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04240368, + "balance_loss_mlp": 3.77421188, + "epoch": 0.004424778761061947, + "flos": 527727310080.0, + "grad_norm": 1.6702760591351544, + "language_loss": 1.36044598, + "learning_rate": 0.0006209076479463684, + "loss": 1.40284979, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.6484375, + "step": 23, + "time_per_iteration": 2.6194069385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04405254, + "balance_loss_mlp": 3.93871665, + "epoch": 0.00461716044632551, + "flos": 549218815488.0, + "grad_norm": 1.6356367296774819, + "language_loss": 1.46302319, + "learning_rate": 0.0006293355346737718, + "loss": 1.50707567, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.65234375, + "step": 24, + "time_per_iteration": 2.741433620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03977472, + "balance_loss_mlp": 3.50483179, + "epoch": 0.004809542131589073, + "flos": 568752569088.0, + "grad_norm": 1.079559317914091, + "language_loss": 1.33177948, + "learning_rate": 0.0006374193284416834, + "loss": 1.37155437, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.71484375, + "step": 25, + "time_per_iteration": 2.902089834213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03127712, + "balance_loss_mlp": 2.642483, + "epoch": 0.005001923816852636, + "flos": 471584410368.0, + "grad_norm": 0.4847890845471295, + "language_loss": 1.26058078, + "learning_rate": 0.0006451860277489461, + "loss": 1.29185796, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.84375, + "step": 26, + "time_per_iteration": 2.6045680046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02733563, + "balance_loss_mlp": 2.23154879, + "epoch": 0.005194305502116198, + "flos": 416381502720.0, + "grad_norm": 0.2845036760864029, + "language_loss": 1.33193052, + "learning_rate": 0.0006526595731190848, + "loss": 1.35926616, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.015625, + "step": 27, + "time_per_iteration": 2.4412264823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02759137, + "balance_loss_mlp": 2.2411015, + "epoch": 0.005386687187379761, + "flos": 629996894976.0, + "grad_norm": 0.34713687972437796, + "language_loss": 1.22031224, + "learning_rate": 0.0006598612921618983, + "loss": 1.24790359, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.1796875, + "step": 28, + "time_per_iteration": 2.80483078956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575766, + "balance_loss_mlp": 2.05010033, + "epoch": 0.005579068872643324, + "flos": 888021326592.0, + "grad_norm": 0.3062478898066755, + "language_loss": 1.16221631, + "learning_rate": 0.0006668102665011454, + "loss": 1.18797398, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.2578125, + "step": 29, + "time_per_iteration": 3.243164300918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02507804, + "balance_loss_mlp": 1.97527242, + "epoch": 0.005771450557906887, + "flos": 548658902016.0, + "grad_norm": 0.22276861521731073, + "language_loss": 1.24634933, + "learning_rate": 0.0006735236364718957, + "loss": 1.27142727, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.328125, + "step": 30, + "time_per_iteration": 2.7701382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02465182, + "balance_loss_mlp": 1.93226886, + "epoch": 0.00596383224317045, + "flos": 533069809152.0, + "grad_norm": 0.21102664747409663, + "language_loss": 1.23222375, + "learning_rate": 0.0006800168558381346, + "loss": 1.25687563, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.33203125, + "step": 31, + "time_per_iteration": 2.635246515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02445382, + "balance_loss_mlp": 1.91552007, + "epoch": 0.0061562139284340135, + "flos": 590163394560.0, + "grad_norm": 0.21886797396213825, + "language_loss": 1.26610851, + "learning_rate": 0.0006863039060567947, + "loss": 1.29056239, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.30078125, + "step": 32, + "time_per_iteration": 2.7791683673858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02338603, + "balance_loss_mlp": 1.80950415, + "epoch": 0.006348595613697576, + "flos": 619442154240.0, + "grad_norm": 0.18971916612404452, + "language_loss": 1.17543316, + "learning_rate": 0.0006923974775611263, + "loss": 1.19881916, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.29296875, + "step": 33, + "time_per_iteration": 2.836601495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02160521, + "balance_loss_mlp": 1.64134097, + "epoch": 0.006540977298961139, + "flos": 779300109312.0, + "grad_norm": 0.13369632510289112, + "language_loss": 1.13907146, + "learning_rate": 0.0006983091239737814, + "loss": 1.16067672, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.19140625, + "step": 34, + "time_per_iteration": 3.021479606628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0221033, + "balance_loss_mlp": 1.69649041, + "epoch": 0.006733358984224702, + "flos": 668373264384.0, + "grad_norm": 0.11522706717853448, + "language_loss": 1.11973858, + "learning_rate": 0.0007040493939600222, + "loss": 1.14184177, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 5.13671875, + "step": 35, + "time_per_iteration": 2.9400346279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0227657, + "balance_loss_mlp": 1.76997864, + "epoch": 0.006925740669488265, + "flos": 565496133888.0, + "grad_norm": 0.11143421895921844, + "language_loss": 1.12295914, + "learning_rate": 0.0007096279445021078, + "loss": 1.14572477, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 5.0625, + "step": 36, + "time_per_iteration": 2.698153495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02284885, + "balance_loss_mlp": 1.78668559, + "epoch": 0.007118122354751828, + "flos": 551112405504.0, + "grad_norm": 0.11733654674395574, + "language_loss": 1.1734066, + "learning_rate": 0.0007150536386503726, + "loss": 1.19625545, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.9765625, + "step": 37, + "time_per_iteration": 2.8579084873199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02268399, + "balance_loss_mlp": 1.77782845, + "epoch": 0.007310504040015391, + "flos": 703814951424.0, + "grad_norm": 0.14208952684155102, + "language_loss": 1.10088778, + "learning_rate": 0.0007203346302358509, + "loss": 1.12357187, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.8984375, + "step": 38, + "time_per_iteration": 2.928835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220017, + "balance_loss_mlp": 1.73555112, + "epoch": 0.007502885725278953, + "flos": 600501361920.0, + "grad_norm": 0.142042154575746, + "language_loss": 1.15486813, + "learning_rate": 0.000725478437577282, + "loss": 1.17706823, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.8359375, + "step": 39, + "time_per_iteration": 2.8706436157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0209897, + "balance_loss_mlp": 1.62251425, + "epoch": 0.007695267410542516, + "flos": 561428018688.0, + "grad_norm": 0.13255726845543458, + "language_loss": 1.10233212, + "learning_rate": 0.0007304920078549186, + "loss": 1.12332189, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.75390625, + "step": 40, + "time_per_iteration": 2.6895179748535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01939831, + "balance_loss_mlp": 1.46986008, + "epoch": 0.007887649095806078, + "flos": 509231725056.0, + "grad_norm": 0.11166218824526469, + "language_loss": 1.12161303, + "learning_rate": 0.0007353817735343603, + "loss": 1.14101124, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.6875, + "step": 41, + "time_per_iteration": 2.709167957305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0184399, + "balance_loss_mlp": 1.3778342, + "epoch": 0.008080030781069641, + "flos": 504905040384.0, + "grad_norm": 0.06254207778511488, + "language_loss": 1.07663667, + "learning_rate": 0.0007401537019902344, + "loss": 1.09507656, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.6484375, + "step": 42, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01789021, + "balance_loss_mlp": 1.32896876, + "epoch": 0.008272412466333205, + "flos": 519106988544.0, + "grad_norm": 0.07012531219711775, + "language_loss": 1.09992051, + "learning_rate": 0.0007448133392900729, + "loss": 1.11781073, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.5859375, + "step": 43, + "time_per_iteration": 2.6997878551483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01787217, + "balance_loss_mlp": 1.32983518, + "epoch": 0.008464794151596768, + "flos": 609184866816.0, + "grad_norm": 0.09276066699658307, + "language_loss": 1.05755496, + "learning_rate": 0.0007493658489441491, + "loss": 1.07542706, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.56640625, + "step": 44, + "time_per_iteration": 2.8852477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177156, + "balance_loss_mlp": 1.31913674, + "epoch": 0.00865717583686033, + "flos": 539007214848.0, + "grad_norm": 0.11478380715178954, + "language_loss": 1.09959674, + "learning_rate": 0.0007538160463002316, + "loss": 1.11731243, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.53125, + "step": 45, + "time_per_iteration": 2.685568332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01802016, + "balance_loss_mlp": 1.35378933, + "epoch": 0.008849557522123894, + "flos": 509010094080.0, + "grad_norm": 0.14537339285711792, + "language_loss": 1.13533509, + "learning_rate": 0.0007581684291577274, + "loss": 1.15335524, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.49609375, + "step": 46, + "time_per_iteration": 2.5798568725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764716, + "balance_loss_mlp": 1.31915987, + "epoch": 0.009041939207387457, + "flos": 626508135168.0, + "grad_norm": 0.13285081251714825, + "language_loss": 1.15270185, + "learning_rate": 0.0007624272050891776, + "loss": 1.17034888, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.46875, + "step": 47, + "time_per_iteration": 2.822632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0175788, + "balance_loss_mlp": 1.31461263, + "epoch": 0.00923432089265102, + "flos": 550610817792.0, + "grad_norm": 0.11934546954286276, + "language_loss": 1.04916859, + "learning_rate": 0.0007665963158851307, + "loss": 1.06674731, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.4453125, + "step": 48, + "time_per_iteration": 2.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01741735, + "balance_loss_mlp": 1.29846764, + "epoch": 0.009426702577914583, + "flos": 563679333120.0, + "grad_norm": 0.08548395668661983, + "language_loss": 1.13647461, + "learning_rate": 0.0007706794594783609, + "loss": 1.15389204, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.4453125, + "step": 49, + "time_per_iteration": 2.734813928604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01727457, + "balance_loss_mlp": 1.28838515, + "epoch": 0.009619084263178146, + "flos": 617926697472.0, + "grad_norm": 0.06892583067190382, + "language_loss": 1.12110853, + "learning_rate": 0.0007746801096530423, + "loss": 1.13838315, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.40234375, + "step": 50, + "time_per_iteration": 2.7447421550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01719129, + "balance_loss_mlp": 1.28043914, + "epoch": 0.009811465948441709, + "flos": 542489171712.0, + "grad_norm": 0.04778558244894799, + "language_loss": 1.16797209, + "learning_rate": 0.0007786015338021173, + "loss": 1.1851635, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.3984375, + "step": 51, + "time_per_iteration": 2.65645694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01722789, + "balance_loss_mlp": 1.28562462, + "epoch": 0.010003847633705272, + "flos": 536977531392.0, + "grad_norm": 0.06217135289779639, + "language_loss": 1.09074998, + "learning_rate": 0.0007824468089603051, + "loss": 1.10797799, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.3828125, + "step": 52, + "time_per_iteration": 2.7218713760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01697539, + "balance_loss_mlp": 1.26380801, + "epoch": 0.010196229318968833, + "flos": 910806657792.0, + "grad_norm": 0.04206474108062499, + "language_loss": 1.08130515, + "learning_rate": 0.0007862188363098669, + "loss": 1.09828055, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.34765625, + "step": 53, + "time_per_iteration": 3.149973154067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01668333, + "balance_loss_mlp": 1.23765349, + "epoch": 0.010388611004232396, + "flos": 586970142720.0, + "grad_norm": 0.050634309517598654, + "language_loss": 1.08688021, + "learning_rate": 0.0007899203543304438, + "loss": 1.10356343, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.31640625, + "step": 54, + "time_per_iteration": 2.7033088207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01691162, + "balance_loss_mlp": 1.26315343, + "epoch": 0.01058099268949596, + "flos": 503472208896.0, + "grad_norm": 0.06464656169002964, + "language_loss": 1.22991037, + "learning_rate": 0.0007935539507422731, + "loss": 1.246822, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.2890625, + "step": 55, + "time_per_iteration": 2.601745843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017059, + "balance_loss_mlp": 1.28017938, + "epoch": 0.010773374374759523, + "flos": 545558969088.0, + "grad_norm": 0.06403483907250343, + "language_loss": 1.12561536, + "learning_rate": 0.0007971220733732573, + "loss": 1.14267421, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.265625, + "step": 56, + "time_per_iteration": 2.677314281463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0169453, + "balance_loss_mlp": 1.27262425, + "epoch": 0.010965756060023086, + "flos": 527286960384.0, + "grad_norm": 0.061369678053330295, + "language_loss": 1.07931721, + "learning_rate": 0.0008006270400641869, + "loss": 1.09626245, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.2265625, + "step": 57, + "time_per_iteration": 2.7162468433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699229, + "balance_loss_mlp": 1.27846837, + "epoch": 0.011158137745286649, + "flos": 578098054656.0, + "grad_norm": 0.06126094216688289, + "language_loss": 1.08923888, + "learning_rate": 0.0008040710477125043, + "loss": 1.10623109, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.21484375, + "step": 58, + "time_per_iteration": 2.724116563796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648065, + "balance_loss_mlp": 1.23150039, + "epoch": 0.011350519430550212, + "flos": 530314961664.0, + "grad_norm": 0.059594432794803906, + "language_loss": 1.09501219, + "learning_rate": 0.0008074561805429771, + "loss": 1.11149275, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.171875, + "step": 59, + "time_per_iteration": 2.613821268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01628698, + "balance_loss_mlp": 1.21594822, + "epoch": 0.011542901115813775, + "flos": 556971076608.0, + "grad_norm": 0.046387810099464834, + "language_loss": 1.0703913, + "learning_rate": 0.0008107844176832545, + "loss": 1.08667827, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.1328125, + "step": 60, + "time_per_iteration": 2.6809566020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01602811, + "balance_loss_mlp": 1.19349384, + "epoch": 0.011735282801077338, + "flos": 573176463360.0, + "grad_norm": 0.036957475185327084, + "language_loss": 1.08104563, + "learning_rate": 0.0008140576401132568, + "loss": 1.09707379, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.09765625, + "step": 61, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596506, + "balance_loss_mlp": 1.19024038, + "epoch": 0.0119276644863409, + "flos": 616717442304.0, + "grad_norm": 0.034032461682055544, + "language_loss": 1.09685671, + "learning_rate": 0.0008172776370494935, + "loss": 1.11282182, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.06640625, + "step": 62, + "time_per_iteration": 2.7589328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01605764, + "balance_loss_mlp": 1.20255029, + "epoch": 0.012120046171604464, + "flos": 502085064192.0, + "grad_norm": 0.035968497482949544, + "language_loss": 1.17104983, + "learning_rate": 0.0008204461118185703, + "loss": 1.18710756, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.03515625, + "step": 63, + "time_per_iteration": 2.594369411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01603311, + "balance_loss_mlp": 1.20353031, + "epoch": 0.012312427856868027, + "flos": 474302319360.0, + "grad_norm": 0.04911792883083492, + "language_loss": 1.06295228, + "learning_rate": 0.0008235646872681536, + "loss": 1.07898545, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 3.99609375, + "step": 64, + "time_per_iteration": 2.5651702880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01599528, + "balance_loss_mlp": 1.20279896, + "epoch": 0.012504809542131588, + "flos": 539471864064.0, + "grad_norm": 0.049725750424410776, + "language_loss": 1.06296277, + "learning_rate": 0.0008266349107584288, + "loss": 1.07895803, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 3.95898438, + "step": 65, + "time_per_iteration": 2.6876485347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596697, + "balance_loss_mlp": 1.20492756, + "epoch": 0.012697191227395151, + "flos": 609857541120.0, + "grad_norm": 0.056540756097456804, + "language_loss": 1.08585978, + "learning_rate": 0.0008296582587724851, + "loss": 1.10182667, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 3.91210938, + "step": 66, + "time_per_iteration": 2.71223783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587883, + "balance_loss_mlp": 1.19821179, + "epoch": 0.012889572912658714, + "flos": 769398600960.0, + "grad_norm": 0.04465917834699911, + "language_loss": 1.0627861, + "learning_rate": 0.0008326361411800136, + "loss": 1.07866502, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 3.89648438, + "step": 67, + "time_per_iteration": 2.9413115978240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01577237, + "balance_loss_mlp": 1.19099891, + "epoch": 0.013081954597922277, + "flos": 535021724928.0, + "grad_norm": 0.05343660826588632, + "language_loss": 1.06744349, + "learning_rate": 0.0008355699051851403, + "loss": 1.08321595, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 3.86132812, + "step": 68, + "time_per_iteration": 2.726212501525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0157129, + "balance_loss_mlp": 1.18829489, + "epoch": 0.01327433628318584, + "flos": 574181584128.0, + "grad_norm": 0.041490887209285586, + "language_loss": 1.14052749, + "learning_rate": 0.0008384608389860635, + "loss": 1.15624034, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 3.828125, + "step": 69, + "time_per_iteration": 2.6679208278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156381, + "balance_loss_mlp": 1.18386579, + "epoch": 0.013466717968449404, + "flos": 498259967232.0, + "grad_norm": 0.03618836919088814, + "language_loss": 1.04182374, + "learning_rate": 0.000841310175171381, + "loss": 1.05746174, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 3.796875, + "step": 70, + "time_per_iteration": 2.6277127265930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01563963, + "balance_loss_mlp": 1.18592632, + "epoch": 0.013659099653712967, + "flos": 566622763776.0, + "grad_norm": 0.04320101591589407, + "language_loss": 1.02295327, + "learning_rate": 0.000844119093875517, + "loss": 1.03859293, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 3.77734375, + "step": 71, + "time_per_iteration": 2.7236883640289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558639, + "balance_loss_mlp": 1.18403625, + "epoch": 0.01385148133897653, + "flos": 574943686656.0, + "grad_norm": 0.03416580025853519, + "language_loss": 1.06855714, + "learning_rate": 0.0008468887257134666, + "loss": 1.08414352, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 3.7421875, + "step": 72, + "time_per_iteration": 2.6696412563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558456, + "balance_loss_mlp": 1.18499684, + "epoch": 0.014043863024240093, + "flos": 577959048960.0, + "grad_norm": 0.037886537215891476, + "language_loss": 1.09368944, + "learning_rate": 0.0008496201545131264, + "loss": 1.10927403, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 3.73046875, + "step": 73, + "time_per_iteration": 2.701594591140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545785, + "balance_loss_mlp": 1.17575896, + "epoch": 0.014236244709503656, + "flos": 940265252352.0, + "grad_norm": 0.04766211184506119, + "language_loss": 1.07240248, + "learning_rate": 0.0008523144198617317, + "loss": 1.08786011, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.6953125, + "step": 74, + "time_per_iteration": 3.1882145404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01551426, + "balance_loss_mlp": 1.18387985, + "epoch": 0.014428626394767219, + "flos": 529496478720.0, + "grad_norm": 0.031986864242930464, + "language_loss": 1.06216824, + "learning_rate": 0.0008549725194813783, + "loss": 1.0776825, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.66992188, + "step": 75, + "time_per_iteration": 2.666274309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01546662, + "balance_loss_mlp": 1.18102288, + "epoch": 0.014621008080030782, + "flos": 805283549952.0, + "grad_norm": 0.03321604497436844, + "language_loss": 1.05779314, + "learning_rate": 0.0008575954114472099, + "loss": 1.07325983, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.65039062, + "step": 76, + "time_per_iteration": 3.1192731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547179, + "balance_loss_mlp": 1.18478322, + "epoch": 0.014813389765294343, + "flos": 698357746176.0, + "grad_norm": 0.03477979781895141, + "language_loss": 1.02737951, + "learning_rate": 0.0008601840162606118, + "loss": 1.04285145, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.6171875, + "step": 77, + "time_per_iteration": 3.0015783309936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547226, + "balance_loss_mlp": 1.18788171, + "epoch": 0.015005771450557906, + "flos": 598165476864.0, + "grad_norm": 0.032631512960834254, + "language_loss": 1.09477437, + "learning_rate": 0.000862739218788641, + "loss": 1.11024666, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.58984375, + "step": 78, + "time_per_iteration": 2.790245771408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536731, + "balance_loss_mlp": 1.18177319, + "epoch": 0.01519815313582147, + "flos": 550493199360.0, + "grad_norm": 0.0308447873241268, + "language_loss": 1.07131243, + "learning_rate": 0.0008652618700799138, + "loss": 1.0866797, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.55664062, + "step": 79, + "time_per_iteration": 2.6302430629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01532812, + "balance_loss_mlp": 1.18033433, + "epoch": 0.015390534821085032, + "flos": 431440817664.0, + "grad_norm": 0.04595099678969376, + "language_loss": 1.06556606, + "learning_rate": 0.0008677527890662774, + "loss": 1.08089423, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.53125, + "step": 80, + "time_per_iteration": 2.4970459938049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01520539, + "balance_loss_mlp": 1.17130363, + "epoch": 0.015582916506348595, + "flos": 525185345280.0, + "grad_norm": 0.030530536654869142, + "language_loss": 1.07461143, + "learning_rate": 0.0008702127641587799, + "loss": 1.08981681, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.49804688, + "step": 81, + "time_per_iteration": 2.6258630752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01512144, + "balance_loss_mlp": 1.16500628, + "epoch": 0.015775298191612157, + "flos": 576617591040.0, + "grad_norm": 0.026948447424875538, + "language_loss": 1.02672768, + "learning_rate": 0.0008726425547457192, + "loss": 1.04184914, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.4765625, + "step": 82, + "time_per_iteration": 2.7344956398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517079, + "balance_loss_mlp": 1.17375636, + "epoch": 0.01596767987687572, + "flos": 611440071936.0, + "grad_norm": 0.03479426421062965, + "language_loss": 1.02940345, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457426, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.4375, + "step": 83, + "time_per_iteration": 2.738685369491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01509349, + "balance_loss_mlp": 1.16850555, + "epoch": 0.016160061562139283, + "flos": 568233484800.0, + "grad_norm": 0.05178756375238081, + "language_loss": 1.08039558, + "learning_rate": 0.0008774144832015932, + "loss": 1.09548914, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.41210938, + "step": 84, + "time_per_iteration": 2.6948299407958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575775, + "balance_loss_mlp": 2.26144409, + "epoch": 0.016352443247402846, + "flos": 1414502431488.0, + "grad_norm": 0.37456313977874084, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7735008, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.140625, + "step": 85, + "time_per_iteration": 4.596364974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517697, + "balance_loss_mlp": 1.17895198, + "epoch": 0.01654482493266641, + "flos": 731786279424.0, + "grad_norm": 0.04138572693056026, + "language_loss": 1.03059626, + "learning_rate": 0.0008820741205014318, + "loss": 1.04577315, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.390625, + "step": 86, + "time_per_iteration": 2.901047706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01566516, + "balance_loss_mlp": 1.22757995, + "epoch": 0.016737206617929972, + "flos": 537405242112.0, + "grad_norm": 0.0588613682629828, + "language_loss": 1.04849172, + "learning_rate": 0.0008843634575408404, + "loss": 1.06415701, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.39257812, + "step": 87, + "time_per_iteration": 2.6739823818206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583525, + "balance_loss_mlp": 1.24497032, + "epoch": 0.016929588303193535, + "flos": 538130406144.0, + "grad_norm": 0.09131872689500015, + "language_loss": 1.06101418, + "learning_rate": 0.0008866266301555082, + "loss": 1.07684946, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.38867188, + "step": 88, + "time_per_iteration": 2.741093635559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156783, + "balance_loss_mlp": 1.23118281, + "epoch": 0.017121969988457098, + "flos": 527792438784.0, + "grad_norm": 0.07103005743700296, + "language_loss": 1.07027078, + "learning_rate": 0.0008888642296509615, + "loss": 1.08594918, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.36914062, + "step": 89, + "time_per_iteration": 2.622267007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01554346, + "balance_loss_mlp": 1.2196058, + "epoch": 0.01731435167372066, + "flos": 626768649984.0, + "grad_norm": 0.057543283798364535, + "language_loss": 1.11941445, + "learning_rate": 0.0008910768275115906, + "loss": 1.13495779, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.34960938, + "step": 90, + "time_per_iteration": 2.778939962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536545, + "balance_loss_mlp": 1.20409441, + "epoch": 0.017506733358984224, + "flos": 497385103872.0, + "grad_norm": 0.06951140803051024, + "language_loss": 1.07318401, + "learning_rate": 0.0008932649762767675, + "loss": 1.08854938, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.32617188, + "step": 91, + "time_per_iteration": 2.5841660499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01529864, + "balance_loss_mlp": 1.20122755, + "epoch": 0.017699115044247787, + "flos": 747218870016.0, + "grad_norm": 0.037985069994816135, + "language_loss": 1.10022223, + "learning_rate": 0.0008954292103690864, + "loss": 1.11552095, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.28710938, + "step": 92, + "time_per_iteration": 2.976200580596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525091, + "balance_loss_mlp": 1.19893408, + "epoch": 0.01789149672951135, + "flos": 516521282304.0, + "grad_norm": 0.05507041657686672, + "language_loss": 1.1172272, + "learning_rate": 0.0008975700468778296, + "loss": 1.13247812, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.26171875, + "step": 93, + "time_per_iteration": 2.5778274536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01518338, + "balance_loss_mlp": 1.19427943, + "epoch": 0.018083878414774913, + "flos": 587230657536.0, + "grad_norm": 0.047907590915393955, + "language_loss": 1.05762661, + "learning_rate": 0.0008996879863005366, + "loss": 1.07280993, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.24023438, + "step": 94, + "time_per_iteration": 2.6827101707458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01506508, + "balance_loss_mlp": 1.18664575, + "epoch": 0.018276260100038477, + "flos": 498370782720.0, + "grad_norm": 0.03950158468897577, + "language_loss": 1.05640411, + "learning_rate": 0.0009017835132453337, + "loss": 1.07146931, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.19726562, + "step": 95, + "time_per_iteration": 2.5879104137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01488471, + "balance_loss_mlp": 1.17223215, + "epoch": 0.01846864178530204, + "flos": 641233058304.0, + "grad_norm": 0.042611409633865054, + "language_loss": 1.05607677, + "learning_rate": 0.0009038570970964896, + "loss": 1.07096148, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.16015625, + "step": 96, + "time_per_iteration": 2.761634349822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487316, + "balance_loss_mlp": 1.17374837, + "epoch": 0.018661023470565603, + "flos": 512667995136.0, + "grad_norm": 0.026597294022958493, + "language_loss": 1.02809072, + "learning_rate": 0.0009059091926454854, + "loss": 1.04296374, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.1328125, + "step": 97, + "time_per_iteration": 2.602036952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487556, + "balance_loss_mlp": 1.17742097, + "epoch": 0.018853405155829166, + "flos": 932697683712.0, + "grad_norm": 0.04097414840704221, + "language_loss": 1.01764143, + "learning_rate": 0.0009079402406897198, + "loss": 1.03251696, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.09765625, + "step": 98, + "time_per_iteration": 3.2514705657958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483888, + "balance_loss_mlp": 1.17642295, + "epoch": 0.01904578684109273, + "flos": 577587718656.0, + "grad_norm": 0.027217181555243938, + "language_loss": 1.03385735, + "learning_rate": 0.0009099506686008212, + "loss": 1.04869628, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.0703125, + "step": 99, + "time_per_iteration": 2.7867672443389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473245, + "balance_loss_mlp": 1.16883183, + "epoch": 0.019238168526356292, + "flos": 559521789696.0, + "grad_norm": 0.02943095981266107, + "language_loss": 1.06245995, + "learning_rate": 0.0009119408908644013, + "loss": 1.07719231, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.0390625, + "step": 100, + "time_per_iteration": 2.718982219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466855, + "balance_loss_mlp": 1.164922, + "epoch": 0.019430550211619855, + "flos": 725104267776.0, + "grad_norm": 0.035830377247789626, + "language_loss": 1.12020779, + "learning_rate": 0.0009139113095929519, + "loss": 1.13487625, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.01367188, + "step": 101, + "time_per_iteration": 2.9023444652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146708, + "balance_loss_mlp": 1.16781712, + "epoch": 0.019622931896883418, + "flos": 500456846592.0, + "grad_norm": 0.031534744220975436, + "language_loss": 1.0658195, + "learning_rate": 0.0009158623150134762, + "loss": 1.08049035, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 2.98632812, + "step": 102, + "time_per_iteration": 2.5731325149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01479653, + "balance_loss_mlp": 1.1828692, + "epoch": 0.01981531358214698, + "flos": 510282532608.0, + "grad_norm": 0.0334583858191085, + "language_loss": 1.05968487, + "learning_rate": 0.000917794285931332, + "loss": 1.07448149, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 2.9609375, + "step": 103, + "time_per_iteration": 2.656132221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477107, + "balance_loss_mlp": 1.18184972, + "epoch": 0.020007695267410544, + "flos": 522393559296.0, + "grad_norm": 0.033386157220771755, + "language_loss": 0.97816026, + "learning_rate": 0.0009197075901716639, + "loss": 0.99293131, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 2.9453125, + "step": 104, + "time_per_iteration": 2.7207133769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01472947, + "balance_loss_mlp": 1.1811223, + "epoch": 0.020200076952674107, + "flos": 534444314880.0, + "grad_norm": 0.03432724584635873, + "language_loss": 1.08410704, + "learning_rate": 0.0009216025849997171, + "loss": 1.09883642, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 2.92382812, + "step": 105, + "time_per_iteration": 2.783440113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461838, + "balance_loss_mlp": 1.17115784, + "epoch": 0.020392458637937667, + "flos": 686083414272.0, + "grad_norm": 0.04360543496830388, + "language_loss": 1.02907205, + "learning_rate": 0.0009234796175212258, + "loss": 1.04369044, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 2.9140625, + "step": 106, + "time_per_iteration": 2.914760112762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450941, + "balance_loss_mlp": 1.1615957, + "epoch": 0.02058484032320123, + "flos": 703415430912.0, + "grad_norm": 0.03266429542390293, + "language_loss": 1.06572628, + "learning_rate": 0.000925339025064007, + "loss": 1.08023572, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 2.90039062, + "step": 107, + "time_per_iteration": 2.951838254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453976, + "balance_loss_mlp": 1.16558492, + "epoch": 0.020777222008464793, + "flos": 640328059392.0, + "grad_norm": 0.03192051704400644, + "language_loss": 0.99516582, + "learning_rate": 0.0009271811355418027, + "loss": 1.00970554, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 2.890625, + "step": 108, + "time_per_iteration": 2.897881507873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01449031, + "balance_loss_mlp": 1.16159379, + "epoch": 0.020969603693728356, + "flos": 683321763840.0, + "grad_norm": 0.04466737388011785, + "language_loss": 1.06219566, + "learning_rate": 0.0009290062678013548, + "loss": 1.07668602, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 2.88085938, + "step": 109, + "time_per_iteration": 2.8423218727111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430047, + "balance_loss_mlp": 1.14413536, + "epoch": 0.02116198537899192, + "flos": 534420015360.0, + "grad_norm": 0.034258615277409615, + "language_loss": 1.04797208, + "learning_rate": 0.0009308147319536321, + "loss": 1.06227255, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 2.86523438, + "step": 110, + "time_per_iteration": 2.6316323280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425495, + "balance_loss_mlp": 1.14053667, + "epoch": 0.021354367064255482, + "flos": 718728457728.0, + "grad_norm": 0.048864006828935096, + "language_loss": 1.11352324, + "learning_rate": 0.0009326068296900676, + "loss": 1.12777817, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 2.85546875, + "step": 111, + "time_per_iteration": 2.8313205242156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416698, + "balance_loss_mlp": 1.13269377, + "epoch": 0.021546748749519045, + "flos": 520624390656.0, + "grad_norm": 0.040751650479700946, + "language_loss": 1.01643181, + "learning_rate": 0.0009343828545846161, + "loss": 1.03059864, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 2.84570312, + "step": 112, + "time_per_iteration": 2.7729175090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401674, + "balance_loss_mlp": 1.11805177, + "epoch": 0.021739130434782608, + "flos": 506161927680.0, + "grad_norm": 0.042106341000359294, + "language_loss": 1.06266427, + "learning_rate": 0.0009361430923823841, + "loss": 1.07668102, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 2.84179688, + "step": 113, + "time_per_iteration": 2.5920841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01394311, + "balance_loss_mlp": 1.11126053, + "epoch": 0.02193151212004617, + "flos": 464427055872.0, + "grad_norm": 0.07156510336232694, + "language_loss": 1.09574234, + "learning_rate": 0.0009378878212755459, + "loss": 1.10968542, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 2.8359375, + "step": 114, + "time_per_iteration": 2.5213706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376382, + "balance_loss_mlp": 1.09371293, + "epoch": 0.022123893805309734, + "flos": 553332617472.0, + "grad_norm": 0.03568103744776456, + "language_loss": 0.9948864, + "learning_rate": 0.0009396173121672103, + "loss": 1.0086503, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.83203125, + "step": 115, + "time_per_iteration": 2.654648780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351182, + "balance_loss_mlp": 1.0677501, + "epoch": 0.022316275490573297, + "flos": 637379771136.0, + "grad_norm": 0.04471438423319615, + "language_loss": 1.05214882, + "learning_rate": 0.0009413318289238633, + "loss": 1.06566072, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.83984375, + "step": 116, + "time_per_iteration": 2.7842695713043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311882, + "balance_loss_mlp": 1.0282588, + "epoch": 0.02250865717583686, + "flos": 800316271872.0, + "grad_norm": 0.046340717018109684, + "language_loss": 0.97282118, + "learning_rate": 0.0009430316286169771, + "loss": 0.98593992, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.84179688, + "step": 117, + "time_per_iteration": 3.015839099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377985, + "balance_loss_mlp": 1.09283674, + "epoch": 0.022701038861100423, + "flos": 457063621632.0, + "grad_norm": 0.07808854544893538, + "language_loss": 1.02862036, + "learning_rate": 0.0009447169617543361, + "loss": 1.04240024, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.85742188, + "step": 118, + "time_per_iteration": 2.582919120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371579, + "balance_loss_mlp": 1.08871901, + "epoch": 0.022893420546363986, + "flos": 584187105024.0, + "grad_norm": 0.08661397198668377, + "language_loss": 1.09685123, + "learning_rate": 0.0009463880725016029, + "loss": 1.11056697, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.83398438, + "step": 119, + "time_per_iteration": 2.6932969093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312523, + "balance_loss_mlp": 1.03252411, + "epoch": 0.02308580223162755, + "flos": 562478826240.0, + "grad_norm": 0.04303328442288268, + "language_loss": 1.04977584, + "learning_rate": 0.0009480451988946134, + "loss": 1.06290102, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.8046875, + "step": 120, + "time_per_iteration": 2.8070547580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299284, + "balance_loss_mlp": 1.02252805, + "epoch": 0.023278183916891113, + "flos": 772646287872.0, + "grad_norm": 0.03799067846502037, + "language_loss": 1.05637264, + "learning_rate": 0.0009496885730428627, + "loss": 1.0693655, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.77148438, + "step": 121, + "time_per_iteration": 3.014753580093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130842, + "balance_loss_mlp": 1.03376198, + "epoch": 0.023470565602154676, + "flos": 554431057152.0, + "grad_norm": 0.04194740398285866, + "language_loss": 1.04016769, + "learning_rate": 0.0009513184213246156, + "loss": 1.05325174, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.75, + "step": 122, + "time_per_iteration": 2.633074998855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316034, + "balance_loss_mlp": 1.04442739, + "epoch": 0.02366294728741824, + "flos": 561167503872.0, + "grad_norm": 0.038872106950025416, + "language_loss": 1.07101583, + "learning_rate": 0.0009529349645740552, + "loss": 1.08417618, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.71875, + "step": 123, + "time_per_iteration": 2.6846470832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320226, + "balance_loss_mlp": 1.05014575, + "epoch": 0.0238553289726818, + "flos": 469517788416.0, + "grad_norm": 0.03403697644067516, + "language_loss": 1.05937934, + "learning_rate": 0.0009545384182608524, + "loss": 1.07258177, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.703125, + "step": 124, + "time_per_iteration": 2.5332376956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326404, + "balance_loss_mlp": 1.05880272, + "epoch": 0.024047710657945365, + "flos": 561104320512.0, + "grad_norm": 0.042208642163400256, + "language_loss": 1.03444421, + "learning_rate": 0.0009561289926625252, + "loss": 1.04770815, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.67773438, + "step": 125, + "time_per_iteration": 2.68180251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324487, + "balance_loss_mlp": 1.05841172, + "epoch": 0.024240092343208928, + "flos": 505771155456.0, + "grad_norm": 0.03944680997458598, + "language_loss": 1.08491933, + "learning_rate": 0.0009577068930299292, + "loss": 1.0981642, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.66210938, + "step": 126, + "time_per_iteration": 2.602088689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323529, + "balance_loss_mlp": 1.05936122, + "epoch": 0.02443247402847249, + "flos": 436753181184.0, + "grad_norm": 0.04017271590188075, + "language_loss": 1.04077768, + "learning_rate": 0.0009592723197462087, + "loss": 1.05401289, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.64257812, + "step": 127, + "time_per_iteration": 2.643617630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318089, + "balance_loss_mlp": 1.05563784, + "epoch": 0.024624855713736054, + "flos": 685069545216.0, + "grad_norm": 0.03549644551725154, + "language_loss": 1.0056293, + "learning_rate": 0.0009608254684795125, + "loss": 1.01881027, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.625, + "step": 128, + "time_per_iteration": 2.949061632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309242, + "balance_loss_mlp": 1.04831672, + "epoch": 0.024817237398999614, + "flos": 526114643712.0, + "grad_norm": 0.03183934804306691, + "language_loss": 1.03377914, + "learning_rate": 0.0009623665303297678, + "loss": 1.04687166, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.609375, + "step": 129, + "time_per_iteration": 2.7315783500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130104, + "balance_loss_mlp": 1.04106867, + "epoch": 0.025009619084263177, + "flos": 656887279872.0, + "grad_norm": 0.038944166016075116, + "language_loss": 1.07603359, + "learning_rate": 0.0009638956919697878, + "loss": 1.08904397, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.59960938, + "step": 130, + "time_per_iteration": 2.9588887691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293161, + "balance_loss_mlp": 1.03395224, + "epoch": 0.02520200076952674, + "flos": 455370275328.0, + "grad_norm": 0.03345888261117193, + "language_loss": 0.99743778, + "learning_rate": 0.0009654131357809714, + "loss": 1.0103693, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.59179688, + "step": 131, + "time_per_iteration": 2.5802786350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296775, + "balance_loss_mlp": 1.03966463, + "epoch": 0.025394382454790303, + "flos": 841269599232.0, + "grad_norm": 0.04496153180844387, + "language_loss": 1.08517051, + "learning_rate": 0.0009669190399838441, + "loss": 1.09813821, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.5703125, + "step": 132, + "time_per_iteration": 3.1034374237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297684, + "balance_loss_mlp": 1.04190826, + "epoch": 0.025586764140053866, + "flos": 582229353216.0, + "grad_norm": 0.044253016077327914, + "language_loss": 1.0183959, + "learning_rate": 0.0009684135787636724, + "loss": 1.03137255, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.55664062, + "step": 133, + "time_per_iteration": 2.8056888580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284726, + "balance_loss_mlp": 1.03066742, + "epoch": 0.02577914582531743, + "flos": 791678453760.0, + "grad_norm": 0.04023348500073193, + "language_loss": 1.06134284, + "learning_rate": 0.0009698969223913726, + "loss": 1.07419014, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.5390625, + "step": 134, + "time_per_iteration": 3.0520598888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_mlp": 1.02717578, + "epoch": 0.025971527510580992, + "flos": 596063861760.0, + "grad_norm": 0.02965492003563146, + "language_loss": 1.08660483, + "learning_rate": 0.0009713692373399265, + "loss": 1.09939814, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.51953125, + "step": 135, + "time_per_iteration": 2.679379463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01931427, + "balance_loss_mlp": 1.66744995, + "epoch": 0.026163909195844555, + "flos": 1581077391360.0, + "grad_norm": 0.18396358569787127, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81387651, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.640625, + "step": 136, + "time_per_iteration": 5.69318151473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01580238, + "balance_loss_mlp": 1.32083893, + "epoch": 0.026356290881108118, + "flos": 1505163555840.0, + "grad_norm": 0.11058621392355464, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79391277, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.59375, + "step": 137, + "time_per_iteration": 4.930646896362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336039, + "balance_loss_mlp": 1.08846498, + "epoch": 0.02654867256637168, + "flos": 598341421056.0, + "grad_norm": 0.05793494017899448, + "language_loss": 1.01254559, + "learning_rate": 0.0009757216201974225, + "loss": 1.02590609, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.47265625, + "step": 138, + "time_per_iteration": 2.8532111644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376264, + "balance_loss_mlp": 1.13059723, + "epoch": 0.026741054251635244, + "flos": 546136379136.0, + "grad_norm": 0.07027637242601113, + "language_loss": 1.06507492, + "learning_rate": 0.0009771514130396581, + "loss": 1.07883763, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.453125, + "step": 139, + "time_per_iteration": 2.742065668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01373402, + "balance_loss_mlp": 1.12792611, + "epoch": 0.026933435936898807, + "flos": 507846525696.0, + "grad_norm": 0.06681977417406691, + "language_loss": 1.06790614, + "learning_rate": 0.00097857095638274, + "loss": 1.08164012, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.45117188, + "step": 140, + "time_per_iteration": 2.689812660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01350241, + "balance_loss_mlp": 1.10533786, + "epoch": 0.02712581762216237, + "flos": 742254504192.0, + "grad_norm": 0.04346752833457442, + "language_loss": 0.97943556, + "learning_rate": 0.0009799803961288726, + "loss": 0.99293798, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.4453125, + "step": 141, + "time_per_iteration": 3.064852714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340988, + "balance_loss_mlp": 1.09684777, + "epoch": 0.027318199307425933, + "flos": 849779105280.0, + "grad_norm": 0.04419232462487818, + "language_loss": 1.04253626, + "learning_rate": 0.000981379875086876, + "loss": 1.05594611, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.4375, + "step": 142, + "time_per_iteration": 3.049978494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342845, + "balance_loss_mlp": 1.09870481, + "epoch": 0.027510580992689496, + "flos": 576638978304.0, + "grad_norm": 0.03936283820829166, + "language_loss": 0.99339008, + "learning_rate": 0.0009827695330590185, + "loss": 1.00681853, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.4375, + "step": 143, + "time_per_iteration": 2.677050828933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360296, + "balance_loss_mlp": 1.11729932, + "epoch": 0.02770296267795306, + "flos": 773790414336.0, + "grad_norm": 0.036415015399305896, + "language_loss": 0.98794824, + "learning_rate": 0.0009841495069248256, + "loss": 1.00155115, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.42578125, + "step": 144, + "time_per_iteration": 2.9983932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369915, + "balance_loss_mlp": 1.12768197, + "epoch": 0.027895344363216622, + "flos": 570449806080.0, + "grad_norm": 0.04357781303470995, + "language_loss": 0.98341697, + "learning_rate": 0.0009855199307219871, + "loss": 0.99711609, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.41796875, + "step": 145, + "time_per_iteration": 2.6622605323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136275, + "balance_loss_mlp": 1.12261522, + "epoch": 0.028087726048480186, + "flos": 548409080832.0, + "grad_norm": 0.032618269384273584, + "language_loss": 1.00131154, + "learning_rate": 0.0009868809357244854, + "loss": 1.01493907, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.39648438, + "step": 146, + "time_per_iteration": 2.7002813816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347166, + "balance_loss_mlp": 1.10836601, + "epoch": 0.02828010773374375, + "flos": 525873570816.0, + "grad_norm": 0.032542426789695725, + "language_loss": 1.04416764, + "learning_rate": 0.0009882326505180556, + "loss": 1.05763924, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.3828125, + "step": 147, + "time_per_iteration": 2.710149049758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334853, + "balance_loss_mlp": 1.09815085, + "epoch": 0.02847248941900731, + "flos": 773772917760.0, + "grad_norm": 0.045451062042893155, + "language_loss": 1.02790403, + "learning_rate": 0.0009895752010730906, + "loss": 1.04125249, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.36132812, + "step": 148, + "time_per_iteration": 2.965888261795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328294, + "balance_loss_mlp": 1.0936898, + "epoch": 0.028664871104270875, + "flos": 535470822912.0, + "grad_norm": 0.03549847888949514, + "language_loss": 1.08720016, + "learning_rate": 0.0009909087108150867, + "loss": 1.10048318, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.33984375, + "step": 149, + "time_per_iteration": 2.759585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328431, + "balance_loss_mlp": 1.09649718, + "epoch": 0.028857252789534438, + "flos": 368605212672.0, + "grad_norm": 0.04584721914032896, + "language_loss": 1.09262538, + "learning_rate": 0.0009922333006927371, + "loss": 1.10590982, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.3125, + "step": 150, + "time_per_iteration": 2.5677716732025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132956, + "balance_loss_mlp": 1.09896171, + "epoch": 0.029049634474798, + "flos": 516484343808.0, + "grad_norm": 0.054837011337671125, + "language_loss": 1.02855873, + "learning_rate": 0.0009935490892437632, + "loss": 1.04185438, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.29882812, + "step": 151, + "time_per_iteration": 2.5842795372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323589, + "balance_loss_mlp": 1.09623301, + "epoch": 0.029242016160061564, + "flos": 589349769216.0, + "grad_norm": 0.041624099188269474, + "language_loss": 1.01284385, + "learning_rate": 0.0009948561926585687, + "loss": 1.02607965, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.2734375, + "step": 152, + "time_per_iteration": 2.7717602252960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309484, + "balance_loss_mlp": 1.08422625, + "epoch": 0.029434397845325123, + "flos": 553137231360.0, + "grad_norm": 0.04242067063834005, + "language_loss": 1.0541966, + "learning_rate": 0.0009961547248418122, + "loss": 1.0672915, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.25976562, + "step": 153, + "time_per_iteration": 2.6492583751678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303287, + "balance_loss_mlp": 1.07898307, + "epoch": 0.029626779530588686, + "flos": 604608360960.0, + "grad_norm": 0.03242941124289258, + "language_loss": 1.02145946, + "learning_rate": 0.0009974447974719707, + "loss": 1.03449237, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.25, + "step": 154, + "time_per_iteration": 2.7111871242523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303637, + "balance_loss_mlp": 1.08181214, + "epoch": 0.02981916121585225, + "flos": 622218388992.0, + "grad_norm": 0.03743420896054, + "language_loss": 1.03581393, + "learning_rate": 0.0009987265200589763, + "loss": 1.0488503, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.22460938, + "step": 155, + "time_per_iteration": 2.7590832710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281243, + "balance_loss_mlp": 1.06151628, + "epoch": 0.030011542901115813, + "flos": 662881065984.0, + "grad_norm": 0.03665146617631418, + "language_loss": 1.03448439, + "learning_rate": 0.001, + "loss": 1.04729688, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.203125, + "step": 156, + "time_per_iteration": 2.868732452392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262046, + "balance_loss_mlp": 1.04441714, + "epoch": 0.030203924586379376, + "flos": 652819164672.0, + "grad_norm": 0.048414208125286275, + "language_loss": 1.0101347, + "learning_rate": 0.0009999999029413921, + "loss": 1.02275515, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.18164062, + "step": 157, + "time_per_iteration": 2.8458704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249467, + "balance_loss_mlp": 1.03393674, + "epoch": 0.03039630627164294, + "flos": 532444766976.0, + "grad_norm": 0.038165698108555156, + "language_loss": 1.02398324, + "learning_rate": 0.0009999996117656068, + "loss": 1.03647804, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.16015625, + "step": 158, + "time_per_iteration": 2.7255747318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250196, + "balance_loss_mlp": 1.03657281, + "epoch": 0.030588687956906502, + "flos": 587295786240.0, + "grad_norm": 0.04636715302465643, + "language_loss": 0.95869231, + "learning_rate": 0.0009999991264727564, + "loss": 0.97119427, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.140625, + "step": 159, + "time_per_iteration": 2.7805936336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_mlp": 1.05284619, + "epoch": 0.030781069642170065, + "flos": 514287464448.0, + "grad_norm": 0.055354258548617474, + "language_loss": 1.07316554, + "learning_rate": 0.0009999984470630296, + "loss": 1.08580732, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.1171875, + "step": 160, + "time_per_iteration": 2.6011087894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284107, + "balance_loss_mlp": 1.07372677, + "epoch": 0.030973451327433628, + "flos": 719560546560.0, + "grad_norm": 0.03499871632601644, + "language_loss": 0.95530587, + "learning_rate": 0.0009999975735366902, + "loss": 0.96814692, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.10742188, + "step": 161, + "time_per_iteration": 3.083415985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_mlp": 1.07439709, + "epoch": 0.03116583301269719, + "flos": 1111615994880.0, + "grad_norm": 0.03722431710536786, + "language_loss": 0.96960843, + "learning_rate": 0.0009999965058940775, + "loss": 0.9824428, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.09375, + "step": 162, + "time_per_iteration": 3.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_mlp": 1.05655301, + "epoch": 0.031358214697960754, + "flos": 451833883392.0, + "grad_norm": 0.04231417263227255, + "language_loss": 1.04135799, + "learning_rate": 0.0009999952441356057, + "loss": 1.05399871, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.078125, + "step": 163, + "time_per_iteration": 2.5445146560668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239952, + "balance_loss_mlp": 1.03357697, + "epoch": 0.031550596383224314, + "flos": 1257087309312.0, + "grad_norm": 0.03293922474511325, + "language_loss": 1.04807603, + "learning_rate": 0.000999993788261765, + "loss": 1.06047547, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.06640625, + "step": 164, + "time_per_iteration": 3.603273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233685, + "balance_loss_mlp": 1.02769136, + "epoch": 0.03174297806848788, + "flos": 669323950080.0, + "grad_norm": 0.03785089383184646, + "language_loss": 1.05591631, + "learning_rate": 0.00099999213827312, + "loss": 1.06825328, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.0625, + "step": 165, + "time_per_iteration": 2.822242498397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237294, + "balance_loss_mlp": 1.03206336, + "epoch": 0.03193535975375144, + "flos": 552364435200.0, + "grad_norm": 0.03413051380570177, + "language_loss": 1.00392842, + "learning_rate": 0.000999990294170312, + "loss": 1.01630139, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.0546875, + "step": 166, + "time_per_iteration": 2.6473989486694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124218, + "balance_loss_mlp": 1.03790259, + "epoch": 0.032127741439015006, + "flos": 544740486144.0, + "grad_norm": 0.02951320831702663, + "language_loss": 1.04371905, + "learning_rate": 0.0009999882559540566, + "loss": 1.0561409, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.04492188, + "step": 167, + "time_per_iteration": 2.654994487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_mlp": 1.04661989, + "epoch": 0.032320123124278566, + "flos": 549514323456.0, + "grad_norm": 0.03217165834370848, + "language_loss": 1.01348543, + "learning_rate": 0.000999986023625145, + "loss": 1.02598298, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.03320312, + "step": 168, + "time_per_iteration": 2.759324550628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01736656, + "balance_loss_mlp": 1.53829193, + "epoch": 0.03251250480954213, + "flos": 1308817963776.0, + "grad_norm": 0.15145695156494207, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8066107, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.9765625, + "step": 169, + "time_per_iteration": 4.9954283237457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125768, + "balance_loss_mlp": 1.05588245, + "epoch": 0.03270488649480569, + "flos": 562202760192.0, + "grad_norm": 0.04037677915440104, + "language_loss": 1.01481748, + "learning_rate": 0.0009999809766328958, + "loss": 1.02739429, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.01953125, + "step": 170, + "time_per_iteration": 2.6656970977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250876, + "balance_loss_mlp": 1.0494597, + "epoch": 0.03289726818006926, + "flos": 483339657984.0, + "grad_norm": 0.04232720535630845, + "language_loss": 1.03883123, + "learning_rate": 0.0009999781619715177, + "loss": 1.0513401, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.015625, + "step": 171, + "time_per_iteration": 2.5408902168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238141, + "balance_loss_mlp": 1.03786898, + "epoch": 0.03308964986533282, + "flos": 675821269248.0, + "grad_norm": 0.04278552863969592, + "language_loss": 1.04043615, + "learning_rate": 0.000999975153201402, + "loss": 1.05281758, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.00390625, + "step": 172, + "time_per_iteration": 2.85229754447937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233367, + "balance_loss_mlp": 1.03385854, + "epoch": 0.033282031550596385, + "flos": 610341632256.0, + "grad_norm": 0.04144744195910536, + "language_loss": 1.01965618, + "learning_rate": 0.0009999719503237174, + "loss": 1.03198993, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 1.9921875, + "step": 173, + "time_per_iteration": 2.7612979412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234993, + "balance_loss_mlp": 1.03739214, + "epoch": 0.033474413235859944, + "flos": 468996758784.0, + "grad_norm": 0.06741318195929925, + "language_loss": 1.10547054, + "learning_rate": 0.0009999685533397073, + "loss": 1.1178205, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 1.97265625, + "step": 174, + "time_per_iteration": 2.5750949382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246896, + "balance_loss_mlp": 1.05101097, + "epoch": 0.03366679492112351, + "flos": 580715841792.0, + "grad_norm": 0.0354258140398677, + "language_loss": 1.02665091, + "learning_rate": 0.00099996496225069, + "loss": 1.03911996, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 1.95605469, + "step": 175, + "time_per_iteration": 2.6886191368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124614, + "balance_loss_mlp": 1.05168545, + "epoch": 0.03385917660638707, + "flos": 638886479616.0, + "grad_norm": 0.036851717024697625, + "language_loss": 1.04551578, + "learning_rate": 0.0009999611770580604, + "loss": 1.0579772, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 1.94433594, + "step": 176, + "time_per_iteration": 2.8528547286987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227252, + "balance_loss_mlp": 1.03422809, + "epoch": 0.03405155829165064, + "flos": 442740164352.0, + "grad_norm": 0.05003520598604069, + "language_loss": 1.03819132, + "learning_rate": 0.0009999571977632876, + "loss": 1.0504638, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 1.9296875, + "step": 177, + "time_per_iteration": 2.6220269203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224145, + "balance_loss_mlp": 1.03188384, + "epoch": 0.034243939976914196, + "flos": 467275222272.0, + "grad_norm": 0.0554689754659714, + "language_loss": 1.0658946, + "learning_rate": 0.0009999530243679166, + "loss": 1.07813609, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 1.921875, + "step": 178, + "time_per_iteration": 2.5593671798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235914, + "balance_loss_mlp": 1.04479802, + "epoch": 0.03443632166217776, + "flos": 780713498880.0, + "grad_norm": 0.03675993055709111, + "language_loss": 1.01102996, + "learning_rate": 0.0009999486568735675, + "loss": 1.02338898, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 1.91015625, + "step": 179, + "time_per_iteration": 3.083312749862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235549, + "balance_loss_mlp": 1.04548192, + "epoch": 0.03462870334744132, + "flos": 1265760120576.0, + "grad_norm": 0.04656515886260978, + "language_loss": 1.01660061, + "learning_rate": 0.0009999440952819362, + "loss": 1.02895617, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 1.89941406, + "step": 180, + "time_per_iteration": 3.691354513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231777, + "balance_loss_mlp": 1.04390287, + "epoch": 0.03482108503270489, + "flos": 608303200512.0, + "grad_norm": 0.04339398829325753, + "language_loss": 1.02140999, + "learning_rate": 0.0009999393395947935, + "loss": 1.03372765, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 1.87695312, + "step": 181, + "time_per_iteration": 2.8826780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222143, + "balance_loss_mlp": 1.03617644, + "epoch": 0.03501346671796845, + "flos": 539315361792.0, + "grad_norm": 0.033650569268787865, + "language_loss": 1.05363226, + "learning_rate": 0.0009999343898139858, + "loss": 1.06585371, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 1.85742188, + "step": 182, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217643, + "balance_loss_mlp": 1.03329813, + "epoch": 0.035205848403232015, + "flos": 519499706112.0, + "grad_norm": 0.04889617812287003, + "language_loss": 1.03914642, + "learning_rate": 0.0009999292459414348, + "loss": 1.05132294, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 1.84082031, + "step": 183, + "time_per_iteration": 2.648263931274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223183, + "balance_loss_mlp": 1.04103076, + "epoch": 0.035398230088495575, + "flos": 473334137088.0, + "grad_norm": 0.03546540132303448, + "language_loss": 1.08284354, + "learning_rate": 0.0009999239079791374, + "loss": 1.09507537, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 1.81835938, + "step": 184, + "time_per_iteration": 2.6003947257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229231, + "balance_loss_mlp": 1.04908144, + "epoch": 0.03559061177375914, + "flos": 513095705856.0, + "grad_norm": 0.03580873522044792, + "language_loss": 1.00877666, + "learning_rate": 0.0009999183759291659, + "loss": 1.02106905, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 1.79785156, + "step": 185, + "time_per_iteration": 2.7518959045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229953, + "balance_loss_mlp": 1.05161583, + "epoch": 0.0357829934590227, + "flos": 478350992640.0, + "grad_norm": 0.05401643684385997, + "language_loss": 1.03586912, + "learning_rate": 0.0009999126497936682, + "loss": 1.04816866, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 1.78710938, + "step": 186, + "time_per_iteration": 2.565373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218003, + "balance_loss_mlp": 1.04052448, + "epoch": 0.03597537514428627, + "flos": 645885386496.0, + "grad_norm": 0.027605248849540943, + "language_loss": 1.06344712, + "learning_rate": 0.0009999067295748676, + "loss": 1.07562721, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 1.77832031, + "step": 187, + "time_per_iteration": 2.862023115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208675, + "balance_loss_mlp": 1.03167319, + "epoch": 0.03616775682954983, + "flos": 582270182400.0, + "grad_norm": 0.041753828035088196, + "language_loss": 1.04174721, + "learning_rate": 0.000999900615275062, + "loss": 1.05383396, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 1.7734375, + "step": 188, + "time_per_iteration": 2.7248780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206757, + "balance_loss_mlp": 1.02994609, + "epoch": 0.03636013851481339, + "flos": 383265007104.0, + "grad_norm": 0.05119808239604003, + "language_loss": 1.10189009, + "learning_rate": 0.0009998943068966256, + "loss": 1.11395764, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 1.77148438, + "step": 189, + "time_per_iteration": 2.487445592880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216253, + "balance_loss_mlp": 1.04010975, + "epoch": 0.03655252020007695, + "flos": 584308614144.0, + "grad_norm": 0.029643950017142998, + "language_loss": 1.04644084, + "learning_rate": 0.0009998878044420072, + "loss": 1.05860329, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 1.76464844, + "step": 190, + "time_per_iteration": 2.736809015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012186, + "balance_loss_mlp": 1.04321897, + "epoch": 0.03674490188534051, + "flos": 472598279424.0, + "grad_norm": 0.03987592529636011, + "language_loss": 1.00565469, + "learning_rate": 0.0009998811079137318, + "loss": 1.01784062, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 1.75683594, + "step": 191, + "time_per_iteration": 2.6006946563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214791, + "balance_loss_mlp": 1.04017353, + "epoch": 0.03693728357060408, + "flos": 529411908096.0, + "grad_norm": 0.03601320862003297, + "language_loss": 1.01597381, + "learning_rate": 0.0009998742173143987, + "loss": 1.02812171, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 1.74902344, + "step": 192, + "time_per_iteration": 2.6246893405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200861, + "balance_loss_mlp": 1.02719736, + "epoch": 0.03712966525586764, + "flos": 800346407424.0, + "grad_norm": 0.02962706666311765, + "language_loss": 1.0204885, + "learning_rate": 0.0009998671326466833, + "loss": 1.03249693, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 1.73925781, + "step": 193, + "time_per_iteration": 2.9852418899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194118, + "balance_loss_mlp": 1.02121651, + "epoch": 0.037322046941131205, + "flos": 831359342592.0, + "grad_norm": 0.049736474928026, + "language_loss": 1.0340569, + "learning_rate": 0.0009998598539133362, + "loss": 1.04599798, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 1.73144531, + "step": 194, + "time_per_iteration": 3.0510568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194861, + "balance_loss_mlp": 1.02339077, + "epoch": 0.037514428626394765, + "flos": 438589423872.0, + "grad_norm": 0.030819097200883293, + "language_loss": 1.03682184, + "learning_rate": 0.0009998523811171828, + "loss": 1.04877055, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 1.71679688, + "step": 195, + "time_per_iteration": 2.5203936100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197718, + "balance_loss_mlp": 1.0269146, + "epoch": 0.03770681031165833, + "flos": 512639804928.0, + "grad_norm": 0.031890398221933944, + "language_loss": 1.04342675, + "learning_rate": 0.0009998447142611248, + "loss": 1.05540395, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 1.70996094, + "step": 196, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193737, + "balance_loss_mlp": 1.02341044, + "epoch": 0.03789919199692189, + "flos": 808843274496.0, + "grad_norm": 0.030368823498634023, + "language_loss": 0.97672093, + "learning_rate": 0.0009998368533481387, + "loss": 0.98865831, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 1.70507812, + "step": 197, + "time_per_iteration": 3.031437397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185957, + "balance_loss_mlp": 1.01677489, + "epoch": 0.03809157368218546, + "flos": 691792386048.0, + "grad_norm": 0.027429804092446938, + "language_loss": 1.00742936, + "learning_rate": 0.0009998287983812762, + "loss": 1.01928902, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 1.69335938, + "step": 198, + "time_per_iteration": 2.8533172607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186443, + "balance_loss_mlp": 1.01764262, + "epoch": 0.03828395536744902, + "flos": 519004921344.0, + "grad_norm": 0.029672573654994608, + "language_loss": 1.06761527, + "learning_rate": 0.0009998205493636646, + "loss": 1.07947969, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 1.68945312, + "step": 199, + "time_per_iteration": 2.6512415409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190294, + "balance_loss_mlp": 1.02197027, + "epoch": 0.038476337052712584, + "flos": 582763021824.0, + "grad_norm": 0.03300049351517658, + "language_loss": 0.99112457, + "learning_rate": 0.0009998121062985063, + "loss": 1.00302756, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 1.68457031, + "step": 200, + "time_per_iteration": 2.6979846954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187054, + "balance_loss_mlp": 1.01996994, + "epoch": 0.03866871873797614, + "flos": 578273998848.0, + "grad_norm": 0.03164459486115397, + "language_loss": 1.0110172, + "learning_rate": 0.0009998034691890794, + "loss": 1.02288771, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 1.671875, + "step": 201, + "time_per_iteration": 2.80670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183327, + "balance_loss_mlp": 1.01672018, + "epoch": 0.03886110042323971, + "flos": 541772755968.0, + "grad_norm": 0.032663388617215364, + "language_loss": 1.05587053, + "learning_rate": 0.0009997946380387369, + "loss": 1.06770372, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 1.66699219, + "step": 202, + "time_per_iteration": 2.6591310501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179406, + "balance_loss_mlp": 1.01394379, + "epoch": 0.03905348210850327, + "flos": 719240739072.0, + "grad_norm": 0.030305493428663434, + "language_loss": 1.08528447, + "learning_rate": 0.0009997856128509076, + "loss": 1.09707844, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 1.65527344, + "step": 203, + "time_per_iteration": 2.9006340503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181527, + "balance_loss_mlp": 1.01720893, + "epoch": 0.039245863793766836, + "flos": 428397265152.0, + "grad_norm": 0.03189317300504765, + "language_loss": 1.03375864, + "learning_rate": 0.0009997763936290952, + "loss": 1.04557395, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.64355469, + "step": 204, + "time_per_iteration": 2.5836358070373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178747, + "balance_loss_mlp": 1.01538289, + "epoch": 0.039438245479030395, + "flos": 664270156032.0, + "grad_norm": 0.033629424624266296, + "language_loss": 1.0866276, + "learning_rate": 0.0009997669803768789, + "loss": 1.09841514, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.63378906, + "step": 205, + "time_per_iteration": 2.7809464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180514, + "balance_loss_mlp": 1.01791251, + "epoch": 0.03963062716429396, + "flos": 636496159488.0, + "grad_norm": 0.025840840316256445, + "language_loss": 1.03755617, + "learning_rate": 0.0009997573730979134, + "loss": 1.04936123, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.62597656, + "step": 206, + "time_per_iteration": 2.7759904861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207138, + "balance_loss_mlp": 1.04272461, + "epoch": 0.03982300884955752, + "flos": 1421589799680.0, + "grad_norm": 0.03078548913711826, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80400336, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.64453125, + "step": 207, + "time_per_iteration": 4.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177297, + "balance_loss_mlp": 1.0162214, + "epoch": 0.04001539053482109, + "flos": 690520914432.0, + "grad_norm": 0.03233621027438014, + "language_loss": 1.02104092, + "learning_rate": 0.0009997375764747294, + "loss": 1.03281379, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.61035156, + "step": 208, + "time_per_iteration": 2.9808952808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181785, + "balance_loss_mlp": 1.02156758, + "epoch": 0.04020777222008465, + "flos": 534752461824.0, + "grad_norm": 0.037334696417832054, + "language_loss": 0.99876916, + "learning_rate": 0.0009997273871381967, + "loss": 1.01058698, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.6015625, + "step": 209, + "time_per_iteration": 2.6938650608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183132, + "balance_loss_mlp": 1.02396429, + "epoch": 0.040400153905348214, + "flos": 568997532672.0, + "grad_norm": 0.03228633343407045, + "language_loss": 1.04497194, + "learning_rate": 0.0009997170037902862, + "loss": 1.05680323, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.59082031, + "step": 210, + "time_per_iteration": 2.722900629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189763, + "balance_loss_mlp": 1.03145349, + "epoch": 0.040592535590611774, + "flos": 714679784448.0, + "grad_norm": 0.026587079094436805, + "language_loss": 1.0723207, + "learning_rate": 0.0009997064264350292, + "loss": 1.08421838, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.58203125, + "step": 211, + "time_per_iteration": 2.8636813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186902, + "balance_loss_mlp": 1.02954614, + "epoch": 0.04078491727587533, + "flos": 579207187968.0, + "grad_norm": 0.028855359605628288, + "language_loss": 1.01311755, + "learning_rate": 0.0009996956550765317, + "loss": 1.02498662, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.57226562, + "step": 212, + "time_per_iteration": 2.6752002239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183355, + "balance_loss_mlp": 1.0270474, + "epoch": 0.0409772989611389, + "flos": 553369555968.0, + "grad_norm": 0.03615073574048419, + "language_loss": 0.96463609, + "learning_rate": 0.0009996846897189762, + "loss": 0.97646964, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.56152344, + "step": 213, + "time_per_iteration": 2.618417501449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180441, + "balance_loss_mlp": 1.02470577, + "epoch": 0.04116968064640246, + "flos": 556764996864.0, + "grad_norm": 0.04473264124517712, + "language_loss": 1.02233624, + "learning_rate": 0.0009996735303666193, + "loss": 1.03414059, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.55566406, + "step": 214, + "time_per_iteration": 2.7398550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118203, + "balance_loss_mlp": 1.026963, + "epoch": 0.041362062331666026, + "flos": 579652395264.0, + "grad_norm": 0.027182691243245845, + "language_loss": 1.04435229, + "learning_rate": 0.0009996621770237937, + "loss": 1.05617261, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.54882812, + "step": 215, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182535, + "balance_loss_mlp": 1.02775347, + "epoch": 0.041554444016929586, + "flos": 612701816832.0, + "grad_norm": 0.028683660550217302, + "language_loss": 1.00582075, + "learning_rate": 0.0009996506296949073, + "loss": 1.01764607, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.54589844, + "step": 216, + "time_per_iteration": 2.877587080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180569, + "balance_loss_mlp": 1.02607429, + "epoch": 0.04174682570219315, + "flos": 529151393280.0, + "grad_norm": 0.031901868987761664, + "language_loss": 1.00452459, + "learning_rate": 0.0009996388883844428, + "loss": 1.01633024, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.54296875, + "step": 217, + "time_per_iteration": 2.6346311569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.02002692, + "epoch": 0.04193920738745671, + "flos": 512500799232.0, + "grad_norm": 0.02715845750356807, + "language_loss": 1.03465486, + "learning_rate": 0.0009996269530969588, + "loss": 1.04639161, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.53417969, + "step": 218, + "time_per_iteration": 2.6205921173095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170474, + "balance_loss_mlp": 1.0176959, + "epoch": 0.04213158907272028, + "flos": 572553366528.0, + "grad_norm": 0.03606301207395498, + "language_loss": 1.04169452, + "learning_rate": 0.0009996148238370888, + "loss": 1.05339921, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.52539062, + "step": 219, + "time_per_iteration": 2.8047173023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.01725543, + "epoch": 0.04232397075798384, + "flos": 965905552896.0, + "grad_norm": 0.026524392964530758, + "language_loss": 0.99111861, + "learning_rate": 0.0009996025006095421, + "loss": 1.00281417, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.52050781, + "step": 220, + "time_per_iteration": 3.315859317779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 0.99693298, + "epoch": 0.042516352443247404, + "flos": 1472733340416.0, + "grad_norm": 0.01509407607306266, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.78931135, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.5078125, + "step": 221, + "time_per_iteration": 5.540910243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166927, + "balance_loss_mlp": 1.0164367, + "epoch": 0.042708734128510964, + "flos": 655892852736.0, + "grad_norm": 0.029367950869880366, + "language_loss": 0.99126619, + "learning_rate": 0.0009995772722706307, + "loss": 1.00293541, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.50195312, + "step": 222, + "time_per_iteration": 2.901489019393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167445, + "balance_loss_mlp": 1.01705015, + "epoch": 0.04290111581377453, + "flos": 432734643456.0, + "grad_norm": 0.04040999725558835, + "language_loss": 1.13508129, + "learning_rate": 0.0009995643671690604, + "loss": 1.1467557, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.50097656, + "step": 223, + "time_per_iteration": 2.5576720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168153, + "balance_loss_mlp": 1.01823533, + "epoch": 0.04309349749903809, + "flos": 645867889920.0, + "grad_norm": 0.02824445481068148, + "language_loss": 1.00763512, + "learning_rate": 0.0009995512681194023, + "loss": 1.01931667, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.49609375, + "step": 224, + "time_per_iteration": 2.9571568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167559, + "balance_loss_mlp": 1.01840472, + "epoch": 0.04328587918430166, + "flos": 832897153536.0, + "grad_norm": 0.025764365733734692, + "language_loss": 0.98235118, + "learning_rate": 0.0009995379751267417, + "loss": 0.99402678, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.48828125, + "step": 225, + "time_per_iteration": 3.2627484798431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166832, + "balance_loss_mlp": 1.01824963, + "epoch": 0.043478260869565216, + "flos": 526116589056.0, + "grad_norm": 0.03531387708455554, + "language_loss": 1.00006318, + "learning_rate": 0.0009995244881962398, + "loss": 1.01173151, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.48242188, + "step": 226, + "time_per_iteration": 2.624209403991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170136, + "balance_loss_mlp": 1.02212548, + "epoch": 0.04367064255482878, + "flos": 440413027584.0, + "grad_norm": 0.039279482080902435, + "language_loss": 1.01293874, + "learning_rate": 0.0009995108073331323, + "loss": 1.02464008, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.4765625, + "step": 227, + "time_per_iteration": 2.6042520999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164096, + "balance_loss_mlp": 1.01742136, + "epoch": 0.04386302424009234, + "flos": 508467677184.0, + "grad_norm": 0.03801127181345805, + "language_loss": 1.03535032, + "learning_rate": 0.0009994969325427309, + "loss": 1.04699123, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.46582031, + "step": 228, + "time_per_iteration": 2.6691603660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163607, + "balance_loss_mlp": 1.01769507, + "epoch": 0.04405540592535591, + "flos": 541744565760.0, + "grad_norm": 0.03512041362752814, + "language_loss": 1.00143218, + "learning_rate": 0.0009994828638304218, + "loss": 1.0130682, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.46191406, + "step": 229, + "time_per_iteration": 2.627833366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164617, + "balance_loss_mlp": 1.01927722, + "epoch": 0.04424778761061947, + "flos": 447309867264.0, + "grad_norm": 0.03576658395893793, + "language_loss": 1.06260157, + "learning_rate": 0.0009994686012016675, + "loss": 1.07424784, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.45703125, + "step": 230, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159156, + "balance_loss_mlp": 1.01448417, + "epoch": 0.044440169295883035, + "flos": 701982599424.0, + "grad_norm": 0.03592315304636455, + "language_loss": 1.05298328, + "learning_rate": 0.000999454144662005, + "loss": 1.06457496, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.45019531, + "step": 231, + "time_per_iteration": 2.918896436691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156937, + "balance_loss_mlp": 1.01274192, + "epoch": 0.044632550981146595, + "flos": 589427536896.0, + "grad_norm": 0.032106980286660924, + "language_loss": 0.996499, + "learning_rate": 0.0009994394942170468, + "loss": 1.00806844, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.4453125, + "step": 232, + "time_per_iteration": 2.700378179550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169343, + "balance_loss_mlp": 1.02524316, + "epoch": 0.04482493266641016, + "flos": 555855140352.0, + "grad_norm": 0.03061962333593277, + "language_loss": 0.97402102, + "learning_rate": 0.0009994246498724808, + "loss": 0.9857145, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.44433594, + "step": 233, + "time_per_iteration": 2.692657232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171534, + "balance_loss_mlp": 1.02848291, + "epoch": 0.04501731435167372, + "flos": 724070956800.0, + "grad_norm": 0.03598428268947968, + "language_loss": 1.00358808, + "learning_rate": 0.00099940961163407, + "loss": 1.01530337, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.43359375, + "step": 234, + "time_per_iteration": 2.8496198654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167121, + "balance_loss_mlp": 1.02473748, + "epoch": 0.04520969603693728, + "flos": 512798252544.0, + "grad_norm": 0.03236637347420306, + "language_loss": 1.0231185, + "learning_rate": 0.0009993943795076528, + "loss": 1.03478956, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.42675781, + "step": 235, + "time_per_iteration": 2.6304001808166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157951, + "balance_loss_mlp": 1.01623452, + "epoch": 0.04540207772220085, + "flos": 365878555392.0, + "grad_norm": 0.04557463461025321, + "language_loss": 1.04854226, + "learning_rate": 0.0009993789534991427, + "loss": 1.06012177, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.41992188, + "step": 236, + "time_per_iteration": 2.500347852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156422, + "balance_loss_mlp": 1.01613641, + "epoch": 0.045594459407464406, + "flos": 523724323584.0, + "grad_norm": 0.028810086143122388, + "language_loss": 0.99360317, + "learning_rate": 0.0009993633336145287, + "loss": 1.00516737, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.40527344, + "step": 237, + "time_per_iteration": 2.6991968154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156358, + "balance_loss_mlp": 1.01664495, + "epoch": 0.04578684109272797, + "flos": 673116966144.0, + "grad_norm": 0.036851747197037266, + "language_loss": 1.03695393, + "learning_rate": 0.0009993475198598752, + "loss": 1.04851758, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.39941406, + "step": 238, + "time_per_iteration": 3.0150160789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160083, + "balance_loss_mlp": 1.02084696, + "epoch": 0.04597922277799153, + "flos": 542621374464.0, + "grad_norm": 0.03967898438127139, + "language_loss": 1.00323462, + "learning_rate": 0.0009993315122413212, + "loss": 1.01483548, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.39453125, + "step": 239, + "time_per_iteration": 2.6226179599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.01690221, + "epoch": 0.0461716044632551, + "flos": 459994413312.0, + "grad_norm": 0.029756199222484733, + "language_loss": 1.00536144, + "learning_rate": 0.0009993153107650818, + "loss": 1.01691425, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.38574219, + "step": 240, + "time_per_iteration": 2.635673999786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154071, + "balance_loss_mlp": 1.01607406, + "epoch": 0.04636398614851866, + "flos": 456171261696.0, + "grad_norm": 0.03103837756937707, + "language_loss": 0.99882519, + "learning_rate": 0.0009992989154374468, + "loss": 1.01036584, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.38183594, + "step": 241, + "time_per_iteration": 2.5449135303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115588, + "balance_loss_mlp": 1.01836014, + "epoch": 0.046556367833782225, + "flos": 557902320384.0, + "grad_norm": 0.06487144756994469, + "language_loss": 1.0686537, + "learning_rate": 0.0009992823262647817, + "loss": 1.08021247, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.37695312, + "step": 242, + "time_per_iteration": 2.705120325088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011561, + "balance_loss_mlp": 1.01905739, + "epoch": 0.046748749519045785, + "flos": 594088613376.0, + "grad_norm": 0.03633512017688626, + "language_loss": 1.00915635, + "learning_rate": 0.0009992655432535264, + "loss": 1.02071738, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.37207031, + "step": 243, + "time_per_iteration": 2.8158721923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160044, + "balance_loss_mlp": 1.02347767, + "epoch": 0.04694113120430935, + "flos": 570942645504.0, + "grad_norm": 0.036353271768507285, + "language_loss": 1.01172018, + "learning_rate": 0.0009992485664101973, + "loss": 1.02332067, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.3671875, + "step": 244, + "time_per_iteration": 2.723409414291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156505, + "balance_loss_mlp": 1.0207969, + "epoch": 0.04713351288957291, + "flos": 865246689024.0, + "grad_norm": 0.05316255083066814, + "language_loss": 1.03417325, + "learning_rate": 0.000999231395741385, + "loss": 1.04573822, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.35839844, + "step": 245, + "time_per_iteration": 3.1441562175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155254, + "balance_loss_mlp": 1.02011812, + "epoch": 0.04732589457483648, + "flos": 538236364032.0, + "grad_norm": 0.039550829703112036, + "language_loss": 1.01375949, + "learning_rate": 0.0009992140312537557, + "loss": 1.02531195, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.35253906, + "step": 246, + "time_per_iteration": 2.6407320499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158298, + "balance_loss_mlp": 1.02402055, + "epoch": 0.04751827626010004, + "flos": 763272612096.0, + "grad_norm": 0.029332271702031103, + "language_loss": 0.96132767, + "learning_rate": 0.000999196472954051, + "loss": 0.97291064, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.34375, + "step": 247, + "time_per_iteration": 2.9791386127471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115741, + "balance_loss_mlp": 1.02313232, + "epoch": 0.0477106579453636, + "flos": 1583128462080.0, + "grad_norm": 0.019406803026512872, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80582267, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.34375, + "step": 248, + "time_per_iteration": 5.547277927398682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115733, + "balance_loss_mlp": 1.02457833, + "epoch": 0.04790303963062716, + "flos": 458693784576.0, + "grad_norm": 0.04949407998464004, + "language_loss": 1.04053593, + "learning_rate": 0.0009991607749457578, + "loss": 1.05210924, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.328125, + "step": 249, + "time_per_iteration": 2.610372304916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158188, + "balance_loss_mlp": 1.02629459, + "epoch": 0.04809542131589073, + "flos": 783787186944.0, + "grad_norm": 0.03428496832179458, + "language_loss": 1.01565814, + "learning_rate": 0.0009991426352510286, + "loss": 1.02723992, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.31933594, + "step": 250, + "time_per_iteration": 2.9723451137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.0272516, + "epoch": 0.04828780300115429, + "flos": 560322776064.0, + "grad_norm": 0.03370153589925739, + "language_loss": 1.02967048, + "learning_rate": 0.0009991243017719422, + "loss": 1.04125512, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.3125, + "step": 251, + "time_per_iteration": 2.691317319869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115263, + "balance_loss_mlp": 1.02149975, + "epoch": 0.048480184686417856, + "flos": 502922989056.0, + "grad_norm": 0.033537523086657674, + "language_loss": 0.98110956, + "learning_rate": 0.0009991057745156165, + "loss": 0.99263585, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.31152344, + "step": 252, + "time_per_iteration": 2.615726947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126877, + "balance_loss_mlp": 0.99641418, + "epoch": 0.048672566371681415, + "flos": 1539471810048.0, + "grad_norm": 0.00943295316075806, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83037865, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.3046875, + "step": 253, + "time_per_iteration": 5.119662523269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155145, + "balance_loss_mlp": 1.02439594, + "epoch": 0.04886494805694498, + "flos": 538952779776.0, + "grad_norm": 0.04101934284448647, + "language_loss": 1.06555986, + "learning_rate": 0.0009990681387000943, + "loss": 1.07711136, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.30761719, + "step": 254, + "time_per_iteration": 2.7494144439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153346, + "balance_loss_mlp": 1.02316916, + "epoch": 0.04905732974220854, + "flos": 681485521152.0, + "grad_norm": 0.029284228955777224, + "language_loss": 1.01195645, + "learning_rate": 0.0009990490301555093, + "loss": 1.02348995, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.30175781, + "step": 255, + "time_per_iteration": 2.9595844745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_mlp": 1.00462341, + "epoch": 0.04924971142747211, + "flos": 1424277573120.0, + "grad_norm": 0.011666997955433429, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80348712, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.2890625, + "step": 256, + "time_per_iteration": 4.918023347854614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126785, + "balance_loss_mlp": 0.99822998, + "epoch": 0.04944209311273567, + "flos": 1561239381504.0, + "grad_norm": 0.006197531934497474, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80369532, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.28515625, + "step": 257, + "time_per_iteration": 4.996341228485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127556, + "balance_loss_mlp": 0.99976349, + "epoch": 0.04963447479799923, + "flos": 1574173748736.0, + "grad_norm": 0.01126324229515774, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71103442, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.27734375, + "step": 258, + "time_per_iteration": 4.951507329940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167552, + "balance_loss_mlp": 1.03966403, + "epoch": 0.049826856483262794, + "flos": 626499386880.0, + "grad_norm": 0.07394024090910019, + "language_loss": 0.96613419, + "learning_rate": 0.0009989706585723202, + "loss": 0.97780967, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.27832031, + "step": 259, + "time_per_iteration": 2.819796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158437, + "balance_loss_mlp": 1.03073978, + "epoch": 0.05001923816852635, + "flos": 505156806912.0, + "grad_norm": 0.042054435700702504, + "language_loss": 1.02184892, + "learning_rate": 0.0009989505813633442, + "loss": 1.0334332, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.27636719, + "step": 260, + "time_per_iteration": 2.671597719192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.02206886, + "epoch": 0.05021161985378992, + "flos": 588468102912.0, + "grad_norm": 0.05343186989039486, + "language_loss": 1.02308297, + "learning_rate": 0.000998930310444573, + "loss": 1.03457689, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.27246094, + "step": 261, + "time_per_iteration": 2.7573728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.01883233, + "epoch": 0.05040400153905348, + "flos": 634403292672.0, + "grad_norm": 0.052960623500171895, + "language_loss": 1.00806391, + "learning_rate": 0.0009989098458238765, + "loss": 1.01951981, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.26660156, + "step": 262, + "time_per_iteration": 2.7937912940979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146699, + "balance_loss_mlp": 1.02033675, + "epoch": 0.050596383224317046, + "flos": 554809190400.0, + "grad_norm": 0.04531187332347281, + "language_loss": 0.99888676, + "learning_rate": 0.0009988891875091998, + "loss": 1.0103538, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.26269531, + "step": 263, + "time_per_iteration": 2.811218500137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.01939976, + "epoch": 0.050788764909580605, + "flos": 550762462464.0, + "grad_norm": 0.03965392167411722, + "language_loss": 0.94696999, + "learning_rate": 0.0009988683355085636, + "loss": 0.95842183, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.25683594, + "step": 264, + "time_per_iteration": 2.7378242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141586, + "balance_loss_mlp": 1.01617777, + "epoch": 0.05098114659484417, + "flos": 606345448704.0, + "grad_norm": 0.024717188615823983, + "language_loss": 1.02827787, + "learning_rate": 0.000998847289830063, + "loss": 1.03969371, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.25292969, + "step": 265, + "time_per_iteration": 2.8625917434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142152, + "balance_loss_mlp": 1.01693416, + "epoch": 0.05117352828010773, + "flos": 439473035520.0, + "grad_norm": 0.036783183293041616, + "language_loss": 0.96527213, + "learning_rate": 0.0009988260504818682, + "loss": 0.97669363, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.25097656, + "step": 266, + "time_per_iteration": 2.5658230781555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138539, + "balance_loss_mlp": 1.0135119, + "epoch": 0.0513659099653713, + "flos": 506031670272.0, + "grad_norm": 0.04116504124695153, + "language_loss": 1.03285778, + "learning_rate": 0.000998804617472226, + "loss": 1.0442431, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.24902344, + "step": 267, + "time_per_iteration": 2.63395094871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138256, + "balance_loss_mlp": 1.01418352, + "epoch": 0.05155829165063486, + "flos": 696715922688.0, + "grad_norm": 0.034853618125567455, + "language_loss": 0.98327756, + "learning_rate": 0.0009987829908094568, + "loss": 0.9946602, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.23925781, + "step": 268, + "time_per_iteration": 2.8239262104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.01331627, + "epoch": 0.051750673335898424, + "flos": 1350302059008.0, + "grad_norm": 0.042488112993129025, + "language_loss": 1.04893267, + "learning_rate": 0.0009987611705019569, + "loss": 1.0603019, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.234375, + "step": 269, + "time_per_iteration": 4.33854079246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.01387095, + "epoch": 0.051943055021161984, + "flos": 490590331392.0, + "grad_norm": 0.037116049987967636, + "language_loss": 1.03026497, + "learning_rate": 0.0009987391565581978, + "loss": 1.04163671, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.23144531, + "step": 270, + "time_per_iteration": 2.609722852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136969, + "balance_loss_mlp": 1.01365864, + "epoch": 0.05213543670642555, + "flos": 546880985088.0, + "grad_norm": 0.03927026934880779, + "language_loss": 0.95517516, + "learning_rate": 0.000998716948986726, + "loss": 0.96654487, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.23144531, + "step": 271, + "time_per_iteration": 2.797673225402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137765, + "balance_loss_mlp": 1.01512277, + "epoch": 0.05232781839168911, + "flos": 604673489664.0, + "grad_norm": 0.04118655717732696, + "language_loss": 0.97937191, + "learning_rate": 0.0009986945477961633, + "loss": 0.9907496, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.22460938, + "step": 272, + "time_per_iteration": 2.6988775730133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135438, + "balance_loss_mlp": 1.01336777, + "epoch": 0.052520200076952676, + "flos": 539656556544.0, + "grad_norm": 0.027940819886650203, + "language_loss": 1.02222085, + "learning_rate": 0.0009986719529952066, + "loss": 1.0335753, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.21875, + "step": 273, + "time_per_iteration": 2.9503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133251, + "balance_loss_mlp": 1.01175284, + "epoch": 0.052712581762216236, + "flos": 464333736960.0, + "grad_norm": 0.036678205813438995, + "language_loss": 1.02377117, + "learning_rate": 0.000998649164592628, + "loss": 1.0351038, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.21289062, + "step": 274, + "time_per_iteration": 2.575183868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134193, + "balance_loss_mlp": 1.01279056, + "epoch": 0.0529049634474798, + "flos": 549106054656.0, + "grad_norm": 0.029580362230619023, + "language_loss": 1.00386071, + "learning_rate": 0.0009986261825972748, + "loss": 1.01520276, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.21191406, + "step": 275, + "time_per_iteration": 2.781388521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.01578796, + "epoch": 0.05309734513274336, + "flos": 619201081344.0, + "grad_norm": 0.028327187192750843, + "language_loss": 1.01742268, + "learning_rate": 0.000998603007018069, + "loss": 1.0287869, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.20410156, + "step": 276, + "time_per_iteration": 2.8231008052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137197, + "balance_loss_mlp": 1.01665294, + "epoch": 0.05328972681800693, + "flos": 606618602496.0, + "grad_norm": 0.02408735734832513, + "language_loss": 1.00149679, + "learning_rate": 0.0009985796378640089, + "loss": 1.01286888, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.203125, + "step": 277, + "time_per_iteration": 2.721719264984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136169, + "balance_loss_mlp": 1.01610124, + "epoch": 0.05348210850327049, + "flos": 605731100160.0, + "grad_norm": 0.0319931943489141, + "language_loss": 0.99697894, + "learning_rate": 0.0009985560751441665, + "loss": 1.0083406, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.19824219, + "step": 278, + "time_per_iteration": 2.835160255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133809, + "balance_loss_mlp": 1.01412332, + "epoch": 0.053674490188534055, + "flos": 631998388224.0, + "grad_norm": 0.030840524384760076, + "language_loss": 1.0228467, + "learning_rate": 0.00099853231886769, + "loss": 1.03418469, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.19433594, + "step": 279, + "time_per_iteration": 2.8541102409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131641, + "balance_loss_mlp": 1.01243138, + "epoch": 0.053866871873797614, + "flos": 480174596352.0, + "grad_norm": 0.030057370429500904, + "language_loss": 1.01521945, + "learning_rate": 0.0009985083690438024, + "loss": 1.02653599, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.18945312, + "step": 280, + "time_per_iteration": 2.778996706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133594, + "balance_loss_mlp": 1.01514757, + "epoch": 0.054059253559061174, + "flos": 789490322688.0, + "grad_norm": 0.030570218765999514, + "language_loss": 0.92515564, + "learning_rate": 0.0009984842256818016, + "loss": 0.93649161, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.18164062, + "step": 281, + "time_per_iteration": 3.113694429397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.01928854, + "epoch": 0.05425163524432474, + "flos": 629506000896.0, + "grad_norm": 0.043548376252248826, + "language_loss": 1.03102541, + "learning_rate": 0.0009984598887910613, + "loss": 1.04240274, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.18164062, + "step": 282, + "time_per_iteration": 2.8303444385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.01504183, + "epoch": 0.0544440169295883, + "flos": 616993508352.0, + "grad_norm": 0.05077708884656826, + "language_loss": 0.98823464, + "learning_rate": 0.0009984353583810297, + "loss": 0.99956, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.171875, + "step": 283, + "time_per_iteration": 2.835850954055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.01315546, + "epoch": 0.05463639861485187, + "flos": 648930884352.0, + "grad_norm": 0.03524270200319673, + "language_loss": 1.0117259, + "learning_rate": 0.0009984106344612302, + "loss": 1.02302563, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.16503906, + "step": 284, + "time_per_iteration": 2.760528564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129453, + "balance_loss_mlp": 1.01319993, + "epoch": 0.054828780300115426, + "flos": 798585987072.0, + "grad_norm": 0.03078454247465455, + "language_loss": 0.96210134, + "learning_rate": 0.0009983857170412615, + "loss": 0.97339588, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.15917969, + "step": 285, + "time_per_iteration": 2.9911587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131741, + "balance_loss_mlp": 1.01567924, + "epoch": 0.05502116198537899, + "flos": 550799400960.0, + "grad_norm": 0.028192528419898312, + "language_loss": 0.95645988, + "learning_rate": 0.000998360606130798, + "loss": 0.96777725, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.15722656, + "step": 286, + "time_per_iteration": 2.8603405952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119957, + "balance_loss_mlp": 1.00475311, + "epoch": 0.05521354367064255, + "flos": 1410909659136.0, + "grad_norm": 0.016802553847575376, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70193076, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.1484375, + "step": 287, + "time_per_iteration": 4.872994899749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139216, + "balance_loss_mlp": 1.02372622, + "epoch": 0.05540592535590612, + "flos": 646612495872.0, + "grad_norm": 0.03160477576624613, + "language_loss": 1.01500821, + "learning_rate": 0.0009983098038774552, + "loss": 1.02640033, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.15136719, + "step": 288, + "time_per_iteration": 2.7645044326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119652, + "balance_loss_mlp": 1.00521088, + "epoch": 0.05559830704116968, + "flos": 1514318512896.0, + "grad_norm": 0.011772143096286682, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79289877, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.140625, + "step": 289, + "time_per_iteration": 4.783201456069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150059, + "balance_loss_mlp": 1.03542745, + "epoch": 0.055790688726433245, + "flos": 509335737600.0, + "grad_norm": 0.037615798403722346, + "language_loss": 1.00063777, + "learning_rate": 0.0009982582277800948, + "loss": 1.01213825, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.14257812, + "step": 290, + "time_per_iteration": 2.5825588703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142418, + "balance_loss_mlp": 1.02873969, + "epoch": 0.055983070411696804, + "flos": 659075410944.0, + "grad_norm": 0.03490310528255379, + "language_loss": 1.06654799, + "learning_rate": 0.0009982321495648908, + "loss": 1.07797217, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.13671875, + "step": 291, + "time_per_iteration": 2.8099231719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137522, + "balance_loss_mlp": 1.02470279, + "epoch": 0.05617545209696037, + "flos": 588476851200.0, + "grad_norm": 0.035465642673631545, + "language_loss": 0.97683877, + "learning_rate": 0.0009982058779188115, + "loss": 0.98821402, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.13183594, + "step": 292, + "time_per_iteration": 2.7125580310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136096, + "balance_loss_mlp": 1.02384841, + "epoch": 0.05636783378222393, + "flos": 612788332800.0, + "grad_norm": 0.032210362870472055, + "language_loss": 1.05647731, + "learning_rate": 0.0009981794128520567, + "loss": 1.06783831, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.12597656, + "step": 293, + "time_per_iteration": 2.7916390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135514, + "balance_loss_mlp": 1.0241251, + "epoch": 0.0565602154674875, + "flos": 669424071936.0, + "grad_norm": 0.03595229916115603, + "language_loss": 1.02550793, + "learning_rate": 0.000998152754374901, + "loss": 1.03686309, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.1171875, + "step": 294, + "time_per_iteration": 2.8770558834075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134115, + "balance_loss_mlp": 1.0227263, + "epoch": 0.05675259715275106, + "flos": 618365101824.0, + "grad_norm": 0.028486588423889302, + "language_loss": 0.98274708, + "learning_rate": 0.0009981259024976943, + "loss": 0.99408829, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.1171875, + "step": 295, + "time_per_iteration": 2.729853630065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133246, + "balance_loss_mlp": 1.02204788, + "epoch": 0.05694497883801462, + "flos": 753154330368.0, + "grad_norm": 0.04188437456637708, + "language_loss": 0.968624, + "learning_rate": 0.0009980988572308612, + "loss": 0.97995651, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.11523438, + "step": 296, + "time_per_iteration": 3.0135345458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132496, + "balance_loss_mlp": 1.02187026, + "epoch": 0.05713736052327818, + "flos": 713382067968.0, + "grad_norm": 0.0305883196599643, + "language_loss": 0.9903996, + "learning_rate": 0.0009980716185849015, + "loss": 1.0017246, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.109375, + "step": 297, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.01865172, + "epoch": 0.05732974220854175, + "flos": 469936750848.0, + "grad_norm": 0.029025981508343963, + "language_loss": 0.95620793, + "learning_rate": 0.0009980441865703904, + "loss": 0.96750069, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.109375, + "step": 298, + "time_per_iteration": 2.67486572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.0163666, + "epoch": 0.05752212389380531, + "flos": 602541739008.0, + "grad_norm": 0.028406065642448373, + "language_loss": 1.04190016, + "learning_rate": 0.000998016561197978, + "loss": 1.05316436, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.10351562, + "step": 299, + "time_per_iteration": 2.7435965538024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127499, + "balance_loss_mlp": 1.01773107, + "epoch": 0.057714505579068875, + "flos": 679950622464.0, + "grad_norm": 0.02999406165417261, + "language_loss": 0.957955, + "learning_rate": 0.0009979887424783895, + "loss": 0.96922994, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.10058594, + "step": 300, + "time_per_iteration": 2.868412494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127678, + "balance_loss_mlp": 1.01800561, + "epoch": 0.057906887264332435, + "flos": 597012602112.0, + "grad_norm": 0.033381964405594114, + "language_loss": 0.95279002, + "learning_rate": 0.0009979607304224248, + "loss": 0.96406674, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.09960938, + "step": 301, + "time_per_iteration": 2.7196099758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127179, + "balance_loss_mlp": 1.01760185, + "epoch": 0.058099268949596, + "flos": 553165421568.0, + "grad_norm": 0.029428698202492602, + "language_loss": 1.02305853, + "learning_rate": 0.000997932525040959, + "loss": 1.03433037, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.09863281, + "step": 302, + "time_per_iteration": 2.645131826400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126073, + "balance_loss_mlp": 1.0166868, + "epoch": 0.05829165063485956, + "flos": 509231725056.0, + "grad_norm": 0.033454482596205204, + "language_loss": 1.04832363, + "learning_rate": 0.000997904126344943, + "loss": 1.05958426, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.09667969, + "step": 303, + "time_per_iteration": 2.60955810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_mlp": 1.0157212, + "epoch": 0.05848403232012313, + "flos": 616363608576.0, + "grad_norm": 0.0319979050325151, + "language_loss": 1.00779867, + "learning_rate": 0.0009978755343454018, + "loss": 1.01905453, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.1015625, + "step": 304, + "time_per_iteration": 2.733825206756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124254, + "balance_loss_mlp": 1.01467645, + "epoch": 0.05867641400538669, + "flos": 501079943424.0, + "grad_norm": 0.03385536533959698, + "language_loss": 1.01509869, + "learning_rate": 0.0009978467490534355, + "loss": 1.0263412, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.09863281, + "step": 305, + "time_per_iteration": 2.6263206005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121932, + "balance_loss_mlp": 1.01292717, + "epoch": 0.05886879569065025, + "flos": 532379638272.0, + "grad_norm": 0.03088897761094542, + "language_loss": 0.98605353, + "learning_rate": 0.00099781777048022, + "loss": 0.99727285, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.09277344, + "step": 306, + "time_per_iteration": 2.7351841926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122107, + "balance_loss_mlp": 1.01329267, + "epoch": 0.05906117737591381, + "flos": 490041111552.0, + "grad_norm": 0.034758856969872284, + "language_loss": 0.99957371, + "learning_rate": 0.0009977885986370057, + "loss": 1.01079476, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.09082031, + "step": 307, + "time_per_iteration": 2.566316843032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120423, + "balance_loss_mlp": 1.01199007, + "epoch": 0.05925355906117737, + "flos": 592710216960.0, + "grad_norm": 0.0408216139096099, + "language_loss": 0.95604599, + "learning_rate": 0.000997759233535118, + "loss": 0.96725023, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.08691406, + "step": 308, + "time_per_iteration": 2.781667470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119623, + "balance_loss_mlp": 1.01147592, + "epoch": 0.05944594074644094, + "flos": 564788466432.0, + "grad_norm": 0.03543125546238922, + "language_loss": 1.01945186, + "learning_rate": 0.0009977296751859576, + "loss": 1.03064811, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.08398438, + "step": 309, + "time_per_iteration": 2.778700828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121487, + "balance_loss_mlp": 1.0137223, + "epoch": 0.0596383224317045, + "flos": 539808201216.0, + "grad_norm": 0.03208598270087784, + "language_loss": 1.03591859, + "learning_rate": 0.0009976999236009998, + "loss": 1.04713345, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.08007812, + "step": 310, + "time_per_iteration": 2.790116786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121449, + "balance_loss_mlp": 1.01387453, + "epoch": 0.059830704116968066, + "flos": 562053060864.0, + "grad_norm": 0.03260901983169028, + "language_loss": 1.05564129, + "learning_rate": 0.0009976699787917955, + "loss": 1.06685579, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.078125, + "step": 311, + "time_per_iteration": 2.6586148738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108932, + "balance_loss_mlp": 1.00326538, + "epoch": 0.060023085802231625, + "flos": 1574050294272.0, + "grad_norm": 0.018314702584398344, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74551928, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.05859375, + "step": 312, + "time_per_iteration": 4.943182945251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128012, + "balance_loss_mlp": 1.02101004, + "epoch": 0.06021546748749519, + "flos": 483628363008.0, + "grad_norm": 0.04396023920554742, + "language_loss": 0.97026515, + "learning_rate": 0.0009976095095472243, + "loss": 0.98154521, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.07226562, + "step": 313, + "time_per_iteration": 2.619016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_mlp": 1.02425838, + "epoch": 0.06040784917275875, + "flos": 621424205568.0, + "grad_norm": 0.03687701456451143, + "language_loss": 0.97965562, + "learning_rate": 0.0009975789851353334, + "loss": 0.99096727, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.07128906, + "step": 314, + "time_per_iteration": 2.8331894874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125758, + "balance_loss_mlp": 1.01980519, + "epoch": 0.06060023085802232, + "flos": 484603348224.0, + "grad_norm": 0.029408756794299912, + "language_loss": 1.00726843, + "learning_rate": 0.0009975482675461487, + "loss": 1.01852608, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.06152344, + "step": 315, + "time_per_iteration": 2.659079074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125971, + "balance_loss_mlp": 1.02001762, + "epoch": 0.06079261254328588, + "flos": 582986598144.0, + "grad_norm": 0.027344501346145803, + "language_loss": 0.98408186, + "learning_rate": 0.0009975173567915952, + "loss": 0.99534154, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.06152344, + "step": 316, + "time_per_iteration": 2.6947872638702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123736, + "balance_loss_mlp": 1.01873684, + "epoch": 0.060984994228549444, + "flos": 689009348352.0, + "grad_norm": 0.03553374767777348, + "language_loss": 0.92618632, + "learning_rate": 0.000997486252883674, + "loss": 0.93742371, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.05175781, + "step": 317, + "time_per_iteration": 2.8523428440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123139, + "balance_loss_mlp": 1.01861632, + "epoch": 0.061177375913813004, + "flos": 1316749104384.0, + "grad_norm": 0.03506621320439297, + "language_loss": 0.97693729, + "learning_rate": 0.0009974549558344602, + "loss": 0.98816866, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.046875, + "step": 318, + "time_per_iteration": 3.705524206161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121913, + "balance_loss_mlp": 1.01805806, + "epoch": 0.06136975759907657, + "flos": 575401532928.0, + "grad_norm": 0.03493031867187039, + "language_loss": 1.07333064, + "learning_rate": 0.000997423465656105, + "loss": 1.08454978, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.04003906, + "step": 319, + "time_per_iteration": 2.75838565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119126, + "balance_loss_mlp": 1.01546133, + "epoch": 0.06156213928434013, + "flos": 528565234944.0, + "grad_norm": 0.037170039701900144, + "language_loss": 1.04350638, + "learning_rate": 0.0009973917823608335, + "loss": 1.05469775, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.03808594, + "step": 320, + "time_per_iteration": 2.6494460105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117676, + "balance_loss_mlp": 1.01458335, + "epoch": 0.061754520969603696, + "flos": 496590920448.0, + "grad_norm": 0.030464742512101767, + "language_loss": 0.98981547, + "learning_rate": 0.0009973599059609462, + "loss": 1.00099218, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.03222656, + "step": 321, + "time_per_iteration": 2.7119081020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116635, + "balance_loss_mlp": 1.01344728, + "epoch": 0.061946902654867256, + "flos": 441044872704.0, + "grad_norm": 0.031106795532346753, + "language_loss": 0.97035432, + "learning_rate": 0.000997327836468819, + "loss": 0.98152065, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.03320312, + "step": 322, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121262, + "balance_loss_mlp": 1.01836073, + "epoch": 0.06213928434013082, + "flos": 600043515648.0, + "grad_norm": 0.031546338171402045, + "language_loss": 1.00120687, + "learning_rate": 0.000997295573896902, + "loss": 1.01241946, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.03027344, + "step": 323, + "time_per_iteration": 2.825425624847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113502, + "balance_loss_mlp": 1.01126862, + "epoch": 0.06233166602539438, + "flos": 1453116961536.0, + "grad_norm": 0.009515746361157745, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82309544, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.0234375, + "step": 324, + "time_per_iteration": 4.7325074672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.0074234, + "epoch": 0.06252404771065795, + "flos": 1466631651072.0, + "grad_norm": 0.010337204897298672, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79680836, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.015625, + "step": 325, + "time_per_iteration": 4.845058917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131262, + "balance_loss_mlp": 1.02950513, + "epoch": 0.06271642939592151, + "flos": 465236790528.0, + "grad_norm": 0.04479189972062717, + "language_loss": 0.94122899, + "learning_rate": 0.000997197627828043, + "loss": 0.95254159, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.01855469, + "step": 326, + "time_per_iteration": 2.531477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136139, + "balance_loss_mlp": 1.03466833, + "epoch": 0.06290881108118507, + "flos": 533432391168.0, + "grad_norm": 0.03210871152906133, + "language_loss": 0.89633012, + "learning_rate": 0.0009971645930629716, + "loss": 0.9076916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.015625, + "step": 327, + "time_per_iteration": 2.766155481338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131438, + "balance_loss_mlp": 1.0305388, + "epoch": 0.06310119276644863, + "flos": 674768516352.0, + "grad_norm": 0.03217671154768682, + "language_loss": 1.03418863, + "learning_rate": 0.0009971313652814872, + "loss": 1.0455029, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.00976562, + "step": 328, + "time_per_iteration": 2.818718433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125209, + "balance_loss_mlp": 1.02440596, + "epoch": 0.0632935744517122, + "flos": 772051381248.0, + "grad_norm": 0.03902843256426295, + "language_loss": 1.00692391, + "learning_rate": 0.0009970979444964903, + "loss": 1.01817608, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.00878906, + "step": 329, + "time_per_iteration": 2.9847218990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119216, + "balance_loss_mlp": 1.01869905, + "epoch": 0.06348595613697576, + "flos": 562975556352.0, + "grad_norm": 0.040034835413812295, + "language_loss": 1.01797342, + "learning_rate": 0.0009970643307209556, + "loss": 1.02916563, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.00585938, + "step": 330, + "time_per_iteration": 2.817711353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112644, + "balance_loss_mlp": 1.01250839, + "epoch": 0.06367833782223932, + "flos": 677384358144.0, + "grad_norm": 0.031424074947949916, + "language_loss": 0.98358697, + "learning_rate": 0.0009970305239679334, + "loss": 0.99471337, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.00195312, + "step": 331, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.01247358, + "epoch": 0.06387071950750288, + "flos": 496349847552.0, + "grad_norm": 0.04016029313197435, + "language_loss": 1.03082633, + "learning_rate": 0.0009969965242505483, + "loss": 1.04195428, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.00390625, + "step": 332, + "time_per_iteration": 2.631326675415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113411, + "balance_loss_mlp": 1.01317954, + "epoch": 0.06406310119276645, + "flos": 534557075712.0, + "grad_norm": 0.03761595064373852, + "language_loss": 0.99054992, + "learning_rate": 0.0009969623315820007, + "loss": 1.00168395, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.00292969, + "step": 333, + "time_per_iteration": 2.6700048446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.01369655, + "epoch": 0.06425548287803001, + "flos": 457165688832.0, + "grad_norm": 0.0356255093132357, + "language_loss": 0.99075055, + "learning_rate": 0.000996927945975565, + "loss": 1.00188696, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.99951172, + "step": 334, + "time_per_iteration": 2.567225933074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112774, + "balance_loss_mlp": 1.01282871, + "epoch": 0.06444786456329357, + "flos": 561123762432.0, + "grad_norm": 0.034265188200332725, + "language_loss": 0.96451521, + "learning_rate": 0.0009968933674445906, + "loss": 0.97564298, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.99951172, + "step": 335, + "time_per_iteration": 2.6834452152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110954, + "balance_loss_mlp": 1.01100898, + "epoch": 0.06464024624855713, + "flos": 667357449984.0, + "grad_norm": 0.026754476738251005, + "language_loss": 0.980811, + "learning_rate": 0.0009968585960025028, + "loss": 0.99192053, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.99853516, + "step": 336, + "time_per_iteration": 2.9675402641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112488, + "balance_loss_mlp": 1.01368713, + "epoch": 0.0648326279338207, + "flos": 1524558303744.0, + "grad_norm": 0.027483244216433014, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78765678, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.98632812, + "step": 337, + "time_per_iteration": 4.80242133140564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.01540756, + "epoch": 0.06502500961908426, + "flos": 1145216581632.0, + "grad_norm": 0.03509421691107687, + "language_loss": 0.96500707, + "learning_rate": 0.0009967884744390583, + "loss": 0.97615772, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.99414062, + "step": 338, + "time_per_iteration": 3.517488479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118827, + "balance_loss_mlp": 1.01945412, + "epoch": 0.06521739130434782, + "flos": 583694265600.0, + "grad_norm": 0.03507378265000135, + "language_loss": 0.97375119, + "learning_rate": 0.0009967531243449256, + "loss": 0.98493946, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.9921875, + "step": 339, + "time_per_iteration": 2.713430404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119012, + "balance_loss_mlp": 1.02002037, + "epoch": 0.06540977298961138, + "flos": 498659487744.0, + "grad_norm": 0.03215705196534619, + "language_loss": 1.04762673, + "learning_rate": 0.000996717581394126, + "loss": 1.05881691, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.98876953, + "step": 340, + "time_per_iteration": 2.5391135215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116775, + "balance_loss_mlp": 1.01787901, + "epoch": 0.06560215467487496, + "flos": 543904506624.0, + "grad_norm": 0.030763143460584817, + "language_loss": 1.05044627, + "learning_rate": 0.000996681845600459, + "loss": 1.06161404, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.98632812, + "step": 341, + "time_per_iteration": 2.670804262161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118249, + "balance_loss_mlp": 1.01963949, + "epoch": 0.06579453636013852, + "flos": 414351819264.0, + "grad_norm": 0.040583240554979534, + "language_loss": 0.9744029, + "learning_rate": 0.0009966459169777982, + "loss": 0.98558539, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.98388672, + "step": 342, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115105, + "balance_loss_mlp": 1.01706719, + "epoch": 0.06598691804540208, + "flos": 561681730560.0, + "grad_norm": 0.04164342519277061, + "language_loss": 1.05655766, + "learning_rate": 0.0009966097955400924, + "loss": 1.06770873, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.97949219, + "step": 343, + "time_per_iteration": 2.666548728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_mlp": 1.01532912, + "epoch": 0.06617929973066564, + "flos": 573302830080.0, + "grad_norm": 0.03386977599556249, + "language_loss": 0.99970496, + "learning_rate": 0.0009965734813013652, + "loss": 1.01082909, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.97070312, + "step": 344, + "time_per_iteration": 2.8448328971862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_mlp": 1.01261127, + "epoch": 0.06637168141592921, + "flos": 491465194752.0, + "grad_norm": 0.03376822413453626, + "language_loss": 1.02026749, + "learning_rate": 0.0009965369742757151, + "loss": 1.03136492, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.97119141, + "step": 345, + "time_per_iteration": 2.568521738052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.01176453, + "epoch": 0.06656406310119277, + "flos": 1081039518720.0, + "grad_norm": 0.03449730062562062, + "language_loss": 0.98245382, + "learning_rate": 0.0009965002744773152, + "loss": 0.99353665, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.96484375, + "step": 346, + "time_per_iteration": 3.501471519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109602, + "balance_loss_mlp": 1.01347148, + "epoch": 0.06675644478645633, + "flos": 514723923456.0, + "grad_norm": 0.029121068034632647, + "language_loss": 0.95998263, + "learning_rate": 0.0009964633819204139, + "loss": 0.97107863, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.9609375, + "step": 347, + "time_per_iteration": 2.6675100326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093636, + "balance_loss_mlp": 0.9986496, + "epoch": 0.06694882647171989, + "flos": 1450537079808.0, + "grad_norm": 0.008592618933675954, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.82894754, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.94921875, + "step": 348, + "time_per_iteration": 4.92915415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 0.99832916, + "epoch": 0.06714120815698346, + "flos": 1555400152320.0, + "grad_norm": 0.006174818833869298, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76247013, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.94726562, + "step": 349, + "time_per_iteration": 4.8783159255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112042, + "balance_loss_mlp": 1.01719952, + "epoch": 0.06733358984224702, + "flos": 881617326336.0, + "grad_norm": 0.039044792628629706, + "language_loss": 0.95966816, + "learning_rate": 0.000996351547842304, + "loss": 0.97078854, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.94775391, + "step": 350, + "time_per_iteration": 3.151158094406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106972, + "balance_loss_mlp": 1.01222503, + "epoch": 0.06752597152751058, + "flos": 519918668544.0, + "grad_norm": 0.04011951728876299, + "language_loss": 0.94198334, + "learning_rate": 0.0009963138843953744, + "loss": 0.953053, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.94677734, + "step": 351, + "time_per_iteration": 2.6077194213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111271, + "balance_loss_mlp": 1.01661849, + "epoch": 0.06771835321277414, + "flos": 540883308288.0, + "grad_norm": 0.02897454745239974, + "language_loss": 0.98297268, + "learning_rate": 0.000996276028262306, + "loss": 0.99408543, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.94580078, + "step": 352, + "time_per_iteration": 2.8440346717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115128, + "balance_loss_mlp": 1.02052331, + "epoch": 0.0679107348980377, + "flos": 461615827968.0, + "grad_norm": 0.03358261828070724, + "language_loss": 1.05270672, + "learning_rate": 0.0009962379794577964, + "loss": 1.06385791, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.9453125, + "step": 353, + "time_per_iteration": 2.6153147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115897, + "balance_loss_mlp": 1.02129257, + "epoch": 0.06810311658330127, + "flos": 637208684544.0, + "grad_norm": 0.03193767698980152, + "language_loss": 0.94629884, + "learning_rate": 0.000996199737996617, + "loss": 0.95745778, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.9453125, + "step": 354, + "time_per_iteration": 2.9557363986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114833, + "balance_loss_mlp": 1.0208956, + "epoch": 0.06829549826856483, + "flos": 465627562752.0, + "grad_norm": 0.034421374529713736, + "language_loss": 1.03816652, + "learning_rate": 0.0009961613038936149, + "loss": 1.04931474, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.93847656, + "step": 355, + "time_per_iteration": 2.583648204803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112591, + "balance_loss_mlp": 1.01879704, + "epoch": 0.06848787995382839, + "flos": 635897362176.0, + "grad_norm": 0.027271592740405557, + "language_loss": 0.95725697, + "learning_rate": 0.000996122677163711, + "loss": 0.96838284, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.93701172, + "step": 356, + "time_per_iteration": 2.7997536659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.02022934, + "epoch": 0.06868026163909195, + "flos": 807781773312.0, + "grad_norm": 0.036098266403844226, + "language_loss": 1.02058005, + "learning_rate": 0.000996083857821902, + "loss": 1.03171647, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.93310547, + "step": 357, + "time_per_iteration": 3.0117554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113245, + "balance_loss_mlp": 1.01978505, + "epoch": 0.06887264332435553, + "flos": 440152512768.0, + "grad_norm": 0.03587140172627376, + "language_loss": 1.00045025, + "learning_rate": 0.0009960448458832588, + "loss": 1.01158273, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.93359375, + "step": 358, + "time_per_iteration": 2.6948373317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110172, + "balance_loss_mlp": 1.01714087, + "epoch": 0.06906502500961909, + "flos": 485786358528.0, + "grad_norm": 0.028895953236024122, + "language_loss": 0.99980301, + "learning_rate": 0.000996005641362927, + "loss": 1.01090467, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.92919922, + "step": 359, + "time_per_iteration": 2.600889205932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110333, + "balance_loss_mlp": 1.01715922, + "epoch": 0.06925740669488265, + "flos": 734886212352.0, + "grad_norm": 0.03093408458560108, + "language_loss": 1.02453041, + "learning_rate": 0.0009959662442761274, + "loss": 1.0356338, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.93066406, + "step": 360, + "time_per_iteration": 2.9324746131896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107676, + "balance_loss_mlp": 1.01445436, + "epoch": 0.0694497883801462, + "flos": 553571745024.0, + "grad_norm": 0.03028505188811882, + "language_loss": 0.95860314, + "learning_rate": 0.000995926654638155, + "loss": 0.96967983, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.93115234, + "step": 361, + "time_per_iteration": 2.8280868530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104746, + "balance_loss_mlp": 1.01157248, + "epoch": 0.06964217006540978, + "flos": 679244900352.0, + "grad_norm": 0.03450824772288923, + "language_loss": 0.98644811, + "learning_rate": 0.00099588687246438, + "loss": 0.99749553, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.93066406, + "step": 362, + "time_per_iteration": 2.8108932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108438, + "balance_loss_mlp": 1.01535928, + "epoch": 0.06983455175067334, + "flos": 525261167616.0, + "grad_norm": 0.03621302361184023, + "language_loss": 1.06105995, + "learning_rate": 0.0009958468977702471, + "loss": 1.07214439, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.9296875, + "step": 363, + "time_per_iteration": 2.6087372303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135422, + "balance_loss_mlp": 1.04272461, + "epoch": 0.0700269334359369, + "flos": 1580176283136.0, + "grad_norm": 0.03651647631774479, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.80870128, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.92578125, + "step": 364, + "time_per_iteration": 4.806072235107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104861, + "balance_loss_mlp": 1.01254511, + "epoch": 0.07021931512120046, + "flos": 1014858050304.0, + "grad_norm": 0.04058448706036458, + "language_loss": 0.94071019, + "learning_rate": 0.0009957663708830612, + "loss": 0.9517588, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.921875, + "step": 365, + "time_per_iteration": 3.30859637260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110656, + "balance_loss_mlp": 1.01862633, + "epoch": 0.07041169680646403, + "flos": 824432367360.0, + "grad_norm": 0.04186203278400794, + "language_loss": 0.98041129, + "learning_rate": 0.0009957258187212714, + "loss": 0.9915179, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.91894531, + "step": 366, + "time_per_iteration": 3.00058913230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097015, + "balance_loss_mlp": 1.00565338, + "epoch": 0.07060407849172759, + "flos": 1417293250560.0, + "grad_norm": 0.011820269564466843, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80291873, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.91210938, + "step": 367, + "time_per_iteration": 4.794500827789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113703, + "balance_loss_mlp": 1.02186394, + "epoch": 0.07079646017699115, + "flos": 513942379008.0, + "grad_norm": 0.041641563183133855, + "language_loss": 0.94691038, + "learning_rate": 0.0009956441370400167, + "loss": 0.95804739, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.91699219, + "step": 368, + "time_per_iteration": 2.63948917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111577, + "balance_loss_mlp": 1.02436066, + "epoch": 0.07098884186225471, + "flos": 541549179648.0, + "grad_norm": 0.03426405251061256, + "language_loss": 1.00885093, + "learning_rate": 0.0009956030075522636, + "loss": 1.02000868, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.91259766, + "step": 369, + "time_per_iteration": 2.74157452583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107449, + "balance_loss_mlp": 1.01613438, + "epoch": 0.07118122354751828, + "flos": 549739845120.0, + "grad_norm": 0.030296400642036637, + "language_loss": 1.0031743, + "learning_rate": 0.0009955616856543587, + "loss": 1.01424885, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.91162109, + "step": 370, + "time_per_iteration": 2.6210479736328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105786, + "balance_loss_mlp": 1.01475775, + "epoch": 0.07137360523278184, + "flos": 622077437952.0, + "grad_norm": 0.029509682347833893, + "language_loss": 0.92550498, + "learning_rate": 0.0009955201713623448, + "loss": 0.93656284, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.90869141, + "step": 371, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.00284576, + "epoch": 0.0715659869180454, + "flos": 1505976202752.0, + "grad_norm": 0.005566886599578838, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77765214, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.89648438, + "step": 372, + "time_per_iteration": 4.947838306427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126764, + "balance_loss_mlp": 1.0361172, + "epoch": 0.07175836860330896, + "flos": 496482050304.0, + "grad_norm": 0.040308561934975694, + "language_loss": 1.05629396, + "learning_rate": 0.0009954365656605333, + "loss": 1.06756163, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.90478516, + "step": 373, + "time_per_iteration": 2.5537302494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124141, + "balance_loss_mlp": 1.03416181, + "epoch": 0.07195075028857253, + "flos": 787082505984.0, + "grad_norm": 0.034789914575730614, + "language_loss": 0.98912442, + "learning_rate": 0.0009953944742831947, + "loss": 1.00036585, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.89892578, + "step": 374, + "time_per_iteration": 2.976074695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106044, + "balance_loss_mlp": 1.01678061, + "epoch": 0.0721431319738361, + "flos": 594347182848.0, + "grad_norm": 0.029628456658550576, + "language_loss": 1.02558136, + "learning_rate": 0.0009953521905766642, + "loss": 1.03664172, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.89404297, + "step": 375, + "time_per_iteration": 2.9556005001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101275, + "balance_loss_mlp": 1.01234496, + "epoch": 0.07233551365909965, + "flos": 549329630976.0, + "grad_norm": 0.034208323574026145, + "language_loss": 1.01073325, + "learning_rate": 0.0009953097145573577, + "loss": 1.02174592, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.89111328, + "step": 376, + "time_per_iteration": 2.6449482440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_mlp": 1.01759815, + "epoch": 0.07252789534436321, + "flos": 959169106176.0, + "grad_norm": 0.031040198427254525, + "language_loss": 0.98588479, + "learning_rate": 0.000995267046241766, + "loss": 0.99694908, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.89013672, + "step": 377, + "time_per_iteration": 3.2564361095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106989, + "balance_loss_mlp": 1.01877415, + "epoch": 0.07272027702962677, + "flos": 508656260352.0, + "grad_norm": 0.029229214223645432, + "language_loss": 0.98238575, + "learning_rate": 0.0009952241856464547, + "loss": 0.99345565, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.88378906, + "step": 378, + "time_per_iteration": 2.5843191146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108111, + "balance_loss_mlp": 1.02013505, + "epoch": 0.07291265871489035, + "flos": 613552380672.0, + "grad_norm": 0.03194005050639913, + "language_loss": 1.05557346, + "learning_rate": 0.0009951811327880632, + "loss": 1.06665444, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.88134766, + "step": 379, + "time_per_iteration": 2.727449655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107323, + "balance_loss_mlp": 1.01934636, + "epoch": 0.0731050404001539, + "flos": 496742565120.0, + "grad_norm": 0.03092115392183015, + "language_loss": 0.98400533, + "learning_rate": 0.0009951378876833063, + "loss": 0.99507862, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.88134766, + "step": 380, + "time_per_iteration": 2.5320205688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101258, + "balance_loss_mlp": 1.01332915, + "epoch": 0.07329742208541747, + "flos": 641130991104.0, + "grad_norm": 0.032065094183830696, + "language_loss": 1.04703462, + "learning_rate": 0.0009950944503489736, + "loss": 1.05804706, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.88085938, + "step": 381, + "time_per_iteration": 2.7422876358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_mlp": 1.01453876, + "epoch": 0.07348980377068103, + "flos": 817741607424.0, + "grad_norm": 0.030510114485064205, + "language_loss": 0.99112171, + "learning_rate": 0.0009950508208019285, + "loss": 1.00214303, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.87744141, + "step": 382, + "time_per_iteration": 3.046475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_mlp": 1.01323569, + "epoch": 0.0736821854559446, + "flos": 509670129408.0, + "grad_norm": 0.035756321159612754, + "language_loss": 1.03789318, + "learning_rate": 0.0009950069990591096, + "loss": 1.04890537, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.88134766, + "step": 383, + "time_per_iteration": 2.620088577270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.02674103, + "epoch": 0.07387456714120816, + "flos": 1558050987264.0, + "grad_norm": 0.043940663043905655, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77514511, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.86523438, + "step": 384, + "time_per_iteration": 4.87653374671936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121594, + "balance_loss_mlp": 1.03299809, + "epoch": 0.07406694882647172, + "flos": 526644421632.0, + "grad_norm": 0.039102279996233, + "language_loss": 0.96614265, + "learning_rate": 0.0009949187790542777, + "loss": 0.97735858, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.88769531, + "step": 385, + "time_per_iteration": 2.734100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112471, + "balance_loss_mlp": 1.03625691, + "epoch": 0.07425933051173528, + "flos": 498824738304.0, + "grad_norm": 0.03701278047407747, + "language_loss": 0.92462552, + "learning_rate": 0.0009948743808265148, + "loss": 0.93587261, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.88623047, + "step": 386, + "time_per_iteration": 2.7154581546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_mlp": 1.03704965, + "epoch": 0.07445171219699885, + "flos": 506057915136.0, + "grad_norm": 0.06663512882119103, + "language_loss": 1.02268195, + "learning_rate": 0.0009948297904714782, + "loss": 1.0339365, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.88574219, + "step": 387, + "time_per_iteration": 2.68532133102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112575, + "balance_loss_mlp": 1.03777313, + "epoch": 0.07464409388226241, + "flos": 555117337344.0, + "grad_norm": 0.036483324457394946, + "language_loss": 0.94151849, + "learning_rate": 0.0009947850080064796, + "loss": 0.95277596, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.88134766, + "step": 388, + "time_per_iteration": 2.789128303527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121204, + "balance_loss_mlp": 1.03370392, + "epoch": 0.07483647556752597, + "flos": 778275546624.0, + "grad_norm": 0.0421926900222792, + "language_loss": 0.99476451, + "learning_rate": 0.0009947400334489047, + "loss": 1.00597644, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.87646484, + "step": 389, + "time_per_iteration": 2.9937496185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011085, + "balance_loss_mlp": 1.02133441, + "epoch": 0.07502885725278953, + "flos": 613682638080.0, + "grad_norm": 0.0417493031738284, + "language_loss": 0.90741575, + "learning_rate": 0.0009946948668162145, + "loss": 0.91850078, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.87304688, + "step": 390, + "time_per_iteration": 2.7264010906219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101473, + "balance_loss_mlp": 1.01502275, + "epoch": 0.0752212389380531, + "flos": 689856021504.0, + "grad_norm": 0.03330838563423677, + "language_loss": 0.95001, + "learning_rate": 0.0009946495081259441, + "loss": 0.9610247, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.86572266, + "step": 391, + "time_per_iteration": 2.832472085952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097898, + "balance_loss_mlp": 1.01182938, + "epoch": 0.07541362062331666, + "flos": 767052022272.0, + "grad_norm": 0.03859494705227578, + "language_loss": 0.99014449, + "learning_rate": 0.0009946039573957035, + "loss": 1.00112355, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.86181641, + "step": 392, + "time_per_iteration": 2.925933361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101692, + "balance_loss_mlp": 1.01576602, + "epoch": 0.07560600230858022, + "flos": 589909682688.0, + "grad_norm": 0.039112379024015986, + "language_loss": 0.95485294, + "learning_rate": 0.000994558214643177, + "loss": 0.9658699, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.86035156, + "step": 393, + "time_per_iteration": 2.763448476791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095538, + "balance_loss_mlp": 1.00961244, + "epoch": 0.07579838399384378, + "flos": 751146034176.0, + "grad_norm": 0.03818992224284351, + "language_loss": 0.96862066, + "learning_rate": 0.000994512279886123, + "loss": 0.97957599, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.86035156, + "step": 394, + "time_per_iteration": 3.143615245819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101397, + "balance_loss_mlp": 1.01561391, + "epoch": 0.07599076567910736, + "flos": 524551554816.0, + "grad_norm": 0.030240351127206026, + "language_loss": 0.96659988, + "learning_rate": 0.0009944661531423758, + "loss": 0.97761387, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.85888672, + "step": 395, + "time_per_iteration": 2.6748764514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107513, + "balance_loss_mlp": 1.02206361, + "epoch": 0.07618314736437092, + "flos": 552186545664.0, + "grad_norm": 0.03358451790414236, + "language_loss": 0.95614338, + "learning_rate": 0.000994419834429843, + "loss": 0.96721858, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.85546875, + "step": 396, + "time_per_iteration": 2.6525089740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105372, + "balance_loss_mlp": 1.01987493, + "epoch": 0.07637552904963447, + "flos": 699433831680.0, + "grad_norm": 0.04315212632526892, + "language_loss": 1.00552011, + "learning_rate": 0.0009943733237665069, + "loss": 1.01657379, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.85595703, + "step": 397, + "time_per_iteration": 2.8678157329559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097353, + "balance_loss_mlp": 1.01218963, + "epoch": 0.07656791073489803, + "flos": 580636128768.0, + "grad_norm": 0.029538416941692198, + "language_loss": 0.99224108, + "learning_rate": 0.0009943266211704248, + "loss": 1.0032146, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.85253906, + "step": 398, + "time_per_iteration": 3.0023248195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099387, + "balance_loss_mlp": 1.01460528, + "epoch": 0.0767602924201616, + "flos": 418037910528.0, + "grad_norm": 0.03167845871290285, + "language_loss": 1.01143491, + "learning_rate": 0.000994279726659728, + "loss": 1.02242875, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.84863281, + "step": 399, + "time_per_iteration": 2.527693271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107007, + "balance_loss_mlp": 1.02246368, + "epoch": 0.07695267410542517, + "flos": 483888877824.0, + "grad_norm": 0.03414294034973106, + "language_loss": 0.9968133, + "learning_rate": 0.0009942326402526231, + "loss": 1.00788331, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.84619141, + "step": 400, + "time_per_iteration": 2.5610573291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_mlp": 1.02848434, + "epoch": 0.07714505579068873, + "flos": 532027749888.0, + "grad_norm": 0.030264499227930883, + "language_loss": 0.97403878, + "learning_rate": 0.0009941853619673902, + "loss": 0.98516715, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.84423828, + "step": 401, + "time_per_iteration": 2.680175542831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107236, + "balance_loss_mlp": 1.02302694, + "epoch": 0.07733743747595229, + "flos": 806440315392.0, + "grad_norm": 0.03979329481069023, + "language_loss": 1.01160502, + "learning_rate": 0.0009941378918223844, + "loss": 1.02267742, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.84277344, + "step": 402, + "time_per_iteration": 3.0908427238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098686, + "balance_loss_mlp": 1.01447606, + "epoch": 0.07752981916121585, + "flos": 623614281984.0, + "grad_norm": 0.03310929598543939, + "language_loss": 0.93567806, + "learning_rate": 0.0009940902298360354, + "loss": 0.94666493, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.84277344, + "step": 403, + "time_per_iteration": 2.7569308280944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.01048076, + "epoch": 0.07772220084647942, + "flos": 729543713280.0, + "grad_norm": 0.03955766616265138, + "language_loss": 1.03173304, + "learning_rate": 0.0009940423760268473, + "loss": 1.04268289, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.84570312, + "step": 404, + "time_per_iteration": 2.8456103801727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098252, + "balance_loss_mlp": 1.01375628, + "epoch": 0.07791458253174298, + "flos": 556469488896.0, + "grad_norm": 0.042207617679060144, + "language_loss": 0.96929657, + "learning_rate": 0.0009939943304133982, + "loss": 0.98027909, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.84570312, + "step": 405, + "time_per_iteration": 2.615145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104796, + "balance_loss_mlp": 1.02044404, + "epoch": 0.07810696421700654, + "flos": 554235671040.0, + "grad_norm": 0.04104566792755741, + "language_loss": 1.03659868, + "learning_rate": 0.0009939460930143416, + "loss": 1.04764676, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.84423828, + "step": 406, + "time_per_iteration": 2.6304614543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_mlp": 1.01745594, + "epoch": 0.0782993459022701, + "flos": 651879172608.0, + "grad_norm": 0.0317151282671847, + "language_loss": 0.97752666, + "learning_rate": 0.0009938976638484043, + "loss": 0.98854232, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.84179688, + "step": 407, + "time_per_iteration": 2.9032115936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109564, + "balance_loss_mlp": 1.01205039, + "epoch": 0.07849172758753367, + "flos": 497161527552.0, + "grad_norm": 0.04013855375776475, + "language_loss": 0.97246277, + "learning_rate": 0.0009938490429343887, + "loss": 0.98341918, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.83642578, + "step": 408, + "time_per_iteration": 2.5688796043395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.01236188, + "epoch": 0.07868410927279723, + "flos": 579076930560.0, + "grad_norm": 0.0397915036848884, + "language_loss": 0.97571141, + "learning_rate": 0.0009938002302911709, + "loss": 0.98666751, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.83300781, + "step": 409, + "time_per_iteration": 2.75036883354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096533, + "balance_loss_mlp": 1.01365864, + "epoch": 0.07887649095806079, + "flos": 524067463680.0, + "grad_norm": 0.03678821175613874, + "language_loss": 1.00230122, + "learning_rate": 0.0009937512259377015, + "loss": 1.01326644, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.82910156, + "step": 410, + "time_per_iteration": 2.6584975719451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110197, + "balance_loss_mlp": 1.01938236, + "epoch": 0.07906887264332435, + "flos": 558438901248.0, + "grad_norm": 0.04956969404692801, + "language_loss": 0.989124, + "learning_rate": 0.000993702029893006, + "loss": 1.00014377, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.82617188, + "step": 411, + "time_per_iteration": 2.7666263580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_mlp": 1.0196116, + "epoch": 0.07926125432858792, + "flos": 823364063232.0, + "grad_norm": 0.03322797228086769, + "language_loss": 0.99091381, + "learning_rate": 0.0009936526421761838, + "loss": 1.00193632, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.82666016, + "step": 412, + "time_per_iteration": 3.0222113132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102099, + "balance_loss_mlp": 1.01955855, + "epoch": 0.07945363601385148, + "flos": 563394518784.0, + "grad_norm": 0.04210923401756456, + "language_loss": 1.01423764, + "learning_rate": 0.000993603062806409, + "loss": 1.02525866, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.82568359, + "step": 413, + "time_per_iteration": 2.713226079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100317, + "balance_loss_mlp": 1.0176332, + "epoch": 0.07964601769911504, + "flos": 518885357568.0, + "grad_norm": 0.041362228888401006, + "language_loss": 1.04903626, + "learning_rate": 0.0009935532918029298, + "loss": 1.06003952, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.82714844, + "step": 414, + "time_per_iteration": 2.59602689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095709, + "balance_loss_mlp": 1.01326394, + "epoch": 0.0798383993843786, + "flos": 540301040640.0, + "grad_norm": 0.030384950019726516, + "language_loss": 0.97377884, + "learning_rate": 0.0009935033291850694, + "loss": 0.98473597, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.82470703, + "step": 415, + "time_per_iteration": 2.6417808532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094851, + "balance_loss_mlp": 1.013026, + "epoch": 0.08003078106964218, + "flos": 486122695680.0, + "grad_norm": 0.03579523867672845, + "language_loss": 1.00004411, + "learning_rate": 0.0009934531749722247, + "loss": 1.01099253, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.81835938, + "step": 416, + "time_per_iteration": 2.593029737472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095566, + "balance_loss_mlp": 1.01383638, + "epoch": 0.08022316275490574, + "flos": 519276129792.0, + "grad_norm": 0.0354518245662521, + "language_loss": 0.98370755, + "learning_rate": 0.0009934028291838672, + "loss": 0.99466318, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.81738281, + "step": 417, + "time_per_iteration": 2.7351250648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096643, + "balance_loss_mlp": 1.01496112, + "epoch": 0.0804155444401693, + "flos": 495047273472.0, + "grad_norm": 0.032920982329526526, + "language_loss": 0.93668723, + "learning_rate": 0.0009933522918395433, + "loss": 0.94765365, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.81689453, + "step": 418, + "time_per_iteration": 2.6427221298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.03316498, + "epoch": 0.08060792612543285, + "flos": 1584856801536.0, + "grad_norm": 0.029973653623271358, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79365897, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.81640625, + "step": 419, + "time_per_iteration": 4.8632917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.01569724, + "epoch": 0.08080030781069643, + "flos": 526359607296.0, + "grad_norm": 0.04163447523548115, + "language_loss": 1.12134457, + "learning_rate": 0.000993250642561551, + "loss": 1.13230991, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.80810547, + "step": 420, + "time_per_iteration": 2.608396053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109578, + "balance_loss_mlp": 1.01505113, + "epoch": 0.08099268949595999, + "flos": 547757793792.0, + "grad_norm": 0.04746808509414602, + "language_loss": 0.97398257, + "learning_rate": 0.0009931995306673466, + "loss": 0.98494035, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.80712891, + "step": 421, + "time_per_iteration": 2.7215850353240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097341, + "balance_loss_mlp": 1.01670778, + "epoch": 0.08118507118122355, + "flos": 511374169344.0, + "grad_norm": 0.04020038552675014, + "language_loss": 1.02514148, + "learning_rate": 0.000993148227296103, + "loss": 1.03611493, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.80615234, + "step": 422, + "time_per_iteration": 2.625366449356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010968, + "balance_loss_mlp": 1.01607168, + "epoch": 0.08137745286648711, + "flos": 722002389504.0, + "grad_norm": 0.03556088777041087, + "language_loss": 0.90137196, + "learning_rate": 0.000993096732467738, + "loss": 0.91233999, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.80712891, + "step": 423, + "time_per_iteration": 2.9795689582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.0118531, + "epoch": 0.08156983455175067, + "flos": 680818682880.0, + "grad_norm": 0.04422604915428747, + "language_loss": 0.99073571, + "learning_rate": 0.0009930450462022435, + "loss": 1.00165915, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.8046875, + "step": 424, + "time_per_iteration": 2.879889726638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.00783539, + "epoch": 0.08176221623701424, + "flos": 1456591137024.0, + "grad_norm": 0.006453860192715822, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.8027699, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.79296875, + "step": 425, + "time_per_iteration": 4.908784627914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.01541877, + "epoch": 0.0819545979222778, + "flos": 1558885044480.0, + "grad_norm": 0.04271462185638088, + "language_loss": 0.96659774, + "learning_rate": 0.0009929410994402065, + "loss": 0.9775573, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.80517578, + "step": 426, + "time_per_iteration": 3.7266876697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100573, + "balance_loss_mlp": 1.02013052, + "epoch": 0.08214697960754136, + "flos": 513801427968.0, + "grad_norm": 0.040597463537132866, + "language_loss": 1.00489211, + "learning_rate": 0.0009928888389840196, + "loss": 1.01589799, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.80419922, + "step": 427, + "time_per_iteration": 2.695010185241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098577, + "balance_loss_mlp": 1.01822996, + "epoch": 0.08233936129280492, + "flos": 596222309376.0, + "grad_norm": 0.03622779747664415, + "language_loss": 1.02622843, + "learning_rate": 0.0009928363871714147, + "loss": 1.03721428, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.80322266, + "step": 428, + "time_per_iteration": 2.66733455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097324, + "balance_loss_mlp": 1.01721525, + "epoch": 0.08253174297806849, + "flos": 573165769728.0, + "grad_norm": 0.028981657602537042, + "language_loss": 0.97141832, + "learning_rate": 0.0009927837440227556, + "loss": 0.98239154, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.80078125, + "step": 429, + "time_per_iteration": 2.8499114513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093938, + "balance_loss_mlp": 1.01392436, + "epoch": 0.08272412466333205, + "flos": 624643702272.0, + "grad_norm": 0.031878488957356683, + "language_loss": 0.91184896, + "learning_rate": 0.0009927309095584798, + "loss": 0.92278832, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.79980469, + "step": 430, + "time_per_iteration": 3.020768165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097624, + "balance_loss_mlp": 1.01756275, + "epoch": 0.08291650634859561, + "flos": 514995131904.0, + "grad_norm": 0.040558959270141796, + "language_loss": 1.03523278, + "learning_rate": 0.0009926778837991, + "loss": 1.0462091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.80029297, + "step": 431, + "time_per_iteration": 2.609189033508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101176, + "balance_loss_mlp": 1.02125835, + "epoch": 0.08310888803385917, + "flos": 668542405632.0, + "grad_norm": 0.035092839201242565, + "language_loss": 1.01323938, + "learning_rate": 0.000992624666765202, + "loss": 1.0242511, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.79882812, + "step": 432, + "time_per_iteration": 2.817399501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_mlp": 1.02154219, + "epoch": 0.08330126971912274, + "flos": 584491361280.0, + "grad_norm": 0.0354530922421884, + "language_loss": 0.98992586, + "learning_rate": 0.000992571258477447, + "loss": 1.00094295, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.80126953, + "step": 433, + "time_per_iteration": 2.777506113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010961, + "balance_loss_mlp": 1.0161345, + "epoch": 0.0834936514043863, + "flos": 562498268160.0, + "grad_norm": 0.03167346665720251, + "language_loss": 0.92772877, + "learning_rate": 0.0009925176589565695, + "loss": 0.93868983, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.79931641, + "step": 434, + "time_per_iteration": 2.801501512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093857, + "balance_loss_mlp": 1.01398647, + "epoch": 0.08368603308964986, + "flos": 495513868032.0, + "grad_norm": 0.03411426988917409, + "language_loss": 1.03318536, + "learning_rate": 0.0009924638682233791, + "loss": 1.04412401, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.79833984, + "step": 435, + "time_per_iteration": 2.573282241821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.01512909, + "epoch": 0.08387841477491342, + "flos": 1391811397632.0, + "grad_norm": 0.030642245427906535, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.8065716, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.7734375, + "step": 436, + "time_per_iteration": 4.596274375915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099407, + "balance_loss_mlp": 1.02006125, + "epoch": 0.084070796460177, + "flos": 800355155712.0, + "grad_norm": 0.040681894877429646, + "language_loss": 0.92768085, + "learning_rate": 0.0009923557132036668, + "loss": 0.93867493, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.79296875, + "step": 437, + "time_per_iteration": 3.0366878509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_mlp": 1.02364242, + "epoch": 0.08426317814544056, + "flos": 560097254400.0, + "grad_norm": 0.034275916488964116, + "language_loss": 0.96774155, + "learning_rate": 0.0009923013489591345, + "loss": 0.97876477, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.78613281, + "step": 438, + "time_per_iteration": 2.8060851097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_mlp": 1.0219903, + "epoch": 0.08445555983070412, + "flos": 811884881664.0, + "grad_norm": 0.035250716051411925, + "language_loss": 0.95655745, + "learning_rate": 0.0009922467935862681, + "loss": 0.96756417, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.78613281, + "step": 439, + "time_per_iteration": 3.116757869720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098598, + "balance_loss_mlp": 1.0204916, + "epoch": 0.08464794151596768, + "flos": 511170034944.0, + "grad_norm": 0.03561138790794706, + "language_loss": 0.98418635, + "learning_rate": 0.0009921920471062478, + "loss": 0.99517238, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.78027344, + "step": 440, + "time_per_iteration": 2.6008944511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093389, + "balance_loss_mlp": 1.01561701, + "epoch": 0.08484032320123125, + "flos": 557474609664.0, + "grad_norm": 0.02914226137027636, + "language_loss": 0.96590662, + "learning_rate": 0.0009921371095403281, + "loss": 0.97684056, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.77685547, + "step": 441, + "time_per_iteration": 2.638679265975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_mlp": 1.01697087, + "epoch": 0.08503270488649481, + "flos": 528361100544.0, + "grad_norm": 0.02987504029564206, + "language_loss": 0.99685514, + "learning_rate": 0.0009920819809098379, + "loss": 1.00780344, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.77783203, + "step": 442, + "time_per_iteration": 2.5915398597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089542, + "balance_loss_mlp": 1.01172209, + "epoch": 0.08522508657175837, + "flos": 615386678016.0, + "grad_norm": 0.03983619354546574, + "language_loss": 0.95535469, + "learning_rate": 0.0009920266612361798, + "loss": 0.96625006, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.77734375, + "step": 443, + "time_per_iteration": 2.724025249481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091681, + "balance_loss_mlp": 1.01371801, + "epoch": 0.08541746825702193, + "flos": 620987746560.0, + "grad_norm": 0.032808156584867194, + "language_loss": 0.9504559, + "learning_rate": 0.0009919711505408308, + "loss": 0.96137273, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.77880859, + "step": 444, + "time_per_iteration": 2.780973434448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087177, + "balance_loss_mlp": 1.00926137, + "epoch": 0.08560984994228549, + "flos": 483888877824.0, + "grad_norm": 0.03232110076143325, + "language_loss": 0.92813373, + "learning_rate": 0.000991915448845342, + "loss": 0.93900549, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.77832031, + "step": 445, + "time_per_iteration": 2.6011459827423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090069, + "balance_loss_mlp": 1.01243973, + "epoch": 0.08580223162754906, + "flos": 518177690112.0, + "grad_norm": 0.03377956208163177, + "language_loss": 1.02285504, + "learning_rate": 0.000991859556171339, + "loss": 1.03375578, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.77539062, + "step": 446, + "time_per_iteration": 2.606220006942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.01086187, + "epoch": 0.08599461331281262, + "flos": 532520589312.0, + "grad_norm": 0.037753212584348855, + "language_loss": 1.04541254, + "learning_rate": 0.000991803472540521, + "loss": 1.0562979, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.77587891, + "step": 447, + "time_per_iteration": 2.625401735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088712, + "balance_loss_mlp": 1.01113105, + "epoch": 0.08618699499807618, + "flos": 791634712320.0, + "grad_norm": 0.030920782852134367, + "language_loss": 0.98781657, + "learning_rate": 0.0009917471979746615, + "loss": 0.99870372, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.77490234, + "step": 448, + "time_per_iteration": 3.0066978931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.01195049, + "epoch": 0.08637937668333974, + "flos": 567115603200.0, + "grad_norm": 0.03238149886931097, + "language_loss": 0.98317528, + "learning_rate": 0.0009916907324956086, + "loss": 0.99407488, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.77929688, + "step": 449, + "time_per_iteration": 2.7561135292053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091057, + "balance_loss_mlp": 1.01333201, + "epoch": 0.08657175836860331, + "flos": 446118108672.0, + "grad_norm": 0.029046506526173844, + "language_loss": 0.94927382, + "learning_rate": 0.0009916340761252837, + "loss": 0.96018445, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.77636719, + "step": 450, + "time_per_iteration": 2.6452889442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089525, + "balance_loss_mlp": 1.01222932, + "epoch": 0.08676414005386687, + "flos": 845589480960.0, + "grad_norm": 0.032144406787761336, + "language_loss": 0.91630232, + "learning_rate": 0.0009915772288856832, + "loss": 0.92719758, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.77197266, + "step": 451, + "time_per_iteration": 3.0991322994232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108797, + "balance_loss_mlp": 1.01086605, + "epoch": 0.08695652173913043, + "flos": 604484906496.0, + "grad_norm": 0.025568476728402203, + "language_loss": 0.93134868, + "learning_rate": 0.000991520190798877, + "loss": 0.94222844, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.77001953, + "step": 452, + "time_per_iteration": 2.833534002304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093662, + "balance_loss_mlp": 1.01660514, + "epoch": 0.08714890342439399, + "flos": 732001107456.0, + "grad_norm": 0.03795734255344977, + "language_loss": 1.02428043, + "learning_rate": 0.0009914629618870089, + "loss": 1.03521705, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.76953125, + "step": 453, + "time_per_iteration": 2.9043643474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.02319336, + "epoch": 0.08734128510965757, + "flos": 1485456770304.0, + "grad_norm": 0.019964198948139205, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79774594, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.75390625, + "step": 454, + "time_per_iteration": 2.093019723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_mlp": 1.01278687, + "epoch": 0.08753366679492113, + "flos": 1526269146624.0, + "grad_norm": 0.012226751630218, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82515901, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.75, + "step": 455, + "time_per_iteration": 4.905871391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091043, + "balance_loss_mlp": 1.01379561, + "epoch": 0.08772604848018468, + "flos": 722525364480.0, + "grad_norm": 0.044152825797527884, + "language_loss": 0.95217329, + "learning_rate": 0.0009912901304235883, + "loss": 0.96308374, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.77148438, + "step": 456, + "time_per_iteration": 2.850330352783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090876, + "balance_loss_mlp": 1.01396191, + "epoch": 0.08791843016544824, + "flos": 709467542784.0, + "grad_norm": 0.038854584599924205, + "language_loss": 0.92178857, + "learning_rate": 0.000991232138434397, + "loss": 0.9326973, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.76806641, + "step": 457, + "time_per_iteration": 2.868957757949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.01477098, + "epoch": 0.08811081185071182, + "flos": 474022362624.0, + "grad_norm": 0.04035146689108268, + "language_loss": 0.99321103, + "learning_rate": 0.000991173955731976, + "loss": 1.00412512, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.76513672, + "step": 458, + "time_per_iteration": 2.6747970581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089272, + "balance_loss_mlp": 1.01288271, + "epoch": 0.08830319353597538, + "flos": 686315738880.0, + "grad_norm": 0.033089720334054364, + "language_loss": 1.03213239, + "learning_rate": 0.0009911155823389137, + "loss": 1.04302514, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.76269531, + "step": 459, + "time_per_iteration": 2.9462268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_mlp": 1.00881398, + "epoch": 0.08849557522123894, + "flos": 574609294848.0, + "grad_norm": 0.035557366742091014, + "language_loss": 0.99025905, + "learning_rate": 0.000991057018277873, + "loss": 1.00111353, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.76513672, + "step": 460, + "time_per_iteration": 2.6903369426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.00968456, + "epoch": 0.0886879569065025, + "flos": 565628336640.0, + "grad_norm": 0.039664118418905284, + "language_loss": 1.00002789, + "learning_rate": 0.0009909982635715898, + "loss": 1.01089334, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.76757812, + "step": 461, + "time_per_iteration": 2.620046615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010895, + "balance_loss_mlp": 1.0128243, + "epoch": 0.08888033859176607, + "flos": 564957607680.0, + "grad_norm": 0.03231802322071402, + "language_loss": 0.98670942, + "learning_rate": 0.0009909393182428751, + "loss": 0.99760437, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.765625, + "step": 462, + "time_per_iteration": 2.6466307640075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090991, + "balance_loss_mlp": 1.01412475, + "epoch": 0.08907272027702963, + "flos": 466743499008.0, + "grad_norm": 0.03344290639259395, + "language_loss": 0.93214953, + "learning_rate": 0.000990880182314614, + "loss": 0.94305944, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.76757812, + "step": 463, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_mlp": 1.0100224, + "epoch": 0.08926510196229319, + "flos": 682844475648.0, + "grad_norm": 0.03261982194681884, + "language_loss": 0.93093467, + "learning_rate": 0.0009908208558097643, + "loss": 0.94180012, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.76416016, + "step": 464, + "time_per_iteration": 2.9068925380706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089482, + "balance_loss_mlp": 1.01323605, + "epoch": 0.08945748364755675, + "flos": 597822336768.0, + "grad_norm": 0.03309433671244878, + "language_loss": 0.95414662, + "learning_rate": 0.000990761338751359, + "loss": 0.9650414, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.76123047, + "step": 465, + "time_per_iteration": 2.774606227874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079613, + "balance_loss_mlp": 1.00732422, + "epoch": 0.08964986533282032, + "flos": 1589343879168.0, + "grad_norm": 0.03434681355524106, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74739242, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.72460938, + "step": 466, + "time_per_iteration": 4.996358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092523, + "balance_loss_mlp": 1.01646745, + "epoch": 0.08984224701808388, + "flos": 534550272768.0, + "grad_norm": 0.03379784984504044, + "language_loss": 0.98391378, + "learning_rate": 0.0009906417330663815, + "loss": 0.99483901, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.75927734, + "step": 467, + "time_per_iteration": 2.6774964332580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092653, + "balance_loss_mlp": 1.01678836, + "epoch": 0.09003462870334744, + "flos": 479850898176.0, + "grad_norm": 0.04271038491910547, + "language_loss": 0.94838965, + "learning_rate": 0.0009905816444862442, + "loss": 0.95931625, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.75732422, + "step": 468, + "time_per_iteration": 2.6558451652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092919, + "balance_loss_mlp": 1.01691103, + "epoch": 0.090227010388611, + "flos": 654903283200.0, + "grad_norm": 0.031716132767048565, + "language_loss": 0.92225289, + "learning_rate": 0.0009905213654454216, + "loss": 0.933182, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.75878906, + "step": 469, + "time_per_iteration": 2.9322757720947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.01796389, + "epoch": 0.09041939207387456, + "flos": 619359528960.0, + "grad_norm": 0.03474651138537023, + "language_loss": 1.00819349, + "learning_rate": 0.0009904608959673158, + "loss": 1.01913023, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.75585938, + "step": 470, + "time_per_iteration": 2.7938003540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091787, + "balance_loss_mlp": 1.01620793, + "epoch": 0.09061177375913813, + "flos": 455296398336.0, + "grad_norm": 0.04023106246537731, + "language_loss": 1.00852847, + "learning_rate": 0.000990400236075403, + "loss": 1.01944637, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.75439453, + "step": 471, + "time_per_iteration": 2.5231049060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.01024961, + "epoch": 0.0908041554444017, + "flos": 545309147904.0, + "grad_norm": 0.036372029021066864, + "language_loss": 0.97571105, + "learning_rate": 0.0009903393857932338, + "loss": 0.98656648, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.75146484, + "step": 472, + "time_per_iteration": 2.700449228286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082684, + "balance_loss_mlp": 1.00786841, + "epoch": 0.09099653712966525, + "flos": 565467943680.0, + "grad_norm": 0.03263919317425628, + "language_loss": 0.95124531, + "learning_rate": 0.0009902783451444317, + "loss": 0.96207213, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.74658203, + "step": 473, + "time_per_iteration": 2.7006537914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.00667381, + "epoch": 0.09118891881492881, + "flos": 475502826240.0, + "grad_norm": 0.036465550100162274, + "language_loss": 0.98778975, + "learning_rate": 0.0009902171141526956, + "loss": 0.99860233, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.74414062, + "step": 474, + "time_per_iteration": 2.565852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081522, + "balance_loss_mlp": 1.00732613, + "epoch": 0.09138130050019239, + "flos": 546991800576.0, + "grad_norm": 0.03189281102051162, + "language_loss": 0.86324012, + "learning_rate": 0.000990155692841797, + "loss": 0.87405533, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.74023438, + "step": 475, + "time_per_iteration": 2.9694621562957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.0079515, + "epoch": 0.09157368218545595, + "flos": 733974410496.0, + "grad_norm": 0.03574286330183218, + "language_loss": 0.98287529, + "learning_rate": 0.0009900940812355818, + "loss": 0.99369442, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.73779297, + "step": 476, + "time_per_iteration": 2.8549702167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.00835192, + "epoch": 0.0917660638707195, + "flos": 612073862400.0, + "grad_norm": 0.03800316101532587, + "language_loss": 0.95275486, + "learning_rate": 0.00099003227935797, + "loss": 0.96357656, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.73632812, + "step": 477, + "time_per_iteration": 2.709808349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084092, + "balance_loss_mlp": 1.01051593, + "epoch": 0.09195844555598306, + "flos": 657019482624.0, + "grad_norm": 0.03875864993538346, + "language_loss": 0.99037415, + "learning_rate": 0.000989970287232955, + "loss": 1.0012151, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.73486328, + "step": 478, + "time_per_iteration": 2.7670538425445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.01252699, + "epoch": 0.09215082724124664, + "flos": 477541257984.0, + "grad_norm": 0.03367109557456403, + "language_loss": 0.95731258, + "learning_rate": 0.0009899081048846043, + "loss": 0.96817166, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.73339844, + "step": 479, + "time_per_iteration": 2.588352918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_mlp": 1.01208997, + "epoch": 0.0923432089265102, + "flos": 525326296320.0, + "grad_norm": 0.0462740033589213, + "language_loss": 1.00606585, + "learning_rate": 0.0009898457323370593, + "loss": 1.01691723, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.73046875, + "step": 480, + "time_per_iteration": 2.5808160305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082132, + "balance_loss_mlp": 1.00936687, + "epoch": 0.09253559061177376, + "flos": 546639912192.0, + "grad_norm": 0.03676160983227949, + "language_loss": 0.9798522, + "learning_rate": 0.000989783169614535, + "loss": 0.99067354, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.72900391, + "step": 481, + "time_per_iteration": 2.624483108520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097145, + "balance_loss_mlp": 1.02485657, + "epoch": 0.09272797229703732, + "flos": 1541337209856.0, + "grad_norm": 0.023489610904585654, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79849905, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.72460938, + "step": 482, + "time_per_iteration": 4.897305965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085543, + "balance_loss_mlp": 1.01330173, + "epoch": 0.09292035398230089, + "flos": 691065276672.0, + "grad_norm": 0.04252493421314706, + "language_loss": 0.95552129, + "learning_rate": 0.000989657473741779, + "loss": 0.96637678, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.72412109, + "step": 483, + "time_per_iteration": 2.8165738582611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.01184416, + "epoch": 0.09311273566756445, + "flos": 510823004160.0, + "grad_norm": 0.03895509426778844, + "language_loss": 0.97422099, + "learning_rate": 0.0009895943406403465, + "loss": 0.98506236, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.72460938, + "step": 484, + "time_per_iteration": 2.7523326873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086134, + "balance_loss_mlp": 1.01384509, + "epoch": 0.09330511735282801, + "flos": 660584064768.0, + "grad_norm": 0.04754513437429821, + "language_loss": 0.90526009, + "learning_rate": 0.0009895310174615338, + "loss": 0.91612148, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.72460938, + "step": 485, + "time_per_iteration": 2.843790292739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070969, + "balance_loss_mlp": 0.99982452, + "epoch": 0.09349749903809157, + "flos": 1456024420608.0, + "grad_norm": 0.007982392205281765, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76789486, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.71289062, + "step": 486, + "time_per_iteration": 4.649716138839722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.00877845, + "epoch": 0.09368988072335514, + "flos": 521900719872.0, + "grad_norm": 0.0379904908867083, + "language_loss": 0.94096279, + "learning_rate": 0.0009894038009701782, + "loss": 0.95177054, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.72167969, + "step": 487, + "time_per_iteration": 2.615767002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_mlp": 1.012941, + "epoch": 0.0938822624086187, + "flos": 498752806656.0, + "grad_norm": 0.041516659048387576, + "language_loss": 0.97017074, + "learning_rate": 0.0009893399077070253, + "loss": 0.98102111, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.72265625, + "step": 488, + "time_per_iteration": 2.592867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.01828361, + "epoch": 0.09407464409388226, + "flos": 534224629248.0, + "grad_norm": 0.031087819309936707, + "language_loss": 0.91152203, + "learning_rate": 0.0009892758244652718, + "loss": 0.92242396, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.72070312, + "step": 489, + "time_per_iteration": 2.702681541442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080571, + "balance_loss_mlp": 1.00852132, + "epoch": 0.09426702577914582, + "flos": 587091651840.0, + "grad_norm": 0.037758062155454256, + "language_loss": 0.98290044, + "learning_rate": 0.0009892115512697968, + "loss": 0.99370617, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.72216797, + "step": 490, + "time_per_iteration": 2.7222015857696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.01649261, + "epoch": 0.0944594074644094, + "flos": 504464690688.0, + "grad_norm": 0.03400132145466818, + "language_loss": 0.98617911, + "learning_rate": 0.0009891470881455537, + "loss": 0.99706453, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.72216797, + "step": 491, + "time_per_iteration": 2.6978650093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.01626599, + "epoch": 0.09465178914967295, + "flos": 572114962176.0, + "grad_norm": 0.03537229102294209, + "language_loss": 0.97051454, + "learning_rate": 0.0009890824351175692, + "loss": 0.98139298, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.71728516, + "step": 492, + "time_per_iteration": 2.7183802127838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.01590919, + "epoch": 0.09484417083493651, + "flos": 550419322368.0, + "grad_norm": 0.028677449722299516, + "language_loss": 1.00688422, + "learning_rate": 0.0009890175922109435, + "loss": 1.01776004, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.71826172, + "step": 493, + "time_per_iteration": 2.680469512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082456, + "balance_loss_mlp": 1.01088285, + "epoch": 0.09503655252020007, + "flos": 825272237568.0, + "grad_norm": 0.03488638846892438, + "language_loss": 0.98808897, + "learning_rate": 0.0009889525594508513, + "loss": 0.99891359, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.71728516, + "step": 494, + "time_per_iteration": 2.983400344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083066, + "balance_loss_mlp": 1.01154041, + "epoch": 0.09522893420546363, + "flos": 405518615040.0, + "grad_norm": 0.028649644857800794, + "language_loss": 0.9245472, + "learning_rate": 0.0009888873368625404, + "loss": 0.93537784, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.71679688, + "step": 495, + "time_per_iteration": 2.497526168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108369, + "balance_loss_mlp": 1.01206875, + "epoch": 0.0954213158907272, + "flos": 692257035264.0, + "grad_norm": 0.03396045626839725, + "language_loss": 0.96602595, + "learning_rate": 0.0009888219244713326, + "loss": 0.97686291, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.71777344, + "step": 496, + "time_per_iteration": 2.8588504791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.01018417, + "epoch": 0.09561369757599077, + "flos": 520075170816.0, + "grad_norm": 0.039869543083186736, + "language_loss": 0.97707164, + "learning_rate": 0.0009887563223026229, + "loss": 0.98788875, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.71679688, + "step": 497, + "time_per_iteration": 2.6856894493103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075874, + "balance_loss_mlp": 1.00644684, + "epoch": 0.09580607926125433, + "flos": 1388784363264.0, + "grad_norm": 0.01625235818526382, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80144036, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.6953125, + "step": 498, + "time_per_iteration": 4.882593393325806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086748, + "balance_loss_mlp": 1.0150795, + "epoch": 0.09599846094651789, + "flos": 718826634240.0, + "grad_norm": 0.03326061844711544, + "language_loss": 0.95632416, + "learning_rate": 0.0009886245487346482, + "loss": 0.9671917, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.71826172, + "step": 499, + "time_per_iteration": 3.0426785945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.01568544, + "epoch": 0.09619084263178146, + "flos": 386894717952.0, + "grad_norm": 0.04298067648683731, + "language_loss": 0.98954022, + "learning_rate": 0.0009885583773865422, + "loss": 1.00041187, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.71630859, + "step": 500, + "time_per_iteration": 2.452941417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.01467967, + "epoch": 0.09638322431704502, + "flos": 535173369600.0, + "grad_norm": 0.04172266818012015, + "language_loss": 0.95971203, + "learning_rate": 0.0009884920163632524, + "loss": 0.97057414, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.71679688, + "step": 501, + "time_per_iteration": 2.657940626144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080406, + "balance_loss_mlp": 1.00911927, + "epoch": 0.09657560600230858, + "flos": 501657353472.0, + "grad_norm": 0.041437287127294276, + "language_loss": 0.9960922, + "learning_rate": 0.000988425465690543, + "loss": 1.00689626, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.71435547, + "step": 502, + "time_per_iteration": 2.5540428161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.00642741, + "epoch": 0.09676798768757214, + "flos": 530332458240.0, + "grad_norm": 0.03187665411612151, + "language_loss": 0.96807587, + "learning_rate": 0.0009883587253942505, + "loss": 0.97885495, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.71630859, + "step": 503, + "time_per_iteration": 2.7744338512420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086901, + "balance_loss_mlp": 1.01542282, + "epoch": 0.09696036937283571, + "flos": 464557313280.0, + "grad_norm": 0.038653015311582224, + "language_loss": 1.0234406, + "learning_rate": 0.0009882917955002862, + "loss": 1.03430974, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.71630859, + "step": 504, + "time_per_iteration": 2.500669479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_mlp": 1.01074982, + "epoch": 0.09715275105809927, + "flos": 536011294464.0, + "grad_norm": 0.035792041916504785, + "language_loss": 0.94188601, + "learning_rate": 0.0009882246760346343, + "loss": 0.95270395, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.71191406, + "step": 505, + "time_per_iteration": 2.6442148685455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077575, + "balance_loss_mlp": 1.00652647, + "epoch": 0.09734513274336283, + "flos": 455882556672.0, + "grad_norm": 0.04461237962136338, + "language_loss": 1.00418711, + "learning_rate": 0.0009881573670233533, + "loss": 1.01496279, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.71191406, + "step": 506, + "time_per_iteration": 2.5102410316467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075502, + "balance_loss_mlp": 1.00450063, + "epoch": 0.09753751442862639, + "flos": 509828577024.0, + "grad_norm": 0.03506590591484262, + "language_loss": 0.93374205, + "learning_rate": 0.0009880898684925747, + "loss": 0.94449711, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.71142578, + "step": 507, + "time_per_iteration": 2.652381658554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.00624609, + "epoch": 0.09772989611388996, + "flos": 485247832320.0, + "grad_norm": 0.03501422949918711, + "language_loss": 0.92606336, + "learning_rate": 0.0009880221804685037, + "loss": 0.9368335, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.70898438, + "step": 508, + "time_per_iteration": 2.5481274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073608, + "balance_loss_mlp": 1.00456238, + "epoch": 0.09792227779915352, + "flos": 1569319231488.0, + "grad_norm": 0.011873284077886747, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80418032, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.69140625, + "step": 509, + "time_per_iteration": 4.725191354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076044, + "balance_loss_mlp": 1.00590122, + "epoch": 0.09811465948441708, + "flos": 588915255552.0, + "grad_norm": 0.04172960474096109, + "language_loss": 0.98818666, + "learning_rate": 0.0009878862360456733, + "loss": 0.99894708, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.70263672, + "step": 510, + "time_per_iteration": 2.7094569206237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.00828481, + "epoch": 0.09830704116968064, + "flos": 614129790720.0, + "grad_norm": 0.037035801977756785, + "language_loss": 0.90851068, + "learning_rate": 0.0009878179796996922, + "loss": 0.919294, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.70166016, + "step": 511, + "time_per_iteration": 2.6973366737365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079637, + "balance_loss_mlp": 1.00973296, + "epoch": 0.09849942285494422, + "flos": 539936513280.0, + "grad_norm": 0.0318668020933778, + "language_loss": 0.94484478, + "learning_rate": 0.0009877495339659754, + "loss": 0.95564115, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.70019531, + "step": 512, + "time_per_iteration": 2.7476089000701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083598, + "balance_loss_mlp": 1.0137887, + "epoch": 0.09869180454020778, + "flos": 621604040448.0, + "grad_norm": 0.03763698097825182, + "language_loss": 0.89467418, + "learning_rate": 0.000987680898871096, + "loss": 0.90551007, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.69921875, + "step": 513, + "time_per_iteration": 2.7254321575164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.01382184, + "epoch": 0.09888418622547133, + "flos": 813061089024.0, + "grad_norm": 0.049179676158016515, + "language_loss": 0.91816097, + "learning_rate": 0.0009876120744417, + "loss": 0.9289968, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.69873047, + "step": 514, + "time_per_iteration": 2.9596974849700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083293, + "balance_loss_mlp": 1.01357901, + "epoch": 0.0990765679107349, + "flos": 536857967616.0, + "grad_norm": 0.03966041946019195, + "language_loss": 0.99294269, + "learning_rate": 0.0009875430607045078, + "loss": 1.0037756, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.69824219, + "step": 515, + "time_per_iteration": 2.7065181732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_mlp": 1.01439941, + "epoch": 0.09926894959599845, + "flos": 588971635968.0, + "grad_norm": 0.037836000479060286, + "language_loss": 0.94664383, + "learning_rate": 0.000987473857686313, + "loss": 0.95748156, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.69482422, + "step": 516, + "time_per_iteration": 2.712947130203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_mlp": 1.01582849, + "epoch": 0.09946133128126203, + "flos": 642387878400.0, + "grad_norm": 0.04191957443387863, + "language_loss": 0.98466003, + "learning_rate": 0.0009874044654139824, + "loss": 0.99551111, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.69384766, + "step": 517, + "time_per_iteration": 2.7391469478607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081227, + "balance_loss_mlp": 1.01194227, + "epoch": 0.09965371296652559, + "flos": 466726002432.0, + "grad_norm": 0.049265237591549625, + "language_loss": 0.97911566, + "learning_rate": 0.0009873348839144563, + "loss": 0.98992795, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.69384766, + "step": 518, + "time_per_iteration": 2.5496554374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081078, + "balance_loss_mlp": 1.01198411, + "epoch": 0.09984609465178915, + "flos": 484559606784.0, + "grad_norm": 0.04039588305244337, + "language_loss": 0.99084902, + "learning_rate": 0.000987265113214749, + "loss": 1.00165975, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.69189453, + "step": 519, + "time_per_iteration": 2.592350721359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_mlp": 1.01200545, + "epoch": 0.1000384763370527, + "flos": 570095972352.0, + "grad_norm": 0.04690738730083641, + "language_loss": 1.01784182, + "learning_rate": 0.0009871951533419476, + "loss": 1.02865279, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.69189453, + "step": 520, + "time_per_iteration": 2.699725866317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.00854921, + "epoch": 0.10023085802231628, + "flos": 546926671872.0, + "grad_norm": 0.03422053119670882, + "language_loss": 0.91227025, + "learning_rate": 0.0009871250043232132, + "loss": 0.92304718, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.69238281, + "step": 521, + "time_per_iteration": 2.74124813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078273, + "balance_loss_mlp": 1.00913203, + "epoch": 0.10042323970757984, + "flos": 504440391168.0, + "grad_norm": 0.0407416967929008, + "language_loss": 0.91114902, + "learning_rate": 0.0009870546661857797, + "loss": 0.92193174, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.69238281, + "step": 522, + "time_per_iteration": 2.6524126529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080712, + "balance_loss_mlp": 1.01199949, + "epoch": 0.1006156213928434, + "flos": 771725737728.0, + "grad_norm": 0.04764395650012834, + "language_loss": 1.0071038, + "learning_rate": 0.0009869841389569553, + "loss": 1.01791096, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.68798828, + "step": 523, + "time_per_iteration": 2.9797816276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.01237857, + "epoch": 0.10080800307810696, + "flos": 491009293824.0, + "grad_norm": 0.04526617857315469, + "language_loss": 0.93126583, + "learning_rate": 0.0009869134226641206, + "loss": 0.94207817, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.68945312, + "step": 524, + "time_per_iteration": 2.624396562576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079355, + "balance_loss_mlp": 1.01064241, + "epoch": 0.10100038476337053, + "flos": 455713415424.0, + "grad_norm": 0.04976961118682096, + "language_loss": 0.93662071, + "learning_rate": 0.0009868425173347303, + "loss": 0.94741422, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.68798828, + "step": 525, + "time_per_iteration": 2.659106731414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077138, + "balance_loss_mlp": 1.00809169, + "epoch": 0.10119276644863409, + "flos": 557574731520.0, + "grad_norm": 0.04197638521891018, + "language_loss": 0.9924143, + "learning_rate": 0.0009867714229963125, + "loss": 1.00318575, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.69140625, + "step": 526, + "time_per_iteration": 2.7414495944976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.01201165, + "epoch": 0.10138514813389765, + "flos": 517220201472.0, + "grad_norm": 0.044929109849797505, + "language_loss": 0.96641302, + "learning_rate": 0.000986700139676468, + "loss": 0.97722065, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.68847656, + "step": 527, + "time_per_iteration": 2.620313882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083057, + "balance_loss_mlp": 1.01405847, + "epoch": 0.10157752981916121, + "flos": 501564034560.0, + "grad_norm": 0.03558874762709202, + "language_loss": 0.9424324, + "learning_rate": 0.0009866286674028717, + "loss": 0.95326293, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.69091797, + "step": 528, + "time_per_iteration": 2.632835865020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082511, + "balance_loss_mlp": 1.01379848, + "epoch": 0.10176991150442478, + "flos": 658094589696.0, + "grad_norm": 0.042026744727430246, + "language_loss": 0.91470444, + "learning_rate": 0.0009865570062032717, + "loss": 0.9255296, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.68798828, + "step": 529, + "time_per_iteration": 2.9185874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.01519477, + "epoch": 0.10196229318968834, + "flos": 574403215104.0, + "grad_norm": 0.031693910674612406, + "language_loss": 0.95307148, + "learning_rate": 0.0009864851561054893, + "loss": 0.96391344, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.69091797, + "step": 530, + "time_per_iteration": 2.7826597690582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.01765728, + "epoch": 0.1021546748749519, + "flos": 519256687872.0, + "grad_norm": 0.0418084670656813, + "language_loss": 0.94574928, + "learning_rate": 0.0009864131171374191, + "loss": 0.95661592, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.69091797, + "step": 531, + "time_per_iteration": 2.67000150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088238, + "balance_loss_mlp": 1.01919198, + "epoch": 0.10234705656021546, + "flos": 610954035456.0, + "grad_norm": 0.03906444640078033, + "language_loss": 0.94287467, + "learning_rate": 0.0009863408893270292, + "loss": 0.95375705, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.69140625, + "step": 532, + "time_per_iteration": 2.7893166542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_mlp": 1.02029741, + "epoch": 0.10253943824547904, + "flos": 602913069312.0, + "grad_norm": 0.046708965243717, + "language_loss": 0.90346718, + "learning_rate": 0.0009862684727023605, + "loss": 0.91435778, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.68847656, + "step": 533, + "time_per_iteration": 2.7212483882904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.0105468, + "epoch": 0.1027318199307426, + "flos": 664157395200.0, + "grad_norm": 0.04923575085492922, + "language_loss": 0.9286049, + "learning_rate": 0.0009861958672915283, + "loss": 0.93939555, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.68603516, + "step": 534, + "time_per_iteration": 2.8216443061828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080998, + "balance_loss_mlp": 1.01271474, + "epoch": 0.10292420161600616, + "flos": 684531019008.0, + "grad_norm": 0.03566434899904423, + "language_loss": 0.91122925, + "learning_rate": 0.0009861230731227201, + "loss": 0.92203927, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.68359375, + "step": 535, + "time_per_iteration": 2.8432843685150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082908, + "balance_loss_mlp": 1.01514912, + "epoch": 0.10311658330126972, + "flos": 491269808640.0, + "grad_norm": 0.04656876258351904, + "language_loss": 0.9494285, + "learning_rate": 0.0009860500902241973, + "loss": 0.96025753, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.67822266, + "step": 536, + "time_per_iteration": 2.601234197616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085787, + "balance_loss_mlp": 1.01831496, + "epoch": 0.10330896498653329, + "flos": 432687011328.0, + "grad_norm": 0.046264109011482965, + "language_loss": 0.99409795, + "learning_rate": 0.0009859769186242942, + "loss": 1.00495577, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.67529297, + "step": 537, + "time_per_iteration": 2.527156114578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079891, + "balance_loss_mlp": 1.01265681, + "epoch": 0.10350134667179685, + "flos": 550642898688.0, + "grad_norm": 0.04274411195548745, + "language_loss": 0.92667055, + "learning_rate": 0.0009859035583514187, + "loss": 0.93746948, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.67285156, + "step": 538, + "time_per_iteration": 2.6489107608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082236, + "balance_loss_mlp": 1.01505005, + "epoch": 0.10369372835706041, + "flos": 641827964928.0, + "grad_norm": 0.04978782417937993, + "language_loss": 0.95941103, + "learning_rate": 0.0009858300094340517, + "loss": 0.97023344, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.67236328, + "step": 539, + "time_per_iteration": 2.8078534603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107826, + "balance_loss_mlp": 1.01102614, + "epoch": 0.10388611004232397, + "flos": 522766834944.0, + "grad_norm": 0.04233995967203171, + "language_loss": 0.8846426, + "learning_rate": 0.0009857562719007473, + "loss": 0.8954252, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.67285156, + "step": 540, + "time_per_iteration": 2.605253219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108211, + "balance_loss_mlp": 1.01487637, + "epoch": 0.10407849172758753, + "flos": 703741074432.0, + "grad_norm": 0.04489314852578161, + "language_loss": 0.9024663, + "learning_rate": 0.0009856823457801331, + "loss": 0.91328734, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.67285156, + "step": 541, + "time_per_iteration": 2.8836264610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074756, + "balance_loss_mlp": 1.00737894, + "epoch": 0.1042708734128511, + "flos": 503945606400.0, + "grad_norm": 0.04545070943505171, + "language_loss": 0.97841358, + "learning_rate": 0.00098560823110091, + "loss": 0.98916113, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.67431641, + "step": 542, + "time_per_iteration": 2.629241466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078174, + "balance_loss_mlp": 1.01084471, + "epoch": 0.10446325509811466, + "flos": 486641779968.0, + "grad_norm": 0.04151430298304091, + "language_loss": 0.974545, + "learning_rate": 0.000985533927891851, + "loss": 0.98532677, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.67382812, + "step": 543, + "time_per_iteration": 2.712714195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.01110125, + "epoch": 0.10465563678337822, + "flos": 569713948416.0, + "grad_norm": 0.043537531534841835, + "language_loss": 0.9559319, + "learning_rate": 0.0009854594361818044, + "loss": 0.96671236, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.66992188, + "step": 544, + "time_per_iteration": 2.66324520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075343, + "balance_loss_mlp": 1.00806153, + "epoch": 0.10484801846864178, + "flos": 627243992832.0, + "grad_norm": 0.042858245855360314, + "language_loss": 0.94459403, + "learning_rate": 0.0009853847559996897, + "loss": 0.95534742, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.67333984, + "step": 545, + "time_per_iteration": 2.749379873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.00697374, + "epoch": 0.10504040015390535, + "flos": 744813965568.0, + "grad_norm": 0.04113973833070077, + "language_loss": 0.93940508, + "learning_rate": 0.0009853098873745, + "loss": 0.95015049, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.67626953, + "step": 546, + "time_per_iteration": 3.0356035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082094, + "balance_loss_mlp": 1.01457405, + "epoch": 0.10523278183916891, + "flos": 587843060736.0, + "grad_norm": 0.04039468180414331, + "language_loss": 0.92498314, + "learning_rate": 0.0009852348303353027, + "loss": 0.93580401, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.67578125, + "step": 547, + "time_per_iteration": 2.787853479385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_mlp": 1.01283157, + "epoch": 0.10542516352443247, + "flos": 871147156224.0, + "grad_norm": 0.04319215205461418, + "language_loss": 0.86143011, + "learning_rate": 0.000985159584911237, + "loss": 0.872235, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.67724609, + "step": 548, + "time_per_iteration": 3.103173017501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.00949633, + "epoch": 0.10561754520969603, + "flos": 506413694208.0, + "grad_norm": 0.04405333210851084, + "language_loss": 0.94064271, + "learning_rate": 0.0009850841511315162, + "loss": 0.95141286, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.67578125, + "step": 549, + "time_per_iteration": 2.647629737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107948, + "balance_loss_mlp": 1.01176953, + "epoch": 0.1058099268949596, + "flos": 561148061952.0, + "grad_norm": 0.03728506713954383, + "language_loss": 0.9326818, + "learning_rate": 0.0009850085290254256, + "loss": 0.94347662, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.67773438, + "step": 550, + "time_per_iteration": 2.7680838108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081585, + "balance_loss_mlp": 1.01411295, + "epoch": 0.10600230858022316, + "flos": 563160248832.0, + "grad_norm": 0.031635589688873186, + "language_loss": 0.90350562, + "learning_rate": 0.0009849327186223246, + "loss": 0.91432148, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.67529297, + "step": 551, + "time_per_iteration": 2.7540531158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077249, + "balance_loss_mlp": 1.01001453, + "epoch": 0.10619469026548672, + "flos": 495318481920.0, + "grad_norm": 0.03875875468173829, + "language_loss": 0.97612774, + "learning_rate": 0.000984856719951646, + "loss": 0.98690015, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.67285156, + "step": 552, + "time_per_iteration": 2.5471906661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_mlp": 1.01300704, + "epoch": 0.10638707195075028, + "flos": 677465038080.0, + "grad_norm": 0.04041077275123314, + "language_loss": 0.94560456, + "learning_rate": 0.0009847805330428943, + "loss": 0.95640558, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.67138672, + "step": 553, + "time_per_iteration": 2.879901647567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081173, + "balance_loss_mlp": 1.01398706, + "epoch": 0.10657945363601386, + "flos": 489035990784.0, + "grad_norm": 0.051524237529684984, + "language_loss": 0.97161597, + "learning_rate": 0.0009847041579256481, + "loss": 0.98242772, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.67236328, + "step": 554, + "time_per_iteration": 2.5838425159454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076637, + "balance_loss_mlp": 1.00997543, + "epoch": 0.10677183532127742, + "flos": 483971503104.0, + "grad_norm": 0.03890900728724459, + "language_loss": 0.96058643, + "learning_rate": 0.0009846275946295592, + "loss": 0.97135282, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.66699219, + "step": 555, + "time_per_iteration": 2.619490623474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074794, + "balance_loss_mlp": 1.00813222, + "epoch": 0.10696421700654098, + "flos": 657582308352.0, + "grad_norm": 0.03350037319549477, + "language_loss": 0.89189553, + "learning_rate": 0.0009845508431843518, + "loss": 0.9026435, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.66699219, + "step": 556, + "time_per_iteration": 3.0074055194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_mlp": 1.00895333, + "epoch": 0.10715659869180454, + "flos": 568793398272.0, + "grad_norm": 0.03867425342149035, + "language_loss": 0.90383601, + "learning_rate": 0.0009844739036198233, + "loss": 0.91459262, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.66748047, + "step": 557, + "time_per_iteration": 2.719309091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073849, + "balance_loss_mlp": 1.00756896, + "epoch": 0.10734898037706811, + "flos": 541744565760.0, + "grad_norm": 0.03845092177051005, + "language_loss": 0.97656357, + "learning_rate": 0.0009843967759658448, + "loss": 0.98730206, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.66308594, + "step": 558, + "time_per_iteration": 2.679964065551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077583, + "balance_loss_mlp": 1.01311493, + "epoch": 0.10754136206233167, + "flos": 1479734192640.0, + "grad_norm": 0.013283033162601723, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73845339, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.64453125, + "step": 559, + "time_per_iteration": 4.837440729141235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_mlp": 1.00977802, + "epoch": 0.10773374374759523, + "flos": 513412601088.0, + "grad_norm": 0.03702065367467253, + "language_loss": 0.97501957, + "learning_rate": 0.000984241956509384, + "loss": 0.98577774, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.66064453, + "step": 560, + "time_per_iteration": 2.6579978466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079218, + "balance_loss_mlp": 1.01312864, + "epoch": 0.10792612543285879, + "flos": 497478422784.0, + "grad_norm": 0.05173888564395698, + "language_loss": 0.9404971, + "learning_rate": 0.0009841642647670078, + "loss": 0.9512893, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.66113281, + "step": 561, + "time_per_iteration": 2.557605743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080143, + "balance_loss_mlp": 1.01429176, + "epoch": 0.10811850711812235, + "flos": 736838128128.0, + "grad_norm": 0.0493873548723288, + "language_loss": 0.88547891, + "learning_rate": 0.0009840863850553944, + "loss": 0.89628035, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.65869141, + "step": 562, + "time_per_iteration": 2.949580669403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077668, + "balance_loss_mlp": 1.0115304, + "epoch": 0.10831088880338592, + "flos": 612677517312.0, + "grad_norm": 0.04173462884607535, + "language_loss": 0.94150907, + "learning_rate": 0.0009840083174047782, + "loss": 0.95228577, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.66162109, + "step": 563, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081559, + "balance_loss_mlp": 1.01561248, + "epoch": 0.10850327048864948, + "flos": 557498909184.0, + "grad_norm": 0.034100755270258146, + "language_loss": 0.88515103, + "learning_rate": 0.0009839300618454685, + "loss": 0.89596659, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.65966797, + "step": 564, + "time_per_iteration": 2.8846256732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080528, + "balance_loss_mlp": 1.0148201, + "epoch": 0.10869565217391304, + "flos": 604437274368.0, + "grad_norm": 0.036735298053950545, + "language_loss": 0.93941957, + "learning_rate": 0.0009838516184078466, + "loss": 0.95022488, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.65722656, + "step": 565, + "time_per_iteration": 2.813284158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078727, + "balance_loss_mlp": 1.01297164, + "epoch": 0.1088880338591766, + "flos": 527206280448.0, + "grad_norm": 0.040314305725270186, + "language_loss": 0.91096556, + "learning_rate": 0.0009837729871223669, + "loss": 0.92175281, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.65771484, + "step": 566, + "time_per_iteration": 2.651611089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078801, + "balance_loss_mlp": 1.01318836, + "epoch": 0.10908041554444017, + "flos": 621417402624.0, + "grad_norm": 0.042325065837349046, + "language_loss": 0.91458869, + "learning_rate": 0.0009836941680195568, + "loss": 0.92537665, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.65625, + "step": 567, + "time_per_iteration": 2.8296427726745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.01508534, + "epoch": 0.10927279722970373, + "flos": 899674507008.0, + "grad_norm": 0.04990856516123606, + "language_loss": 0.87414277, + "learning_rate": 0.0009836151611300166, + "loss": 0.88495302, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.65966797, + "step": 568, + "time_per_iteration": 3.2401816844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107825, + "balance_loss_mlp": 1.01206517, + "epoch": 0.10946517891496729, + "flos": 529700613120.0, + "grad_norm": 0.0427731854110213, + "language_loss": 0.96863574, + "learning_rate": 0.0009835359664844194, + "loss": 0.97941828, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.66210938, + "step": 569, + "time_per_iteration": 2.6190173625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064438, + "balance_loss_mlp": 1.00092316, + "epoch": 0.10965756060023085, + "flos": 1563994228992.0, + "grad_norm": 0.005811935039235345, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.8210125, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.63476562, + "step": 570, + "time_per_iteration": 4.957117795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080699, + "balance_loss_mlp": 1.0151341, + "epoch": 0.10984994228549443, + "flos": 514100826624.0, + "grad_norm": 0.04369440603786518, + "language_loss": 0.94858396, + "learning_rate": 0.0009833770140481118, + "loss": 0.95939088, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.65576172, + "step": 571, + "time_per_iteration": 2.6529860496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_mlp": 1.02059519, + "epoch": 0.11004232397075799, + "flos": 956275252992.0, + "grad_norm": 0.04378732511153692, + "language_loss": 0.85010409, + "learning_rate": 0.000983297256319112, + "loss": 0.86096668, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.65673828, + "step": 572, + "time_per_iteration": 3.2036497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.01499045, + "epoch": 0.11023470565602154, + "flos": 489229431552.0, + "grad_norm": 0.043497603291787354, + "language_loss": 0.89141667, + "learning_rate": 0.000983217310957477, + "loss": 0.90222269, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.65625, + "step": 573, + "time_per_iteration": 2.7763278484344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078757, + "balance_loss_mlp": 1.01333535, + "epoch": 0.1104270873412851, + "flos": 656991292416.0, + "grad_norm": 0.04901418812727031, + "language_loss": 0.9269613, + "learning_rate": 0.000983137177994244, + "loss": 0.93774891, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.65429688, + "step": 574, + "time_per_iteration": 2.8529646396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080019, + "balance_loss_mlp": 1.01474011, + "epoch": 0.11061946902654868, + "flos": 724748488704.0, + "grad_norm": 0.03457948694206611, + "language_loss": 0.87449324, + "learning_rate": 0.0009830568574605235, + "loss": 0.88529336, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.65283203, + "step": 575, + "time_per_iteration": 2.94710373878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010791, + "balance_loss_mlp": 1.01367807, + "epoch": 0.11081185071181224, + "flos": 836869037568.0, + "grad_norm": 0.04085001299476677, + "language_loss": 0.90086508, + "learning_rate": 0.0009829763493874992, + "loss": 0.91165602, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.65429688, + "step": 576, + "time_per_iteration": 3.0296730995178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107807, + "balance_loss_mlp": 1.01283884, + "epoch": 0.1110042323970758, + "flos": 610283306496.0, + "grad_norm": 0.03775485835018356, + "language_loss": 0.95256275, + "learning_rate": 0.0009828956538064264, + "loss": 0.9633435, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.65234375, + "step": 577, + "time_per_iteration": 2.7944416999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.00893569, + "epoch": 0.11119661408233936, + "flos": 597040792320.0, + "grad_norm": 0.04378674390965236, + "language_loss": 0.93033826, + "learning_rate": 0.0009828147707486344, + "loss": 0.94107759, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.64990234, + "step": 578, + "time_per_iteration": 2.7034592628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.01099229, + "epoch": 0.11138899576760293, + "flos": 556888451328.0, + "grad_norm": 0.05042820660432219, + "language_loss": 0.89312434, + "learning_rate": 0.0009827337002455245, + "loss": 0.90388274, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.6484375, + "step": 579, + "time_per_iteration": 2.6187195777893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074948, + "balance_loss_mlp": 1.01057482, + "epoch": 0.11158137745286649, + "flos": 691063331328.0, + "grad_norm": 0.03501309245374513, + "language_loss": 0.89977694, + "learning_rate": 0.0009826524423285712, + "loss": 0.91052639, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.64355469, + "step": 580, + "time_per_iteration": 2.9009909629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.0148946, + "epoch": 0.11177375913813005, + "flos": 764307868416.0, + "grad_norm": 0.04023884017549449, + "language_loss": 0.91280103, + "learning_rate": 0.0009825709970293218, + "loss": 0.92359698, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.64697266, + "step": 581, + "time_per_iteration": 2.9111618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.0095998, + "epoch": 0.11196614082339361, + "flos": 808031594496.0, + "grad_norm": 0.038028140255108665, + "language_loss": 0.97163212, + "learning_rate": 0.0009824893643793956, + "loss": 0.98237336, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.64501953, + "step": 582, + "time_per_iteration": 3.0907368659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072862, + "balance_loss_mlp": 1.00796497, + "epoch": 0.11215852250865718, + "flos": 559725924096.0, + "grad_norm": 0.04580369165919148, + "language_loss": 0.90464842, + "learning_rate": 0.0009824075444104857, + "loss": 0.91537702, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.64892578, + "step": 583, + "time_per_iteration": 2.7276525497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_mlp": 1.01285601, + "epoch": 0.11235090419392074, + "flos": 514576169472.0, + "grad_norm": 0.03926612419770205, + "language_loss": 0.95381963, + "learning_rate": 0.000982325537154357, + "loss": 0.96459383, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.64550781, + "step": 584, + "time_per_iteration": 2.6261777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074375, + "balance_loss_mlp": 1.0100019, + "epoch": 0.1125432858791843, + "flos": 492433377024.0, + "grad_norm": 0.043221505898455144, + "language_loss": 0.96143711, + "learning_rate": 0.0009822433426428484, + "loss": 0.97218084, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.64355469, + "step": 585, + "time_per_iteration": 2.5630125999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.01079714, + "epoch": 0.11273566756444786, + "flos": 511728003072.0, + "grad_norm": 0.04466131563000304, + "language_loss": 0.88984096, + "learning_rate": 0.0009821609609078697, + "loss": 0.90059173, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.64257812, + "step": 586, + "time_per_iteration": 2.649122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.01077783, + "epoch": 0.11292804924971142, + "flos": 623640526848.0, + "grad_norm": 0.03579172726266892, + "language_loss": 0.91595018, + "learning_rate": 0.0009820783919814045, + "loss": 0.92670119, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.64306641, + "step": 587, + "time_per_iteration": 2.7977845668792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_mlp": 1.00830126, + "epoch": 0.113120430934975, + "flos": 479039218176.0, + "grad_norm": 0.04738669495581529, + "language_loss": 0.85574889, + "learning_rate": 0.0009819956358955095, + "loss": 0.86647511, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.64306641, + "step": 588, + "time_per_iteration": 2.59133243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_mlp": 1.01245642, + "epoch": 0.11331281262023855, + "flos": 467991638016.0, + "grad_norm": 0.048752038127388646, + "language_loss": 0.86982751, + "learning_rate": 0.0009819126926823127, + "loss": 0.88059437, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.64208984, + "step": 589, + "time_per_iteration": 2.511939764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075971, + "balance_loss_mlp": 1.01174104, + "epoch": 0.11350519430550211, + "flos": 651611854848.0, + "grad_norm": 0.04204370934342767, + "language_loss": 0.89311969, + "learning_rate": 0.000981829562374016, + "loss": 0.9038794, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.64208984, + "step": 590, + "time_per_iteration": 2.798734426498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.01111591, + "epoch": 0.11369757599076567, + "flos": 558861754368.0, + "grad_norm": 0.04723710161718091, + "language_loss": 0.99783856, + "learning_rate": 0.0009817462450028933, + "loss": 1.00858927, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.63916016, + "step": 591, + "time_per_iteration": 2.717622756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076867, + "balance_loss_mlp": 1.01316178, + "epoch": 0.11388995767602925, + "flos": 572306457600.0, + "grad_norm": 0.041300229846526024, + "language_loss": 0.87103492, + "learning_rate": 0.0009816627406012916, + "loss": 0.88180363, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.63671875, + "step": 592, + "time_per_iteration": 2.783677339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077614, + "balance_loss_mlp": 1.01376593, + "epoch": 0.1140823393612928, + "flos": 741744168192.0, + "grad_norm": 0.04574882804976793, + "language_loss": 0.87044728, + "learning_rate": 0.0009815790492016295, + "loss": 0.88122344, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.63818359, + "step": 593, + "time_per_iteration": 2.920262336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079358, + "balance_loss_mlp": 1.01560438, + "epoch": 0.11427472104655637, + "flos": 700252314624.0, + "grad_norm": 0.042792726491020304, + "language_loss": 0.89086539, + "learning_rate": 0.0009814951708363993, + "loss": 0.90165901, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.63720703, + "step": 594, + "time_per_iteration": 2.8244025707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.00799561, + "epoch": 0.11446710273181993, + "flos": 1480355344128.0, + "grad_norm": 0.0135056408383676, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79060781, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.6171875, + "step": 595, + "time_per_iteration": 4.779642105102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.01189995, + "epoch": 0.1146594844170835, + "flos": 495913388544.0, + "grad_norm": 0.038757735955663945, + "language_loss": 0.90035105, + "learning_rate": 0.0009813268533395648, + "loss": 0.91110331, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.6328125, + "step": 596, + "time_per_iteration": 2.5933825969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082133, + "balance_loss_mlp": 1.01895213, + "epoch": 0.11485186610234706, + "flos": 475791531264.0, + "grad_norm": 0.0538004660752225, + "language_loss": 0.90474582, + "learning_rate": 0.0009812424142733073, + "loss": 0.9155671, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.63134766, + "step": 597, + "time_per_iteration": 2.528027296066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.01089013, + "epoch": 0.11504424778761062, + "flos": 732620313600.0, + "grad_norm": 0.03283482462688361, + "language_loss": 0.87953097, + "learning_rate": 0.000981157788372175, + "loss": 0.89027071, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.63037109, + "step": 598, + "time_per_iteration": 3.008469343185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.01160276, + "epoch": 0.11523662947287418, + "flos": 546963610368.0, + "grad_norm": 0.037424804687157906, + "language_loss": 0.91041148, + "learning_rate": 0.0009810729756690223, + "loss": 0.92115927, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.63134766, + "step": 599, + "time_per_iteration": 2.75840163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077312, + "balance_loss_mlp": 1.01408339, + "epoch": 0.11542901115813775, + "flos": 776388759552.0, + "grad_norm": 0.04126969924944996, + "language_loss": 0.9391377, + "learning_rate": 0.0009809879761967766, + "loss": 0.94991082, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.63183594, + "step": 600, + "time_per_iteration": 2.9511778354644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081843, + "balance_loss_mlp": 1.01828074, + "epoch": 0.11562139284340131, + "flos": 732213990144.0, + "grad_norm": 0.05544181306164312, + "language_loss": 0.88981479, + "learning_rate": 0.0009809027899884378, + "loss": 0.90063322, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.63525391, + "step": 601, + "time_per_iteration": 2.888591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.01256609, + "epoch": 0.11581377452866487, + "flos": 537040714752.0, + "grad_norm": 0.03483284203155477, + "language_loss": 0.90335476, + "learning_rate": 0.0009808174170770779, + "loss": 0.9141165, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.63574219, + "step": 602, + "time_per_iteration": 2.7933802604675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073479, + "balance_loss_mlp": 1.01263428, + "epoch": 0.11600615621392843, + "flos": 1559214555648.0, + "grad_norm": 0.012041981792172347, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85971725, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.60742188, + "step": 603, + "time_per_iteration": 4.875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.01658237, + "epoch": 0.116198537899192, + "flos": 538468688640.0, + "grad_norm": 0.046063141341509364, + "language_loss": 0.95944118, + "learning_rate": 0.0009806461112779462, + "loss": 0.97023928, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.63183594, + "step": 604, + "time_per_iteration": 2.708552360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077772, + "balance_loss_mlp": 1.01444781, + "epoch": 0.11639091958445556, + "flos": 455137950720.0, + "grad_norm": 0.05737724930332189, + "language_loss": 0.90764457, + "learning_rate": 0.0009805601784566814, + "loss": 0.91842222, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.6328125, + "step": 605, + "time_per_iteration": 2.545696496963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076475, + "balance_loss_mlp": 1.01329422, + "epoch": 0.11658330126971912, + "flos": 556152593664.0, + "grad_norm": 0.04016687987230144, + "language_loss": 0.97276044, + "learning_rate": 0.0009804740590654089, + "loss": 0.98352522, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.63134766, + "step": 606, + "time_per_iteration": 2.6464574337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_mlp": 1.01399851, + "epoch": 0.11677568295498268, + "flos": 717601827840.0, + "grad_norm": 0.0453344941203476, + "language_loss": 0.91881627, + "learning_rate": 0.0009803877531375635, + "loss": 0.9295876, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.63085938, + "step": 607, + "time_per_iteration": 2.8467392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074228, + "balance_loss_mlp": 1.0111903, + "epoch": 0.11696806464024626, + "flos": 610899600384.0, + "grad_norm": 0.04469679718872237, + "language_loss": 0.92976171, + "learning_rate": 0.0009803012607066523, + "loss": 0.94050401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.62988281, + "step": 608, + "time_per_iteration": 2.7587811946868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.01030838, + "epoch": 0.11716044632550981, + "flos": 521416628736.0, + "grad_norm": 0.04044307397502579, + "language_loss": 0.91207683, + "learning_rate": 0.0009802145818062543, + "loss": 0.92280889, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.62841797, + "step": 609, + "time_per_iteration": 2.7623538970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.00919068, + "epoch": 0.11735282801077337, + "flos": 508489064448.0, + "grad_norm": 0.04251091083777229, + "language_loss": 0.93763256, + "learning_rate": 0.0009801277164700212, + "loss": 0.9483524, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.62744141, + "step": 610, + "time_per_iteration": 2.6250369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079805, + "balance_loss_mlp": 1.0171963, + "epoch": 0.11754520969603693, + "flos": 687837031680.0, + "grad_norm": 0.044835447829723894, + "language_loss": 0.91796255, + "learning_rate": 0.0009800406647316776, + "loss": 0.92876053, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.62548828, + "step": 611, + "time_per_iteration": 2.81438946723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058731, + "balance_loss_mlp": 0.99807739, + "epoch": 0.1177375913813005, + "flos": 1545759158784.0, + "grad_norm": 0.00493114536612535, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77973187, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.60546875, + "step": 612, + "time_per_iteration": 4.795796871185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.01008153, + "epoch": 0.11792997306656407, + "flos": 521538137856.0, + "grad_norm": 0.049162221556570344, + "language_loss": 0.91035461, + "learning_rate": 0.000979866002183916, + "loss": 0.92108488, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.62890625, + "step": 613, + "time_per_iteration": 2.6470768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.00820458, + "epoch": 0.11812235475182763, + "flos": 667489652736.0, + "grad_norm": 0.0453482214384289, + "language_loss": 0.92239928, + "learning_rate": 0.0009797783914423082, + "loss": 0.93311322, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.63134766, + "step": 614, + "time_per_iteration": 2.8020856380462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_mlp": 1.01220894, + "epoch": 0.11831473643709119, + "flos": 622505148672.0, + "grad_norm": 0.04034391423157231, + "language_loss": 0.86097217, + "learning_rate": 0.0009796905944342094, + "loss": 0.87172604, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.63134766, + "step": 615, + "time_per_iteration": 2.839617967605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_mlp": 1.0160079, + "epoch": 0.11850711812235475, + "flos": 457695466752.0, + "grad_norm": 0.03330066749319758, + "language_loss": 0.89949274, + "learning_rate": 0.0009796026111937057, + "loss": 0.91028321, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.62988281, + "step": 616, + "time_per_iteration": 2.6211540699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077601, + "balance_loss_mlp": 1.0150882, + "epoch": 0.11869949980761832, + "flos": 514928057856.0, + "grad_norm": 0.034464018290856886, + "language_loss": 0.90251315, + "learning_rate": 0.0009795144417549552, + "loss": 0.91328913, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.62451172, + "step": 617, + "time_per_iteration": 2.6946897506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080332, + "balance_loss_mlp": 1.01815259, + "epoch": 0.11889188149288188, + "flos": 536157103104.0, + "grad_norm": 0.035314864293198016, + "language_loss": 0.91583192, + "learning_rate": 0.0009794260861521883, + "loss": 0.92663527, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.62109375, + "step": 618, + "time_per_iteration": 2.77822208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.01979554, + "epoch": 0.11908426317814544, + "flos": 499645166592.0, + "grad_norm": 0.042334404758790994, + "language_loss": 0.88659471, + "learning_rate": 0.0009793375444197075, + "loss": 0.89741158, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.61816406, + "step": 619, + "time_per_iteration": 2.6199400424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086152, + "balance_loss_mlp": 1.02416277, + "epoch": 0.119276644863409, + "flos": 661068155904.0, + "grad_norm": 0.043937618111938345, + "language_loss": 0.86906028, + "learning_rate": 0.000979248816591888, + "loss": 0.87992179, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.61914062, + "step": 620, + "time_per_iteration": 2.789858341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_mlp": 1.01947308, + "epoch": 0.11946902654867257, + "flos": 760153237248.0, + "grad_norm": 0.04701199265522289, + "language_loss": 0.87992656, + "learning_rate": 0.0009791599027031766, + "loss": 0.89074314, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.62109375, + "step": 621, + "time_per_iteration": 3.026487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074317, + "balance_loss_mlp": 1.01223314, + "epoch": 0.11966140823393613, + "flos": 682214575872.0, + "grad_norm": 0.0506686420393155, + "language_loss": 0.88143325, + "learning_rate": 0.0009790708027880932, + "loss": 0.89217639, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.62011719, + "step": 622, + "time_per_iteration": 2.8321774005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.02143097, + "epoch": 0.11985378991919969, + "flos": 1454300938752.0, + "grad_norm": 0.023212611497014573, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78508806, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.59960938, + "step": 623, + "time_per_iteration": 4.862462759017944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071261, + "balance_loss_mlp": 1.00936747, + "epoch": 0.12004617160446325, + "flos": 528899626752.0, + "grad_norm": 0.04437858339694968, + "language_loss": 0.95209736, + "learning_rate": 0.0009788920450172487, + "loss": 0.96280998, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.61816406, + "step": 624, + "time_per_iteration": 2.630764961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078232, + "balance_loss_mlp": 1.01619518, + "epoch": 0.12023855328972682, + "flos": 475177182720.0, + "grad_norm": 0.048047229360432486, + "language_loss": 0.92430472, + "learning_rate": 0.0009788023872308875, + "loss": 0.93508708, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.61962891, + "step": 625, + "time_per_iteration": 2.5534780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076523, + "balance_loss_mlp": 1.01682281, + "epoch": 0.12043093497499038, + "flos": 1535054718720.0, + "grad_norm": 0.022021305117703366, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.7650553, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.59570312, + "step": 626, + "time_per_iteration": 4.738527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108475, + "balance_loss_mlp": 1.023, + "epoch": 0.12062331666025394, + "flos": 540915389184.0, + "grad_norm": 0.04663901515177362, + "language_loss": 0.9603011, + "learning_rate": 0.0009786225140303285, + "loss": 0.97114861, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.61669922, + "step": 627, + "time_per_iteration": 2.634160280227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.02427304, + "epoch": 0.1208156983455175, + "flos": 513000441600.0, + "grad_norm": 0.042540459475059536, + "language_loss": 0.94019556, + "learning_rate": 0.0009785322986859634, + "loss": 0.95105481, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.61572266, + "step": 628, + "time_per_iteration": 2.681070327758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.01725972, + "epoch": 0.12100808003078108, + "flos": 597590012160.0, + "grad_norm": 0.03866803919075334, + "language_loss": 0.94614279, + "learning_rate": 0.0009784418975588838, + "loss": 0.95693052, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.61425781, + "step": 629, + "time_per_iteration": 2.7337839603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073027, + "balance_loss_mlp": 1.01132393, + "epoch": 0.12120046171604464, + "flos": 524067463680.0, + "grad_norm": 0.03279843121618067, + "language_loss": 0.94581258, + "learning_rate": 0.0009783513106841862, + "loss": 0.95654285, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.61621094, + "step": 630, + "time_per_iteration": 2.702615737915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080086, + "balance_loss_mlp": 1.01981354, + "epoch": 0.1213928434013082, + "flos": 1557910036224.0, + "grad_norm": 0.01502333088768157, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77812791, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.6015625, + "step": 631, + "time_per_iteration": 4.998409032821655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080339, + "balance_loss_mlp": 1.01835024, + "epoch": 0.12158522508657175, + "flos": 496388731392.0, + "grad_norm": 0.04174070683076465, + "language_loss": 0.89320499, + "learning_rate": 0.0009781695798326854, + "loss": 0.90400839, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.61914062, + "step": 632, + "time_per_iteration": 2.5908379554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079195, + "balance_loss_mlp": 1.01744485, + "epoch": 0.12177760677183531, + "flos": 476590572288.0, + "grad_norm": 0.04165368210868703, + "language_loss": 0.89744723, + "learning_rate": 0.0009780784359264365, + "loss": 0.90823919, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.61669922, + "step": 633, + "time_per_iteration": 2.689202070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073334, + "balance_loss_mlp": 1.01382446, + "epoch": 0.12196998845709889, + "flos": 1471787512320.0, + "grad_norm": 0.011333314510513573, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75262028, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.59375, + "step": 634, + "time_per_iteration": 4.762145757675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073337, + "balance_loss_mlp": 1.01187229, + "epoch": 0.12216237014236245, + "flos": 587749741824.0, + "grad_norm": 0.03178889939160208, + "language_loss": 0.88649213, + "learning_rate": 0.000977895591329867, + "loss": 0.8972255, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.61376953, + "step": 635, + "time_per_iteration": 2.7996504306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075051, + "balance_loss_mlp": 1.01372933, + "epoch": 0.12235475182762601, + "flos": 599107414272.0, + "grad_norm": 0.038321985001081305, + "language_loss": 0.88459468, + "learning_rate": 0.000977803890710533, + "loss": 0.89534515, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.61230469, + "step": 636, + "time_per_iteration": 2.7200405597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072884, + "balance_loss_mlp": 1.0117538, + "epoch": 0.12254713351288957, + "flos": 498761554944.0, + "grad_norm": 0.03313527469264444, + "language_loss": 0.94808865, + "learning_rate": 0.0009777120045912774, + "loss": 0.95881748, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.61035156, + "step": 637, + "time_per_iteration": 2.6253507137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072019, + "balance_loss_mlp": 1.01084125, + "epoch": 0.12273951519815314, + "flos": 606981184512.0, + "grad_norm": 0.04065251745031248, + "language_loss": 0.91558111, + "learning_rate": 0.0009776199330077736, + "loss": 0.92630136, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.61083984, + "step": 638, + "time_per_iteration": 2.724416732788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069779, + "balance_loss_mlp": 1.0086484, + "epoch": 0.1229318968834167, + "flos": 598985905152.0, + "grad_norm": 0.04427923240085457, + "language_loss": 0.94062102, + "learning_rate": 0.0009775276759957667, + "loss": 0.9513188, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.61035156, + "step": 639, + "time_per_iteration": 2.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_mlp": 1.00851989, + "epoch": 0.12312427856868026, + "flos": 679589985792.0, + "grad_norm": 0.04435656949952303, + "language_loss": 0.91938198, + "learning_rate": 0.0009774352335910745, + "loss": 0.93008226, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.61425781, + "step": 640, + "time_per_iteration": 2.8135974407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072128, + "balance_loss_mlp": 1.01095021, + "epoch": 0.12331666025394382, + "flos": 610044178944.0, + "grad_norm": 0.03352322480141845, + "language_loss": 0.95842457, + "learning_rate": 0.000977342605829586, + "loss": 0.96914589, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.61083984, + "step": 641, + "time_per_iteration": 2.734373092651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107341, + "balance_loss_mlp": 1.01208854, + "epoch": 0.12350904193920739, + "flos": 763841273856.0, + "grad_norm": 0.04166007448412618, + "language_loss": 0.87458932, + "learning_rate": 0.0009772497927472623, + "loss": 0.88532341, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.61230469, + "step": 642, + "time_per_iteration": 3.069495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.01199543, + "epoch": 0.12370142362447095, + "flos": 542050767360.0, + "grad_norm": 0.04189965725350253, + "language_loss": 0.86664522, + "learning_rate": 0.0009771567943801368, + "loss": 0.87737978, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.61376953, + "step": 643, + "time_per_iteration": 2.6783955097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.01000655, + "epoch": 0.12389380530973451, + "flos": 549253808640.0, + "grad_norm": 0.03907898995026106, + "language_loss": 0.90534973, + "learning_rate": 0.0009770636107643152, + "loss": 0.91606158, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.61083984, + "step": 644, + "time_per_iteration": 2.7792532444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.01343274, + "epoch": 0.12408618699499807, + "flos": 541353793536.0, + "grad_norm": 0.03775088580197231, + "language_loss": 0.89077818, + "learning_rate": 0.0009769702419359738, + "loss": 0.9015224, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.60888672, + "step": 645, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071725, + "balance_loss_mlp": 1.01083338, + "epoch": 0.12427856868026164, + "flos": 747160544256.0, + "grad_norm": 0.03491310842571494, + "language_loss": 0.90435565, + "learning_rate": 0.000976876687931362, + "loss": 0.91507292, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.60791016, + "step": 646, + "time_per_iteration": 3.028578758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074215, + "balance_loss_mlp": 1.01332271, + "epoch": 0.1244709503655252, + "flos": 534745658880.0, + "grad_norm": 0.04739554944994068, + "language_loss": 0.86433625, + "learning_rate": 0.0009767829487868005, + "loss": 0.87507832, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.60791016, + "step": 647, + "time_per_iteration": 2.6323471069335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075713, + "balance_loss_mlp": 1.01472592, + "epoch": 0.12466333205078876, + "flos": 509112161280.0, + "grad_norm": 0.0390766896094967, + "language_loss": 0.89632404, + "learning_rate": 0.000976689024538682, + "loss": 0.90708113, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.60888672, + "step": 648, + "time_per_iteration": 2.6233997344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069043, + "balance_loss_mlp": 1.00819838, + "epoch": 0.12485571373605232, + "flos": 682640341248.0, + "grad_norm": 0.04106035596266842, + "language_loss": 0.87981439, + "learning_rate": 0.0009765949152234716, + "loss": 0.89050484, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.60742188, + "step": 649, + "time_per_iteration": 2.9135711193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064964, + "balance_loss_mlp": 1.00659943, + "epoch": 0.1250480954213159, + "flos": 1333201377024.0, + "grad_norm": 0.013063081234142807, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79751045, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.58203125, + "step": 650, + "time_per_iteration": 4.696362495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.0093261, + "epoch": 0.12524047710657946, + "flos": 940198178304.0, + "grad_norm": 0.03723688894295025, + "language_loss": 0.82869852, + "learning_rate": 0.0009764061415379919, + "loss": 0.83939779, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.60498047, + "step": 651, + "time_per_iteration": 3.287029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071729, + "balance_loss_mlp": 1.01078951, + "epoch": 0.12543285879184302, + "flos": 514901812992.0, + "grad_norm": 0.03842788822410913, + "language_loss": 0.90123397, + "learning_rate": 0.0009763114772410109, + "loss": 0.91195124, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.60839844, + "step": 652, + "time_per_iteration": 2.5726470947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071215, + "balance_loss_mlp": 1.01075244, + "epoch": 0.12562524047710658, + "flos": 719684001024.0, + "grad_norm": 0.03790395950388449, + "language_loss": 0.88320071, + "learning_rate": 0.0009762166280235146, + "loss": 0.89391285, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.60351562, + "step": 653, + "time_per_iteration": 2.9728682041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073992, + "balance_loss_mlp": 1.01372027, + "epoch": 0.12581762216237014, + "flos": 564799160064.0, + "grad_norm": 0.039966468352906216, + "language_loss": 0.88308495, + "learning_rate": 0.0009761215939223267, + "loss": 0.89382488, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.6015625, + "step": 654, + "time_per_iteration": 2.7552366256713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.01100981, + "epoch": 0.1260100038476337, + "flos": 482901253632.0, + "grad_norm": 0.045851790315233704, + "language_loss": 0.87049586, + "learning_rate": 0.0009760263749743428, + "loss": 0.88121206, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.60498047, + "step": 655, + "time_per_iteration": 2.5859339237213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.01301908, + "epoch": 0.12620238553289725, + "flos": 576702161664.0, + "grad_norm": 0.03680601760412016, + "language_loss": 0.91127861, + "learning_rate": 0.0009759309712165299, + "loss": 0.9220134, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.60351562, + "step": 656, + "time_per_iteration": 2.7411043643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.00841653, + "epoch": 0.12639476721816084, + "flos": 532186197504.0, + "grad_norm": 0.050748048847022796, + "language_loss": 0.94208288, + "learning_rate": 0.0009758353826859272, + "loss": 0.95277309, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.60498047, + "step": 657, + "time_per_iteration": 2.5851681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071924, + "balance_loss_mlp": 1.01117456, + "epoch": 0.1265871489034244, + "flos": 691232472576.0, + "grad_norm": 0.04052834214006204, + "language_loss": 0.90056133, + "learning_rate": 0.0009757396094196456, + "loss": 0.91128063, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.60644531, + "step": 658, + "time_per_iteration": 2.9119739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.01083672, + "epoch": 0.12677953058868796, + "flos": 538243166976.0, + "grad_norm": 0.03305987481805703, + "language_loss": 0.85138786, + "learning_rate": 0.0009756436514548673, + "loss": 0.86210179, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.60449219, + "step": 659, + "time_per_iteration": 2.8146860599517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070631, + "balance_loss_mlp": 1.01021552, + "epoch": 0.12697191227395152, + "flos": 520120857600.0, + "grad_norm": 0.03322369158928612, + "language_loss": 0.89052176, + "learning_rate": 0.0009755475088288466, + "loss": 0.90122807, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.60302734, + "step": 660, + "time_per_iteration": 2.7092652320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070351, + "balance_loss_mlp": 1.01007843, + "epoch": 0.12716429395921508, + "flos": 567666768384.0, + "grad_norm": 0.0427017471912124, + "language_loss": 0.91535795, + "learning_rate": 0.0009754511815789095, + "loss": 0.92606151, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.6015625, + "step": 661, + "time_per_iteration": 2.790198564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.00809085, + "epoch": 0.12735667564447864, + "flos": 515142885888.0, + "grad_norm": 0.0409493229321676, + "language_loss": 0.8685838, + "learning_rate": 0.0009753546697424533, + "loss": 0.87926698, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.60107422, + "step": 662, + "time_per_iteration": 2.6784565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070378, + "balance_loss_mlp": 1.01020074, + "epoch": 0.1275490573297422, + "flos": 542321975808.0, + "grad_norm": 0.039351291895580044, + "language_loss": 0.91270494, + "learning_rate": 0.0009752579733569475, + "loss": 0.92340875, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.60058594, + "step": 663, + "time_per_iteration": 2.679379940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071762, + "balance_loss_mlp": 1.01358795, + "epoch": 0.12774143901500576, + "flos": 1562027728896.0, + "grad_norm": 0.016936801864205438, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7595315, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.58007812, + "step": 664, + "time_per_iteration": 4.936127424240112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070961, + "balance_loss_mlp": 1.01092672, + "epoch": 0.12793382070026935, + "flos": 614874396672.0, + "grad_norm": 0.047422479810277696, + "language_loss": 0.90634137, + "learning_rate": 0.0009750640270890217, + "loss": 0.91705096, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.59912109, + "step": 665, + "time_per_iteration": 2.712202548980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.01361179, + "epoch": 0.1281262023855329, + "flos": 709118566656.0, + "grad_norm": 0.04721256261198653, + "language_loss": 0.97348696, + "learning_rate": 0.0009749667772818983, + "loss": 0.98422199, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.59765625, + "step": 666, + "time_per_iteration": 2.959563732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065521, + "balance_loss_mlp": 1.00791931, + "epoch": 0.12831858407079647, + "flos": 1428185295360.0, + "grad_norm": 0.00958948420866419, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78001463, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.57421875, + "step": 667, + "time_per_iteration": 4.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.01259768, + "epoch": 0.12851096575606002, + "flos": 450019027968.0, + "grad_norm": 0.04331482152431362, + "language_loss": 0.96237415, + "learning_rate": 0.0009747717245101093, + "loss": 0.97309327, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.59179688, + "step": 668, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.01193655, + "epoch": 0.12870334744132358, + "flos": 480910454016.0, + "grad_norm": 0.040015395826151615, + "language_loss": 0.86231172, + "learning_rate": 0.00097467392162117, + "loss": 0.87302423, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.59179688, + "step": 669, + "time_per_iteration": 2.620121717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.01342034, + "epoch": 0.12889572912658714, + "flos": 640152115200.0, + "grad_norm": 0.03307407171369126, + "language_loss": 0.91950834, + "learning_rate": 0.0009745759344474708, + "loss": 0.9302386, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.59472656, + "step": 670, + "time_per_iteration": 2.834406852722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070894, + "balance_loss_mlp": 1.01114607, + "epoch": 0.1290881108118507, + "flos": 510955206912.0, + "grad_norm": 0.03904079329345599, + "language_loss": 0.90752548, + "learning_rate": 0.0009744777630270536, + "loss": 0.91823441, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.59619141, + "step": 671, + "time_per_iteration": 2.5841259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.00967062, + "epoch": 0.12928049249711426, + "flos": 672291680256.0, + "grad_norm": 0.0427916369984872, + "language_loss": 0.94394779, + "learning_rate": 0.000974379407398032, + "loss": 0.95464385, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.59814453, + "step": 672, + "time_per_iteration": 2.8698208332061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072071, + "balance_loss_mlp": 1.0120368, + "epoch": 0.12947287418237785, + "flos": 795000017664.0, + "grad_norm": 0.03399258645873994, + "language_loss": 0.83039552, + "learning_rate": 0.0009742808675985913, + "loss": 0.84111625, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.59912109, + "step": 673, + "time_per_iteration": 3.1018688678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_mlp": 1.00729334, + "epoch": 0.1296652558676414, + "flos": 486448339200.0, + "grad_norm": 0.039807509100232605, + "language_loss": 0.91899526, + "learning_rate": 0.0009741821436669876, + "loss": 0.92966807, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.59863281, + "step": 674, + "time_per_iteration": 2.6348536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068278, + "balance_loss_mlp": 1.00853038, + "epoch": 0.12985763755290497, + "flos": 454393344768.0, + "grad_norm": 0.044170807310258554, + "language_loss": 0.93403888, + "learning_rate": 0.0009740832356415492, + "loss": 0.9447217, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.59619141, + "step": 675, + "time_per_iteration": 2.483262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072639, + "balance_loss_mlp": 1.01265311, + "epoch": 0.13005001923816853, + "flos": 826435805952.0, + "grad_norm": 0.043859966784303914, + "language_loss": 0.89693773, + "learning_rate": 0.0009739841435606756, + "loss": 0.90766412, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.59863281, + "step": 676, + "time_per_iteration": 2.992385149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_mlp": 1.00726056, + "epoch": 0.1302424009234321, + "flos": 532481705472.0, + "grad_norm": 0.03559705023164985, + "language_loss": 0.91210669, + "learning_rate": 0.0009738848674628377, + "loss": 0.92277622, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.59570312, + "step": 677, + "time_per_iteration": 2.766364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.00765288, + "epoch": 0.13043478260869565, + "flos": 526917575424.0, + "grad_norm": 0.03838556287658105, + "language_loss": 0.90382779, + "learning_rate": 0.000973785407386578, + "loss": 0.91449988, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.59423828, + "step": 678, + "time_per_iteration": 2.772854804992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070658, + "balance_loss_mlp": 1.01076782, + "epoch": 0.1306271642939592, + "flos": 627417991680.0, + "grad_norm": 0.03509098765963207, + "language_loss": 0.88142246, + "learning_rate": 0.0009736857633705103, + "loss": 0.89212906, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.59765625, + "step": 679, + "time_per_iteration": 2.851567268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.01602292, + "epoch": 0.13081954597922277, + "flos": 551841460224.0, + "grad_norm": 0.03859467755451503, + "language_loss": 0.94306064, + "learning_rate": 0.0009735859354533196, + "loss": 0.95381933, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.59716797, + "step": 680, + "time_per_iteration": 2.6908183097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.01038456, + "epoch": 0.13101192766448633, + "flos": 537956407296.0, + "grad_norm": 0.04695623305024525, + "language_loss": 0.92768431, + "learning_rate": 0.0009734859236737628, + "loss": 0.93838656, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.59716797, + "step": 681, + "time_per_iteration": 2.618556261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.00631785, + "epoch": 0.13120430934974991, + "flos": 504514268160.0, + "grad_norm": 0.03771498494962771, + "language_loss": 0.94425803, + "learning_rate": 0.0009733857280706678, + "loss": 0.95491678, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.59423828, + "step": 682, + "time_per_iteration": 2.607445240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.00883758, + "epoch": 0.13139669103501347, + "flos": 615423616512.0, + "grad_norm": 0.040497909024236244, + "language_loss": 0.85748106, + "learning_rate": 0.000973285348682934, + "loss": 0.86816311, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.59228516, + "step": 683, + "time_per_iteration": 2.749258518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.00846863, + "epoch": 0.13158907272027703, + "flos": 1488218420736.0, + "grad_norm": 0.017735586482065788, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.78962922, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.5625, + "step": 684, + "time_per_iteration": 4.792337894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069614, + "balance_loss_mlp": 1.01053405, + "epoch": 0.1317814544055406, + "flos": 987119046912.0, + "grad_norm": 0.04121230716493085, + "language_loss": 0.86815995, + "learning_rate": 0.0009730840387095046, + "loss": 0.87885606, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.58935547, + "step": 685, + "time_per_iteration": 3.324737071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.00972676, + "epoch": 0.13197383609080415, + "flos": 612629885184.0, + "grad_norm": 0.03769323902360627, + "language_loss": 0.91733027, + "learning_rate": 0.0009729831082019642, + "loss": 0.92801929, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.59033203, + "step": 686, + "time_per_iteration": 2.883368968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069054, + "balance_loss_mlp": 1.0096879, + "epoch": 0.1321662177760677, + "flos": 495555664128.0, + "grad_norm": 0.03344682577786829, + "language_loss": 0.90060174, + "learning_rate": 0.0009728819940660958, + "loss": 0.91129231, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.59228516, + "step": 687, + "time_per_iteration": 2.7771294116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069131, + "balance_loss_mlp": 1.00971675, + "epoch": 0.13235859946133127, + "flos": 496844632320.0, + "grad_norm": 0.041743180753116546, + "language_loss": 0.8673048, + "learning_rate": 0.0009727806963411557, + "loss": 0.87799615, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.59277344, + "step": 688, + "time_per_iteration": 2.5879924297332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069414, + "balance_loss_mlp": 1.00971425, + "epoch": 0.13255098114659483, + "flos": 512768116992.0, + "grad_norm": 0.035278095584539565, + "language_loss": 0.88457793, + "learning_rate": 0.000972679215066471, + "loss": 0.89527214, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.59570312, + "step": 689, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_mlp": 1.00826621, + "epoch": 0.13274336283185842, + "flos": 548400332544.0, + "grad_norm": 0.043703661342582356, + "language_loss": 1.0036962, + "learning_rate": 0.0009725775502814401, + "loss": 1.01437247, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.59228516, + "step": 690, + "time_per_iteration": 2.580975294113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072547, + "balance_loss_mlp": 1.01313293, + "epoch": 0.13293574451712198, + "flos": 642003909120.0, + "grad_norm": 0.041755939912029, + "language_loss": 0.86554468, + "learning_rate": 0.0009724757020255327, + "loss": 0.87627012, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.59277344, + "step": 691, + "time_per_iteration": 2.895805835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074323, + "balance_loss_mlp": 1.01533794, + "epoch": 0.13312812620238554, + "flos": 492470315520.0, + "grad_norm": 0.04584738151589033, + "language_loss": 0.8907311, + "learning_rate": 0.0009723736703382902, + "loss": 0.90147436, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.58837891, + "step": 692, + "time_per_iteration": 2.593621253967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073259, + "balance_loss_mlp": 1.01427472, + "epoch": 0.1333205078876491, + "flos": 509950086144.0, + "grad_norm": 0.042207641511909956, + "language_loss": 0.84734881, + "learning_rate": 0.0009722714552593244, + "loss": 0.85808134, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.58837891, + "step": 693, + "time_per_iteration": 2.6628286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069094, + "balance_loss_mlp": 1.01010931, + "epoch": 0.13351288957291266, + "flos": 419592251136.0, + "grad_norm": 0.04342856140262568, + "language_loss": 0.95545483, + "learning_rate": 0.000972169056828319, + "loss": 0.96614575, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.58837891, + "step": 694, + "time_per_iteration": 2.491511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.00896847, + "epoch": 0.13370527125817622, + "flos": 617051834112.0, + "grad_norm": 0.03328111889388194, + "language_loss": 0.87929142, + "learning_rate": 0.0009720664750850283, + "loss": 0.88997287, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.59033203, + "step": 695, + "time_per_iteration": 2.802238941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066112, + "balance_loss_mlp": 1.00693631, + "epoch": 0.13389765294343978, + "flos": 627170115840.0, + "grad_norm": 0.04111883948503256, + "language_loss": 0.94899035, + "learning_rate": 0.0009719637100692784, + "loss": 0.95965147, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.59033203, + "step": 696, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066724, + "balance_loss_mlp": 1.00764382, + "epoch": 0.13409003462870334, + "flos": 610897655040.0, + "grad_norm": 0.03903466400724949, + "language_loss": 0.84625083, + "learning_rate": 0.0009718607618209661, + "loss": 0.85691804, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.58935547, + "step": 697, + "time_per_iteration": 2.8612687587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.00915492, + "epoch": 0.13428241631396692, + "flos": 685088987136.0, + "grad_norm": 0.03548160791415639, + "language_loss": 0.8885181, + "learning_rate": 0.0009717576303800595, + "loss": 0.89919716, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.5859375, + "step": 698, + "time_per_iteration": 3.046081304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.00870502, + "epoch": 0.13447479799923048, + "flos": 509819828736.0, + "grad_norm": 0.04099621387271608, + "language_loss": 0.8689754, + "learning_rate": 0.0009716543157865975, + "loss": 0.87964994, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.5859375, + "step": 699, + "time_per_iteration": 2.7116739749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.00893724, + "epoch": 0.13466717968449404, + "flos": 899060158464.0, + "grad_norm": 0.03800712734159662, + "language_loss": 0.8517018, + "learning_rate": 0.0009715508180806907, + "loss": 0.86237621, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.58349609, + "step": 700, + "time_per_iteration": 3.184324026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.00777256, + "epoch": 0.1348595613697576, + "flos": 991695552768.0, + "grad_norm": 0.036541360765650906, + "language_loss": 0.91219282, + "learning_rate": 0.0009714471373025202, + "loss": 0.92285609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.58398438, + "step": 701, + "time_per_iteration": 3.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.0059582, + "epoch": 0.13505194305502116, + "flos": 488812414464.0, + "grad_norm": 0.038284394577449095, + "language_loss": 0.90020943, + "learning_rate": 0.0009713432734923386, + "loss": 0.91085601, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.58544922, + "step": 702, + "time_per_iteration": 2.6416144371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.00842357, + "epoch": 0.13524432474028472, + "flos": 614520562944.0, + "grad_norm": 0.03635122731697363, + "language_loss": 0.87970936, + "learning_rate": 0.0009712392266904696, + "loss": 0.89038247, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.58740234, + "step": 703, + "time_per_iteration": 2.73490309715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.00782144, + "epoch": 0.13543670642554828, + "flos": 906275838720.0, + "grad_norm": 0.040994558071305906, + "language_loss": 0.86788869, + "learning_rate": 0.0009711349969373076, + "loss": 0.87855482, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.58642578, + "step": 704, + "time_per_iteration": 3.1667368412017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066356, + "balance_loss_mlp": 1.00765777, + "epoch": 0.13562908811081184, + "flos": 551748141312.0, + "grad_norm": 0.040707128775991024, + "language_loss": 0.81448901, + "learning_rate": 0.0009710305842733178, + "loss": 0.82515258, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.58544922, + "step": 705, + "time_per_iteration": 2.7456798553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064755, + "balance_loss_mlp": 1.00648558, + "epoch": 0.1358214697960754, + "flos": 509038284288.0, + "grad_norm": 0.04235852839756889, + "language_loss": 0.91048527, + "learning_rate": 0.0009709259887390373, + "loss": 0.9211328, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.58105469, + "step": 706, + "time_per_iteration": 2.614645481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067613, + "balance_loss_mlp": 1.0098201, + "epoch": 0.136013851481339, + "flos": 529924189440.0, + "grad_norm": 0.045207837368539144, + "language_loss": 0.92539275, + "learning_rate": 0.0009708212103750737, + "loss": 0.93606889, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.57617188, + "step": 707, + "time_per_iteration": 2.5839250087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073243, + "balance_loss_mlp": 1.01525927, + "epoch": 0.13620623316660255, + "flos": 660321604608.0, + "grad_norm": 0.04139663244511697, + "language_loss": 0.88690269, + "learning_rate": 0.0009707162492221051, + "loss": 0.8976351, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.578125, + "step": 708, + "time_per_iteration": 2.8753738403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106855, + "balance_loss_mlp": 1.01051939, + "epoch": 0.1363986148518661, + "flos": 673083918336.0, + "grad_norm": 0.04870142688483653, + "language_loss": 0.89226341, + "learning_rate": 0.0009706111053208815, + "loss": 0.90294898, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.57861328, + "step": 709, + "time_per_iteration": 2.792555570602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.0069865, + "epoch": 0.13659099653712967, + "flos": 474004866048.0, + "grad_norm": 0.041589756065930725, + "language_loss": 0.87875092, + "learning_rate": 0.0009705057787122232, + "loss": 0.88940346, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.58105469, + "step": 710, + "time_per_iteration": 2.5474488735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106841, + "balance_loss_mlp": 1.00980711, + "epoch": 0.13678337822239323, + "flos": 453648738816.0, + "grad_norm": 0.03947638411835938, + "language_loss": 0.92397159, + "learning_rate": 0.0009704002694370216, + "loss": 0.93465567, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.58447266, + "step": 711, + "time_per_iteration": 2.5812153816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107162, + "balance_loss_mlp": 1.01306474, + "epoch": 0.13697575990765679, + "flos": 520626336000.0, + "grad_norm": 0.04103000756090051, + "language_loss": 0.88202429, + "learning_rate": 0.0009702945775362388, + "loss": 0.89274049, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.58398438, + "step": 712, + "time_per_iteration": 2.6084940433502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067699, + "balance_loss_mlp": 1.00914371, + "epoch": 0.13716814159292035, + "flos": 481366354944.0, + "grad_norm": 0.04017855754763819, + "language_loss": 0.88458985, + "learning_rate": 0.0009701887030509086, + "loss": 0.89526689, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.58398438, + "step": 713, + "time_per_iteration": 2.6361663341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.01425505, + "epoch": 0.1373605232781839, + "flos": 546750727680.0, + "grad_norm": 0.04169009137316196, + "language_loss": 0.92536753, + "learning_rate": 0.0009700826460221346, + "loss": 0.93609238, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.58056641, + "step": 714, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068882, + "balance_loss_mlp": 1.01080275, + "epoch": 0.1375529049634475, + "flos": 710071197696.0, + "grad_norm": 0.042053375460334, + "language_loss": 0.94210052, + "learning_rate": 0.0009699764064910921, + "loss": 0.95278937, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.57910156, + "step": 715, + "time_per_iteration": 2.870835542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069166, + "balance_loss_mlp": 1.01099169, + "epoch": 0.13774528664871105, + "flos": 487677036288.0, + "grad_norm": 0.04018028408764831, + "language_loss": 0.88572168, + "learning_rate": 0.0009698699844990268, + "loss": 0.89641333, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.58007812, + "step": 716, + "time_per_iteration": 2.6557233333587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106644, + "balance_loss_mlp": 1.00817037, + "epoch": 0.1379376683339746, + "flos": 681459276288.0, + "grad_norm": 0.03631196674856893, + "language_loss": 0.89737439, + "learning_rate": 0.0009697633800872555, + "loss": 0.90803885, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.58105469, + "step": 717, + "time_per_iteration": 2.9236202239990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.00998127, + "epoch": 0.13813005001923817, + "flos": 612226473984.0, + "grad_norm": 0.040527486313319094, + "language_loss": 0.9214747, + "learning_rate": 0.0009696565932971655, + "loss": 0.93215865, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.58251953, + "step": 718, + "time_per_iteration": 2.8931636810302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_mlp": 1.01394677, + "epoch": 0.13832243170450173, + "flos": 589927179264.0, + "grad_norm": 0.042228364331249636, + "language_loss": 0.91184157, + "learning_rate": 0.0009695496241702153, + "loss": 0.92256421, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.58154297, + "step": 719, + "time_per_iteration": 2.8006720542907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010673, + "balance_loss_mlp": 1.00917327, + "epoch": 0.1385148133897653, + "flos": 701320618752.0, + "grad_norm": 0.04012183054192491, + "language_loss": 0.87174737, + "learning_rate": 0.0009694424727479339, + "loss": 0.88242036, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.57958984, + "step": 720, + "time_per_iteration": 2.9363977909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.0081414, + "epoch": 0.13870719507502885, + "flos": 599367929088.0, + "grad_norm": 0.04032336097495746, + "language_loss": 0.90803999, + "learning_rate": 0.0009693351390719213, + "loss": 0.91870457, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.58154297, + "step": 721, + "time_per_iteration": 2.7786271572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01273775, + "epoch": 0.1388995767602924, + "flos": 587749741824.0, + "grad_norm": 0.04179929290372652, + "language_loss": 0.92465305, + "learning_rate": 0.000969227623183848, + "loss": 0.93536115, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.57910156, + "step": 722, + "time_per_iteration": 2.777453660964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.00870621, + "epoch": 0.139091958445556, + "flos": 652363263744.0, + "grad_norm": 0.041578114374578125, + "language_loss": 0.92603219, + "learning_rate": 0.0009691199251254554, + "loss": 0.9366982, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.57714844, + "step": 723, + "time_per_iteration": 2.813610553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.00586045, + "epoch": 0.13928434013081956, + "flos": 576906296064.0, + "grad_norm": 0.03663552971403626, + "language_loss": 0.88541949, + "learning_rate": 0.0009690120449385555, + "loss": 0.89605606, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.57617188, + "step": 724, + "time_per_iteration": 2.7604424953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_mlp": 1.00582433, + "epoch": 0.13947672181608312, + "flos": 564315068928.0, + "grad_norm": 0.034271197388489986, + "language_loss": 0.93926299, + "learning_rate": 0.0009689039826650312, + "loss": 0.94990206, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.57910156, + "step": 725, + "time_per_iteration": 2.7856695652008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095871, + "balance_loss_mlp": 1.03941345, + "epoch": 0.13966910350134668, + "flos": 1524951988224.0, + "grad_norm": 0.03128450212810151, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77618933, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.56640625, + "step": 726, + "time_per_iteration": 4.903306245803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.00902975, + "epoch": 0.13986148518661023, + "flos": 500856367104.0, + "grad_norm": 0.052764167671210026, + "language_loss": 0.89172196, + "learning_rate": 0.0009686873120259941, + "loss": 0.90239263, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.57861328, + "step": 727, + "time_per_iteration": 2.6450552940368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072692, + "balance_loss_mlp": 1.01518559, + "epoch": 0.1400538668718738, + "flos": 599850074880.0, + "grad_norm": 0.036488800736072635, + "language_loss": 0.88047451, + "learning_rate": 0.0009685787037446004, + "loss": 0.89120144, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.57324219, + "step": 728, + "time_per_iteration": 2.763434648513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072512, + "balance_loss_mlp": 1.01481462, + "epoch": 0.14024624855713735, + "flos": 595169556480.0, + "grad_norm": 0.047561697925478, + "language_loss": 0.88858587, + "learning_rate": 0.0009684699135448201, + "loss": 0.89931101, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.57519531, + "step": 729, + "time_per_iteration": 2.745037078857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067277, + "balance_loss_mlp": 1.00962722, + "epoch": 0.1404386302424009, + "flos": 507586010880.0, + "grad_norm": 0.03094406590189725, + "language_loss": 0.9291476, + "learning_rate": 0.0009683609414688895, + "loss": 0.93982029, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.57470703, + "step": 730, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.01039195, + "epoch": 0.14063101192766447, + "flos": 574515975936.0, + "grad_norm": 0.037780385553924656, + "language_loss": 0.87345785, + "learning_rate": 0.0009682517875591154, + "loss": 0.88414258, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.57910156, + "step": 731, + "time_per_iteration": 2.752572536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071345, + "balance_loss_mlp": 1.0129801, + "epoch": 0.14082339361292806, + "flos": 565765396992.0, + "grad_norm": 0.03832964150159033, + "language_loss": 0.87666118, + "learning_rate": 0.0009681424518578749, + "loss": 0.88737464, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.58203125, + "step": 732, + "time_per_iteration": 2.7323830127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.01028764, + "epoch": 0.14101577529819162, + "flos": 464583558144.0, + "grad_norm": 0.035957988569031644, + "language_loss": 0.88670099, + "learning_rate": 0.000968032934407616, + "loss": 0.8973856, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.58007812, + "step": 733, + "time_per_iteration": 2.6479005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064525, + "balance_loss_mlp": 1.00644577, + "epoch": 0.14120815698345518, + "flos": 597262423296.0, + "grad_norm": 0.039547782577588224, + "language_loss": 0.82413781, + "learning_rate": 0.0009679232352508571, + "loss": 0.83478296, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.57910156, + "step": 734, + "time_per_iteration": 2.7924795150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.00599897, + "epoch": 0.14140053866871874, + "flos": 536232925440.0, + "grad_norm": 0.03854566850595878, + "language_loss": 0.82520735, + "learning_rate": 0.0009678133544301871, + "loss": 0.83584428, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.57519531, + "step": 735, + "time_per_iteration": 2.658731698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_mlp": 1.00498438, + "epoch": 0.1415929203539823, + "flos": 521277623040.0, + "grad_norm": 0.0297517777524564, + "language_loss": 0.92917788, + "learning_rate": 0.0009677032919882658, + "loss": 0.93980187, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.57226562, + "step": 736, + "time_per_iteration": 2.661276340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_mlp": 1.0113374, + "epoch": 0.14178530203924586, + "flos": 483302719488.0, + "grad_norm": 0.041037110936195734, + "language_loss": 0.92867804, + "learning_rate": 0.000967593047967823, + "loss": 0.93936217, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.56982422, + "step": 737, + "time_per_iteration": 2.52840256690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068115, + "balance_loss_mlp": 1.01056099, + "epoch": 0.14197768372450942, + "flos": 677840259072.0, + "grad_norm": 0.04254557939420697, + "language_loss": 0.88126308, + "learning_rate": 0.0009674826224116593, + "loss": 0.89194429, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.57373047, + "step": 738, + "time_per_iteration": 2.858147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.0167979, + "epoch": 0.14217006540977298, + "flos": 446992972032.0, + "grad_norm": 0.045930563119643074, + "language_loss": 0.87994051, + "learning_rate": 0.0009673720153626455, + "loss": 0.89068353, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.57324219, + "step": 739, + "time_per_iteration": 2.664236545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.01199603, + "epoch": 0.14236244709503657, + "flos": 497478422784.0, + "grad_norm": 0.040566684483093814, + "language_loss": 0.88105047, + "learning_rate": 0.0009672612268637235, + "loss": 0.89174449, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.57226562, + "step": 740, + "time_per_iteration": 2.634126901626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069933, + "balance_loss_mlp": 1.01304626, + "epoch": 0.14255482878030012, + "flos": 649480104192.0, + "grad_norm": 0.05086050125917657, + "language_loss": 0.85906518, + "learning_rate": 0.0009671502569579048, + "loss": 0.86976457, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.56884766, + "step": 741, + "time_per_iteration": 2.7642107009887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071964, + "balance_loss_mlp": 1.01564944, + "epoch": 0.14274721046556368, + "flos": 537274984704.0, + "grad_norm": 0.037356444744632025, + "language_loss": 0.90824854, + "learning_rate": 0.0009670391056882719, + "loss": 0.91896814, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.56445312, + "step": 742, + "time_per_iteration": 2.7307372093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069292, + "balance_loss_mlp": 1.01288199, + "epoch": 0.14293959215082724, + "flos": 958584893184.0, + "grad_norm": 0.03744948002603285, + "language_loss": 0.89976203, + "learning_rate": 0.0009669277730979776, + "loss": 0.91045499, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.56494141, + "step": 743, + "time_per_iteration": 3.2251601219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068321, + "balance_loss_mlp": 1.01162553, + "epoch": 0.1431319738360908, + "flos": 694386840576.0, + "grad_norm": 0.037398516399228816, + "language_loss": 0.86562485, + "learning_rate": 0.0009668162592302449, + "loss": 0.87630802, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.56738281, + "step": 744, + "time_per_iteration": 2.924435615539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.01015854, + "epoch": 0.14332435552135436, + "flos": 566503200000.0, + "grad_norm": 0.037819132294000864, + "language_loss": 0.86981773, + "learning_rate": 0.0009667045641283676, + "loss": 0.88048917, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.56933594, + "step": 745, + "time_per_iteration": 2.6744887828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071811, + "balance_loss_mlp": 1.01540148, + "epoch": 0.14351673720661792, + "flos": 739696988160.0, + "grad_norm": 0.042480690817339954, + "language_loss": 0.96115947, + "learning_rate": 0.0009665926878357092, + "loss": 0.97187757, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.56591797, + "step": 746, + "time_per_iteration": 2.9137520790100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069908, + "balance_loss_mlp": 1.0134027, + "epoch": 0.14370911889188148, + "flos": 550352248320.0, + "grad_norm": 0.037361960218361134, + "language_loss": 0.92219329, + "learning_rate": 0.0009664806303957043, + "loss": 0.93289238, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.56542969, + "step": 747, + "time_per_iteration": 2.7734382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.01453757, + "epoch": 0.14390150057714507, + "flos": 591590390016.0, + "grad_norm": 0.040803275102161134, + "language_loss": 0.88578373, + "learning_rate": 0.0009663683918518571, + "loss": 0.89649272, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.56542969, + "step": 748, + "time_per_iteration": 2.93782114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.0103749, + "epoch": 0.14409388226240863, + "flos": 592145445888.0, + "grad_norm": 0.040391516566669984, + "language_loss": 0.87085271, + "learning_rate": 0.0009662559722477428, + "loss": 0.88152146, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.56640625, + "step": 749, + "time_per_iteration": 2.696570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.08542633, + "epoch": 0.1442862639476722, + "flos": 1514657762304.0, + "grad_norm": 0.043557664449290004, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77303517, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.55273438, + "step": 750, + "time_per_iteration": 5.024984836578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106928, + "balance_loss_mlp": 1.01263177, + "epoch": 0.14447864563293575, + "flos": 497856556032.0, + "grad_norm": 0.03544029116038115, + "language_loss": 0.90697813, + "learning_rate": 0.0009660305900333632, + "loss": 0.91767091, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.56738281, + "step": 751, + "time_per_iteration": 2.678037166595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078207, + "balance_loss_mlp": 1.02165437, + "epoch": 0.1446710273181993, + "flos": 590795239680.0, + "grad_norm": 0.04141635113788076, + "language_loss": 0.83649188, + "learning_rate": 0.0009659176275105992, + "loss": 0.84727395, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.56640625, + "step": 752, + "time_per_iteration": 2.714871883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.02074409, + "epoch": 0.14486340900346287, + "flos": 587013884160.0, + "grad_norm": 0.03637909883196532, + "language_loss": 0.87195009, + "learning_rate": 0.0009658044841025701, + "loss": 0.88271976, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.56396484, + "step": 753, + "time_per_iteration": 2.7753467559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.01901722, + "epoch": 0.14505579068872643, + "flos": 505741019904.0, + "grad_norm": 0.041255413340114844, + "language_loss": 0.82866222, + "learning_rate": 0.0009656911598532021, + "loss": 0.83941746, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.56591797, + "step": 754, + "time_per_iteration": 2.657831907272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.02119958, + "epoch": 0.14524817237399, + "flos": 487816041984.0, + "grad_norm": 0.03637506550278126, + "language_loss": 0.9138847, + "learning_rate": 0.0009655776548064917, + "loss": 0.92465889, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.56347656, + "step": 755, + "time_per_iteration": 2.6499805450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070806, + "balance_loss_mlp": 1.01477778, + "epoch": 0.14544055405925355, + "flos": 729450394368.0, + "grad_norm": 0.037726189244012505, + "language_loss": 0.89799821, + "learning_rate": 0.0009654639690065054, + "loss": 0.90870631, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.56201172, + "step": 756, + "time_per_iteration": 2.913638114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.01461017, + "epoch": 0.14563293574451713, + "flos": 594787532544.0, + "grad_norm": 0.03772784195488967, + "language_loss": 0.8914414, + "learning_rate": 0.00096535010249738, + "loss": 0.90214825, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.5625, + "step": 757, + "time_per_iteration": 2.721640110015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.01121712, + "epoch": 0.1458253174297807, + "flos": 561623404800.0, + "grad_norm": 0.04410713855467511, + "language_loss": 0.84106696, + "learning_rate": 0.0009652360553233224, + "loss": 0.8517437, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.56591797, + "step": 758, + "time_per_iteration": 2.771986484527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080421, + "balance_loss_mlp": 1.02625275, + "epoch": 0.14601769911504425, + "flos": 1561189804032.0, + "grad_norm": 0.021986445825835567, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74854165, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.54296875, + "step": 759, + "time_per_iteration": 4.951657056808472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064246, + "balance_loss_mlp": 1.00712132, + "epoch": 0.1462100808003078, + "flos": 867823646976.0, + "grad_norm": 0.03532102179266325, + "language_loss": 0.82350075, + "learning_rate": 0.0009650074191575883, + "loss": 0.83414322, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.56982422, + "step": 760, + "time_per_iteration": 3.2275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.02083874, + "epoch": 0.14640246248557137, + "flos": 524030525184.0, + "grad_norm": 0.0394901057776484, + "language_loss": 0.87295806, + "learning_rate": 0.0009648928302546766, + "loss": 0.88373965, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.57177734, + "step": 761, + "time_per_iteration": 2.6739044189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108061, + "balance_loss_mlp": 1.02319896, + "epoch": 0.14659484417083493, + "flos": 1032242556672.0, + "grad_norm": 0.0381114836464334, + "language_loss": 0.86423808, + "learning_rate": 0.0009647780608643613, + "loss": 0.87504417, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.57226562, + "step": 762, + "time_per_iteration": 3.355055332183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_mlp": 1.02742219, + "epoch": 0.1467872258560985, + "flos": 501657353472.0, + "grad_norm": 0.04884269069306727, + "language_loss": 0.89483184, + "learning_rate": 0.0009646631110312001, + "loss": 0.90568066, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.57275391, + "step": 763, + "time_per_iteration": 2.638404607772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074108, + "balance_loss_mlp": 1.01683939, + "epoch": 0.14697960754136205, + "flos": 548936913408.0, + "grad_norm": 0.030517371118051684, + "language_loss": 0.89587164, + "learning_rate": 0.0009645479807998203, + "loss": 0.90661263, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.57128906, + "step": 764, + "time_per_iteration": 2.7784340381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066881, + "balance_loss_mlp": 1.0099467, + "epoch": 0.14717198922662564, + "flos": 518902854144.0, + "grad_norm": 0.03321738346858149, + "language_loss": 0.93693149, + "learning_rate": 0.0009644326702149196, + "loss": 0.94760031, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.56884766, + "step": 765, + "time_per_iteration": 2.712148904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.009902, + "epoch": 0.1473643709118892, + "flos": 733484483328.0, + "grad_norm": 0.042813367444357694, + "language_loss": 0.86227441, + "learning_rate": 0.0009643171793212653, + "loss": 0.87293845, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.56591797, + "step": 766, + "time_per_iteration": 3.0350003242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.01357007, + "epoch": 0.14755675259715276, + "flos": 621669169152.0, + "grad_norm": 0.04397904632105779, + "language_loss": 0.90884185, + "learning_rate": 0.0009642015081636952, + "loss": 0.91953874, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.56298828, + "step": 767, + "time_per_iteration": 2.6967811584472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.01185656, + "epoch": 0.14774913428241632, + "flos": 453173395968.0, + "grad_norm": 0.040409537343205924, + "language_loss": 0.89756525, + "learning_rate": 0.0009640856567871166, + "loss": 0.90824074, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.55859375, + "step": 768, + "time_per_iteration": 2.5016207695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.00803363, + "epoch": 0.14794151596767988, + "flos": 838655702784.0, + "grad_norm": 0.03518214363191685, + "language_loss": 0.90024096, + "learning_rate": 0.0009639696252365072, + "loss": 0.91087824, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.55859375, + "step": 769, + "time_per_iteration": 3.0535316467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064247, + "balance_loss_mlp": 1.00874364, + "epoch": 0.14813389765294344, + "flos": 687405430272.0, + "grad_norm": 0.03578436651039587, + "language_loss": 0.83073497, + "learning_rate": 0.0009638534135569144, + "loss": 0.8413775, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.55664062, + "step": 770, + "time_per_iteration": 2.8983683586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065854, + "balance_loss_mlp": 1.01039767, + "epoch": 0.148326279338207, + "flos": 510944513280.0, + "grad_norm": 0.03931230706380594, + "language_loss": 0.91550887, + "learning_rate": 0.0009637370217934554, + "loss": 0.92616743, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.55615234, + "step": 771, + "time_per_iteration": 2.6311967372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061556, + "balance_loss_mlp": 1.00590932, + "epoch": 0.14851866102347056, + "flos": 589332272640.0, + "grad_norm": 0.03214719611667013, + "language_loss": 0.8436957, + "learning_rate": 0.0009636204499913175, + "loss": 0.85431123, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.55810547, + "step": 772, + "time_per_iteration": 2.8748695850372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066056, + "balance_loss_mlp": 1.01069546, + "epoch": 0.14871104270873411, + "flos": 692248286976.0, + "grad_norm": 0.034034874980260935, + "language_loss": 0.89455193, + "learning_rate": 0.0009635036981957581, + "loss": 0.9052124, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.55517578, + "step": 773, + "time_per_iteration": 2.8526012897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063541, + "balance_loss_mlp": 1.00789392, + "epoch": 0.1489034243939977, + "flos": 656283624960.0, + "grad_norm": 0.03841304714783139, + "language_loss": 0.91971016, + "learning_rate": 0.0009633867664521043, + "loss": 0.93034559, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.55810547, + "step": 774, + "time_per_iteration": 2.823320150375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_mlp": 1.00736797, + "epoch": 0.14909580607926126, + "flos": 476796652032.0, + "grad_norm": 0.0404919947218097, + "language_loss": 0.88328946, + "learning_rate": 0.0009632696548057527, + "loss": 0.89392436, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.56298828, + "step": 775, + "time_per_iteration": 2.5567190647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072265, + "balance_loss_mlp": 1.01609385, + "epoch": 0.14928818776452482, + "flos": 612284799744.0, + "grad_norm": 0.03821441574416946, + "language_loss": 0.86270714, + "learning_rate": 0.0009631523633021704, + "loss": 0.87342978, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.56347656, + "step": 776, + "time_per_iteration": 2.783348321914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.01187015, + "epoch": 0.14948056944978838, + "flos": 562917230592.0, + "grad_norm": 0.039790220133906304, + "language_loss": 0.90072912, + "learning_rate": 0.0009630348919868936, + "loss": 0.9114095, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.56347656, + "step": 777, + "time_per_iteration": 2.7115018367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.01736236, + "epoch": 0.14967295113505194, + "flos": 450112346880.0, + "grad_norm": 0.044777999480791836, + "language_loss": 0.82363755, + "learning_rate": 0.0009629172409055293, + "loss": 0.83437192, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.5625, + "step": 778, + "time_per_iteration": 2.578178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079079, + "balance_loss_mlp": 1.02319324, + "epoch": 0.1498653328203155, + "flos": 572429912064.0, + "grad_norm": 0.03699200582710457, + "language_loss": 0.8876617, + "learning_rate": 0.0009627994101037531, + "loss": 0.89845246, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.56054688, + "step": 779, + "time_per_iteration": 2.7733986377716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.01556909, + "epoch": 0.15005771450557906, + "flos": 632408602368.0, + "grad_norm": 0.04036301028093645, + "language_loss": 0.90477651, + "learning_rate": 0.0009626813996273114, + "loss": 0.91549194, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.56152344, + "step": 780, + "time_per_iteration": 2.8476834297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064638, + "balance_loss_mlp": 1.00884771, + "epoch": 0.15025009619084262, + "flos": 579166358784.0, + "grad_norm": 0.036574622666600026, + "language_loss": 0.89819682, + "learning_rate": 0.0009625632095220198, + "loss": 0.90884316, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.55957031, + "step": 781, + "time_per_iteration": 2.8279531002044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065406, + "balance_loss_mlp": 1.00961614, + "epoch": 0.1504424778761062, + "flos": 484857060096.0, + "grad_norm": 0.04416373966784989, + "language_loss": 0.8858574, + "learning_rate": 0.0009624448398337637, + "loss": 0.89651144, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.55957031, + "step": 782, + "time_per_iteration": 2.512742280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0075767, + "epoch": 0.15063485956136977, + "flos": 763895708928.0, + "grad_norm": 0.03630111779859241, + "language_loss": 0.90811443, + "learning_rate": 0.0009623262906084984, + "loss": 0.9187429, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.55419922, + "step": 783, + "time_per_iteration": 3.0409936904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.01156867, + "epoch": 0.15082724124663333, + "flos": 498676984320.0, + "grad_norm": 0.03758683048429116, + "language_loss": 0.91324949, + "learning_rate": 0.0009622075618922486, + "loss": 0.92391407, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.55029297, + "step": 784, + "time_per_iteration": 2.716580629348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.01188219, + "epoch": 0.15101962293189689, + "flos": 510722882304.0, + "grad_norm": 0.0361748672236624, + "language_loss": 0.88713133, + "learning_rate": 0.0009620886537311091, + "loss": 0.89779752, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.54882812, + "step": 785, + "time_per_iteration": 2.7197515964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.01069367, + "epoch": 0.15121200461716044, + "flos": 458702532864.0, + "grad_norm": 0.0476660620131034, + "language_loss": 0.86751854, + "learning_rate": 0.000961969566171244, + "loss": 0.87817287, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.54882812, + "step": 786, + "time_per_iteration": 2.519826650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063397, + "balance_loss_mlp": 1.00865602, + "epoch": 0.151404386302424, + "flos": 539017908480.0, + "grad_norm": 0.0401982478312821, + "language_loss": 0.91594857, + "learning_rate": 0.0009618502992588873, + "loss": 0.92658257, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.54882812, + "step": 787, + "time_per_iteration": 2.6427645683288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.02133262, + "epoch": 0.15159676798768756, + "flos": 689617860864.0, + "grad_norm": 0.04258050045209434, + "language_loss": 0.8916502, + "learning_rate": 0.0009617308530403424, + "loss": 0.9024148, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.55273438, + "step": 788, + "time_per_iteration": 3.0662577152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106461, + "balance_loss_mlp": 1.00958323, + "epoch": 0.15178914967295112, + "flos": 546433832448.0, + "grad_norm": 0.03354297731817266, + "language_loss": 0.88695067, + "learning_rate": 0.0009616112275619825, + "loss": 0.89759684, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.55175781, + "step": 789, + "time_per_iteration": 2.7230606079101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065845, + "balance_loss_mlp": 1.01081765, + "epoch": 0.1519815313582147, + "flos": 512815749120.0, + "grad_norm": 0.03087624340708216, + "language_loss": 0.85391772, + "learning_rate": 0.0009614914228702503, + "loss": 0.86457616, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.55175781, + "step": 790, + "time_per_iteration": 2.6690316200256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075082, + "balance_loss_mlp": 1.02024603, + "epoch": 0.15217391304347827, + "flos": 685458372096.0, + "grad_norm": 0.03877155611381102, + "language_loss": 0.90952718, + "learning_rate": 0.0009613714390116581, + "loss": 0.92027801, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.54980469, + "step": 791, + "time_per_iteration": 3.006898880004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069036, + "balance_loss_mlp": 1.01396108, + "epoch": 0.15236629472874183, + "flos": 645446982144.0, + "grad_norm": 0.03750254169389994, + "language_loss": 0.87660968, + "learning_rate": 0.0009612512760327879, + "loss": 0.88730001, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.55224609, + "step": 792, + "time_per_iteration": 2.858262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068429, + "balance_loss_mlp": 1.01297235, + "epoch": 0.1525586764140054, + "flos": 413765660928.0, + "grad_norm": 0.044925092089749936, + "language_loss": 0.86468709, + "learning_rate": 0.0009611309339802909, + "loss": 0.87537134, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.55615234, + "step": 793, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070738, + "balance_loss_mlp": 1.01485312, + "epoch": 0.15275105809926895, + "flos": 804234687744.0, + "grad_norm": 0.03634630877191588, + "language_loss": 0.85518378, + "learning_rate": 0.0009610104129008881, + "loss": 0.8658911, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.56054688, + "step": 794, + "time_per_iteration": 3.119896173477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064568, + "balance_loss_mlp": 1.0088737, + "epoch": 0.1529434397845325, + "flos": 613543632384.0, + "grad_norm": 0.039196324818253456, + "language_loss": 0.89691782, + "learning_rate": 0.0009608897128413701, + "loss": 0.90756351, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.55859375, + "step": 795, + "time_per_iteration": 2.7244484424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.00949657, + "epoch": 0.15313582146979607, + "flos": 616472478720.0, + "grad_norm": 0.031652256183926086, + "language_loss": 0.86697376, + "learning_rate": 0.0009607688338485965, + "loss": 0.87762469, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.55761719, + "step": 796, + "time_per_iteration": 2.859959363937378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106534, + "balance_loss_mlp": 1.00997913, + "epoch": 0.15332820315505963, + "flos": 794993214720.0, + "grad_norm": 0.036135713167076366, + "language_loss": 0.91464871, + "learning_rate": 0.0009606477759694969, + "loss": 0.92530215, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.55517578, + "step": 797, + "time_per_iteration": 3.0383169651031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.00806129, + "epoch": 0.1535205848403232, + "flos": 551257247232.0, + "grad_norm": 0.04267360012583918, + "language_loss": 0.89290035, + "learning_rate": 0.0009605265392510703, + "loss": 0.90353841, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.55908203, + "step": 798, + "time_per_iteration": 2.642423152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063071, + "balance_loss_mlp": 1.00732899, + "epoch": 0.15371296652558677, + "flos": 536979476736.0, + "grad_norm": 0.03662373873498648, + "language_loss": 0.93232477, + "learning_rate": 0.0009604051237403846, + "loss": 0.94295549, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.55908203, + "step": 799, + "time_per_iteration": 2.6661648750305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062966, + "balance_loss_mlp": 1.00693774, + "epoch": 0.15390534821085033, + "flos": 396090504192.0, + "grad_norm": 0.042222005302764924, + "language_loss": 0.87381375, + "learning_rate": 0.0009602835294845776, + "loss": 0.8844434, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.56201172, + "step": 800, + "time_per_iteration": 2.4529898166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_mlp": 1.00520432, + "epoch": 0.1540977298961139, + "flos": 536886157824.0, + "grad_norm": 0.03888031973735598, + "language_loss": 0.91938102, + "learning_rate": 0.0009601617565308565, + "loss": 0.92998952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.55810547, + "step": 801, + "time_per_iteration": 2.6380698680877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.0085746, + "epoch": 0.15429011158137745, + "flos": 725091628800.0, + "grad_norm": 0.03523983772327724, + "language_loss": 0.87975162, + "learning_rate": 0.0009600398049264977, + "loss": 0.89039195, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.55615234, + "step": 802, + "time_per_iteration": 2.9610986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064973, + "balance_loss_mlp": 1.00970769, + "epoch": 0.154482493266641, + "flos": 621749849088.0, + "grad_norm": 0.04424510077845192, + "language_loss": 0.93353879, + "learning_rate": 0.0009599176747188469, + "loss": 0.94418848, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.55419922, + "step": 803, + "time_per_iteration": 2.883296251296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065036, + "balance_loss_mlp": 1.00981843, + "epoch": 0.15467487495190457, + "flos": 526720243968.0, + "grad_norm": 0.03833070581853241, + "language_loss": 0.84471631, + "learning_rate": 0.0009597953659553196, + "loss": 0.85536671, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.55371094, + "step": 804, + "time_per_iteration": 2.7128705978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.00712788, + "epoch": 0.15486725663716813, + "flos": 528760621056.0, + "grad_norm": 0.03896986919959599, + "language_loss": 0.90159577, + "learning_rate": 0.0009596728786833997, + "loss": 0.9122197, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.55419922, + "step": 805, + "time_per_iteration": 2.605398178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062158, + "balance_loss_mlp": 1.00684452, + "epoch": 0.1550596383224317, + "flos": 1050280295424.0, + "grad_norm": 0.039312204875199507, + "language_loss": 0.90827858, + "learning_rate": 0.0009595502129506415, + "loss": 0.91890013, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.5546875, + "step": 806, + "time_per_iteration": 3.355556011199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_mlp": 1.00736439, + "epoch": 0.15525202000769528, + "flos": 614837458176.0, + "grad_norm": 0.03934214137038287, + "language_loss": 0.83726299, + "learning_rate": 0.0009594273688046678, + "loss": 0.8478874, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.55224609, + "step": 807, + "time_per_iteration": 2.765700101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062118, + "balance_loss_mlp": 1.00728118, + "epoch": 0.15544440169295884, + "flos": 534103120128.0, + "grad_norm": 0.042258492962953934, + "language_loss": 0.86714661, + "learning_rate": 0.000959304346293171, + "loss": 0.8777678, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.54980469, + "step": 808, + "time_per_iteration": 2.6490986347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.00928247, + "epoch": 0.1556367833782224, + "flos": 645887331840.0, + "grad_norm": 0.047675746935091516, + "language_loss": 0.89139616, + "learning_rate": 0.0009591811454639125, + "loss": 0.90203738, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.54980469, + "step": 809, + "time_per_iteration": 2.7880568504333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059631, + "balance_loss_mlp": 1.00469911, + "epoch": 0.15582916506348596, + "flos": 544953368832.0, + "grad_norm": 0.05205155355433054, + "language_loss": 0.89500809, + "learning_rate": 0.0009590577663647234, + "loss": 0.90560436, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.55078125, + "step": 810, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061907, + "balance_loss_mlp": 1.0068804, + "epoch": 0.15602154674874952, + "flos": 581215484160.0, + "grad_norm": 0.039153260843753375, + "language_loss": 0.87186325, + "learning_rate": 0.0009589342090435036, + "loss": 0.88248235, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.55175781, + "step": 811, + "time_per_iteration": 2.806425094604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_mlp": 1.00607169, + "epoch": 0.15621392843401308, + "flos": 536317496064.0, + "grad_norm": 0.04937652455074429, + "language_loss": 0.88453877, + "learning_rate": 0.0009588104735482223, + "loss": 0.89514732, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.54931641, + "step": 812, + "time_per_iteration": 2.647728204727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060077, + "balance_loss_mlp": 1.00538397, + "epoch": 0.15640631011927664, + "flos": 551982411264.0, + "grad_norm": 0.04402679292728805, + "language_loss": 0.85281312, + "learning_rate": 0.0009586865599269177, + "loss": 0.86341381, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.54833984, + "step": 813, + "time_per_iteration": 2.642218828201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061354, + "balance_loss_mlp": 1.0069474, + "epoch": 0.1565986918045402, + "flos": 638636658432.0, + "grad_norm": 0.0415768255708782, + "language_loss": 0.89702487, + "learning_rate": 0.0009585624682276977, + "loss": 0.90763843, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.54541016, + "step": 814, + "time_per_iteration": 2.7770931720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058453, + "balance_loss_mlp": 1.00366414, + "epoch": 0.15679107348980378, + "flos": 491782089984.0, + "grad_norm": 0.039213144049943555, + "language_loss": 0.88436091, + "learning_rate": 0.0009584381984987386, + "loss": 0.89494538, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.54931641, + "step": 815, + "time_per_iteration": 2.617560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061581, + "balance_loss_mlp": 1.00655353, + "epoch": 0.15698345517506734, + "flos": 531003187200.0, + "grad_norm": 0.030486806446719653, + "language_loss": 0.91117728, + "learning_rate": 0.0009583137507882864, + "loss": 0.92179304, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.55175781, + "step": 816, + "time_per_iteration": 2.6757051944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060617, + "balance_loss_mlp": 1.00568497, + "epoch": 0.1571758368603309, + "flos": 547078316544.0, + "grad_norm": 0.03910336486934304, + "language_loss": 0.82217371, + "learning_rate": 0.000958189125144656, + "loss": 0.83277988, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.55078125, + "step": 817, + "time_per_iteration": 2.7065701484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061392, + "balance_loss_mlp": 1.00655591, + "epoch": 0.15736821854559446, + "flos": 566744272896.0, + "grad_norm": 0.03730967846547413, + "language_loss": 0.89150202, + "learning_rate": 0.0009580643216162313, + "loss": 0.90211594, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.54980469, + "step": 818, + "time_per_iteration": 2.6849937438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_mlp": 1.00792253, + "epoch": 0.15756060023085802, + "flos": 501954806784.0, + "grad_norm": 0.041127076818974775, + "language_loss": 0.80838168, + "learning_rate": 0.0009579393402514652, + "loss": 0.81900686, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.54736328, + "step": 819, + "time_per_iteration": 2.615342378616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060056, + "balance_loss_mlp": 1.00560164, + "epoch": 0.15775298191612158, + "flos": 520272502272.0, + "grad_norm": 0.037825026421493144, + "language_loss": 0.91941106, + "learning_rate": 0.0009578141810988801, + "loss": 0.93001157, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.54589844, + "step": 820, + "time_per_iteration": 2.6530544757843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.00666904, + "epoch": 0.15794536360138514, + "flos": 467088584448.0, + "grad_norm": 0.039348813654249644, + "language_loss": 0.92238629, + "learning_rate": 0.0009576888442070668, + "loss": 0.93299985, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.54833984, + "step": 821, + "time_per_iteration": 2.5978658199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_mlp": 1.00809062, + "epoch": 0.1581377452866487, + "flos": 518168941824.0, + "grad_norm": 0.03790806580601569, + "language_loss": 0.93657464, + "learning_rate": 0.0009575633296246854, + "loss": 0.94720107, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.546875, + "step": 822, + "time_per_iteration": 2.582139492034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_mlp": 1.00711334, + "epoch": 0.15833012697191226, + "flos": 550838284800.0, + "grad_norm": 0.03604802690546967, + "language_loss": 0.84146446, + "learning_rate": 0.0009574376374004652, + "loss": 0.85208106, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.546875, + "step": 823, + "time_per_iteration": 2.6182329654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.00703347, + "epoch": 0.15852250865717585, + "flos": 488467329024.0, + "grad_norm": 0.0382059884648543, + "language_loss": 0.82121176, + "learning_rate": 0.000957311767583204, + "loss": 0.83182758, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.546875, + "step": 824, + "time_per_iteration": 2.584266185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_mlp": 1.00531006, + "epoch": 0.1587148903424394, + "flos": 1312699441152.0, + "grad_norm": 0.00659207066158758, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83129162, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.5234375, + "step": 825, + "time_per_iteration": 4.734830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_mlp": 1.00965643, + "epoch": 0.15890727202770297, + "flos": 467833190400.0, + "grad_norm": 0.04624650490850591, + "language_loss": 0.92764026, + "learning_rate": 0.0009570594953650961, + "loss": 0.93828189, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.54638672, + "step": 826, + "time_per_iteration": 2.5117454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.00937772, + "epoch": 0.15909965371296653, + "flos": 778607993088.0, + "grad_norm": 0.03976637787958364, + "language_loss": 0.81327987, + "learning_rate": 0.00095693309306219, + "loss": 0.8239187, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.54638672, + "step": 827, + "time_per_iteration": 3.1954681873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.00599849, + "epoch": 0.1592920353982301, + "flos": 1079964411648.0, + "grad_norm": 0.038150784713437476, + "language_loss": 0.89750922, + "learning_rate": 0.0009568065133621244, + "loss": 0.90811658, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54882812, + "step": 828, + "time_per_iteration": 3.3355016708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.00642896, + "epoch": 0.15948441708349365, + "flos": 726890932992.0, + "grad_norm": 0.03986186218144037, + "language_loss": 0.85834098, + "learning_rate": 0.0009566797563140422, + "loss": 0.86894989, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.54589844, + "step": 829, + "time_per_iteration": 2.873845100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_mlp": 1.00519884, + "epoch": 0.1596767987687572, + "flos": 580076215296.0, + "grad_norm": 0.03433333328837374, + "language_loss": 0.89395094, + "learning_rate": 0.0009565528219671547, + "loss": 0.90454364, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.54199219, + "step": 830, + "time_per_iteration": 2.9566032886505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.00991619, + "epoch": 0.15986918045402077, + "flos": 530026256640.0, + "grad_norm": 0.037800776955081314, + "language_loss": 0.86586118, + "learning_rate": 0.0009564257103707418, + "loss": 0.87649965, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.54052734, + "step": 831, + "time_per_iteration": 2.6305205821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.00870061, + "epoch": 0.16006156213928435, + "flos": 575670796032.0, + "grad_norm": 0.04196239075383403, + "language_loss": 0.92502224, + "learning_rate": 0.0009562984215741533, + "loss": 0.93564951, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54150391, + "step": 832, + "time_per_iteration": 2.6781210899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00743783, + "epoch": 0.1602539438245479, + "flos": 516675839232.0, + "grad_norm": 0.039654673227061156, + "language_loss": 0.83729708, + "learning_rate": 0.0009561709556268065, + "loss": 0.84791321, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.54296875, + "step": 833, + "time_per_iteration": 2.732191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064816, + "balance_loss_mlp": 1.01021826, + "epoch": 0.16044632550981147, + "flos": 622162008576.0, + "grad_norm": 0.03600956841171521, + "language_loss": 0.95349514, + "learning_rate": 0.0009560433125781884, + "loss": 0.96414334, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.54736328, + "step": 834, + "time_per_iteration": 4.227160215377808 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063475, + "balance_loss_mlp": 1.008973, + "epoch": 0.16063870719507503, + "flos": 562128883200.0, + "grad_norm": 0.03652136008848007, + "language_loss": 0.94107795, + "learning_rate": 0.0009559154924778544, + "loss": 0.95171273, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.54638672, + "step": 835, + "time_per_iteration": 2.7238283157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_mlp": 1.01251614, + "epoch": 0.1608310888803386, + "flos": 806561824512.0, + "grad_norm": 0.044196177378580975, + "language_loss": 0.86185992, + "learning_rate": 0.0009557874953754284, + "loss": 0.87252581, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.54199219, + "step": 836, + "time_per_iteration": 3.03965425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.00943184, + "epoch": 0.16102347056560215, + "flos": 601695065856.0, + "grad_norm": 0.04086380423696876, + "language_loss": 0.84961462, + "learning_rate": 0.0009556593213206038, + "loss": 0.86025023, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.54248047, + "step": 837, + "time_per_iteration": 2.714165687561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_mlp": 1.0095681, + "epoch": 0.1612158522508657, + "flos": 554615749632.0, + "grad_norm": 0.03942211179170501, + "language_loss": 0.88284755, + "learning_rate": 0.0009555309703631414, + "loss": 0.89348304, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.54101562, + "step": 838, + "time_per_iteration": 2.6616575717926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061318, + "balance_loss_mlp": 1.00729215, + "epoch": 0.16140823393612927, + "flos": 557018708736.0, + "grad_norm": 0.03970121061853926, + "language_loss": 0.88476837, + "learning_rate": 0.0009554024425528722, + "loss": 0.89538157, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.54150391, + "step": 839, + "time_per_iteration": 2.6778693199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061761, + "balance_loss_mlp": 1.00792611, + "epoch": 0.16160061562139286, + "flos": 544909627392.0, + "grad_norm": 0.03616953348933095, + "language_loss": 0.90216744, + "learning_rate": 0.0009552737379396948, + "loss": 0.91278505, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.53955078, + "step": 840, + "time_per_iteration": 2.6190080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060215, + "balance_loss_mlp": 1.00638056, + "epoch": 0.16179299730665642, + "flos": 605007881472.0, + "grad_norm": 0.03485432207779616, + "language_loss": 0.88917094, + "learning_rate": 0.0009551448565735767, + "loss": 0.89977312, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.53955078, + "step": 841, + "time_per_iteration": 2.771730422973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059839, + "balance_loss_mlp": 1.00624251, + "epoch": 0.16198537899191998, + "flos": 788552275968.0, + "grad_norm": 0.040424272174261144, + "language_loss": 0.855564, + "learning_rate": 0.0009550157985045543, + "loss": 0.86616236, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.53710938, + "step": 842, + "time_per_iteration": 3.014448642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063416, + "balance_loss_mlp": 1.00986671, + "epoch": 0.16217776067718354, + "flos": 520830470400.0, + "grad_norm": 0.03210449059239548, + "language_loss": 0.9010545, + "learning_rate": 0.0009548865637827321, + "loss": 0.91168869, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.53662109, + "step": 843, + "time_per_iteration": 2.663733959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.00725794, + "epoch": 0.1623701423624471, + "flos": 506255246592.0, + "grad_norm": 0.04236042945807781, + "language_loss": 0.91279781, + "learning_rate": 0.0009547571524582838, + "loss": 0.92340446, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.53515625, + "step": 844, + "time_per_iteration": 2.5841143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00848722, + "epoch": 0.16256252404771065, + "flos": 498157900032.0, + "grad_norm": 0.043042899099755685, + "language_loss": 0.93573415, + "learning_rate": 0.0009546275645814512, + "loss": 0.94635028, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.53222656, + "step": 845, + "time_per_iteration": 2.601743221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064884, + "balance_loss_mlp": 1.01152599, + "epoch": 0.16275490573297421, + "flos": 503287516416.0, + "grad_norm": 0.046422900850994125, + "language_loss": 0.90658545, + "learning_rate": 0.0009544978002025446, + "loss": 0.9172343, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.53466797, + "step": 846, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.00957346, + "epoch": 0.16294728741823777, + "flos": 508354916352.0, + "grad_norm": 0.03474620131823351, + "language_loss": 0.88017273, + "learning_rate": 0.0009543678593719434, + "loss": 0.89080155, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.53417969, + "step": 847, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_mlp": 1.01334834, + "epoch": 0.16313966910350133, + "flos": 510757875456.0, + "grad_norm": 0.031134263506057067, + "language_loss": 0.88570058, + "learning_rate": 0.0009542377421400945, + "loss": 0.89637142, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.53857422, + "step": 848, + "time_per_iteration": 2.79311203956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061983, + "balance_loss_mlp": 1.00810015, + "epoch": 0.16333205078876492, + "flos": 545057381376.0, + "grad_norm": 0.03805815068737175, + "language_loss": 0.84448338, + "learning_rate": 0.0009541074485575145, + "loss": 0.85510319, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.54003906, + "step": 849, + "time_per_iteration": 2.714644193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106829, + "balance_loss_mlp": 1.01450312, + "epoch": 0.16352443247402848, + "flos": 508712640768.0, + "grad_norm": 0.03447226436126556, + "language_loss": 0.93184924, + "learning_rate": 0.0009539769786747874, + "loss": 0.94253218, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.5390625, + "step": 850, + "time_per_iteration": 2.5857110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070929, + "balance_loss_mlp": 1.01709449, + "epoch": 0.16371681415929204, + "flos": 543223084032.0, + "grad_norm": 0.036141614394747515, + "language_loss": 0.82550752, + "learning_rate": 0.0009538463325425665, + "loss": 0.83621687, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.53955078, + "step": 851, + "time_per_iteration": 2.7186405658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.01242912, + "epoch": 0.1639091958445556, + "flos": 521761714176.0, + "grad_norm": 0.03784697093976771, + "language_loss": 0.87203169, + "learning_rate": 0.0009537155102115728, + "loss": 0.8826977, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.54296875, + "step": 852, + "time_per_iteration": 2.5761775970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061784, + "balance_loss_mlp": 1.00771022, + "epoch": 0.16410157752981916, + "flos": 548482957824.0, + "grad_norm": 0.03731294741121226, + "language_loss": 0.85278255, + "learning_rate": 0.0009535845117325961, + "loss": 0.8634004, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.54199219, + "step": 853, + "time_per_iteration": 2.6968846321105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065026, + "balance_loss_mlp": 1.01085758, + "epoch": 0.16429395921508272, + "flos": 584026712064.0, + "grad_norm": 0.031860977478103375, + "language_loss": 0.9423098, + "learning_rate": 0.0009534533371564946, + "loss": 0.95296007, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.54296875, + "step": 854, + "time_per_iteration": 2.7640349864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106098, + "balance_loss_mlp": 1.00709713, + "epoch": 0.16448634090034628, + "flos": 531962621184.0, + "grad_norm": 0.03950290113288642, + "language_loss": 0.89868152, + "learning_rate": 0.0009533219865341949, + "loss": 0.90929133, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.54003906, + "step": 855, + "time_per_iteration": 2.6025009155273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060489, + "balance_loss_mlp": 1.00693989, + "epoch": 0.16467872258560984, + "flos": 492961209600.0, + "grad_norm": 0.03645156199748424, + "language_loss": 0.87602645, + "learning_rate": 0.0009531904599166916, + "loss": 0.88663131, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.53662109, + "step": 856, + "time_per_iteration": 2.656604290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060758, + "balance_loss_mlp": 1.00730467, + "epoch": 0.16487110427087343, + "flos": 507260367360.0, + "grad_norm": 0.04426557796634758, + "language_loss": 0.86560714, + "learning_rate": 0.0009530587573550478, + "loss": 0.87621474, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.53564453, + "step": 857, + "time_per_iteration": 2.610445261001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_mlp": 1.00538635, + "epoch": 0.16506348595613698, + "flos": 1436111555328.0, + "grad_norm": 0.010874217326465607, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75375891, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.51171875, + "step": 858, + "time_per_iteration": 4.991516590118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.00718212, + "epoch": 0.16525586764140054, + "flos": 478090477824.0, + "grad_norm": 0.04454190836652637, + "language_loss": 0.91544032, + "learning_rate": 0.0009527948246039337, + "loss": 0.9260481, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.53710938, + "step": 859, + "time_per_iteration": 2.538290500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058142, + "balance_loss_mlp": 1.00425971, + "epoch": 0.1654482493266641, + "flos": 882541767168.0, + "grad_norm": 0.03991834039284953, + "language_loss": 0.88867122, + "learning_rate": 0.000952662594516931, + "loss": 0.89925265, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.54003906, + "step": 860, + "time_per_iteration": 3.083786964416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065202, + "balance_loss_mlp": 1.01122451, + "epoch": 0.16564063101192766, + "flos": 628106217216.0, + "grad_norm": 0.03630731527649873, + "language_loss": 0.87934124, + "learning_rate": 0.0009525301886907234, + "loss": 0.88999331, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.54101562, + "step": 861, + "time_per_iteration": 2.8606412410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.00884438, + "epoch": 0.16583301269719122, + "flos": 562593532416.0, + "grad_norm": 0.03632506699489255, + "language_loss": 0.8885988, + "learning_rate": 0.0009523976071767155, + "loss": 0.89922649, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.54052734, + "step": 862, + "time_per_iteration": 2.651202440261841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062989, + "balance_loss_mlp": 1.0094403, + "epoch": 0.16602539438245478, + "flos": 568984893696.0, + "grad_norm": 0.03883194498572106, + "language_loss": 0.88789731, + "learning_rate": 0.00095226485002638, + "loss": 0.8985272, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.53662109, + "step": 863, + "time_per_iteration": 2.798125982284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.01019073, + "epoch": 0.16621777606771834, + "flos": 576022684416.0, + "grad_norm": 0.03638934937563812, + "language_loss": 0.89892161, + "learning_rate": 0.0009521319172912576, + "loss": 0.90955949, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.53710938, + "step": 864, + "time_per_iteration": 4.098716974258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_mlp": 1.00632548, + "epoch": 0.16641015775298193, + "flos": 515598786816.0, + "grad_norm": 0.037169751839881825, + "language_loss": 0.96108532, + "learning_rate": 0.0009519988090229579, + "loss": 0.97168505, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.53759766, + "step": 865, + "time_per_iteration": 2.659381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068447, + "balance_loss_mlp": 1.01489806, + "epoch": 0.1666025394382455, + "flos": 622850234112.0, + "grad_norm": 0.04388029559541895, + "language_loss": 0.88811028, + "learning_rate": 0.0009518655252731576, + "loss": 0.89879477, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.53662109, + "step": 866, + "time_per_iteration": 2.738511323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061925, + "balance_loss_mlp": 1.00880551, + "epoch": 0.16679492112350905, + "flos": 549933285888.0, + "grad_norm": 0.03352631932153436, + "language_loss": 0.91113746, + "learning_rate": 0.0009517320660936022, + "loss": 0.92175674, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.53222656, + "step": 867, + "time_per_iteration": 2.7755699157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.01343453, + "epoch": 0.1669873028087726, + "flos": 666866555904.0, + "grad_norm": 0.04051359913494383, + "language_loss": 0.84396493, + "learning_rate": 0.0009515984315361051, + "loss": 0.85462809, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.52978516, + "step": 868, + "time_per_iteration": 2.8502533435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_mlp": 1.00944042, + "epoch": 0.16717968449403617, + "flos": 539604066816.0, + "grad_norm": 0.03969494402961726, + "language_loss": 0.88029611, + "learning_rate": 0.000951464621652548, + "loss": 0.89092225, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.53271484, + "step": 869, + "time_per_iteration": 2.6079800128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.01233244, + "epoch": 0.16737206617929973, + "flos": 531279253248.0, + "grad_norm": 0.03349656106003216, + "language_loss": 0.7990135, + "learning_rate": 0.0009513306364948804, + "loss": 0.80967236, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.53662109, + "step": 870, + "time_per_iteration": 2.824232578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106371, + "balance_loss_mlp": 1.00987494, + "epoch": 0.1675644478645633, + "flos": 481757127168.0, + "grad_norm": 0.04264569815750397, + "language_loss": 0.90229708, + "learning_rate": 0.0009511964761151197, + "loss": 0.91293418, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.53955078, + "step": 871, + "time_per_iteration": 2.6326816082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106642, + "balance_loss_mlp": 1.01344323, + "epoch": 0.16775682954982685, + "flos": 495542058240.0, + "grad_norm": 0.04000245460937008, + "language_loss": 0.91825569, + "learning_rate": 0.0009510621405653521, + "loss": 0.92891991, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.53076172, + "step": 872, + "time_per_iteration": 2.5802783966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074346, + "balance_loss_mlp": 1.02151191, + "epoch": 0.1679492112350904, + "flos": 753406096896.0, + "grad_norm": 0.04130745072346603, + "language_loss": 0.85908926, + "learning_rate": 0.0009509276298977309, + "loss": 0.86983275, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.52929688, + "step": 873, + "time_per_iteration": 2.9676413536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069963, + "balance_loss_mlp": 1.01689136, + "epoch": 0.168141592920354, + "flos": 1137733583616.0, + "grad_norm": 0.036676349776393134, + "language_loss": 0.82925022, + "learning_rate": 0.0009507929441644778, + "loss": 0.83994985, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.53173828, + "step": 874, + "time_per_iteration": 3.5441927909851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.00924039, + "epoch": 0.16833397460561755, + "flos": 633554674176.0, + "grad_norm": 0.03715311549034911, + "language_loss": 0.86810201, + "learning_rate": 0.0009506580834178826, + "loss": 0.87872851, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.53515625, + "step": 875, + "time_per_iteration": 2.767840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106879, + "balance_loss_mlp": 1.01524162, + "epoch": 0.1685263562908811, + "flos": 542543606784.0, + "grad_norm": 0.041322978640758234, + "language_loss": 0.92533737, + "learning_rate": 0.0009505230477103028, + "loss": 0.93602526, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.53662109, + "step": 876, + "time_per_iteration": 2.68626070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_mlp": 1.01151776, + "epoch": 0.16871873797614467, + "flos": 620486158848.0, + "grad_norm": 0.04979097271806245, + "language_loss": 0.82312369, + "learning_rate": 0.0009503878370941641, + "loss": 0.83377057, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.53271484, + "step": 877, + "time_per_iteration": 2.738828182220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.01455081, + "epoch": 0.16891111966140823, + "flos": 607456527360.0, + "grad_norm": 0.048240798926105125, + "language_loss": 0.90597415, + "learning_rate": 0.0009502524516219595, + "loss": 0.91664839, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.52978516, + "step": 878, + "time_per_iteration": 2.7533464431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.01234174, + "epoch": 0.1691035013466718, + "flos": 553406494464.0, + "grad_norm": 0.04285435284136928, + "language_loss": 0.91275579, + "learning_rate": 0.0009501168913462506, + "loss": 0.92340994, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.53173828, + "step": 879, + "time_per_iteration": 2.6498849391937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106115, + "balance_loss_mlp": 1.00946045, + "epoch": 0.16929588303193535, + "flos": 1479308427264.0, + "grad_norm": 0.010969186313753012, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80183077, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.51757812, + "step": 880, + "time_per_iteration": 4.8127734661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065784, + "balance_loss_mlp": 1.01228285, + "epoch": 0.1694882647171989, + "flos": 927848024064.0, + "grad_norm": 0.04254449001590413, + "language_loss": 0.86211771, + "learning_rate": 0.0009498452465949042, + "loss": 0.87277561, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.53613281, + "step": 881, + "time_per_iteration": 3.242352247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.00668061, + "epoch": 0.1696806464024625, + "flos": 547152193536.0, + "grad_norm": 0.03842920637304405, + "language_loss": 0.92758489, + "learning_rate": 0.0009497091622247285, + "loss": 0.93818152, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.53076172, + "step": 882, + "time_per_iteration": 2.7538321018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.01363766, + "epoch": 0.16987302808772606, + "flos": 530295519744.0, + "grad_norm": 0.04346709327253658, + "language_loss": 0.94739175, + "learning_rate": 0.0009495729032619723, + "loss": 0.95805502, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.52783203, + "step": 883, + "time_per_iteration": 2.681851863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.00830746, + "epoch": 0.17006540977298962, + "flos": 756479784960.0, + "grad_norm": 0.03707996109728333, + "language_loss": 0.85065424, + "learning_rate": 0.0009494364697595354, + "loss": 0.86126566, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.52929688, + "step": 884, + "time_per_iteration": 2.886613607406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058078, + "balance_loss_mlp": 1.00495851, + "epoch": 0.17025779145825318, + "flos": 559875623424.0, + "grad_norm": 0.04262534374301406, + "language_loss": 0.90753883, + "learning_rate": 0.0009492998617703867, + "loss": 0.91811961, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.53222656, + "step": 885, + "time_per_iteration": 2.7197954654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069962, + "balance_loss_mlp": 1.01684284, + "epoch": 0.17045017314351674, + "flos": 513217214976.0, + "grad_norm": 0.04472607646913617, + "language_loss": 0.89151132, + "learning_rate": 0.0009491630793475619, + "loss": 0.90221095, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.53222656, + "step": 886, + "time_per_iteration": 2.6023643016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059759, + "balance_loss_mlp": 1.00706899, + "epoch": 0.1706425548287803, + "flos": 510013269504.0, + "grad_norm": 0.03690999998020265, + "language_loss": 0.86250949, + "learning_rate": 0.0009490261225441643, + "loss": 0.87310708, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.52783203, + "step": 887, + "time_per_iteration": 2.8811516761779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01845872, + "epoch": 0.17083493651404386, + "flos": 718715818752.0, + "grad_norm": 0.037520519160069404, + "language_loss": 0.91723603, + "learning_rate": 0.0009488889914133656, + "loss": 0.92794418, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.52441406, + "step": 888, + "time_per_iteration": 2.983920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.01515496, + "epoch": 0.17102731819930742, + "flos": 560201266944.0, + "grad_norm": 0.034570155262309, + "language_loss": 0.90050644, + "learning_rate": 0.0009487516860084047, + "loss": 0.91118205, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.52490234, + "step": 889, + "time_per_iteration": 2.739945888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061028, + "balance_loss_mlp": 1.0078603, + "epoch": 0.17121969988457098, + "flos": 495765634560.0, + "grad_norm": 0.04354558177795279, + "language_loss": 0.9033885, + "learning_rate": 0.0009486142063825884, + "loss": 0.91399872, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.53271484, + "step": 890, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.02223206, + "epoch": 0.17141208156983456, + "flos": 1552108723968.0, + "grad_norm": 0.01766408052426257, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73499948, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.515625, + "step": 891, + "time_per_iteration": 4.968579053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_mlp": 1.00568736, + "epoch": 0.17160446325509812, + "flos": 620700986880.0, + "grad_norm": 0.037544702591063864, + "language_loss": 0.91210532, + "learning_rate": 0.0009483387246819542, + "loss": 0.92269152, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.53027344, + "step": 892, + "time_per_iteration": 2.7970938682556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071655, + "balance_loss_mlp": 1.0209198, + "epoch": 0.17179684494036168, + "flos": 1384695839232.0, + "grad_norm": 0.01601076320839161, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83357239, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.5078125, + "step": 893, + "time_per_iteration": 4.629605054855347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.01386988, + "epoch": 0.17198922662562524, + "flos": 493642632192.0, + "grad_norm": 0.03763004911158334, + "language_loss": 0.90241146, + "learning_rate": 0.0009480625467392688, + "loss": 0.91307414, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.52490234, + "step": 894, + "time_per_iteration": 2.6142358779907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_mlp": 1.01822662, + "epoch": 0.1721816083108888, + "flos": 1461488428800.0, + "grad_norm": 0.016749035753296605, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79063439, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.50585938, + "step": 895, + "time_per_iteration": 4.811494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112065, + "balance_loss_mlp": 1.06719661, + "epoch": 0.17237398999615236, + "flos": 529205828352.0, + "grad_norm": 0.05241044192650153, + "language_loss": 0.88738441, + "learning_rate": 0.0009477856729834196, + "loss": 0.89859092, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.53564453, + "step": 896, + "time_per_iteration": 2.7389612197875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066584, + "balance_loss_mlp": 1.01446557, + "epoch": 0.17256637168141592, + "flos": 605027323392.0, + "grad_norm": 0.03860455021635393, + "language_loss": 0.90989411, + "learning_rate": 0.0009476469753098809, + "loss": 0.92055988, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.52197266, + "step": 897, + "time_per_iteration": 2.7175238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.02507758, + "epoch": 0.17275875336667948, + "flos": 510694692096.0, + "grad_norm": 0.040412661310783936, + "language_loss": 0.88453948, + "learning_rate": 0.0009475081038443738, + "loss": 0.89531147, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.52197266, + "step": 898, + "time_per_iteration": 2.6398110389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.02753115, + "epoch": 0.17295113505194307, + "flos": 666502028544.0, + "grad_norm": 0.045107808798334564, + "language_loss": 0.87902451, + "learning_rate": 0.0009473690586408124, + "loss": 0.88981915, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.52001953, + "step": 899, + "time_per_iteration": 2.817730665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.01965487, + "epoch": 0.17314351673720663, + "flos": 556432550400.0, + "grad_norm": 0.03870851432877784, + "language_loss": 0.87576568, + "learning_rate": 0.0009472298397531792, + "loss": 0.88648236, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.52099609, + "step": 900, + "time_per_iteration": 2.6932764053344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061802, + "balance_loss_mlp": 1.00892079, + "epoch": 0.17333589842247019, + "flos": 504607587072.0, + "grad_norm": 0.03631909976073519, + "language_loss": 0.87174571, + "learning_rate": 0.0009470904472355235, + "loss": 0.88236374, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.52978516, + "step": 901, + "time_per_iteration": 2.669405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099242, + "balance_loss_mlp": 1.04593205, + "epoch": 0.17352828010773375, + "flos": 557351155200.0, + "grad_norm": 0.04839261993488341, + "language_loss": 0.80976391, + "learning_rate": 0.0009469508811419626, + "loss": 0.82075632, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.53417969, + "step": 902, + "time_per_iteration": 2.7412211894989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083992, + "balance_loss_mlp": 1.033638, + "epoch": 0.1737206617929973, + "flos": 1557794363136.0, + "grad_norm": 0.02136399149953286, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72697818, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.50390625, + "step": 903, + "time_per_iteration": 4.800720930099487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075494, + "balance_loss_mlp": 1.02318478, + "epoch": 0.17391304347826086, + "flos": 517756782336.0, + "grad_norm": 0.04178806719411302, + "language_loss": 0.85797513, + "learning_rate": 0.0009466712284439292, + "loss": 0.86873007, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.52392578, + "step": 904, + "time_per_iteration": 2.7409780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076244, + "balance_loss_mlp": 1.02360141, + "epoch": 0.17410542516352442, + "flos": 542161582848.0, + "grad_norm": 0.043268311729831165, + "language_loss": 0.90273786, + "learning_rate": 0.0009465311419480276, + "loss": 0.91350031, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.52734375, + "step": 905, + "time_per_iteration": 2.7310986518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068245, + "balance_loss_mlp": 1.01526833, + "epoch": 0.17429780684878798, + "flos": 625082106624.0, + "grad_norm": 0.0375699532684124, + "language_loss": 0.89484948, + "learning_rate": 0.0009463908820933622, + "loss": 0.905532, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.53076172, + "step": 906, + "time_per_iteration": 2.8575551509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086696, + "balance_loss_mlp": 1.03281319, + "epoch": 0.17449018853405157, + "flos": 576849915648.0, + "grad_norm": 0.04286783530345041, + "language_loss": 0.83513701, + "learning_rate": 0.0009462504489343868, + "loss": 0.84600401, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.54003906, + "step": 907, + "time_per_iteration": 2.83085036277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.0128628, + "epoch": 0.17468257021931513, + "flos": 534773849088.0, + "grad_norm": 0.0408315501053547, + "language_loss": 0.90177906, + "learning_rate": 0.0009461098425256222, + "loss": 0.91243982, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.53320312, + "step": 908, + "time_per_iteration": 2.6000654697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075411, + "balance_loss_mlp": 1.02257717, + "epoch": 0.1748749519045787, + "flos": 541809694464.0, + "grad_norm": 0.0381088809784924, + "language_loss": 0.87053907, + "learning_rate": 0.0009459690629216567, + "loss": 0.88129318, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.52929688, + "step": 909, + "time_per_iteration": 2.622178316116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_mlp": 1.02770495, + "epoch": 0.17506733358984225, + "flos": 499627670016.0, + "grad_norm": 0.039096197570908604, + "language_loss": 0.88898331, + "learning_rate": 0.0009458281101771457, + "loss": 0.89978582, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.52636719, + "step": 910, + "time_per_iteration": 2.5964770317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.01217556, + "epoch": 0.1752597152751058, + "flos": 624133366272.0, + "grad_norm": 0.035444142957055544, + "language_loss": 0.83730716, + "learning_rate": 0.0009456869843468122, + "loss": 0.84795535, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.52734375, + "step": 911, + "time_per_iteration": 2.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059336, + "balance_loss_mlp": 1.00650251, + "epoch": 0.17545209696036937, + "flos": 521994038784.0, + "grad_norm": 0.04587594362499167, + "language_loss": 0.79429859, + "learning_rate": 0.0009455456854854459, + "loss": 0.80489194, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.52929688, + "step": 912, + "time_per_iteration": 2.627058744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.0219084, + "epoch": 0.17564447864563293, + "flos": 462946592256.0, + "grad_norm": 0.044462507375804226, + "language_loss": 0.85522115, + "learning_rate": 0.0009454042136479039, + "loss": 0.86597091, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.53173828, + "step": 913, + "time_per_iteration": 2.562453031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.01250815, + "epoch": 0.1758368603308965, + "flos": 481618121472.0, + "grad_norm": 0.03599423435064716, + "language_loss": 0.84144086, + "learning_rate": 0.0009452625688891103, + "loss": 0.85208857, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.5234375, + "step": 914, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.0137558, + "epoch": 0.17602924201616005, + "flos": 1482087574272.0, + "grad_norm": 0.013260252544834742, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79798466, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.49902344, + "step": 915, + "time_per_iteration": 4.572151184082031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_mlp": 1.0219233, + "epoch": 0.17622162370142364, + "flos": 603471037440.0, + "grad_norm": 0.044830704586910027, + "language_loss": 0.94022703, + "learning_rate": 0.0009449787608278015, + "loss": 0.95096982, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.52441406, + "step": 916, + "time_per_iteration": 2.731264114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062837, + "balance_loss_mlp": 1.0104804, + "epoch": 0.1764140053866872, + "flos": 443606279424.0, + "grad_norm": 0.0370205772569368, + "language_loss": 0.92972034, + "learning_rate": 0.0009448365976354704, + "loss": 0.94034874, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.52441406, + "step": 917, + "time_per_iteration": 2.478041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.0204134, + "epoch": 0.17660638707195075, + "flos": 501592224768.0, + "grad_norm": 0.047363321454448416, + "language_loss": 0.907022, + "learning_rate": 0.0009446942617422558, + "loss": 0.91775542, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.53027344, + "step": 918, + "time_per_iteration": 2.5698564052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_mlp": 1.00789583, + "epoch": 0.17679876875721431, + "flos": 539984145408.0, + "grad_norm": 0.03732253291641402, + "language_loss": 0.86447889, + "learning_rate": 0.0009445517532034176, + "loss": 0.87508708, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.53027344, + "step": 919, + "time_per_iteration": 2.6916563510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.00926292, + "epoch": 0.17699115044247787, + "flos": 498715868160.0, + "grad_norm": 0.04444616550081301, + "language_loss": 0.8994987, + "learning_rate": 0.0009444090720742824, + "loss": 0.91012013, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.52978516, + "step": 920, + "time_per_iteration": 2.5798380374908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.01706016, + "epoch": 0.17718353212774143, + "flos": 663916322304.0, + "grad_norm": 0.04662040468857239, + "language_loss": 0.89399016, + "learning_rate": 0.0009442662184102439, + "loss": 0.90468818, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.52832031, + "step": 921, + "time_per_iteration": 2.755929708480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.01164341, + "epoch": 0.177375913813005, + "flos": 583848822528.0, + "grad_norm": 0.03479566109485236, + "language_loss": 0.88455689, + "learning_rate": 0.000944123192266763, + "loss": 0.89519787, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.52539062, + "step": 922, + "time_per_iteration": 2.8776824474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.00980616, + "epoch": 0.17756829549826855, + "flos": 553684505856.0, + "grad_norm": 0.036018663808135676, + "language_loss": 0.84559548, + "learning_rate": 0.0009439799936993671, + "loss": 0.85622525, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.53271484, + "step": 923, + "time_per_iteration": 2.708897113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.01041508, + "epoch": 0.17776067718353214, + "flos": 557372542464.0, + "grad_norm": 0.06706828820902193, + "language_loss": 0.89721078, + "learning_rate": 0.0009438366227636511, + "loss": 0.90784371, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.52978516, + "step": 924, + "time_per_iteration": 2.6524295806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.01035416, + "epoch": 0.1779530588687957, + "flos": 659652820992.0, + "grad_norm": 0.03503923634288643, + "language_loss": 0.87549317, + "learning_rate": 0.0009436930795152763, + "loss": 0.8861202, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.52441406, + "step": 925, + "time_per_iteration": 2.8627374172210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_mlp": 1.01823378, + "epoch": 0.17814544055405926, + "flos": 645672503808.0, + "grad_norm": 0.03989967380061369, + "language_loss": 0.87815237, + "learning_rate": 0.0009435493640099713, + "loss": 0.88885403, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.52001953, + "step": 926, + "time_per_iteration": 2.7886180877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.01283479, + "epoch": 0.17833782223932282, + "flos": 461885091072.0, + "grad_norm": 0.040977111340993126, + "language_loss": 0.85709256, + "learning_rate": 0.0009434054763035314, + "loss": 0.86774307, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.52294922, + "step": 927, + "time_per_iteration": 2.635576009750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00520515, + "epoch": 0.17853020392458638, + "flos": 760854101760.0, + "grad_norm": 0.029435711646972902, + "language_loss": 0.86359227, + "learning_rate": 0.0009432614164518185, + "loss": 0.8741703, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.52685547, + "step": 928, + "time_per_iteration": 2.945253849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074963, + "balance_loss_mlp": 1.02203369, + "epoch": 0.17872258560984994, + "flos": 784056450048.0, + "grad_norm": 0.039066121455708196, + "language_loss": 0.84876156, + "learning_rate": 0.000943117184510762, + "loss": 0.85951114, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 3.0016870498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092369, + "balance_loss_mlp": 1.04201508, + "epoch": 0.1789149672951135, + "flos": 1463034021120.0, + "grad_norm": 0.03241390760866092, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79882336, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.50390625, + "step": 930, + "time_per_iteration": 5.0408923625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.04005396, + "epoch": 0.17910734898037706, + "flos": 504931285248.0, + "grad_norm": 0.037670754636037675, + "language_loss": 0.90276599, + "learning_rate": 0.0009428282045846674, + "loss": 0.91368294, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.51708984, + "step": 931, + "time_per_iteration": 2.699357509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093914, + "balance_loss_mlp": 1.04260671, + "epoch": 0.17929973066564064, + "flos": 747670880256.0, + "grad_norm": 0.03557447538434831, + "language_loss": 0.91468316, + "learning_rate": 0.0009426834567118214, + "loss": 0.92562228, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.51367188, + "step": 932, + "time_per_iteration": 3.0888116359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095199, + "balance_loss_mlp": 1.04370034, + "epoch": 0.1794921123509042, + "flos": 714573826560.0, + "grad_norm": 0.03713873812168088, + "language_loss": 0.82311261, + "learning_rate": 0.0009425385369740155, + "loss": 0.8340646, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.515625, + "step": 933, + "time_per_iteration": 3.0156304836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.04465711, + "epoch": 0.17968449403616776, + "flos": 634362463488.0, + "grad_norm": 0.04581160448205157, + "language_loss": 0.89044029, + "learning_rate": 0.0009423934454275125, + "loss": 0.90140092, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.51464844, + "step": 934, + "time_per_iteration": 2.8524558544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095874, + "balance_loss_mlp": 1.04428041, + "epoch": 0.17987687572143132, + "flos": 537378997248.0, + "grad_norm": 0.045982575553228676, + "language_loss": 0.93734717, + "learning_rate": 0.0009422481821286418, + "loss": 0.94830596, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.51660156, + "step": 935, + "time_per_iteration": 2.7354249954223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096416, + "balance_loss_mlp": 1.0448221, + "epoch": 0.18006925740669488, + "flos": 539119975680.0, + "grad_norm": 0.04748543050697339, + "language_loss": 0.89948702, + "learning_rate": 0.0009421027471337998, + "loss": 0.91045117, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.51660156, + "step": 936, + "time_per_iteration": 2.660287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095184, + "balance_loss_mlp": 1.04363835, + "epoch": 0.18026163909195844, + "flos": 540535310592.0, + "grad_norm": 0.04911488628490749, + "language_loss": 0.84066534, + "learning_rate": 0.0009419571404994493, + "loss": 0.8516171, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.51611328, + "step": 937, + "time_per_iteration": 2.624769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.03865409, + "epoch": 0.180454020777222, + "flos": 501683598336.0, + "grad_norm": 0.0468107226861285, + "language_loss": 0.92304778, + "learning_rate": 0.00094181136228212, + "loss": 0.9339512, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.51757812, + "step": 938, + "time_per_iteration": 2.6784133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092284, + "balance_loss_mlp": 1.04069054, + "epoch": 0.18064640246248556, + "flos": 500007748608.0, + "grad_norm": 0.039466745711782485, + "language_loss": 0.87082231, + "learning_rate": 0.0009416654125384077, + "loss": 0.8817451, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.51660156, + "step": 939, + "time_per_iteration": 2.7231576442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081085, + "balance_loss_mlp": 1.03034973, + "epoch": 0.18083878414774912, + "flos": 1522293383424.0, + "grad_norm": 0.016406546431804496, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80853462, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.5078125, + "step": 940, + "time_per_iteration": 4.919930934906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329403, + "balance_loss_mlp": 1.27490067, + "epoch": 0.1810311658330127, + "flos": 728666904576.0, + "grad_norm": 0.12503564718566265, + "language_loss": 0.85519916, + "learning_rate": 0.000941372998698552, + "loss": 0.8684932, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.54638672, + "step": 941, + "time_per_iteration": 2.9731380939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093385, + "balance_loss_mlp": 1.04121876, + "epoch": 0.18122354751827627, + "flos": 566045353728.0, + "grad_norm": 0.05253753965114479, + "language_loss": 0.83319217, + "learning_rate": 0.0009412265347159336, + "loss": 0.84412599, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.52246094, + "step": 942, + "time_per_iteration": 2.7150988578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103842, + "balance_loss_mlp": 1.05162799, + "epoch": 0.18141592920353983, + "flos": 520318189056.0, + "grad_norm": 0.046885904923641086, + "language_loss": 0.86687338, + "learning_rate": 0.0009410798994339829, + "loss": 0.87791175, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.52294922, + "step": 943, + "time_per_iteration": 2.598576545715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111341, + "balance_loss_mlp": 1.05831623, + "epoch": 0.1816083108888034, + "flos": 513477729792.0, + "grad_norm": 0.04639702407841738, + "language_loss": 0.8991158, + "learning_rate": 0.000940933092909628, + "loss": 0.91022921, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.53125, + "step": 944, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104212, + "balance_loss_mlp": 1.05109203, + "epoch": 0.18180069257406695, + "flos": 493373369088.0, + "grad_norm": 0.04493061679832577, + "language_loss": 0.85416293, + "learning_rate": 0.0009407861151998649, + "loss": 0.86520505, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.53222656, + "step": 945, + "time_per_iteration": 2.5710983276367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.04692006, + "epoch": 0.1819930742593305, + "flos": 571231350528.0, + "grad_norm": 0.04259629183686275, + "language_loss": 0.87787771, + "learning_rate": 0.0009406389663617552, + "loss": 0.88888001, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.53417969, + "step": 946, + "time_per_iteration": 2.6741456985473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.04661465, + "epoch": 0.18218545594459407, + "flos": 607111441920.0, + "grad_norm": 0.04866460503106345, + "language_loss": 0.87927794, + "learning_rate": 0.000940491646452427, + "loss": 0.89027911, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.53613281, + "step": 947, + "time_per_iteration": 2.718358278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101327, + "balance_loss_mlp": 1.04753995, + "epoch": 0.18237783762985763, + "flos": 549739845120.0, + "grad_norm": 0.042994543525894185, + "language_loss": 0.92601323, + "learning_rate": 0.000940344155529075, + "loss": 0.93702656, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.5390625, + "step": 948, + "time_per_iteration": 2.624303102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097087, + "balance_loss_mlp": 1.04325247, + "epoch": 0.1825702193151212, + "flos": 451675435776.0, + "grad_norm": 0.046415524987670945, + "language_loss": 0.89178842, + "learning_rate": 0.0009401964936489605, + "loss": 0.90275931, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.53955078, + "step": 949, + "time_per_iteration": 2.5104119777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088983, + "balance_loss_mlp": 1.03524303, + "epoch": 0.18276260100038477, + "flos": 590385025536.0, + "grad_norm": 0.0430347708706334, + "language_loss": 0.86972219, + "learning_rate": 0.0009400486608694108, + "loss": 0.88061202, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.53857422, + "step": 950, + "time_per_iteration": 2.744044065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085663, + "balance_loss_mlp": 1.03154159, + "epoch": 0.18295498268564833, + "flos": 788710723584.0, + "grad_norm": 0.040810758702646055, + "language_loss": 0.88588369, + "learning_rate": 0.0009399006572478195, + "loss": 0.89674032, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.54248047, + "step": 951, + "time_per_iteration": 3.0828475952148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.02493632, + "epoch": 0.1831473643709119, + "flos": 579226629888.0, + "grad_norm": 0.03747434947067488, + "language_loss": 0.92113942, + "learning_rate": 0.0009397524828416468, + "loss": 0.93193376, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.54638672, + "step": 952, + "time_per_iteration": 2.6881086826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.03405273, + "epoch": 0.18333974605617545, + "flos": 567964221696.0, + "grad_norm": 0.0419825959367211, + "language_loss": 0.97306633, + "learning_rate": 0.0009396041377084192, + "loss": 0.9839648, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.55957031, + "step": 953, + "time_per_iteration": 2.673654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097804, + "balance_loss_mlp": 1.04191864, + "epoch": 0.183532127741439, + "flos": 528070450176.0, + "grad_norm": 0.04203850234568462, + "language_loss": 0.89016271, + "learning_rate": 0.0009394556219057295, + "loss": 0.90114069, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.56054688, + "step": 954, + "time_per_iteration": 2.7255043983459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.01610565, + "epoch": 0.18372450942670257, + "flos": 595644899328.0, + "grad_norm": 0.03789415730727427, + "language_loss": 0.84751296, + "learning_rate": 0.0009393069354912362, + "loss": 0.85822284, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.55029297, + "step": 955, + "time_per_iteration": 2.7474210262298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_mlp": 1.02963698, + "epoch": 0.18391689111196613, + "flos": 646284907008.0, + "grad_norm": 0.04389714766773939, + "language_loss": 0.83882308, + "learning_rate": 0.0009391580785226649, + "loss": 0.84966445, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.54638672, + "step": 956, + "time_per_iteration": 2.844409465789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081024, + "balance_loss_mlp": 1.02990723, + "epoch": 0.18410927279722972, + "flos": 1460394846720.0, + "grad_norm": 0.013082177800516761, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80421472, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.51171875, + "step": 957, + "time_per_iteration": 4.792405843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_mlp": 1.030267, + "epoch": 0.18430165448249328, + "flos": 660004709376.0, + "grad_norm": 0.04089111102732722, + "language_loss": 0.88231802, + "learning_rate": 0.0009388598531545196, + "loss": 0.89316285, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.54345703, + "step": 958, + "time_per_iteration": 2.900062084197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084489, + "balance_loss_mlp": 1.03017747, + "epoch": 0.18449403616775684, + "flos": 518950486272.0, + "grad_norm": 0.045948437313162956, + "language_loss": 0.87467843, + "learning_rate": 0.000938710484870727, + "loss": 0.88552332, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.54443359, + "step": 959, + "time_per_iteration": 2.5785140991210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.031569, + "epoch": 0.1846864178530204, + "flos": 553825456896.0, + "grad_norm": 0.04362127254920589, + "language_loss": 0.87369549, + "learning_rate": 0.0009385609462644189, + "loss": 0.88455284, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.54296875, + "step": 960, + "time_per_iteration": 2.686221122741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.02774417, + "epoch": 0.18487879953828396, + "flos": 467116774656.0, + "grad_norm": 0.04468558895083242, + "language_loss": 0.86931455, + "learning_rate": 0.0009384112373936514, + "loss": 0.88013744, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.546875, + "step": 961, + "time_per_iteration": 2.633582830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.00935197, + "epoch": 0.18507118122354752, + "flos": 649684238592.0, + "grad_norm": 0.03687654302408078, + "language_loss": 0.9259429, + "learning_rate": 0.0009382613583165467, + "loss": 0.93658715, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.55224609, + "step": 962, + "time_per_iteration": 2.7910635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458915, + "balance_loss_mlp": 1.40078855, + "epoch": 0.18526356290881107, + "flos": 627923470080.0, + "grad_norm": 0.09306974449566385, + "language_loss": 0.90611041, + "learning_rate": 0.0009381113090912928, + "loss": 0.92069954, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.57958984, + "step": 963, + "time_per_iteration": 2.7445125579833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078714, + "balance_loss_mlp": 1.02464056, + "epoch": 0.18545594459407463, + "flos": 433646445312.0, + "grad_norm": 0.04076594680163087, + "language_loss": 0.91471934, + "learning_rate": 0.000937961089776144, + "loss": 0.92550647, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.54199219, + "step": 964, + "time_per_iteration": 2.5835955142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089429, + "balance_loss_mlp": 1.03607059, + "epoch": 0.1856483262793382, + "flos": 750427673088.0, + "grad_norm": 0.041116434601540804, + "language_loss": 0.8449949, + "learning_rate": 0.0009378107004294208, + "loss": 0.8558892, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.53466797, + "step": 965, + "time_per_iteration": 2.9773664474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090833, + "balance_loss_mlp": 1.03790379, + "epoch": 0.18584070796460178, + "flos": 531402707712.0, + "grad_norm": 0.04029010126422192, + "language_loss": 0.93043375, + "learning_rate": 0.0009376601411095096, + "loss": 0.94134206, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.53027344, + "step": 966, + "time_per_iteration": 2.6703643798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088702, + "balance_loss_mlp": 1.03639269, + "epoch": 0.18603308964986534, + "flos": 484084263936.0, + "grad_norm": 0.03934020689435504, + "language_loss": 0.87718618, + "learning_rate": 0.0009375094118748622, + "loss": 0.88807321, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.52392578, + "step": 967, + "time_per_iteration": 2.5719969272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091813, + "balance_loss_mlp": 1.03974187, + "epoch": 0.1862254713351289, + "flos": 802682292480.0, + "grad_norm": 0.042176858736630414, + "language_loss": 0.92643285, + "learning_rate": 0.0009373585127839976, + "loss": 0.93735105, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.52148438, + "step": 968, + "time_per_iteration": 2.956153392791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096332, + "balance_loss_mlp": 1.04483318, + "epoch": 0.18641785302039246, + "flos": 479290984704.0, + "grad_norm": 0.04307464179422831, + "language_loss": 0.92206955, + "learning_rate": 0.0009372074438954994, + "loss": 0.93303293, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.515625, + "step": 969, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092255, + "balance_loss_mlp": 1.04085171, + "epoch": 0.18661023470565602, + "flos": 389779822848.0, + "grad_norm": 0.044792080488554424, + "language_loss": 0.93312657, + "learning_rate": 0.0009370562052680181, + "loss": 0.94404912, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.51464844, + "step": 970, + "time_per_iteration": 2.4642274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109029, + "balance_loss_mlp": 1.03926873, + "epoch": 0.18680261639091958, + "flos": 565776090624.0, + "grad_norm": 0.03666794569701081, + "language_loss": 0.90593827, + "learning_rate": 0.0009369047969602695, + "loss": 0.91684115, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.51074219, + "step": 971, + "time_per_iteration": 2.6925313472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090519, + "balance_loss_mlp": 1.03968859, + "epoch": 0.18699499807618314, + "flos": 480230976768.0, + "grad_norm": 0.04959033368050126, + "language_loss": 0.88274431, + "learning_rate": 0.0009367532190310357, + "loss": 0.89364946, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.50878906, + "step": 972, + "time_per_iteration": 2.5632824897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095286, + "balance_loss_mlp": 1.04464579, + "epoch": 0.1871873797614467, + "flos": 554328989952.0, + "grad_norm": 0.047101191533600484, + "language_loss": 0.90956879, + "learning_rate": 0.0009366014715391644, + "loss": 0.92052168, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.50683594, + "step": 973, + "time_per_iteration": 2.6131792068481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087331, + "balance_loss_mlp": 1.03669059, + "epoch": 0.18737976144671029, + "flos": 553953768960.0, + "grad_norm": 0.03277863870695053, + "language_loss": 0.85193431, + "learning_rate": 0.0009364495545435693, + "loss": 0.86280763, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.50683594, + "step": 974, + "time_per_iteration": 2.768160820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077828, + "balance_loss_mlp": 1.02647221, + "epoch": 0.18757214313197385, + "flos": 503248632576.0, + "grad_norm": 0.03709252074476072, + "language_loss": 0.90046728, + "learning_rate": 0.0009362974681032297, + "loss": 0.91124547, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.51416016, + "step": 975, + "time_per_iteration": 2.596752405166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358762, + "balance_loss_mlp": 1.30464137, + "epoch": 0.1877645248172374, + "flos": 676292721408.0, + "grad_norm": 0.11355211768831018, + "language_loss": 0.89691889, + "learning_rate": 0.0009361452122771907, + "loss": 0.91050649, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.54248047, + "step": 976, + "time_per_iteration": 2.841670036315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.03649426, + "epoch": 0.18795690650250096, + "flos": 405863700480.0, + "grad_norm": 0.05182073733860081, + "language_loss": 0.85757113, + "learning_rate": 0.0009359927871245635, + "loss": 0.86844826, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.51269531, + "step": 977, + "time_per_iteration": 2.4593758583068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110124, + "balance_loss_mlp": 1.04988456, + "epoch": 0.18814928818776452, + "flos": 639064369152.0, + "grad_norm": 0.04599902588150218, + "language_loss": 0.8843354, + "learning_rate": 0.0009358401927045246, + "loss": 0.89534783, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.51416016, + "step": 978, + "time_per_iteration": 2.8043553829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_mlp": 1.05197036, + "epoch": 0.18834166987302808, + "flos": 1140117100800.0, + "grad_norm": 0.05109113713971293, + "language_loss": 0.89583617, + "learning_rate": 0.0009356874290763166, + "loss": 0.90687132, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.51611328, + "step": 979, + "time_per_iteration": 3.4783685207366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105346, + "balance_loss_mlp": 1.0536567, + "epoch": 0.18853405155829164, + "flos": 505816842240.0, + "grad_norm": 0.03906189308485337, + "language_loss": 0.90395761, + "learning_rate": 0.0009355344962992474, + "loss": 0.91501105, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.51757812, + "step": 980, + "time_per_iteration": 2.6457359790802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_mlp": 1.05116904, + "epoch": 0.1887264332435552, + "flos": 609371504640.0, + "grad_norm": 0.038270487176229884, + "language_loss": 0.89782834, + "learning_rate": 0.0009353813944326908, + "loss": 0.9088589, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.51953125, + "step": 981, + "time_per_iteration": 2.923243761062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102538, + "balance_loss_mlp": 1.05070543, + "epoch": 0.1889188149288188, + "flos": 553593132288.0, + "grad_norm": 0.04212053297292714, + "language_loss": 0.84181225, + "learning_rate": 0.0009352281235360863, + "loss": 0.85283768, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.51904297, + "step": 982, + "time_per_iteration": 2.674790620803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_mlp": 1.05135345, + "epoch": 0.18911119661408235, + "flos": 419470742016.0, + "grad_norm": 0.03892833341753514, + "language_loss": 0.86323905, + "learning_rate": 0.0009350746836689389, + "loss": 0.87426949, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.51757812, + "step": 983, + "time_per_iteration": 2.5294649600982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103523, + "balance_loss_mlp": 1.05335999, + "epoch": 0.1893035782993459, + "flos": 1485320676864.0, + "grad_norm": 0.016207020064155576, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82542741, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.50195312, + "step": 984, + "time_per_iteration": 5.031845569610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094201, + "balance_loss_mlp": 1.04227316, + "epoch": 0.18949595998460947, + "flos": 509457246720.0, + "grad_norm": 0.045438139941342374, + "language_loss": 0.84563899, + "learning_rate": 0.0009347672972613634, + "loss": 0.85658097, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.52001953, + "step": 985, + "time_per_iteration": 2.6333274841308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.0384593, + "epoch": 0.18968834166987303, + "flos": 532193000448.0, + "grad_norm": 0.03993027053802703, + "language_loss": 0.8704083, + "learning_rate": 0.0009346133508402735, + "loss": 0.8813107, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.51855469, + "step": 986, + "time_per_iteration": 2.751340389251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089761, + "balance_loss_mlp": 1.03797686, + "epoch": 0.1898807233551366, + "flos": 500754299904.0, + "grad_norm": 0.04595906606263721, + "language_loss": 0.85852754, + "learning_rate": 0.0009344592356873166, + "loss": 0.86942512, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.51855469, + "step": 987, + "time_per_iteration": 2.6785645484924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.03223073, + "epoch": 0.19007310504040015, + "flos": 603360221952.0, + "grad_norm": 0.042275439246703725, + "language_loss": 0.79788595, + "learning_rate": 0.0009343049518623255, + "loss": 0.80872947, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.52197266, + "step": 988, + "time_per_iteration": 2.709439516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365061, + "balance_loss_mlp": 1.30979574, + "epoch": 0.1902654867256637, + "flos": 602765315328.0, + "grad_norm": 0.1049262798815586, + "language_loss": 0.8386007, + "learning_rate": 0.0009341504994251985, + "loss": 0.85225129, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.55419922, + "step": 989, + "time_per_iteration": 2.925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.03952026, + "epoch": 0.19045786841092727, + "flos": 1579234345728.0, + "grad_norm": 0.01847097645999908, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74610186, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.50195312, + "step": 990, + "time_per_iteration": 5.025054216384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_mlp": 1.04845631, + "epoch": 0.19065025009619085, + "flos": 683055412992.0, + "grad_norm": 0.039739471389523856, + "language_loss": 0.8281374, + "learning_rate": 0.0009338410889544574, + "loss": 0.83915699, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.53613281, + "step": 991, + "time_per_iteration": 3.0653748512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112616, + "balance_loss_mlp": 1.05868626, + "epoch": 0.1908426317814544, + "flos": 603442847232.0, + "grad_norm": 0.04383499470371995, + "language_loss": 0.89543211, + "learning_rate": 0.000933686131040967, + "loss": 0.90655828, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.54052734, + "step": 992, + "time_per_iteration": 2.7901530265808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106006, + "balance_loss_mlp": 1.0517416, + "epoch": 0.19103501346671797, + "flos": 587434791936.0, + "grad_norm": 0.04122735235002176, + "language_loss": 0.92173266, + "learning_rate": 0.0009335310047555883, + "loss": 0.93279278, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.54394531, + "step": 993, + "time_per_iteration": 2.7153608798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097163, + "balance_loss_mlp": 1.04285157, + "epoch": 0.19122739515198153, + "flos": 546835298304.0, + "grad_norm": 0.04052898350535971, + "language_loss": 0.89637405, + "learning_rate": 0.0009333757101585467, + "loss": 0.90734565, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.54443359, + "step": 994, + "time_per_iteration": 2.6286795139312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091826, + "balance_loss_mlp": 1.03732359, + "epoch": 0.1914197768372451, + "flos": 522550061568.0, + "grad_norm": 0.03850908176124289, + "language_loss": 0.94694555, + "learning_rate": 0.0009332202473101329, + "loss": 0.95786381, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.54638672, + "step": 995, + "time_per_iteration": 2.649850368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072176, + "balance_loss_mlp": 1.01714945, + "epoch": 0.19161215852250865, + "flos": 612388812288.0, + "grad_norm": 0.03654296504823072, + "language_loss": 0.83743644, + "learning_rate": 0.0009330646162707028, + "loss": 0.84815824, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.55175781, + "step": 996, + "time_per_iteration": 2.7329981327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059087, + "balance_loss_mlp": 1.0033443, + "epoch": 0.1918045402077722, + "flos": 848183935488.0, + "grad_norm": 0.03315860340701524, + "language_loss": 0.85236025, + "learning_rate": 0.0009329088171006779, + "loss": 0.8629511, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.55908203, + "step": 997, + "time_per_iteration": 3.135049343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290698, + "balance_loss_mlp": 1.2330482, + "epoch": 0.19199692189303577, + "flos": 466893198336.0, + "grad_norm": 0.06463762674453556, + "language_loss": 0.86239529, + "learning_rate": 0.0009327528498605446, + "loss": 0.87530231, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.57470703, + "step": 998, + "time_per_iteration": 2.5807580947875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.01727533, + "epoch": 0.19218930357829936, + "flos": 532613908224.0, + "grad_norm": 0.04280698068802137, + "language_loss": 0.90856296, + "learning_rate": 0.0009325967146108548, + "loss": 0.91928697, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.55273438, + "step": 999, + "time_per_iteration": 2.637840986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086346, + "balance_loss_mlp": 1.03217781, + "epoch": 0.19238168526356292, + "flos": 602728376832.0, + "grad_norm": 0.04847652630230049, + "language_loss": 0.88902158, + "learning_rate": 0.0009324404114122258, + "loss": 0.89988506, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.54296875, + "step": 1000, + "time_per_iteration": 4.1391942501068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090902, + "balance_loss_mlp": 1.03701913, + "epoch": 0.19257406694882648, + "flos": 573155076096.0, + "grad_norm": 0.04193719314851312, + "language_loss": 0.88362414, + "learning_rate": 0.0009322839403253397, + "loss": 0.89453316, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.54003906, + "step": 1001, + "time_per_iteration": 2.8266265392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.03395164, + "epoch": 0.19276644863409004, + "flos": 803157635328.0, + "grad_norm": 0.04353601683576214, + "language_loss": 0.85235333, + "learning_rate": 0.0009321273014109439, + "loss": 0.86323166, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.54003906, + "step": 1002, + "time_per_iteration": 2.9539175033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_mlp": 1.04068995, + "epoch": 0.1929588303193536, + "flos": 564480319488.0, + "grad_norm": 0.03718563884895513, + "language_loss": 0.86078906, + "learning_rate": 0.0009319704947298513, + "loss": 0.87173432, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.53955078, + "step": 1003, + "time_per_iteration": 2.8760387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091693, + "balance_loss_mlp": 1.0380007, + "epoch": 0.19315121200461716, + "flos": 627988598784.0, + "grad_norm": 0.03744955738150477, + "language_loss": 0.89579475, + "learning_rate": 0.0009318135203429393, + "loss": 0.9067117, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.53808594, + "step": 1004, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094654, + "balance_loss_mlp": 1.04058087, + "epoch": 0.19334359368988072, + "flos": 518584013568.0, + "grad_norm": 0.03742742378220975, + "language_loss": 0.89228511, + "learning_rate": 0.0009316563783111511, + "loss": 0.90323162, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.54199219, + "step": 1005, + "time_per_iteration": 2.7024500370025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090205, + "balance_loss_mlp": 1.03598833, + "epoch": 0.19353597537514428, + "flos": 695400709632.0, + "grad_norm": 0.036019255491177425, + "language_loss": 0.83731771, + "learning_rate": 0.0009314990686954943, + "loss": 0.84821975, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.54345703, + "step": 1006, + "time_per_iteration": 2.901319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092974, + "balance_loss_mlp": 1.03866184, + "epoch": 0.19372835706040784, + "flos": 1212200981760.0, + "grad_norm": 0.03507497873235563, + "language_loss": 0.82359284, + "learning_rate": 0.000931341591557042, + "loss": 0.8345226, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.54443359, + "step": 1007, + "time_per_iteration": 3.70509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.03467596, + "epoch": 0.19392073874567142, + "flos": 521685891840.0, + "grad_norm": 0.04354230775215961, + "language_loss": 0.88703787, + "learning_rate": 0.0009311839469569325, + "loss": 0.89792681, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.54345703, + "step": 1008, + "time_per_iteration": 2.632070302963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088222, + "balance_loss_mlp": 1.03386211, + "epoch": 0.19411312043093498, + "flos": 589911628032.0, + "grad_norm": 0.044503426382111445, + "language_loss": 0.88821465, + "learning_rate": 0.0009310261349563687, + "loss": 0.89909685, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.54492188, + "step": 1009, + "time_per_iteration": 2.7138211727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0067606, + "epoch": 0.19430550211619854, + "flos": 580572945408.0, + "grad_norm": 0.029375689409949213, + "language_loss": 0.86173785, + "learning_rate": 0.0009308681556166186, + "loss": 0.87235624, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.55224609, + "step": 1010, + "time_per_iteration": 2.834946870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05162705, + "balance_loss_mlp": 5.08607721, + "epoch": 0.1944978838014621, + "flos": 622246579200.0, + "grad_norm": 0.2884784307389343, + "language_loss": 0.88793403, + "learning_rate": 0.0009307100089990152, + "loss": 0.93956107, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.76513672, + "step": 1011, + "time_per_iteration": 2.705335855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094303, + "balance_loss_mlp": 1.04189909, + "epoch": 0.19469026548672566, + "flos": 599815081728.0, + "grad_norm": 0.04633555371791679, + "language_loss": 0.85740912, + "learning_rate": 0.0009305516951649568, + "loss": 0.86835217, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.52490234, + "step": 1012, + "time_per_iteration": 2.7048773765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164213, + "balance_loss_mlp": 1.11281013, + "epoch": 0.19488264717198922, + "flos": 553248046848.0, + "grad_norm": 0.04991787894778298, + "language_loss": 0.87912452, + "learning_rate": 0.0009303932141759057, + "loss": 0.89076668, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.51464844, + "step": 1013, + "time_per_iteration": 2.8072102069854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211245, + "balance_loss_mlp": 1.15984225, + "epoch": 0.19507502885725278, + "flos": 667313708544.0, + "grad_norm": 0.06529111316537192, + "language_loss": 0.85445917, + "learning_rate": 0.0009302345660933902, + "loss": 0.86657166, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.51464844, + "step": 1014, + "time_per_iteration": 2.7895615100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244014, + "balance_loss_mlp": 1.19265878, + "epoch": 0.19526741054251634, + "flos": 672328618752.0, + "grad_norm": 0.06071591874537116, + "language_loss": 0.86587232, + "learning_rate": 0.0009300757509790026, + "loss": 0.87831247, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.51416016, + "step": 1015, + "time_per_iteration": 2.8867006301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012313, + "balance_loss_mlp": 1.18008745, + "epoch": 0.19545979222777993, + "flos": 448147792128.0, + "grad_norm": 0.057262662434688416, + "language_loss": 0.91914976, + "learning_rate": 0.0009299167688944005, + "loss": 0.93146276, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.51269531, + "step": 1016, + "time_per_iteration": 2.526421546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226901, + "balance_loss_mlp": 1.17568827, + "epoch": 0.1956521739130435, + "flos": 570169849344.0, + "grad_norm": 0.05343522997619492, + "language_loss": 0.87454194, + "learning_rate": 0.0009297576199013063, + "loss": 0.8868109, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.51269531, + "step": 1017, + "time_per_iteration": 2.7184784412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012071, + "balance_loss_mlp": 1.15884399, + "epoch": 0.19584455559830705, + "flos": 1458883280640.0, + "grad_norm": 0.03399393552013433, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74209231, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.48242188, + "step": 1018, + "time_per_iteration": 4.916393756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159874, + "balance_loss_mlp": 1.11199951, + "epoch": 0.1960369372835706, + "flos": 1594484189184.0, + "grad_norm": 0.02523442502037962, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80586171, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.47851562, + "step": 1019, + "time_per_iteration": 5.5991902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163202, + "balance_loss_mlp": 1.11241901, + "epoch": 0.19622931896883417, + "flos": 617254023168.0, + "grad_norm": 0.06792637193668423, + "language_loss": 0.88615566, + "learning_rate": 0.0009292791720892659, + "loss": 0.89778763, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.50830078, + "step": 1020, + "time_per_iteration": 2.8419806957244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.08191884, + "epoch": 0.19642170065409773, + "flos": 467208148224.0, + "grad_norm": 0.044541966790476714, + "language_loss": 0.90245676, + "learning_rate": 0.0009291193560807218, + "loss": 0.91378373, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.50830078, + "step": 1021, + "time_per_iteration": 2.60357403755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111942, + "balance_loss_mlp": 1.06858945, + "epoch": 0.19661408233936128, + "flos": 516288957696.0, + "grad_norm": 0.03957164107654416, + "language_loss": 0.88134921, + "learning_rate": 0.0009289593734732688, + "loss": 0.89254344, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.50878906, + "step": 1022, + "time_per_iteration": 2.6077988147735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115639, + "balance_loss_mlp": 1.06461763, + "epoch": 0.19680646402462484, + "flos": 393494104320.0, + "grad_norm": 0.03618938319364158, + "language_loss": 0.94921708, + "learning_rate": 0.0009287992243290175, + "loss": 0.96037352, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.51074219, + "step": 1023, + "time_per_iteration": 2.486910820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_mlp": 1.05263603, + "epoch": 0.19699884570988843, + "flos": 627624071424.0, + "grad_norm": 0.04088238638674664, + "language_loss": 0.91379654, + "learning_rate": 0.0009286389087101435, + "loss": 0.92483938, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.51708984, + "step": 1024, + "time_per_iteration": 2.7762300968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083626, + "balance_loss_mlp": 1.03126919, + "epoch": 0.197191227395152, + "flos": 559074637056.0, + "grad_norm": 0.038177798611856564, + "language_loss": 0.89866579, + "learning_rate": 0.0009284784266788864, + "loss": 0.90950203, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.52441406, + "step": 1025, + "time_per_iteration": 2.7595441341400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.05275905, + "epoch": 0.19738360908041555, + "flos": 666250262016.0, + "grad_norm": 0.08120700653890094, + "language_loss": 0.93505025, + "learning_rate": 0.0009283177782975512, + "loss": 0.94610423, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.52734375, + "step": 1026, + "time_per_iteration": 2.9439735412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158523, + "balance_loss_mlp": 1.10511732, + "epoch": 0.1975759907656791, + "flos": 523511440896.0, + "grad_norm": 0.05175943009769999, + "language_loss": 0.89213437, + "learning_rate": 0.000928156963628507, + "loss": 0.9037196, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.53515625, + "step": 1027, + "time_per_iteration": 2.5648727416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124606, + "balance_loss_mlp": 1.0717721, + "epoch": 0.19776837245094267, + "flos": 463485118464.0, + "grad_norm": 0.0380471847687272, + "language_loss": 0.89530945, + "learning_rate": 0.0009279959827341877, + "loss": 0.90655547, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.52929688, + "step": 1028, + "time_per_iteration": 2.7482099533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114747, + "balance_loss_mlp": 1.0622474, + "epoch": 0.19796075413620623, + "flos": 504058367232.0, + "grad_norm": 0.038077776452832945, + "language_loss": 0.88821751, + "learning_rate": 0.0009278348356770915, + "loss": 0.89936495, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.52587891, + "step": 1029, + "time_per_iteration": 2.5559866428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125843, + "balance_loss_mlp": 1.07362974, + "epoch": 0.1981531358214698, + "flos": 508571689728.0, + "grad_norm": 0.03906482091144459, + "language_loss": 0.87010926, + "learning_rate": 0.0009276735225197814, + "loss": 0.88136768, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.52294922, + "step": 1030, + "time_per_iteration": 2.598353862762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116206, + "balance_loss_mlp": 1.06418335, + "epoch": 0.19834551750673335, + "flos": 532640153088.0, + "grad_norm": 0.039761606091750314, + "language_loss": 0.8715511, + "learning_rate": 0.0009275120433248847, + "loss": 0.88271314, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.52099609, + "step": 1031, + "time_per_iteration": 2.691051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105688, + "balance_loss_mlp": 1.05414224, + "epoch": 0.1985378991919969, + "flos": 776971027200.0, + "grad_norm": 0.03650424605094363, + "language_loss": 0.87217546, + "learning_rate": 0.0009273503981550931, + "loss": 0.88323236, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.51611328, + "step": 1032, + "time_per_iteration": 3.05829119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094626, + "balance_loss_mlp": 1.04336572, + "epoch": 0.1987302808772605, + "flos": 435192037632.0, + "grad_norm": 0.04492232470085823, + "language_loss": 0.88675368, + "learning_rate": 0.0009271885870731626, + "loss": 0.89769995, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.51318359, + "step": 1033, + "time_per_iteration": 2.5097644329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.04036272, + "epoch": 0.19892266256252406, + "flos": 554654633472.0, + "grad_norm": 0.041410721104386976, + "language_loss": 0.89478087, + "learning_rate": 0.0009270266101419143, + "loss": 0.90569472, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.51074219, + "step": 1034, + "time_per_iteration": 2.6359710693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.04026711, + "epoch": 0.19911504424778761, + "flos": 550949100288.0, + "grad_norm": 0.034987230226667505, + "language_loss": 0.86329561, + "learning_rate": 0.0009268644674242328, + "loss": 0.87420899, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.51123047, + "step": 1035, + "time_per_iteration": 2.679041624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091574, + "balance_loss_mlp": 1.04045713, + "epoch": 0.19930742593305117, + "flos": 519313068288.0, + "grad_norm": 0.035495194235479824, + "language_loss": 0.81977046, + "learning_rate": 0.0009267021589830678, + "loss": 0.83068615, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.51171875, + "step": 1036, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330025, + "balance_loss_mlp": 1.27871704, + "epoch": 0.19949980761831473, + "flos": 1512640717824.0, + "grad_norm": 0.0530000786951376, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78957105, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.51367188, + "step": 1037, + "time_per_iteration": 5.041083097457886 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.04635978, + "epoch": 0.1996921893035783, + "flos": 699440634624.0, + "grad_norm": 0.03827221066614039, + "language_loss": 0.93735194, + "learning_rate": 0.000926377045182406, + "loss": 0.94832766, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.51269531, + "step": 1038, + "time_per_iteration": 2.921194314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_mlp": 1.05443072, + "epoch": 0.19988457098884185, + "flos": 728395696128.0, + "grad_norm": 0.0388450926907903, + "language_loss": 0.89164472, + "learning_rate": 0.0009262142399491296, + "loss": 0.90270543, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.51708984, + "step": 1039, + "time_per_iteration": 3.0543293952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.05093122, + "epoch": 0.2000769526741054, + "flos": 561625350144.0, + "grad_norm": 0.04341407711707897, + "language_loss": 0.8911137, + "learning_rate": 0.0009260512692448105, + "loss": 0.90213847, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.51611328, + "step": 1040, + "time_per_iteration": 2.6906111240386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097091, + "balance_loss_mlp": 1.04549766, + "epoch": 0.200269334359369, + "flos": 573165769728.0, + "grad_norm": 0.03433464693573298, + "language_loss": 0.85109496, + "learning_rate": 0.000925888133132719, + "loss": 0.86206591, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.51660156, + "step": 1041, + "time_per_iteration": 2.77327561378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112812, + "balance_loss_mlp": 1.06465149, + "epoch": 0.20046171604463256, + "flos": 1489155500544.0, + "grad_norm": 0.023433110981570023, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072325, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.48144531, + "step": 1042, + "time_per_iteration": 4.926042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116989, + "balance_loss_mlp": 1.06525254, + "epoch": 0.20065409772989612, + "flos": 497578544640.0, + "grad_norm": 0.04254485219096875, + "language_loss": 0.82304472, + "learning_rate": 0.0009255613649386244, + "loss": 0.83421457, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.51806641, + "step": 1043, + "time_per_iteration": 2.6593456268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111366, + "balance_loss_mlp": 1.06144655, + "epoch": 0.20084647941515968, + "flos": 580464075264.0, + "grad_norm": 0.040062947145422745, + "language_loss": 0.79980814, + "learning_rate": 0.0009253977329834838, + "loss": 0.81094474, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.52294922, + "step": 1044, + "time_per_iteration": 2.765777111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110762, + "balance_loss_mlp": 1.0584054, + "epoch": 0.20103886110042324, + "flos": 643288986624.0, + "grad_norm": 0.040441822708095716, + "language_loss": 0.87291706, + "learning_rate": 0.0009252339358742965, + "loss": 0.88402474, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.52441406, + "step": 1045, + "time_per_iteration": 2.825388193130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.05353701, + "epoch": 0.2012312427856868, + "flos": 442970543616.0, + "grad_norm": 0.03567593499019723, + "language_loss": 0.84250462, + "learning_rate": 0.000925069973674654, + "loss": 0.85356355, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.52441406, + "step": 1046, + "time_per_iteration": 2.609393358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103182, + "balance_loss_mlp": 1.05082524, + "epoch": 0.20142362447095036, + "flos": 555473116416.0, + "grad_norm": 0.03147198417726023, + "language_loss": 0.89562172, + "learning_rate": 0.000924905846448212, + "loss": 0.90665352, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.52441406, + "step": 1047, + "time_per_iteration": 2.7771337032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.0364331, + "epoch": 0.20161600615621392, + "flos": 671555822592.0, + "grad_norm": 0.0352448826174341, + "language_loss": 0.86282432, + "learning_rate": 0.0009247415542586906, + "loss": 0.87371844, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.53076172, + "step": 1048, + "time_per_iteration": 2.8992083072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.03624833, + "epoch": 0.2018083878414775, + "flos": 574307950848.0, + "grad_norm": 0.02930747529675645, + "language_loss": 0.83574796, + "learning_rate": 0.0009245770971698735, + "loss": 0.84664071, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.53125, + "step": 1049, + "time_per_iteration": 2.890824317932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092759, + "balance_loss_mlp": 1.03992498, + "epoch": 0.20200076952674106, + "flos": 426795292416.0, + "grad_norm": 0.03785140598382088, + "language_loss": 0.89288604, + "learning_rate": 0.0009244124752456087, + "loss": 0.9038136, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.52929688, + "step": 1050, + "time_per_iteration": 2.5022785663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078262, + "balance_loss_mlp": 1.02566695, + "epoch": 0.20219315121200462, + "flos": 537685198848.0, + "grad_norm": 0.03140637951028952, + "language_loss": 0.86254251, + "learning_rate": 0.0009242476885498081, + "loss": 0.87332511, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.52685547, + "step": 1051, + "time_per_iteration": 2.732915163040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080259, + "balance_loss_mlp": 1.02771127, + "epoch": 0.20238553289726818, + "flos": 478835083776.0, + "grad_norm": 0.042472274730814934, + "language_loss": 0.82148528, + "learning_rate": 0.0009240827371464474, + "loss": 0.83228779, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.52636719, + "step": 1052, + "time_per_iteration": 2.577660322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076456, + "balance_loss_mlp": 1.02448094, + "epoch": 0.20257791458253174, + "flos": 1153847596800.0, + "grad_norm": 0.038862673250338535, + "language_loss": 0.85609984, + "learning_rate": 0.0009239176210995666, + "loss": 0.86686444, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.52050781, + "step": 1053, + "time_per_iteration": 3.517408609390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076589, + "balance_loss_mlp": 1.02485228, + "epoch": 0.2027702962677953, + "flos": 668149688064.0, + "grad_norm": 0.03591644261584591, + "language_loss": 0.94691521, + "learning_rate": 0.0009237523404732695, + "loss": 0.95768112, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51806641, + "step": 1054, + "time_per_iteration": 2.9073944091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010707, + "balance_loss_mlp": 1.01934481, + "epoch": 0.20296267795305886, + "flos": 642453007104.0, + "grad_norm": 0.03829830750428097, + "language_loss": 0.85043323, + "learning_rate": 0.0009235868953317235, + "loss": 0.86114025, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.51416016, + "step": 1055, + "time_per_iteration": 2.8769731521606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063995, + "balance_loss_mlp": 1.01249659, + "epoch": 0.20315505963832242, + "flos": 932130967296.0, + "grad_norm": 0.03371739794492534, + "language_loss": 0.86243355, + "learning_rate": 0.0009234212857391602, + "loss": 0.87307346, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.515625, + "step": 1056, + "time_per_iteration": 3.1701345443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062013, + "balance_loss_mlp": 1.01075327, + "epoch": 0.20334744132358598, + "flos": 563288560896.0, + "grad_norm": 0.028023058598955305, + "language_loss": 0.9034453, + "learning_rate": 0.000923255511759875, + "loss": 0.91406548, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.51318359, + "step": 1057, + "time_per_iteration": 2.8186585903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105938, + "balance_loss_mlp": 1.00840592, + "epoch": 0.20353982300884957, + "flos": 645429485568.0, + "grad_norm": 0.03599363132321351, + "language_loss": 0.85699975, + "learning_rate": 0.000923089573458227, + "loss": 0.86759359, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.51025391, + "step": 1058, + "time_per_iteration": 2.829428195953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_mlp": 1.01248097, + "epoch": 0.20373220469411313, + "flos": 652706403840.0, + "grad_norm": 0.03721325608628497, + "language_loss": 0.84890962, + "learning_rate": 0.0009229234708986392, + "loss": 0.85954273, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.50878906, + "step": 1059, + "time_per_iteration": 2.9125583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119614, + "balance_loss_mlp": 1.06964111, + "epoch": 0.2039245863793767, + "flos": 1440399367680.0, + "grad_norm": 0.026200157549973457, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82786512, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.49902344, + "step": 1060, + "time_per_iteration": 4.70502233505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105542, + "balance_loss_mlp": 1.00468493, + "epoch": 0.20411696806464025, + "flos": 598128538368.0, + "grad_norm": 0.03644056871626998, + "language_loss": 0.85909504, + "learning_rate": 0.0009225907732636548, + "loss": 0.86964923, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.5078125, + "step": 1061, + "time_per_iteration": 2.7681198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057413, + "balance_loss_mlp": 1.00672543, + "epoch": 0.2043093497499038, + "flos": 574897999872.0, + "grad_norm": 0.03243635340085092, + "language_loss": 0.87862682, + "learning_rate": 0.0009224241783174227, + "loss": 0.88920105, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.50732422, + "step": 1062, + "time_per_iteration": 2.682659864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058744, + "balance_loss_mlp": 1.00819898, + "epoch": 0.20450173143516737, + "flos": 631524990720.0, + "grad_norm": 0.033151959510572516, + "language_loss": 0.86810422, + "learning_rate": 0.0009222574193715802, + "loss": 0.87869167, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.50585938, + "step": 1063, + "time_per_iteration": 2.7470076084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057209, + "balance_loss_mlp": 1.00656855, + "epoch": 0.20469411312043093, + "flos": 575147821056.0, + "grad_norm": 0.03442752078644266, + "language_loss": 0.86910367, + "learning_rate": 0.000922090496490869, + "loss": 0.87967575, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.50683594, + "step": 1064, + "time_per_iteration": 2.789161443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_mlp": 1.00465047, + "epoch": 0.20488649480569449, + "flos": 638280879360.0, + "grad_norm": 0.029149473365885022, + "language_loss": 0.90671569, + "learning_rate": 0.0009219234097400937, + "loss": 0.91726714, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.50537109, + "step": 1065, + "time_per_iteration": 2.8469130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00483656, + "epoch": 0.20507887649095807, + "flos": 977439169536.0, + "grad_norm": 0.03225683406068631, + "language_loss": 0.83590472, + "learning_rate": 0.0009217561591841237, + "loss": 0.84645659, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.50390625, + "step": 1066, + "time_per_iteration": 3.331498622894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105396, + "balance_loss_mlp": 1.00332034, + "epoch": 0.20527125817622163, + "flos": 487156006656.0, + "grad_norm": 0.037421781664849635, + "language_loss": 0.81758374, + "learning_rate": 0.0009215887448878913, + "loss": 0.82812333, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.50683594, + "step": 1067, + "time_per_iteration": 2.5782346725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054496, + "balance_loss_mlp": 1.00414193, + "epoch": 0.2054636398614852, + "flos": 528211401216.0, + "grad_norm": 0.031680985043262715, + "language_loss": 0.86063826, + "learning_rate": 0.0009214211669163922, + "loss": 0.87118322, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.50390625, + "step": 1068, + "time_per_iteration": 2.689772129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054583, + "balance_loss_mlp": 1.00403798, + "epoch": 0.20565602154674875, + "flos": 559324458240.0, + "grad_norm": 0.03119808154519671, + "language_loss": 0.94868428, + "learning_rate": 0.0009212534253346862, + "loss": 0.95923012, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.50585938, + "step": 1069, + "time_per_iteration": 2.760840654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.01027393, + "epoch": 0.2058484032320123, + "flos": 505221935616.0, + "grad_norm": 0.042999288209875815, + "language_loss": 0.85068119, + "learning_rate": 0.0009210855202078964, + "loss": 0.86128938, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.50585938, + "step": 1070, + "time_per_iteration": 2.6273016929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057609, + "balance_loss_mlp": 1.00687337, + "epoch": 0.20604078491727587, + "flos": 434047911168.0, + "grad_norm": 0.03672139626538296, + "language_loss": 0.88035965, + "learning_rate": 0.0009209174516012091, + "loss": 0.89093566, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.5078125, + "step": 1071, + "time_per_iteration": 2.5263099670410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.0049957, + "epoch": 0.20623316660253943, + "flos": 609875037696.0, + "grad_norm": 0.03118890610347894, + "language_loss": 0.89938867, + "learning_rate": 0.0009207492195798747, + "loss": 0.90994692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.50878906, + "step": 1072, + "time_per_iteration": 2.773094654083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059091, + "balance_loss_mlp": 1.00816524, + "epoch": 0.206425548287803, + "flos": 481394545152.0, + "grad_norm": 0.034846135669383375, + "language_loss": 0.85408926, + "learning_rate": 0.0009205808242092061, + "loss": 0.86468017, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.50976562, + "step": 1073, + "time_per_iteration": 2.6704161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.01025188, + "epoch": 0.20661792997306658, + "flos": 951124249344.0, + "grad_norm": 0.036438983488896924, + "language_loss": 0.83303434, + "learning_rate": 0.0009204122655545808, + "loss": 0.84364516, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.50878906, + "step": 1074, + "time_per_iteration": 3.3605480194091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059315, + "balance_loss_mlp": 1.00857949, + "epoch": 0.20681031165833014, + "flos": 604617109248.0, + "grad_norm": 0.03238632395719984, + "language_loss": 0.81744164, + "learning_rate": 0.0009202435436814388, + "loss": 0.82803476, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.5078125, + "step": 1075, + "time_per_iteration": 2.6966288089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106261, + "balance_loss_mlp": 1.01163661, + "epoch": 0.2070026933435937, + "flos": 710266583808.0, + "grad_norm": 0.03297439165012413, + "language_loss": 0.90137285, + "learning_rate": 0.0009200746586552836, + "loss": 0.91199899, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.51025391, + "step": 1076, + "time_per_iteration": 2.919851779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057537, + "balance_loss_mlp": 1.00675428, + "epoch": 0.20719507502885726, + "flos": 831255330048.0, + "grad_norm": 0.031928056401627374, + "language_loss": 0.84964621, + "learning_rate": 0.0009199056105416825, + "loss": 0.86022151, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.50830078, + "step": 1077, + "time_per_iteration": 3.0944886207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059646, + "balance_loss_mlp": 1.00881469, + "epoch": 0.20738745671412082, + "flos": 639500828160.0, + "grad_norm": 0.033227407694906064, + "language_loss": 0.87196565, + "learning_rate": 0.0009197363994062654, + "loss": 0.88256204, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.50878906, + "step": 1078, + "time_per_iteration": 2.8505265712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059879, + "balance_loss_mlp": 1.00933433, + "epoch": 0.20757983839938438, + "flos": 686984522496.0, + "grad_norm": 0.03258152966614613, + "language_loss": 0.84972161, + "learning_rate": 0.0009195670253147262, + "loss": 0.86032039, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.50585938, + "step": 1079, + "time_per_iteration": 3.0077526569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_mlp": 1.01375961, + "epoch": 0.20777222008464794, + "flos": 520318189056.0, + "grad_norm": 0.03575722766779635, + "language_loss": 0.83075011, + "learning_rate": 0.0009193974883328216, + "loss": 0.84139216, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.50488281, + "step": 1080, + "time_per_iteration": 2.6277496814727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062434, + "balance_loss_mlp": 1.01212776, + "epoch": 0.2079646017699115, + "flos": 512470663680.0, + "grad_norm": 0.03316952161345372, + "language_loss": 0.87936002, + "learning_rate": 0.0009192277885263718, + "loss": 0.88998437, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.50341797, + "step": 1081, + "time_per_iteration": 2.6486003398895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056126, + "balance_loss_mlp": 1.00596321, + "epoch": 0.20815698345517505, + "flos": 933468534528.0, + "grad_norm": 0.031694408237267754, + "language_loss": 0.87043977, + "learning_rate": 0.0009190579259612602, + "loss": 0.881001, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.50195312, + "step": 1082, + "time_per_iteration": 3.280133008956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062428, + "balance_loss_mlp": 1.01202655, + "epoch": 0.20834936514043864, + "flos": 633554674176.0, + "grad_norm": 0.03367407497844021, + "language_loss": 0.87446159, + "learning_rate": 0.000918887900703433, + "loss": 0.88508588, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.50439453, + "step": 1083, + "time_per_iteration": 2.7914657592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_mlp": 1.01024699, + "epoch": 0.2085417468257022, + "flos": 395243831040.0, + "grad_norm": 0.03354838448754016, + "language_loss": 0.91036344, + "learning_rate": 0.0009187177128188999, + "loss": 0.92096996, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.50439453, + "step": 1084, + "time_per_iteration": 2.4803311824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107357, + "balance_loss_mlp": 1.02455139, + "epoch": 0.20873412851096576, + "flos": 1405197775104.0, + "grad_norm": 0.012085868941934568, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78230107, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.48925781, + "step": 1085, + "time_per_iteration": 4.883121728897095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_mlp": 1.00562024, + "epoch": 0.20892651019622932, + "flos": 448762140672.0, + "grad_norm": 0.03493036575467998, + "language_loss": 0.8691588, + "learning_rate": 0.000918376849434071, + "loss": 0.87971807, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.50317383, + "step": 1086, + "time_per_iteration": 2.537820816040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065129, + "balance_loss_mlp": 1.01444149, + "epoch": 0.20911889188149288, + "flos": 494081036544.0, + "grad_norm": 0.040745363066357655, + "language_loss": 0.91673005, + "learning_rate": 0.0009182061740661098, + "loss": 0.9273814, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.50732422, + "step": 1087, + "time_per_iteration": 2.5920886993408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056749, + "balance_loss_mlp": 1.00615633, + "epoch": 0.20931127356675644, + "flos": 842750062848.0, + "grad_norm": 0.02822254108426211, + "language_loss": 0.85810733, + "learning_rate": 0.0009180353363361127, + "loss": 0.86867487, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.50634766, + "step": 1088, + "time_per_iteration": 3.1376798152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060338, + "balance_loss_mlp": 1.00979316, + "epoch": 0.20950365525202, + "flos": 758525019648.0, + "grad_norm": 0.03922038165748564, + "language_loss": 0.83160806, + "learning_rate": 0.0009178643363104044, + "loss": 0.84221143, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.50585938, + "step": 1089, + "time_per_iteration": 3.124352216720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059844, + "balance_loss_mlp": 1.00939417, + "epoch": 0.20969603693728356, + "flos": 473492584704.0, + "grad_norm": 0.04272734591158297, + "language_loss": 0.920385, + "learning_rate": 0.0009176931740553735, + "loss": 0.93098342, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.50488281, + "step": 1090, + "time_per_iteration": 2.556528091430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067623, + "balance_loss_mlp": 1.01731646, + "epoch": 0.20988841862254715, + "flos": 978628982784.0, + "grad_norm": 0.03590255199570226, + "language_loss": 0.83530974, + "learning_rate": 0.0009175218496374708, + "loss": 0.84598601, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.50341797, + "step": 1091, + "time_per_iteration": 3.328984260559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059931, + "balance_loss_mlp": 1.00976801, + "epoch": 0.2100808003078107, + "flos": 1094819592192.0, + "grad_norm": 0.03766723451938342, + "language_loss": 0.86626744, + "learning_rate": 0.0009173503631232103, + "loss": 0.87686676, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.50170898, + "step": 1092, + "time_per_iteration": 3.4216480255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.00832939, + "epoch": 0.21027318199307427, + "flos": 1014560596992.0, + "grad_norm": 0.047058286401960234, + "language_loss": 0.82703817, + "learning_rate": 0.0009171787145791691, + "loss": 0.83762449, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.50341797, + "step": 1093, + "time_per_iteration": 3.2454655170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.00911129, + "epoch": 0.21046556367833782, + "flos": 522413001216.0, + "grad_norm": 0.043211200123957835, + "language_loss": 0.80955076, + "learning_rate": 0.000917006904071987, + "loss": 0.8201468, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.50537109, + "step": 1094, + "time_per_iteration": 2.6560592651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_mlp": 1.01053584, + "epoch": 0.21065794536360138, + "flos": 604840685568.0, + "grad_norm": 0.03488627405352903, + "language_loss": 0.87964189, + "learning_rate": 0.0009168349316683669, + "loss": 0.89025223, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.50537109, + "step": 1095, + "time_per_iteration": 2.794358253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.01243329, + "epoch": 0.21085032704886494, + "flos": 604558783488.0, + "grad_norm": 0.031199931973452354, + "language_loss": 0.82918072, + "learning_rate": 0.0009166627974350741, + "loss": 0.83981001, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.50537109, + "step": 1096, + "time_per_iteration": 2.89837384223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062823, + "balance_loss_mlp": 1.01242077, + "epoch": 0.2110427087341285, + "flos": 638832044544.0, + "grad_norm": 0.03623978918327459, + "language_loss": 0.90394479, + "learning_rate": 0.0009164905014389373, + "loss": 0.91457301, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.50439453, + "step": 1097, + "time_per_iteration": 2.79203462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055619, + "balance_loss_mlp": 1.00559878, + "epoch": 0.21123509041939206, + "flos": 523930403328.0, + "grad_norm": 0.03351990521185014, + "language_loss": 0.87381279, + "learning_rate": 0.0009163180437468476, + "loss": 0.88436902, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.50024414, + "step": 1098, + "time_per_iteration": 2.6110002994537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056208, + "balance_loss_mlp": 1.00647402, + "epoch": 0.21142747210465565, + "flos": 452194520064.0, + "grad_norm": 0.03619268995909484, + "language_loss": 0.86631316, + "learning_rate": 0.000916145424425759, + "loss": 0.87687522, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.49658203, + "step": 1099, + "time_per_iteration": 2.67106294631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060107, + "balance_loss_mlp": 1.01027727, + "epoch": 0.2116198537899192, + "flos": 877626978816.0, + "grad_norm": 0.042483916895571405, + "language_loss": 0.91832745, + "learning_rate": 0.0009159726435426885, + "loss": 0.92892849, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.49780273, + "step": 1100, + "time_per_iteration": 3.095250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052771, + "balance_loss_mlp": 1.00275087, + "epoch": 0.21181223547518277, + "flos": 524675009280.0, + "grad_norm": 0.035590136232614346, + "language_loss": 0.91126454, + "learning_rate": 0.0009157997011647154, + "loss": 0.92179227, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.49926758, + "step": 1101, + "time_per_iteration": 2.61954665184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.00615227, + "epoch": 0.21200461716044633, + "flos": 573426284544.0, + "grad_norm": 0.03167271765745466, + "language_loss": 0.86759949, + "learning_rate": 0.0009156265973589817, + "loss": 0.87816215, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.50146484, + "step": 1102, + "time_per_iteration": 2.7851946353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053549, + "balance_loss_mlp": 1.00348067, + "epoch": 0.2121969988457099, + "flos": 546175262976.0, + "grad_norm": 0.033324702660241096, + "language_loss": 0.90598941, + "learning_rate": 0.0009154533321926926, + "loss": 0.91652489, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.50073242, + "step": 1103, + "time_per_iteration": 2.658358573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056655, + "balance_loss_mlp": 1.00663483, + "epoch": 0.21238938053097345, + "flos": 845355211008.0, + "grad_norm": 0.03290940631262569, + "language_loss": 0.88234645, + "learning_rate": 0.0009152799057331156, + "loss": 0.89291298, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.50024414, + "step": 1104, + "time_per_iteration": 3.1174561977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056205, + "balance_loss_mlp": 1.00623202, + "epoch": 0.212581762216237, + "flos": 447142671360.0, + "grad_norm": 0.035279899791186564, + "language_loss": 0.91767001, + "learning_rate": 0.0009151063180475805, + "loss": 0.92823207, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.5, + "step": 1105, + "time_per_iteration": 2.538922071456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054823, + "balance_loss_mlp": 1.00489795, + "epoch": 0.21277414390150057, + "flos": 515385904128.0, + "grad_norm": 0.03737857831356842, + "language_loss": 0.85410213, + "learning_rate": 0.0009149325692034803, + "loss": 0.86465037, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.49853516, + "step": 1106, + "time_per_iteration": 2.588087558746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055756, + "balance_loss_mlp": 1.00788116, + "epoch": 0.21296652558676413, + "flos": 1488514907136.0, + "grad_norm": 0.005769411809131762, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80259192, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.47851562, + "step": 1107, + "time_per_iteration": 4.901995658874512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055609, + "balance_loss_mlp": 1.00596976, + "epoch": 0.21315890727202771, + "flos": 847451968512.0, + "grad_norm": 0.03679321288402367, + "language_loss": 0.87994891, + "learning_rate": 0.0009145845883094678, + "loss": 0.89050496, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.49584961, + "step": 1108, + "time_per_iteration": 3.034179925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057835, + "balance_loss_mlp": 1.00833917, + "epoch": 0.21335128895729127, + "flos": 630556808448.0, + "grad_norm": 0.040833312538100186, + "language_loss": 0.86006308, + "learning_rate": 0.000914410356394654, + "loss": 0.87064135, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.49438477, + "step": 1109, + "time_per_iteration": 2.793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058379, + "balance_loss_mlp": 1.00878823, + "epoch": 0.21354367064255483, + "flos": 712285573632.0, + "grad_norm": 0.029526159769499145, + "language_loss": 0.85111213, + "learning_rate": 0.0009142359635914709, + "loss": 0.86169595, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.49560547, + "step": 1110, + "time_per_iteration": 3.0403430461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063298, + "balance_loss_mlp": 1.01375508, + "epoch": 0.2137360523278184, + "flos": 457211375616.0, + "grad_norm": 0.03547311640481051, + "language_loss": 0.85051197, + "learning_rate": 0.0009140614099676245, + "loss": 0.8611449, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.49414062, + "step": 1111, + "time_per_iteration": 2.6027371883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054693, + "balance_loss_mlp": 1.00495887, + "epoch": 0.21392843401308195, + "flos": 667266076416.0, + "grad_norm": 0.03139007596896344, + "language_loss": 0.8342849, + "learning_rate": 0.0009138866955908821, + "loss": 0.84483182, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.49658203, + "step": 1112, + "time_per_iteration": 2.924180269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00517082, + "epoch": 0.2141208156983455, + "flos": 750362544384.0, + "grad_norm": 0.03405304612319473, + "language_loss": 0.81477892, + "learning_rate": 0.0009137118205290738, + "loss": 0.82533085, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.49951172, + "step": 1113, + "time_per_iteration": 2.956289768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057131, + "balance_loss_mlp": 1.00711048, + "epoch": 0.21431319738360907, + "flos": 420011213568.0, + "grad_norm": 0.037812047895131755, + "language_loss": 0.90930229, + "learning_rate": 0.0009135367848500924, + "loss": 0.9198736, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.49975586, + "step": 1114, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_mlp": 1.01079023, + "epoch": 0.21450557906887263, + "flos": 610239565056.0, + "grad_norm": 0.04455846969282107, + "language_loss": 0.87261575, + "learning_rate": 0.0009133615886218927, + "loss": 0.88322389, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.5, + "step": 1115, + "time_per_iteration": 2.7146785259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105379, + "balance_loss_mlp": 1.00367427, + "epoch": 0.21469796075413622, + "flos": 562975556352.0, + "grad_norm": 0.04025415931658291, + "language_loss": 0.88754129, + "learning_rate": 0.0009131862319124917, + "loss": 0.89807916, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.50097656, + "step": 1116, + "time_per_iteration": 2.702315092086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058084, + "balance_loss_mlp": 1.0081588, + "epoch": 0.21489034243939978, + "flos": 595738218240.0, + "grad_norm": 0.036347556106983744, + "language_loss": 0.84819156, + "learning_rate": 0.0009130107147899691, + "loss": 0.8587724, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.49902344, + "step": 1117, + "time_per_iteration": 2.705153226852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055436, + "balance_loss_mlp": 1.00555849, + "epoch": 0.21508272412466334, + "flos": 442850979840.0, + "grad_norm": 0.032390780355026266, + "language_loss": 0.85796201, + "learning_rate": 0.0009128350373224665, + "loss": 0.86851633, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.49804688, + "step": 1118, + "time_per_iteration": 2.5689737796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055817, + "balance_loss_mlp": 1.00775146, + "epoch": 0.2152751058099269, + "flos": 1499234898432.0, + "grad_norm": 0.005802610423144338, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82512248, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.48046875, + "step": 1119, + "time_per_iteration": 4.659603834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054629, + "balance_loss_mlp": 1.00475144, + "epoch": 0.21546748749519046, + "flos": 494992838400.0, + "grad_norm": 0.03550503890551413, + "language_loss": 0.86117166, + "learning_rate": 0.0009124832016254005, + "loss": 0.87171793, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.4987793, + "step": 1120, + "time_per_iteration": 2.6080243587493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054572, + "balance_loss_mlp": 1.00450444, + "epoch": 0.21565986918045402, + "flos": 635695173120.0, + "grad_norm": 0.03761657282592244, + "language_loss": 0.88987935, + "learning_rate": 0.0009123070435324316, + "loss": 0.90042508, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.50097656, + "step": 1121, + "time_per_iteration": 2.8451340198516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062664, + "balance_loss_mlp": 1.01450348, + "epoch": 0.21585225086571758, + "flos": 1586801914368.0, + "grad_norm": 0.011675507285583616, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78938448, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.48144531, + "step": 1122, + "time_per_iteration": 5.018117666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_mlp": 1.00541639, + "epoch": 0.21604463255098114, + "flos": 685323257088.0, + "grad_norm": 0.03443856201457266, + "language_loss": 0.87021005, + "learning_rate": 0.0009119542471995752, + "loss": 0.8807621, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.49682617, + "step": 1123, + "time_per_iteration": 2.8631908893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.00755107, + "epoch": 0.2162370142362447, + "flos": 782308668672.0, + "grad_norm": 0.034966150945184314, + "language_loss": 0.82536203, + "learning_rate": 0.0009117776090966554, + "loss": 0.83593345, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.49511719, + "step": 1124, + "time_per_iteration": 2.9458060264587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_mlp": 1.00877571, + "epoch": 0.21642939592150828, + "flos": 1003762838016.0, + "grad_norm": 0.03795033166932298, + "language_loss": 0.87775326, + "learning_rate": 0.0009116008111274899, + "loss": 0.88833648, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.49511719, + "step": 1125, + "time_per_iteration": 3.2748866081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_mlp": 1.00556183, + "epoch": 0.21662177760677184, + "flos": 1485764917248.0, + "grad_norm": 0.008195913283110022, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8015998, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.47460938, + "step": 1126, + "time_per_iteration": 4.803825616836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105355, + "balance_loss_mlp": 1.00391161, + "epoch": 0.2168141592920354, + "flos": 888861196800.0, + "grad_norm": 0.03626284425770287, + "language_loss": 0.85553163, + "learning_rate": 0.0009112467358650396, + "loss": 0.86606717, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.49609375, + "step": 1127, + "time_per_iteration": 3.155856132507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057313, + "balance_loss_mlp": 1.00753081, + "epoch": 0.21700654097729896, + "flos": 547085119488.0, + "grad_norm": 0.03272511127748384, + "language_loss": 0.87140059, + "learning_rate": 0.0009110694587092192, + "loss": 0.88197374, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.49682617, + "step": 1128, + "time_per_iteration": 2.7438507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.00781655, + "epoch": 0.21719892266256252, + "flos": 510536244480.0, + "grad_norm": 0.0385378102776186, + "language_loss": 0.81826651, + "learning_rate": 0.0009108920219620815, + "loss": 0.82884294, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.49829102, + "step": 1129, + "time_per_iteration": 2.6256754398345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.00682795, + "epoch": 0.21739130434782608, + "flos": 544462474752.0, + "grad_norm": 0.03288593298355655, + "language_loss": 0.9021399, + "learning_rate": 0.0009107144256925133, + "loss": 0.91270602, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.49707031, + "step": 1130, + "time_per_iteration": 2.665764808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055495, + "balance_loss_mlp": 1.00566518, + "epoch": 0.21758368603308964, + "flos": 617983077888.0, + "grad_norm": 0.04004849400109536, + "language_loss": 0.83221352, + "learning_rate": 0.0009105366699694638, + "loss": 0.84276843, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.49755859, + "step": 1131, + "time_per_iteration": 2.7092785835266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055334, + "balance_loss_mlp": 1.0055995, + "epoch": 0.2177760677183532, + "flos": 636335766528.0, + "grad_norm": 0.03327692114185805, + "language_loss": 0.82139939, + "learning_rate": 0.0009103587548619439, + "loss": 0.83195269, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.49658203, + "step": 1132, + "time_per_iteration": 2.833617925643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055546, + "balance_loss_mlp": 1.00585985, + "epoch": 0.2179684494036168, + "flos": 533597641728.0, + "grad_norm": 0.036557340203022134, + "language_loss": 0.8721149, + "learning_rate": 0.0009101806804390261, + "loss": 0.8826704, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.49609375, + "step": 1133, + "time_per_iteration": 2.7880306243896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054716, + "balance_loss_mlp": 1.0050298, + "epoch": 0.21816083108888035, + "flos": 476182303488.0, + "grad_norm": 0.03701280834454915, + "language_loss": 0.917292, + "learning_rate": 0.0009100024467698453, + "loss": 0.92783916, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.49560547, + "step": 1134, + "time_per_iteration": 2.592986822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054821, + "balance_loss_mlp": 1.00513422, + "epoch": 0.2183532127741439, + "flos": 578547152640.0, + "grad_norm": 0.04183992577645213, + "language_loss": 0.83309305, + "learning_rate": 0.0009098240539235981, + "loss": 0.84364122, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.49658203, + "step": 1135, + "time_per_iteration": 2.693387269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055318, + "balance_loss_mlp": 1.00558341, + "epoch": 0.21854559445940747, + "flos": 595280371968.0, + "grad_norm": 0.03379290176549673, + "language_loss": 0.88387418, + "learning_rate": 0.0009096455019695423, + "loss": 0.89442736, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.49609375, + "step": 1136, + "time_per_iteration": 2.781304359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059818, + "balance_loss_mlp": 1.0098455, + "epoch": 0.21873797614467103, + "flos": 409549791744.0, + "grad_norm": 0.03874067782032871, + "language_loss": 0.90736896, + "learning_rate": 0.000909466790976998, + "loss": 0.91796714, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.49951172, + "step": 1137, + "time_per_iteration": 2.4837231636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.00620675, + "epoch": 0.21893035782993459, + "flos": 895655969280.0, + "grad_norm": 0.03281311030157744, + "language_loss": 0.83296013, + "learning_rate": 0.0009092879210153473, + "loss": 0.84352005, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.49682617, + "step": 1138, + "time_per_iteration": 3.156329870223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058359, + "balance_loss_mlp": 1.00862455, + "epoch": 0.21912273951519814, + "flos": 468569048064.0, + "grad_norm": 0.03332829582894704, + "language_loss": 0.89480728, + "learning_rate": 0.0009091088921540333, + "loss": 0.90539086, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.49731445, + "step": 1139, + "time_per_iteration": 2.5444674491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060322, + "balance_loss_mlp": 1.01197052, + "epoch": 0.2193151212004617, + "flos": 1535180118528.0, + "grad_norm": 0.009447727830516332, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76569003, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.48339844, + "step": 1140, + "time_per_iteration": 4.993603944778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_mlp": 1.00158358, + "epoch": 0.2195075028857253, + "flos": 592275703296.0, + "grad_norm": 0.039648398816974934, + "language_loss": 0.85201681, + "learning_rate": 0.0009087503580104985, + "loss": 0.86252946, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.49560547, + "step": 1141, + "time_per_iteration": 2.6736245155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053862, + "balance_loss_mlp": 1.00436676, + "epoch": 0.21969988457098885, + "flos": 637518776832.0, + "grad_norm": 0.03678403810630545, + "language_loss": 0.8005864, + "learning_rate": 0.0009085708528674728, + "loss": 0.81112504, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.49414062, + "step": 1142, + "time_per_iteration": 2.799607038497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.00362051, + "epoch": 0.2198922662562524, + "flos": 913860903936.0, + "grad_norm": 0.040969430424554455, + "language_loss": 0.86853033, + "learning_rate": 0.0009083911891031745, + "loss": 0.87906301, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.49487305, + "step": 1143, + "time_per_iteration": 3.1043601036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010518, + "balance_loss_mlp": 1.00235164, + "epoch": 0.22008464794151597, + "flos": 824495550720.0, + "grad_norm": 0.03475506353694162, + "language_loss": 0.91937912, + "learning_rate": 0.0009082113667873553, + "loss": 0.92989707, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.4934082, + "step": 1144, + "time_per_iteration": 3.114678144454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055626, + "balance_loss_mlp": 1.00636888, + "epoch": 0.22027702962677953, + "flos": 460619455488.0, + "grad_norm": 0.047183367988671336, + "language_loss": 0.91319406, + "learning_rate": 0.0009080313859898283, + "loss": 0.92375034, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.49145508, + "step": 1145, + "time_per_iteration": 2.529627799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_mlp": 1.00877535, + "epoch": 0.2204694113120431, + "flos": 532288264704.0, + "grad_norm": 0.034289556826903954, + "language_loss": 0.91988164, + "learning_rate": 0.0009078512467804684, + "loss": 0.93046296, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.49243164, + "step": 1146, + "time_per_iteration": 2.692556381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056493, + "balance_loss_mlp": 1.00737858, + "epoch": 0.22066179299730665, + "flos": 523687385088.0, + "grad_norm": 0.03628724645244133, + "language_loss": 0.91349947, + "learning_rate": 0.0009076709492292119, + "loss": 0.9240644, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.49023438, + "step": 1147, + "time_per_iteration": 2.6262857913970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056943, + "balance_loss_mlp": 1.00799513, + "epoch": 0.2208541746825702, + "flos": 547506027264.0, + "grad_norm": 0.0383258843164557, + "language_loss": 0.89899343, + "learning_rate": 0.0009074904934060562, + "loss": 0.90956283, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.48901367, + "step": 1148, + "time_per_iteration": 2.710716962814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054195, + "balance_loss_mlp": 1.00498509, + "epoch": 0.22104655636783377, + "flos": 710060504064.0, + "grad_norm": 0.034028934421108444, + "language_loss": 0.85814822, + "learning_rate": 0.0009073098793810607, + "loss": 0.86869013, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.4909668, + "step": 1149, + "time_per_iteration": 2.986891269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056627, + "balance_loss_mlp": 1.00758433, + "epoch": 0.22123893805309736, + "flos": 585965021952.0, + "grad_norm": 0.03641392016248804, + "language_loss": 0.88886124, + "learning_rate": 0.000907129107224346, + "loss": 0.89942753, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.48999023, + "step": 1150, + "time_per_iteration": 2.7348337173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055224, + "balance_loss_mlp": 1.00601482, + "epoch": 0.22143131973836092, + "flos": 493251859968.0, + "grad_norm": 0.02984339906163832, + "language_loss": 0.89448893, + "learning_rate": 0.0009069481770060939, + "loss": 0.90504116, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.49121094, + "step": 1151, + "time_per_iteration": 2.688180685043335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055578, + "balance_loss_mlp": 1.00593948, + "epoch": 0.22162370142362448, + "flos": 1081469174784.0, + "grad_norm": 0.034516826316188534, + "language_loss": 0.8487525, + "learning_rate": 0.000906767088796548, + "loss": 0.85930824, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.49584961, + "step": 1152, + "time_per_iteration": 3.4747724533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057959, + "balance_loss_mlp": 1.00841522, + "epoch": 0.22181608310888803, + "flos": 493512374784.0, + "grad_norm": 0.03114695536209251, + "language_loss": 0.87880313, + "learning_rate": 0.0009065858426660127, + "loss": 0.88938272, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.49462891, + "step": 1153, + "time_per_iteration": 2.6112635135650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060109, + "balance_loss_mlp": 1.0103749, + "epoch": 0.2220084647941516, + "flos": 725325898752.0, + "grad_norm": 0.04119971901255946, + "language_loss": 0.85662532, + "learning_rate": 0.0009064044386848543, + "loss": 0.86722642, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.49658203, + "step": 1154, + "time_per_iteration": 2.893120288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105547, + "balance_loss_mlp": 1.00564086, + "epoch": 0.22220084647941515, + "flos": 490245245952.0, + "grad_norm": 0.04012578927121656, + "language_loss": 0.89651787, + "learning_rate": 0.0009062228769234997, + "loss": 0.9070726, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.49731445, + "step": 1155, + "time_per_iteration": 2.544904947280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_mlp": 1.00344408, + "epoch": 0.2223932281646787, + "flos": 537296371968.0, + "grad_norm": 0.03814815821860503, + "language_loss": 0.82016486, + "learning_rate": 0.0009060411574524376, + "loss": 0.83069855, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.49804688, + "step": 1156, + "time_per_iteration": 2.6412572860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056148, + "balance_loss_mlp": 1.00660419, + "epoch": 0.22258560984994227, + "flos": 932968892160.0, + "grad_norm": 0.0415511709861084, + "language_loss": 0.88770878, + "learning_rate": 0.0009058592803422178, + "loss": 0.89827025, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.49462891, + "step": 1157, + "time_per_iteration": 4.623233079910278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055229, + "balance_loss_mlp": 1.00792694, + "epoch": 0.22277799153520586, + "flos": 1202397638400.0, + "grad_norm": 0.007067436666665483, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79765517, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.47265625, + "step": 1158, + "time_per_iteration": 4.805820465087891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053661, + "balance_loss_mlp": 1.00397491, + "epoch": 0.22297037322046942, + "flos": 502317388800.0, + "grad_norm": 0.032485949168455416, + "language_loss": 0.91067338, + "learning_rate": 0.00090549505348681, + "loss": 0.92121005, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.49633789, + "step": 1159, + "time_per_iteration": 2.5877561569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00427544, + "epoch": 0.22316275490573298, + "flos": 754113764352.0, + "grad_norm": 0.0354615562345569, + "language_loss": 0.84617937, + "learning_rate": 0.0009053127038830275, + "loss": 0.85672045, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.49731445, + "step": 1160, + "time_per_iteration": 3.0164098739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_mlp": 1.00777233, + "epoch": 0.22335513659099654, + "flos": 515804866560.0, + "grad_norm": 0.03692799991821936, + "language_loss": 0.87995219, + "learning_rate": 0.000905130196922898, + "loss": 0.89052767, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.49682617, + "step": 1161, + "time_per_iteration": 2.603769063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058076, + "balance_loss_mlp": 1.00848484, + "epoch": 0.2235475182762601, + "flos": 485508347136.0, + "grad_norm": 0.031071089964746976, + "language_loss": 0.8758713, + "learning_rate": 0.0009049475326772769, + "loss": 0.88645208, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.49511719, + "step": 1162, + "time_per_iteration": 2.6613070964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_mlp": 1.00334835, + "epoch": 0.22373989996152366, + "flos": 471068238336.0, + "grad_norm": 0.03308636607962537, + "language_loss": 0.83887613, + "learning_rate": 0.0009047647112170811, + "loss": 0.84940416, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.49389648, + "step": 1163, + "time_per_iteration": 2.8056106567382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105253, + "balance_loss_mlp": 1.00322485, + "epoch": 0.22393228164678722, + "flos": 1273019542272.0, + "grad_norm": 0.035987441954907426, + "language_loss": 0.88180983, + "learning_rate": 0.0009045817326132876, + "loss": 0.89233518, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.49243164, + "step": 1164, + "time_per_iteration": 3.7020320892333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_mlp": 1.00575495, + "epoch": 0.22412466333205078, + "flos": 597468503040.0, + "grad_norm": 0.03371692057767332, + "language_loss": 0.84342653, + "learning_rate": 0.0009043985969369357, + "loss": 0.85397661, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.49145508, + "step": 1165, + "time_per_iteration": 2.8581626415252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_mlp": 1.00299454, + "epoch": 0.22431704501731436, + "flos": 609632019456.0, + "grad_norm": 0.03010954873673584, + "language_loss": 0.84869868, + "learning_rate": 0.0009042153042591245, + "loss": 0.85922217, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.49243164, + "step": 1166, + "time_per_iteration": 2.810300827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_mlp": 1.0050199, + "epoch": 0.22450942670257792, + "flos": 908108190720.0, + "grad_norm": 0.030118647676053625, + "language_loss": 0.86120874, + "learning_rate": 0.0009040318546510146, + "loss": 0.87175173, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.49169922, + "step": 1167, + "time_per_iteration": 3.129802942276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057032, + "balance_loss_mlp": 1.00791764, + "epoch": 0.22470180838784148, + "flos": 566381690880.0, + "grad_norm": 0.035718478093575166, + "language_loss": 0.85780692, + "learning_rate": 0.0009038482481838275, + "loss": 0.86837721, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.49047852, + "step": 1168, + "time_per_iteration": 2.674471855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00880456, + "epoch": 0.22489419007310504, + "flos": 835918351872.0, + "grad_norm": 0.03078757560697398, + "language_loss": 0.88093269, + "learning_rate": 0.0009036644849288455, + "loss": 0.89151073, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.48925781, + "step": 1169, + "time_per_iteration": 3.126168727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00331759, + "epoch": 0.2250865717583686, + "flos": 582139924992.0, + "grad_norm": 0.03503818002335677, + "language_loss": 0.86431491, + "learning_rate": 0.0009034805649574118, + "loss": 0.87483639, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.48779297, + "step": 1170, + "time_per_iteration": 2.6982839107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056949, + "balance_loss_mlp": 1.0084312, + "epoch": 0.22527895344363216, + "flos": 601671733248.0, + "grad_norm": 0.031992933731526396, + "language_loss": 0.85811341, + "learning_rate": 0.0009032964883409308, + "loss": 0.86868292, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.48510742, + "step": 1171, + "time_per_iteration": 2.9468932151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055088, + "balance_loss_mlp": 1.00826263, + "epoch": 0.22547133512889572, + "flos": 1443734537472.0, + "grad_norm": 0.010800983830845337, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.7410562, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.46777344, + "step": 1172, + "time_per_iteration": 5.044191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_mlp": 1.0051204, + "epoch": 0.22566371681415928, + "flos": 491586703872.0, + "grad_norm": 0.034976527569036825, + "language_loss": 0.88142014, + "learning_rate": 0.0009029278654587462, + "loss": 0.89195722, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.48583984, + "step": 1173, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105611, + "balance_loss_mlp": 1.00749624, + "epoch": 0.22585609849942284, + "flos": 605752487424.0, + "grad_norm": 0.03629905495680353, + "language_loss": 0.82793885, + "learning_rate": 0.0009027433193361548, + "loss": 0.83850002, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.48583984, + "step": 1174, + "time_per_iteration": 2.707061290740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105926, + "balance_loss_mlp": 1.01064646, + "epoch": 0.22604848018468643, + "flos": 636728484096.0, + "grad_norm": 0.035409171913978986, + "language_loss": 0.87780964, + "learning_rate": 0.00090255861685474, + "loss": 0.88840234, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.48608398, + "step": 1175, + "time_per_iteration": 2.7910189628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.00752461, + "epoch": 0.22624086186995, + "flos": 480845325312.0, + "grad_norm": 0.040136392489239156, + "language_loss": 0.91905487, + "learning_rate": 0.0009023737580862095, + "loss": 0.92961645, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.48632812, + "step": 1176, + "time_per_iteration": 2.5489909648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054427, + "balance_loss_mlp": 1.00600469, + "epoch": 0.22643324355521355, + "flos": 496807693824.0, + "grad_norm": 0.032828642541270554, + "language_loss": 0.83966863, + "learning_rate": 0.0009021887431023321, + "loss": 0.85021293, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.48413086, + "step": 1177, + "time_per_iteration": 2.679046392440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060571, + "balance_loss_mlp": 1.01224387, + "epoch": 0.2266256252404771, + "flos": 562684905984.0, + "grad_norm": 0.03431341234676521, + "language_loss": 0.8836711, + "learning_rate": 0.0009020035719749369, + "loss": 0.89427686, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.4831543, + "step": 1178, + "time_per_iteration": 2.777273416519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_mlp": 1.00516534, + "epoch": 0.22681800692574067, + "flos": 581033703936.0, + "grad_norm": 0.0422995660898389, + "language_loss": 0.78512251, + "learning_rate": 0.0009018182447759136, + "loss": 0.79566014, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.48583984, + "step": 1179, + "time_per_iteration": 2.9779903888702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105363, + "balance_loss_mlp": 1.00508785, + "epoch": 0.22701038861100423, + "flos": 741466156800.0, + "grad_norm": 0.03672617722264385, + "language_loss": 0.80683887, + "learning_rate": 0.0009016327615772126, + "loss": 0.81737518, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.48535156, + "step": 1180, + "time_per_iteration": 2.953355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054935, + "balance_loss_mlp": 1.00636911, + "epoch": 0.2272027702962678, + "flos": 578306079744.0, + "grad_norm": 0.03924605706365315, + "language_loss": 0.88551408, + "learning_rate": 0.0009014471224508451, + "loss": 0.89606345, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.4855957, + "step": 1181, + "time_per_iteration": 2.7092630863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00744355, + "epoch": 0.22739515198153135, + "flos": 545291651328.0, + "grad_norm": 0.04038062834310644, + "language_loss": 0.83949769, + "learning_rate": 0.0009012613274688823, + "loss": 0.85005856, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.48632812, + "step": 1182, + "time_per_iteration": 2.642143964767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055555, + "balance_loss_mlp": 1.00689363, + "epoch": 0.22758753366679493, + "flos": 441092504832.0, + "grad_norm": 0.03566258536478163, + "language_loss": 0.88506091, + "learning_rate": 0.0009010753767034565, + "loss": 0.89561647, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.48632812, + "step": 1183, + "time_per_iteration": 2.599167585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_mlp": 1.00526154, + "epoch": 0.2277799153520585, + "flos": 730824900096.0, + "grad_norm": 0.03354089847275564, + "language_loss": 0.79992342, + "learning_rate": 0.0009008892702267599, + "loss": 0.81046152, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.48535156, + "step": 1184, + "time_per_iteration": 2.9798924922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057068, + "balance_loss_mlp": 1.00855029, + "epoch": 0.22797229703732205, + "flos": 527913947904.0, + "grad_norm": 0.04184098346005727, + "language_loss": 0.89975739, + "learning_rate": 0.0009007030081110457, + "loss": 0.91032803, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.48510742, + "step": 1185, + "time_per_iteration": 2.6349968910217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.00910807, + "epoch": 0.2281646787225856, + "flos": 536521630464.0, + "grad_norm": 0.03583751901003141, + "language_loss": 0.85487026, + "learning_rate": 0.000900516590428627, + "loss": 0.86544555, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.48413086, + "step": 1186, + "time_per_iteration": 2.669015407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054531, + "balance_loss_mlp": 1.00596476, + "epoch": 0.22835706040784917, + "flos": 542478478080.0, + "grad_norm": 0.03191556588332838, + "language_loss": 0.9033947, + "learning_rate": 0.0009003300172518778, + "loss": 0.91394001, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.4855957, + "step": 1187, + "time_per_iteration": 2.7164688110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.00804579, + "epoch": 0.22854944209311273, + "flos": 792006042624.0, + "grad_norm": 0.0322044633529041, + "language_loss": 0.85374159, + "learning_rate": 0.0009001432886532321, + "loss": 0.86430913, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.48681641, + "step": 1188, + "time_per_iteration": 2.9621965885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054799, + "balance_loss_mlp": 1.00568485, + "epoch": 0.2287418237783763, + "flos": 470216707584.0, + "grad_norm": 0.03536870053258389, + "language_loss": 0.87358034, + "learning_rate": 0.0008999564047051843, + "loss": 0.88412833, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.49047852, + "step": 1189, + "time_per_iteration": 2.5233154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058932, + "balance_loss_mlp": 1.01003218, + "epoch": 0.22893420546363985, + "flos": 469005507072.0, + "grad_norm": 0.030491923293758834, + "language_loss": 0.8554523, + "learning_rate": 0.0008997693654802894, + "loss": 0.86604154, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.48852539, + "step": 1190, + "time_per_iteration": 2.6391589641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_mlp": 1.00965738, + "epoch": 0.22912658714890344, + "flos": 627402440448.0, + "grad_norm": 0.0331512035559832, + "language_loss": 0.87166977, + "learning_rate": 0.0008995821710511625, + "loss": 0.88225698, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49023438, + "step": 1191, + "time_per_iteration": 2.7549567222595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.00599909, + "epoch": 0.229318968834167, + "flos": 504021428736.0, + "grad_norm": 0.030936804790582927, + "language_loss": 0.85688579, + "learning_rate": 0.0008993948214904786, + "loss": 0.86743385, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.48779297, + "step": 1192, + "time_per_iteration": 2.596224784851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_mlp": 1.01483917, + "epoch": 0.22951135051943056, + "flos": 1377716374272.0, + "grad_norm": 0.008909469382289665, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79484069, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.46972656, + "step": 1193, + "time_per_iteration": 4.853066921234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062271, + "balance_loss_mlp": 1.01356232, + "epoch": 0.22970373220469412, + "flos": 645550994688.0, + "grad_norm": 0.0389743097765726, + "language_loss": 0.78935194, + "learning_rate": 0.0008990196572654427, + "loss": 0.79997468, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.48681641, + "step": 1194, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00771046, + "epoch": 0.22989611388995768, + "flos": 501273384192.0, + "grad_norm": 0.02988304738122761, + "language_loss": 0.88486552, + "learning_rate": 0.0008988318427467426, + "loss": 0.8954283, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.4855957, + "step": 1195, + "time_per_iteration": 2.6931521892547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053709, + "balance_loss_mlp": 1.00514269, + "epoch": 0.23008849557522124, + "flos": 1098334596864.0, + "grad_norm": 0.03694163801075408, + "language_loss": 0.87307864, + "learning_rate": 0.0008986438733877887, + "loss": 0.88361579, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.4855957, + "step": 1196, + "time_per_iteration": 3.4505865573883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00447488, + "epoch": 0.2302808772604848, + "flos": 684993722880.0, + "grad_norm": 0.030674764969734848, + "language_loss": 0.85086071, + "learning_rate": 0.0008984557492615576, + "loss": 0.86139137, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.48583984, + "step": 1197, + "time_per_iteration": 2.936891794204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056985, + "balance_loss_mlp": 1.00837183, + "epoch": 0.23047325894574835, + "flos": 529961127936.0, + "grad_norm": 0.03469763625730159, + "language_loss": 0.90249604, + "learning_rate": 0.0008982674704410854, + "loss": 0.91306591, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.48608398, + "step": 1198, + "time_per_iteration": 2.6928677558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_mlp": 1.00653744, + "epoch": 0.23066564063101191, + "flos": 684127607808.0, + "grad_norm": 0.03582939263118032, + "language_loss": 0.78263444, + "learning_rate": 0.0008980790369994682, + "loss": 0.79318547, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.4855957, + "step": 1199, + "time_per_iteration": 2.941063642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692904, + "epoch": 0.2308580223162755, + "flos": 559632605184.0, + "grad_norm": 0.03400437188822284, + "language_loss": 0.87868834, + "learning_rate": 0.000897890449009863, + "loss": 0.88924116, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.48339844, + "step": 1200, + "time_per_iteration": 2.6677346229553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058779, + "balance_loss_mlp": 1.01061893, + "epoch": 0.23105040400153906, + "flos": 556730003712.0, + "grad_norm": 0.030515141355108834, + "language_loss": 0.90571141, + "learning_rate": 0.0008977017065454853, + "loss": 0.91629916, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.48144531, + "step": 1201, + "time_per_iteration": 2.7204995155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053158, + "balance_loss_mlp": 1.00506902, + "epoch": 0.23124278568680262, + "flos": 706050714624.0, + "grad_norm": 0.034769733982414605, + "language_loss": 0.81452352, + "learning_rate": 0.0008975128096796121, + "loss": 0.82505512, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48071289, + "step": 1202, + "time_per_iteration": 2.861058473587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.00517035, + "epoch": 0.23143516737206618, + "flos": 613969397760.0, + "grad_norm": 0.03845725381901349, + "language_loss": 0.86815399, + "learning_rate": 0.0008973237584855794, + "loss": 0.87868845, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.48266602, + "step": 1203, + "time_per_iteration": 2.907670021057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055623, + "balance_loss_mlp": 1.00715244, + "epoch": 0.23162754905732974, + "flos": 390096718080.0, + "grad_norm": 0.03680581416715809, + "language_loss": 0.82972479, + "learning_rate": 0.0008971345530367832, + "loss": 0.84028101, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.48461914, + "step": 1204, + "time_per_iteration": 2.4500131607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_mlp": 1.00190353, + "epoch": 0.2318199307425933, + "flos": 668970116352.0, + "grad_norm": 0.03636020946200237, + "language_loss": 0.86001658, + "learning_rate": 0.0008969451934066799, + "loss": 0.87052464, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.48828125, + "step": 1205, + "time_per_iteration": 2.786860704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_mlp": 1.00558126, + "epoch": 0.23201231242785686, + "flos": 667628658432.0, + "grad_norm": 0.042825772722853955, + "language_loss": 0.80798173, + "learning_rate": 0.0008967556796687854, + "loss": 0.81852657, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.48852539, + "step": 1206, + "time_per_iteration": 2.9043900966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106058, + "balance_loss_mlp": 1.01153755, + "epoch": 0.23220469411312042, + "flos": 750095226624.0, + "grad_norm": 0.036226897286377145, + "language_loss": 0.84918714, + "learning_rate": 0.0008965660118966752, + "loss": 0.85979295, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.48974609, + "step": 1207, + "time_per_iteration": 2.8989100456237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054609, + "balance_loss_mlp": 1.00597119, + "epoch": 0.232397075798384, + "flos": 668262448896.0, + "grad_norm": 0.03230217319227319, + "language_loss": 0.90859735, + "learning_rate": 0.0008963761901639851, + "loss": 0.91914344, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.48632812, + "step": 1208, + "time_per_iteration": 2.801715612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050654, + "balance_loss_mlp": 1.00204051, + "epoch": 0.23258945748364757, + "flos": 611346753024.0, + "grad_norm": 0.038379048380249, + "language_loss": 0.83753544, + "learning_rate": 0.0008961862145444103, + "loss": 0.84804195, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.48608398, + "step": 1209, + "time_per_iteration": 2.6739237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105504, + "balance_loss_mlp": 1.00656986, + "epoch": 0.23278183916891113, + "flos": 490672956672.0, + "grad_norm": 0.04093378826068356, + "language_loss": 0.86382735, + "learning_rate": 0.0008959960851117059, + "loss": 0.87437773, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.48461914, + "step": 1210, + "time_per_iteration": 2.635650634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056695, + "balance_loss_mlp": 1.00808144, + "epoch": 0.23297422085417469, + "flos": 512674798080.0, + "grad_norm": 0.0354403494585401, + "language_loss": 0.84509313, + "learning_rate": 0.0008958058019396868, + "loss": 0.85566002, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.48608398, + "step": 1211, + "time_per_iteration": 2.788318157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105326, + "balance_loss_mlp": 1.00462246, + "epoch": 0.23316660253943824, + "flos": 547532272128.0, + "grad_norm": 0.03263062148431384, + "language_loss": 0.87462825, + "learning_rate": 0.0008956153651022274, + "loss": 0.8851608, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.48608398, + "step": 1212, + "time_per_iteration": 2.725313901901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_mlp": 1.00709951, + "epoch": 0.2333589842247018, + "flos": 511289598720.0, + "grad_norm": 0.03371055024816449, + "language_loss": 0.84886169, + "learning_rate": 0.0008954247746732618, + "loss": 0.85942048, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.48754883, + "step": 1213, + "time_per_iteration": 2.592165470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057864, + "balance_loss_mlp": 1.00894058, + "epoch": 0.23355136590996536, + "flos": 664407216384.0, + "grad_norm": 0.030798488974581865, + "language_loss": 0.9124192, + "learning_rate": 0.0008952340307267837, + "loss": 0.92299783, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48876953, + "step": 1214, + "time_per_iteration": 2.887542724609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_mlp": 1.00332439, + "epoch": 0.23374374759522892, + "flos": 509465995008.0, + "grad_norm": 0.038631928770240895, + "language_loss": 0.8442086, + "learning_rate": 0.0008950431333368468, + "loss": 0.85472775, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.48583984, + "step": 1215, + "time_per_iteration": 2.5713701248168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_mlp": 1.00283849, + "epoch": 0.2339361292804925, + "flos": 1296429915648.0, + "grad_norm": 0.03446682830311694, + "language_loss": 0.8584398, + "learning_rate": 0.0008948520825775634, + "loss": 0.86895549, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48706055, + "step": 1216, + "time_per_iteration": 3.631596565246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054055, + "balance_loss_mlp": 1.00541723, + "epoch": 0.23412851096575607, + "flos": 707177344512.0, + "grad_norm": 0.031791306217448204, + "language_loss": 0.84468639, + "learning_rate": 0.0008946608785231067, + "loss": 0.85522687, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48632812, + "step": 1217, + "time_per_iteration": 2.878099203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053689, + "balance_loss_mlp": 1.00517046, + "epoch": 0.23432089265101963, + "flos": 439175582208.0, + "grad_norm": 0.03486793229645632, + "language_loss": 0.85493773, + "learning_rate": 0.0008944695212477084, + "loss": 0.86547458, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.48510742, + "step": 1218, + "time_per_iteration": 2.5141704082489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053338, + "balance_loss_mlp": 1.00498641, + "epoch": 0.2345132743362832, + "flos": 481915574784.0, + "grad_norm": 0.03047714423600347, + "language_loss": 0.87145793, + "learning_rate": 0.0008942780108256599, + "loss": 0.88199133, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.48339844, + "step": 1219, + "time_per_iteration": 2.6020901203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050373, + "balance_loss_mlp": 1.00180733, + "epoch": 0.23470565602154675, + "flos": 412341577728.0, + "grad_norm": 0.03328064907126118, + "language_loss": 0.87382472, + "learning_rate": 0.0008940863473313121, + "loss": 0.88432848, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.4855957, + "step": 1220, + "time_per_iteration": 2.4561610221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_mlp": 1.00483322, + "epoch": 0.2348980377068103, + "flos": 546500906496.0, + "grad_norm": 0.04239569524538178, + "language_loss": 0.88751769, + "learning_rate": 0.0008938945308390756, + "loss": 0.89805412, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48779297, + "step": 1221, + "time_per_iteration": 2.657763719558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057653, + "balance_loss_mlp": 1.00906336, + "epoch": 0.23509041939207387, + "flos": 576843112704.0, + "grad_norm": 0.04482007629740174, + "language_loss": 0.88039029, + "learning_rate": 0.00089370256142342, + "loss": 0.89096677, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.48583984, + "step": 1222, + "time_per_iteration": 2.7348928451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_mlp": 1.00616074, + "epoch": 0.23528280107733743, + "flos": 589948566528.0, + "grad_norm": 0.030112791330182954, + "language_loss": 0.85687798, + "learning_rate": 0.0008935104391588746, + "loss": 0.86742526, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.4855957, + "step": 1223, + "time_per_iteration": 2.7620511054992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_mlp": 1.00350857, + "epoch": 0.235475182762601, + "flos": 824858132736.0, + "grad_norm": 0.028710207733723417, + "language_loss": 0.83630896, + "learning_rate": 0.0008933181641200276, + "loss": 0.84683019, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.48608398, + "step": 1224, + "time_per_iteration": 3.1587913036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053568, + "balance_loss_mlp": 1.00531197, + "epoch": 0.23566756444786457, + "flos": 681367902720.0, + "grad_norm": 0.03430983930689064, + "language_loss": 0.86561936, + "learning_rate": 0.0008931257363815271, + "loss": 0.87615514, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.48242188, + "step": 1225, + "time_per_iteration": 2.9277396202087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056611, + "balance_loss_mlp": 1.00849795, + "epoch": 0.23585994613312813, + "flos": 703135474176.0, + "grad_norm": 0.029906055234585397, + "language_loss": 0.90256047, + "learning_rate": 0.0008929331560180798, + "loss": 0.91312659, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.48095703, + "step": 1226, + "time_per_iteration": 2.911451578140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_mlp": 1.00676012, + "epoch": 0.2360523278183917, + "flos": 525196038912.0, + "grad_norm": 0.030679819106685022, + "language_loss": 0.9186613, + "learning_rate": 0.0008927404231044525, + "loss": 0.92921197, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48291016, + "step": 1227, + "time_per_iteration": 2.6848785877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.00756276, + "epoch": 0.23624470950365525, + "flos": 525443914752.0, + "grad_norm": 0.030207709240370546, + "language_loss": 0.82286787, + "learning_rate": 0.0008925475377154703, + "loss": 0.83342624, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48266602, + "step": 1228, + "time_per_iteration": 2.7278709411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058098, + "balance_loss_mlp": 1.00974643, + "epoch": 0.2364370911889188, + "flos": 597961342464.0, + "grad_norm": 0.04301213480645635, + "language_loss": 0.82405227, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463323, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.48339844, + "step": 1229, + "time_per_iteration": 2.7282724380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055873, + "balance_loss_mlp": 1.00766432, + "epoch": 0.23662947287418237, + "flos": 758173131264.0, + "grad_norm": 0.03660169780759576, + "language_loss": 0.92488217, + "learning_rate": 0.00089216130981104, + "loss": 0.93544096, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.48193359, + "step": 1230, + "time_per_iteration": 3.0333714485168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051461, + "balance_loss_mlp": 1.00337219, + "epoch": 0.23682185455944593, + "flos": 547208573952.0, + "grad_norm": 0.03138155314794734, + "language_loss": 0.83336782, + "learning_rate": 0.000891967967445539, + "loss": 0.8438825, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.48071289, + "step": 1231, + "time_per_iteration": 2.7093472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_mlp": 1.00587165, + "epoch": 0.2370142362447095, + "flos": 663523604736.0, + "grad_norm": 0.02795314572038805, + "language_loss": 0.89439881, + "learning_rate": 0.0008917744729045772, + "loss": 0.90493822, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.48046875, + "step": 1232, + "time_per_iteration": 2.8760838508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057809, + "balance_loss_mlp": 1.00974393, + "epoch": 0.23720661792997308, + "flos": 684913042944.0, + "grad_norm": 0.03460859048974857, + "language_loss": 0.8446126, + "learning_rate": 0.0008915808262632757, + "loss": 0.85519075, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.48046875, + "step": 1233, + "time_per_iteration": 2.889141321182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_mlp": 1.01058459, + "epoch": 0.23739899961523664, + "flos": 560023377408.0, + "grad_norm": 0.03296017154749467, + "language_loss": 0.94079709, + "learning_rate": 0.0008913870275968148, + "loss": 0.95138192, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.47875977, + "step": 1234, + "time_per_iteration": 2.7432892322540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_mlp": 1.00655627, + "epoch": 0.2375913813005002, + "flos": 891165000960.0, + "grad_norm": 0.03128077017401229, + "language_loss": 0.88428569, + "learning_rate": 0.0008911930769804342, + "loss": 0.89483166, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.48022461, + "step": 1235, + "time_per_iteration": 3.261483669281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692844, + "epoch": 0.23778376298576376, + "flos": 642366491136.0, + "grad_norm": 0.029107844015886564, + "language_loss": 0.91850013, + "learning_rate": 0.0008909989744894318, + "loss": 0.92905295, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.48339844, + "step": 1236, + "time_per_iteration": 2.8673832416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061412, + "balance_loss_mlp": 1.01287031, + "epoch": 0.23797614467102732, + "flos": 617946139392.0, + "grad_norm": 0.034095811880077646, + "language_loss": 0.82566786, + "learning_rate": 0.0008908047201991649, + "loss": 0.83628196, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.48535156, + "step": 1237, + "time_per_iteration": 2.7810442447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00511789, + "epoch": 0.23816852635629088, + "flos": 625464130560.0, + "grad_norm": 0.032663011960307756, + "language_loss": 0.87081301, + "learning_rate": 0.0008906103141850502, + "loss": 0.88134885, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.48461914, + "step": 1238, + "time_per_iteration": 2.880305528640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_mlp": 1.00416911, + "epoch": 0.23836090804155444, + "flos": 522441191424.0, + "grad_norm": 0.03474425243888252, + "language_loss": 0.88862967, + "learning_rate": 0.0008904157565225621, + "loss": 0.89915323, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48168945, + "step": 1239, + "time_per_iteration": 2.648766040802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052394, + "balance_loss_mlp": 1.00423324, + "epoch": 0.238553289726818, + "flos": 1155855892992.0, + "grad_norm": 0.034399895266541865, + "language_loss": 0.82445645, + "learning_rate": 0.000890221047287235, + "loss": 0.83498037, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48144531, + "step": 1240, + "time_per_iteration": 3.5001280307769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055, + "balance_loss_mlp": 1.00703037, + "epoch": 0.23874567141208156, + "flos": 500910802176.0, + "grad_norm": 0.03306053891413694, + "language_loss": 0.91726851, + "learning_rate": 0.0008900261865546615, + "loss": 0.92781848, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47949219, + "step": 1241, + "time_per_iteration": 2.6465680599212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052437, + "balance_loss_mlp": 1.00418115, + "epoch": 0.23893805309734514, + "flos": 558050074368.0, + "grad_norm": 0.0354259641755878, + "language_loss": 0.85598528, + "learning_rate": 0.0008898311744004936, + "loss": 0.86650962, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.48242188, + "step": 1242, + "time_per_iteration": 2.7268829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053623, + "balance_loss_mlp": 1.0055337, + "epoch": 0.2391304347826087, + "flos": 550317255168.0, + "grad_norm": 0.0320494810853186, + "language_loss": 0.87574649, + "learning_rate": 0.0008896360109004414, + "loss": 0.88628268, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.48071289, + "step": 1243, + "time_per_iteration": 2.6199252605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050337, + "balance_loss_mlp": 1.00222456, + "epoch": 0.23932281646787226, + "flos": 517079250432.0, + "grad_norm": 0.0302458656306059, + "language_loss": 0.85177696, + "learning_rate": 0.0008894406961302742, + "loss": 0.86228031, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.48095703, + "step": 1244, + "time_per_iteration": 2.604508876800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052152, + "balance_loss_mlp": 1.00411069, + "epoch": 0.23951519815313582, + "flos": 745002548736.0, + "grad_norm": 0.03429303167053761, + "language_loss": 0.84712255, + "learning_rate": 0.0008892452301658201, + "loss": 0.85764414, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.48022461, + "step": 1245, + "time_per_iteration": 2.924288272857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054436, + "balance_loss_mlp": 1.00651395, + "epoch": 0.23970757983839938, + "flos": 555175663104.0, + "grad_norm": 0.03219666617279603, + "language_loss": 0.84054452, + "learning_rate": 0.0008890496130829653, + "loss": 0.85108888, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.47900391, + "step": 1246, + "time_per_iteration": 2.6700189113616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052243, + "balance_loss_mlp": 1.00441635, + "epoch": 0.23989996152366294, + "flos": 481618121472.0, + "grad_norm": 0.033578246726411604, + "language_loss": 0.86002076, + "learning_rate": 0.0008888538449576555, + "loss": 0.87054318, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.47802734, + "step": 1247, + "time_per_iteration": 2.6269826889038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_mlp": 1.00886118, + "epoch": 0.2400923432089265, + "flos": 486281143296.0, + "grad_norm": 0.03580496599340432, + "language_loss": 0.83572984, + "learning_rate": 0.0008886579258658944, + "loss": 0.84630001, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48144531, + "step": 1248, + "time_per_iteration": 2.577885389328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054529, + "balance_loss_mlp": 1.0065589, + "epoch": 0.24028472489419006, + "flos": 624793401600.0, + "grad_norm": 0.03296142515540601, + "language_loss": 0.85843956, + "learning_rate": 0.0008884618558837446, + "loss": 0.86898482, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.47949219, + "step": 1249, + "time_per_iteration": 2.874666929244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.00870681, + "epoch": 0.24047710657945365, + "flos": 602809056768.0, + "grad_norm": 0.033943651692576245, + "language_loss": 0.87474859, + "learning_rate": 0.0008882656350873273, + "loss": 0.88531733, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.48144531, + "step": 1250, + "time_per_iteration": 2.8647053241729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055385, + "balance_loss_mlp": 1.00748658, + "epoch": 0.2406694882647172, + "flos": 843001829376.0, + "grad_norm": 0.04142560607115463, + "language_loss": 0.87984931, + "learning_rate": 0.0008880692635528219, + "loss": 0.89040315, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.47875977, + "step": 1251, + "time_per_iteration": 3.0643107891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105149, + "balance_loss_mlp": 1.00352037, + "epoch": 0.24086186994998077, + "flos": 528135578880.0, + "grad_norm": 0.03337559285192523, + "language_loss": 0.90356189, + "learning_rate": 0.0008878727413564669, + "loss": 0.91407681, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47949219, + "step": 1252, + "time_per_iteration": 2.7680115699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053848, + "balance_loss_mlp": 1.00826263, + "epoch": 0.24105425163524433, + "flos": 1341462028800.0, + "grad_norm": 0.009196650126926217, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81189448, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.45507812, + "step": 1253, + "time_per_iteration": 4.858070135116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_mlp": 1.00698781, + "epoch": 0.24124663332050789, + "flos": 615228230400.0, + "grad_norm": 0.036740782431925904, + "language_loss": 0.79496801, + "learning_rate": 0.0008874792452834528, + "loss": 0.80551577, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.47753906, + "step": 1254, + "time_per_iteration": 2.756243944168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_mlp": 1.00954247, + "epoch": 0.24143901500577145, + "flos": 576593291520.0, + "grad_norm": 0.037714132300224086, + "language_loss": 0.87880921, + "learning_rate": 0.0008872822715595626, + "loss": 0.88938332, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.47851562, + "step": 1255, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.00812411, + "epoch": 0.241631396691035, + "flos": 496147658496.0, + "grad_norm": 0.038695693582970765, + "language_loss": 0.87873089, + "learning_rate": 0.0008870851474793598, + "loss": 0.88929206, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.47973633, + "step": 1256, + "time_per_iteration": 2.6313350200653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058027, + "balance_loss_mlp": 1.009866, + "epoch": 0.24182377837629856, + "flos": 637397267712.0, + "grad_norm": 0.03630749648984725, + "language_loss": 0.904266, + "learning_rate": 0.0008868878731193752, + "loss": 0.9148463, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48144531, + "step": 1257, + "time_per_iteration": 2.820671558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052967, + "balance_loss_mlp": 1.00509274, + "epoch": 0.24201616006156215, + "flos": 516350195712.0, + "grad_norm": 0.04098435374075245, + "language_loss": 0.90631104, + "learning_rate": 0.0008866904485561973, + "loss": 0.91684067, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.47851562, + "step": 1258, + "time_per_iteration": 2.712970495223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053405, + "balance_loss_mlp": 1.0053165, + "epoch": 0.2422085417468257, + "flos": 616379159808.0, + "grad_norm": 0.03199149634406808, + "language_loss": 0.83463258, + "learning_rate": 0.000886492873866473, + "loss": 0.84516662, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.48071289, + "step": 1259, + "time_per_iteration": 2.8250985145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00330269, + "epoch": 0.24240092343208927, + "flos": 586913762304.0, + "grad_norm": 0.03973618931504764, + "language_loss": 0.85183978, + "learning_rate": 0.000886295149126908, + "loss": 0.86235273, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.47973633, + "step": 1260, + "time_per_iteration": 2.7110049724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_mlp": 1.00338328, + "epoch": 0.24259330511735283, + "flos": 763572010752.0, + "grad_norm": 0.03275678482299809, + "language_loss": 0.86485362, + "learning_rate": 0.0008860972744142655, + "loss": 0.87536597, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47827148, + "step": 1261, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051398, + "balance_loss_mlp": 1.00361907, + "epoch": 0.2427856868026164, + "flos": 628134407424.0, + "grad_norm": 0.03196094686024711, + "language_loss": 0.82455611, + "learning_rate": 0.0008858992498053671, + "loss": 0.83507007, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47753906, + "step": 1262, + "time_per_iteration": 2.8111376762390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054321, + "balance_loss_mlp": 1.00797272, + "epoch": 0.24297806848787995, + "flos": 1514922167808.0, + "grad_norm": 0.010120346862694057, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77643073, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.46289062, + "step": 1263, + "time_per_iteration": 4.84857177734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052202, + "balance_loss_mlp": 1.00420785, + "epoch": 0.2431704501731435, + "flos": 543073384704.0, + "grad_norm": 0.030775668427347653, + "language_loss": 0.83837479, + "learning_rate": 0.0008855027512063817, + "loss": 0.84889686, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47973633, + "step": 1264, + "time_per_iteration": 2.69954252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055608, + "balance_loss_mlp": 1.0077095, + "epoch": 0.24336283185840707, + "flos": 524879143680.0, + "grad_norm": 0.03906981412635217, + "language_loss": 0.86655742, + "learning_rate": 0.0008853042773702292, + "loss": 0.87711346, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.47875977, + "step": 1265, + "time_per_iteration": 2.703227996826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_mlp": 1.00530863, + "epoch": 0.24355521354367063, + "flos": 538206228480.0, + "grad_norm": 0.030917867079500824, + "language_loss": 0.88497615, + "learning_rate": 0.0008851056539456896, + "loss": 0.89550632, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.47680664, + "step": 1266, + "time_per_iteration": 2.6844840049743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_mlp": 1.00655031, + "epoch": 0.24374759522893422, + "flos": 932109580032.0, + "grad_norm": 0.032880300158599975, + "language_loss": 0.82697207, + "learning_rate": 0.0008849068810098755, + "loss": 0.83751392, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.47607422, + "step": 1267, + "time_per_iteration": 3.274641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_mlp": 1.00789249, + "epoch": 0.24393997691419778, + "flos": 428685970176.0, + "grad_norm": 0.04273651221625489, + "language_loss": 0.84108871, + "learning_rate": 0.0008847079586399575, + "loss": 0.85164183, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47387695, + "step": 1268, + "time_per_iteration": 2.475217819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057264, + "balance_loss_mlp": 1.00993788, + "epoch": 0.24413235859946134, + "flos": 579943045632.0, + "grad_norm": 0.03463136192779687, + "language_loss": 0.86878628, + "learning_rate": 0.0008845088869131641, + "loss": 0.87935889, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.47290039, + "step": 1269, + "time_per_iteration": 2.676954746246338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054221, + "balance_loss_mlp": 1.00689447, + "epoch": 0.2443247402847249, + "flos": 530901120000.0, + "grad_norm": 0.04739098518835349, + "language_loss": 0.8972156, + "learning_rate": 0.0008843096659067818, + "loss": 0.90775776, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.47290039, + "step": 1270, + "time_per_iteration": 2.6031625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_mlp": 1.00896251, + "epoch": 0.24451712196998845, + "flos": 697625779200.0, + "grad_norm": 0.03005687387855686, + "language_loss": 0.8676796, + "learning_rate": 0.000884110295698155, + "loss": 0.87824345, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.47387695, + "step": 1271, + "time_per_iteration": 2.946385145187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00460577, + "epoch": 0.24470950365525201, + "flos": 530864181504.0, + "grad_norm": 0.03542850047119753, + "language_loss": 0.86657912, + "learning_rate": 0.0008839107763646861, + "loss": 0.87710059, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.47509766, + "step": 1272, + "time_per_iteration": 2.6175343990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057047, + "balance_loss_mlp": 1.00955379, + "epoch": 0.24490188534051557, + "flos": 492348806400.0, + "grad_norm": 0.04294337139782129, + "language_loss": 0.9099223, + "learning_rate": 0.0008837111079838353, + "loss": 0.92049271, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.47460938, + "step": 1273, + "time_per_iteration": 2.699777126312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051504, + "balance_loss_mlp": 1.00393975, + "epoch": 0.24509426702577913, + "flos": 475112054016.0, + "grad_norm": 0.03233839715385124, + "language_loss": 0.90686411, + "learning_rate": 0.000883511290633121, + "loss": 0.91737914, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.4753418, + "step": 1274, + "time_per_iteration": 2.5347506999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053736, + "balance_loss_mlp": 1.0061239, + "epoch": 0.24528664871104272, + "flos": 551648019456.0, + "grad_norm": 0.029596958484994024, + "language_loss": 0.9283247, + "learning_rate": 0.000883311324390119, + "loss": 0.93886209, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.47583008, + "step": 1275, + "time_per_iteration": 2.7105162143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_mlp": 1.00703931, + "epoch": 0.24547903039630628, + "flos": 827336914176.0, + "grad_norm": 0.04026092464880397, + "language_loss": 0.8227402, + "learning_rate": 0.0008831112093324629, + "loss": 0.83328599, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.47509766, + "step": 1276, + "time_per_iteration": 3.0518436431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052804, + "balance_loss_mlp": 1.00523984, + "epoch": 0.24567141208156984, + "flos": 592694665728.0, + "grad_norm": 0.0350541873914122, + "language_loss": 0.89993191, + "learning_rate": 0.0008829109455378444, + "loss": 0.91045994, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.4753418, + "step": 1277, + "time_per_iteration": 2.705888032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053461, + "balance_loss_mlp": 1.00606322, + "epoch": 0.2458637937668334, + "flos": 548930110464.0, + "grad_norm": 0.03225743101348484, + "language_loss": 0.87107539, + "learning_rate": 0.000882710533084013, + "loss": 0.88161004, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.47363281, + "step": 1278, + "time_per_iteration": 2.6600000858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_mlp": 1.00418186, + "epoch": 0.24605617545209696, + "flos": 516912054528.0, + "grad_norm": 0.031446449457072034, + "language_loss": 0.89965951, + "learning_rate": 0.0008825099720487755, + "loss": 0.91017628, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.47460938, + "step": 1279, + "time_per_iteration": 2.6381545066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059078, + "balance_loss_mlp": 1.01320648, + "epoch": 0.24624855713736052, + "flos": 1515061173504.0, + "grad_norm": 0.006597619453236458, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76320213, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.45800781, + "step": 1280, + "time_per_iteration": 4.836413621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.0109787, + "epoch": 0.24644093882262408, + "flos": 1530749421312.0, + "grad_norm": 0.006438131933853504, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000866, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.45703125, + "step": 1281, + "time_per_iteration": 4.763012409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055673, + "balance_loss_mlp": 1.00817966, + "epoch": 0.24663332050788764, + "flos": 660349794816.0, + "grad_norm": 0.03366863359794558, + "language_loss": 0.89743239, + "learning_rate": 0.0008819073982335619, + "loss": 0.90798908, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.47460938, + "step": 1282, + "time_per_iteration": 2.830066204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051734, + "balance_loss_mlp": 1.00426519, + "epoch": 0.24682570219315123, + "flos": 542806066944.0, + "grad_norm": 0.034270358372240205, + "language_loss": 0.85323066, + "learning_rate": 0.0008817062436519235, + "loss": 0.86374807, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.47436523, + "step": 1283, + "time_per_iteration": 2.6451101303100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00680709, + "epoch": 0.24701808387841478, + "flos": 441659221248.0, + "grad_norm": 0.03422998600893363, + "language_loss": 0.90367711, + "learning_rate": 0.0008815049408787788, + "loss": 0.91422176, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.47631836, + "step": 1284, + "time_per_iteration": 2.5568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_mlp": 1.00672722, + "epoch": 0.24721046556367834, + "flos": 469033697280.0, + "grad_norm": 0.036620952447016124, + "language_loss": 0.86045629, + "learning_rate": 0.0008813034899922805, + "loss": 0.87100112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47729492, + "step": 1285, + "time_per_iteration": 2.5571885108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052621, + "balance_loss_mlp": 1.00498545, + "epoch": 0.2474028472489419, + "flos": 505408573440.0, + "grad_norm": 0.03938899634346209, + "language_loss": 0.90811062, + "learning_rate": 0.0008811018910706387, + "loss": 0.91863692, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.47607422, + "step": 1286, + "time_per_iteration": 2.5542702674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105262, + "balance_loss_mlp": 1.00496054, + "epoch": 0.24759522893420546, + "flos": 480956140800.0, + "grad_norm": 0.04329385189604929, + "language_loss": 0.82886434, + "learning_rate": 0.0008809001441921211, + "loss": 0.83939052, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.47631836, + "step": 1287, + "time_per_iteration": 2.7426302433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056359, + "balance_loss_mlp": 1.00879443, + "epoch": 0.24778761061946902, + "flos": 534754407168.0, + "grad_norm": 0.03495005483538565, + "language_loss": 0.86372733, + "learning_rate": 0.0008806982494350528, + "loss": 0.87429094, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.4753418, + "step": 1288, + "time_per_iteration": 2.6200613975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_mlp": 1.0063771, + "epoch": 0.24797999230473258, + "flos": 560943927552.0, + "grad_norm": 0.028534619779485338, + "language_loss": 0.90820038, + "learning_rate": 0.0008804962068778161, + "loss": 0.91874075, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.47631836, + "step": 1289, + "time_per_iteration": 2.8445866107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050603, + "balance_loss_mlp": 1.00287127, + "epoch": 0.24817237398999614, + "flos": 625481627136.0, + "grad_norm": 0.033144052318390974, + "language_loss": 0.81476247, + "learning_rate": 0.0008802940165988511, + "loss": 0.82526851, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.47705078, + "step": 1290, + "time_per_iteration": 2.874469518661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052663, + "balance_loss_mlp": 1.00500298, + "epoch": 0.2483647556752597, + "flos": 613485306624.0, + "grad_norm": 0.033485904546120666, + "language_loss": 0.88976955, + "learning_rate": 0.000880091678676655, + "loss": 0.90029621, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47631836, + "step": 1291, + "time_per_iteration": 2.8294923305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_mlp": 1.00159943, + "epoch": 0.2485571373605233, + "flos": 584688692736.0, + "grad_norm": 0.030875088012072577, + "language_loss": 0.89826584, + "learning_rate": 0.0008798891931897821, + "loss": 0.90875816, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47607422, + "step": 1292, + "time_per_iteration": 2.7068471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050902, + "balance_loss_mlp": 1.00359952, + "epoch": 0.24874951904578685, + "flos": 495737444352.0, + "grad_norm": 0.03670876005724945, + "language_loss": 0.84959131, + "learning_rate": 0.0008796865602168447, + "loss": 0.86010033, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.47265625, + "step": 1293, + "time_per_iteration": 2.550218343734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_mlp": 1.00526226, + "epoch": 0.2489419007310504, + "flos": 457174437120.0, + "grad_norm": 0.03243940706171699, + "language_loss": 0.89144397, + "learning_rate": 0.0008794837798365115, + "loss": 0.90196991, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.47290039, + "step": 1294, + "time_per_iteration": 2.6271979808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.00420678, + "epoch": 0.24913428241631397, + "flos": 486565957632.0, + "grad_norm": 0.03268946967982851, + "language_loss": 0.89255542, + "learning_rate": 0.0008792808521275089, + "loss": 0.90307105, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47314453, + "step": 1295, + "time_per_iteration": 2.733107566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_mlp": 1.00544262, + "epoch": 0.24932666410157753, + "flos": 519918668544.0, + "grad_norm": 0.031266052737173484, + "language_loss": 0.88015056, + "learning_rate": 0.0008790777771686206, + "loss": 0.89068043, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.47509766, + "step": 1296, + "time_per_iteration": 2.5860161781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_mlp": 1.0059917, + "epoch": 0.2495190457868411, + "flos": 473557713408.0, + "grad_norm": 0.03428757295266267, + "language_loss": 0.86048388, + "learning_rate": 0.0008788745550386872, + "loss": 0.8710202, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.47607422, + "step": 1297, + "time_per_iteration": 2.599851608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_mlp": 1.00776434, + "epoch": 0.24971142747210465, + "flos": 747199428096.0, + "grad_norm": 0.03345883603952397, + "language_loss": 0.80858141, + "learning_rate": 0.0008786711858166063, + "loss": 0.81913638, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47705078, + "step": 1298, + "time_per_iteration": 2.9357736110687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_mlp": 1.00770009, + "epoch": 0.2499038091573682, + "flos": 750903015936.0, + "grad_norm": 0.03503874681650984, + "language_loss": 0.84951854, + "learning_rate": 0.0008784676695813332, + "loss": 0.86007309, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.47729492, + "step": 1299, + "time_per_iteration": 2.955172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055374, + "balance_loss_mlp": 1.00776184, + "epoch": 0.2500961908426318, + "flos": 746344006656.0, + "grad_norm": 0.032686560936085865, + "language_loss": 0.85840905, + "learning_rate": 0.0008782640064118796, + "loss": 0.86896276, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47583008, + "step": 1300, + "time_per_iteration": 2.897998571395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055206, + "balance_loss_mlp": 1.00904846, + "epoch": 0.2502885725278953, + "flos": 1420526353152.0, + "grad_norm": 0.0075534145797937526, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77239954, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.4609375, + "step": 1301, + "time_per_iteration": 5.023081541061401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.00904393, + "epoch": 0.2504809542131589, + "flos": 516232577280.0, + "grad_norm": 0.03748206036604932, + "language_loss": 0.87484509, + "learning_rate": 0.0008778562395867648, + "loss": 0.88541192, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.47607422, + "step": 1302, + "time_per_iteration": 2.593972682952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105251, + "balance_loss_mlp": 1.00477886, + "epoch": 0.25067333589842244, + "flos": 526852446720.0, + "grad_norm": 0.031223058919554587, + "language_loss": 0.84117836, + "learning_rate": 0.0008776521360894127, + "loss": 0.85170352, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.47705078, + "step": 1303, + "time_per_iteration": 2.6153149604797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.02342987, + "epoch": 0.25086571758368603, + "flos": 1477160146944.0, + "grad_norm": 0.014969332736355754, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80031657, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.4609375, + "step": 1304, + "time_per_iteration": 4.792739629745483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053769, + "balance_loss_mlp": 1.00649047, + "epoch": 0.2510580992689496, + "flos": 529403159808.0, + "grad_norm": 0.03453306909815573, + "language_loss": 0.91369265, + "learning_rate": 0.0008772434893213186, + "loss": 0.92423034, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.47241211, + "step": 1305, + "time_per_iteration": 2.581268072128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056214, + "balance_loss_mlp": 1.00919807, + "epoch": 0.25125048095421315, + "flos": 518466395136.0, + "grad_norm": 0.035319884850533015, + "language_loss": 0.84733635, + "learning_rate": 0.0008770389462092276, + "loss": 0.85789847, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46972656, + "step": 1306, + "time_per_iteration": 2.627317428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056702, + "balance_loss_mlp": 1.00951862, + "epoch": 0.25144286263947674, + "flos": 621675972096.0, + "grad_norm": 0.03558379494917989, + "language_loss": 0.87486076, + "learning_rate": 0.0008768342567176357, + "loss": 0.88542777, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.47143555, + "step": 1307, + "time_per_iteration": 2.787318706512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052534, + "balance_loss_mlp": 1.00537527, + "epoch": 0.25163524432474027, + "flos": 504866156544.0, + "grad_norm": 0.03616031366836922, + "language_loss": 0.9109531, + "learning_rate": 0.0008766294209260107, + "loss": 0.92147839, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.47119141, + "step": 1308, + "time_per_iteration": 2.6384546756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_mlp": 1.00510657, + "epoch": 0.25182762601000386, + "flos": 510080343552.0, + "grad_norm": 0.03702737725286332, + "language_loss": 0.92033225, + "learning_rate": 0.0008764244389138767, + "loss": 0.93085706, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47338867, + "step": 1309, + "time_per_iteration": 2.5620551109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053435, + "balance_loss_mlp": 1.006037, + "epoch": 0.2520200076952674, + "flos": 635098321152.0, + "grad_norm": 0.03928250470986306, + "language_loss": 0.83104628, + "learning_rate": 0.000876219310760815, + "loss": 0.84158063, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.47363281, + "step": 1310, + "time_per_iteration": 2.886335849761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053423, + "balance_loss_mlp": 1.00614405, + "epoch": 0.252212389380531, + "flos": 495652873728.0, + "grad_norm": 0.03544669215118347, + "language_loss": 0.82256365, + "learning_rate": 0.0008760140365464631, + "loss": 0.83309782, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.47241211, + "step": 1311, + "time_per_iteration": 2.607191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053107, + "balance_loss_mlp": 1.00592351, + "epoch": 0.2524047710657945, + "flos": 491530323456.0, + "grad_norm": 0.037974131054051216, + "language_loss": 0.87817502, + "learning_rate": 0.0008758086163505156, + "loss": 0.88870609, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.47143555, + "step": 1312, + "time_per_iteration": 2.6121339797973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052408, + "balance_loss_mlp": 1.00505757, + "epoch": 0.2525971527510581, + "flos": 648613989120.0, + "grad_norm": 0.03226827566126977, + "language_loss": 0.90228277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91280687, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.47314453, + "step": 1313, + "time_per_iteration": 2.8256115913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049721, + "balance_loss_mlp": 1.00234711, + "epoch": 0.2527895344363217, + "flos": 570373983744.0, + "grad_norm": 0.0325160066751772, + "language_loss": 0.907884, + "learning_rate": 0.0008753973383328954, + "loss": 0.91838121, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.47338867, + "step": 1314, + "time_per_iteration": 2.722231388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051583, + "balance_loss_mlp": 1.00423265, + "epoch": 0.2529819161215852, + "flos": 515069008896.0, + "grad_norm": 0.040482030139478604, + "language_loss": 0.8500945, + "learning_rate": 0.0008751914806708952, + "loss": 0.86061025, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.47314453, + "step": 1315, + "time_per_iteration": 2.593076229095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_mlp": 1.00376213, + "epoch": 0.2531742978068488, + "flos": 532351448064.0, + "grad_norm": 0.03414491036051862, + "language_loss": 0.82694548, + "learning_rate": 0.0008749854773466439, + "loss": 0.8374573, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.47387695, + "step": 1316, + "time_per_iteration": 2.660116672515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054789, + "balance_loss_mlp": 1.00722456, + "epoch": 0.25336667949211233, + "flos": 597748459776.0, + "grad_norm": 0.03206754273868493, + "language_loss": 0.84984171, + "learning_rate": 0.0008747793284401192, + "loss": 0.86038959, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.4753418, + "step": 1317, + "time_per_iteration": 2.692183017730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105185, + "balance_loss_mlp": 1.00407052, + "epoch": 0.2535590611773759, + "flos": 603256209408.0, + "grad_norm": 0.034288977750124294, + "language_loss": 0.85941386, + "learning_rate": 0.0008745730340313551, + "loss": 0.86993235, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.47753906, + "step": 1318, + "time_per_iteration": 2.7932682037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105299, + "balance_loss_mlp": 1.00525868, + "epoch": 0.25375144286263945, + "flos": 496323602688.0, + "grad_norm": 0.035249055653748196, + "language_loss": 0.8522734, + "learning_rate": 0.0008743665942004422, + "loss": 0.86280334, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.47705078, + "step": 1319, + "time_per_iteration": 2.6616318225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052413, + "balance_loss_mlp": 1.00465751, + "epoch": 0.25394382454790304, + "flos": 513477729792.0, + "grad_norm": 0.032623992793633046, + "language_loss": 0.93257391, + "learning_rate": 0.0008741600090275277, + "loss": 0.94309807, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.47729492, + "step": 1320, + "time_per_iteration": 2.567985773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051086, + "balance_loss_mlp": 1.00333035, + "epoch": 0.25413620623316663, + "flos": 960856616448.0, + "grad_norm": 0.03465281335593922, + "language_loss": 0.8488484, + "learning_rate": 0.0008739532785928151, + "loss": 0.85935926, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.47729492, + "step": 1321, + "time_per_iteration": 3.4506430625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054222, + "balance_loss_mlp": 1.00882721, + "epoch": 0.25432858791843016, + "flos": 1580651625984.0, + "grad_norm": 0.01348888133328934, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75947809, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.453125, + "step": 1322, + "time_per_iteration": 4.819811820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055371, + "balance_loss_mlp": 1.00752044, + "epoch": 0.25452096960369375, + "flos": 584894772480.0, + "grad_norm": 0.03690210205672512, + "language_loss": 0.83839363, + "learning_rate": 0.0008735393822590908, + "loss": 0.84894735, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.47827148, + "step": 1323, + "time_per_iteration": 2.680769681930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069306, + "balance_loss_mlp": 1.02138364, + "epoch": 0.2547133512889573, + "flos": 509641939200.0, + "grad_norm": 0.03795743442729459, + "language_loss": 0.87760162, + "learning_rate": 0.0008733322165207681, + "loss": 0.8882947, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.47900391, + "step": 1324, + "time_per_iteration": 2.6391303539276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056249, + "balance_loss_mlp": 1.00856507, + "epoch": 0.25490573297422087, + "flos": 784037008128.0, + "grad_norm": 0.03625483542623235, + "language_loss": 0.83670151, + "learning_rate": 0.0008731249058420247, + "loss": 0.84726399, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.4765625, + "step": 1325, + "time_per_iteration": 3.0179827213287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.01479542, + "epoch": 0.2550981146594844, + "flos": 510953261568.0, + "grad_norm": 0.03728184694741104, + "language_loss": 0.91373062, + "learning_rate": 0.0008729174503033459, + "loss": 0.92435133, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.47241211, + "step": 1326, + "time_per_iteration": 2.644351005554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059853, + "balance_loss_mlp": 1.01262248, + "epoch": 0.255290496344748, + "flos": 677931632640.0, + "grad_norm": 0.04262364220636159, + "language_loss": 0.83700824, + "learning_rate": 0.0008727098499852728, + "loss": 0.84760678, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.47192383, + "step": 1327, + "time_per_iteration": 2.8393821716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059289, + "balance_loss_mlp": 1.01212943, + "epoch": 0.2554828780300115, + "flos": 538985827584.0, + "grad_norm": 0.0346626903619469, + "language_loss": 0.90499496, + "learning_rate": 0.0008725021049684034, + "loss": 0.91558784, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.47119141, + "step": 1328, + "time_per_iteration": 2.74480938911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_mlp": 1.00554764, + "epoch": 0.2556752597152751, + "flos": 825624125952.0, + "grad_norm": 0.0321884383853499, + "language_loss": 0.83690739, + "learning_rate": 0.000872294215333391, + "loss": 0.84743297, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.46972656, + "step": 1329, + "time_per_iteration": 3.177448034286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066156, + "balance_loss_mlp": 1.01880646, + "epoch": 0.2558676414005387, + "flos": 571891385856.0, + "grad_norm": 0.037080167806849716, + "language_loss": 0.84060931, + "learning_rate": 0.0008720861811609457, + "loss": 0.85127091, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.47314453, + "step": 1330, + "time_per_iteration": 2.7320711612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_mlp": 1.00745046, + "epoch": 0.2560600230858022, + "flos": 487748967936.0, + "grad_norm": 0.03498979971426328, + "language_loss": 0.84052318, + "learning_rate": 0.0008718780025318338, + "loss": 0.85106957, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.47143555, + "step": 1331, + "time_per_iteration": 2.7297112941741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.00705111, + "epoch": 0.2562524047710658, + "flos": 514120268544.0, + "grad_norm": 0.03699782349212247, + "language_loss": 0.84697664, + "learning_rate": 0.0008716696795268771, + "loss": 0.85751587, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.46826172, + "step": 1332, + "time_per_iteration": 2.6615397930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054318, + "balance_loss_mlp": 1.00756466, + "epoch": 0.25644478645632934, + "flos": 636110244864.0, + "grad_norm": 0.03600089626817585, + "language_loss": 0.85914254, + "learning_rate": 0.0008714612122269538, + "loss": 0.86968577, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.46704102, + "step": 1333, + "time_per_iteration": 2.849813938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056443, + "balance_loss_mlp": 1.00968957, + "epoch": 0.25663716814159293, + "flos": 437545419264.0, + "grad_norm": 0.03932780780666976, + "language_loss": 0.90516675, + "learning_rate": 0.0008712526007129982, + "loss": 0.91573119, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46704102, + "step": 1334, + "time_per_iteration": 2.520730972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_mlp": 1.00675464, + "epoch": 0.25682954982685646, + "flos": 499243700736.0, + "grad_norm": 0.03395243638019146, + "language_loss": 0.9133085, + "learning_rate": 0.0008710438450660003, + "loss": 0.9238441, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.4675293, + "step": 1335, + "time_per_iteration": 2.6936721801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00590599, + "epoch": 0.25702193151212005, + "flos": 458628655872.0, + "grad_norm": 0.038911849114865095, + "language_loss": 0.8791827, + "learning_rate": 0.0008708349453670064, + "loss": 0.88971329, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47119141, + "step": 1336, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074594, + "balance_loss_mlp": 1.02733934, + "epoch": 0.2572143131973836, + "flos": 599404867584.0, + "grad_norm": 0.03723585257139378, + "language_loss": 0.92015922, + "learning_rate": 0.0008706259016971185, + "loss": 0.93090516, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.47216797, + "step": 1337, + "time_per_iteration": 2.792436361312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055792, + "balance_loss_mlp": 1.00872791, + "epoch": 0.25740669488264717, + "flos": 699527150592.0, + "grad_norm": 0.04259016947882448, + "language_loss": 0.8355068, + "learning_rate": 0.0008704167141374944, + "loss": 0.84606469, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.47021484, + "step": 1338, + "time_per_iteration": 2.806931972503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056758, + "balance_loss_mlp": 1.01014686, + "epoch": 0.25759907656791076, + "flos": 503378889984.0, + "grad_norm": 0.03686560218677495, + "language_loss": 0.88890558, + "learning_rate": 0.0008702073827693482, + "loss": 0.89947319, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.46557617, + "step": 1339, + "time_per_iteration": 2.7613115310668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_mlp": 1.01112759, + "epoch": 0.2577914582531743, + "flos": 775242687744.0, + "grad_norm": 0.03484469931885578, + "language_loss": 0.89865053, + "learning_rate": 0.0008699979076739494, + "loss": 0.90922654, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.46411133, + "step": 1340, + "time_per_iteration": 2.9694418907165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_mlp": 1.00552797, + "epoch": 0.2579838399384379, + "flos": 460610707200.0, + "grad_norm": 0.04216529081594553, + "language_loss": 0.89380765, + "learning_rate": 0.0008697882889326234, + "loss": 0.9043293, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.46582031, + "step": 1341, + "time_per_iteration": 2.5050456523895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051599, + "balance_loss_mlp": 1.00482166, + "epoch": 0.2581762216237014, + "flos": 570263168256.0, + "grad_norm": 0.03742337984590145, + "language_loss": 0.87203884, + "learning_rate": 0.0008695785266267515, + "loss": 0.88255489, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.46728516, + "step": 1342, + "time_per_iteration": 2.677072763442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057516, + "balance_loss_mlp": 1.01069069, + "epoch": 0.258368603308965, + "flos": 605387960064.0, + "grad_norm": 0.035138016776099276, + "language_loss": 0.83827055, + "learning_rate": 0.0008693686208377704, + "loss": 0.84884572, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.46777344, + "step": 1343, + "time_per_iteration": 2.826026439666748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054134, + "balance_loss_mlp": 1.0075947, + "epoch": 0.2585609849942285, + "flos": 492487812096.0, + "grad_norm": 0.03194520317053949, + "language_loss": 0.89379156, + "learning_rate": 0.0008691585716471733, + "loss": 0.90433288, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.46484375, + "step": 1344, + "time_per_iteration": 2.6379647254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_mlp": 1.00646937, + "epoch": 0.2587533666794921, + "flos": 641958222336.0, + "grad_norm": 0.03185107281306307, + "language_loss": 0.86602217, + "learning_rate": 0.0008689483791365079, + "loss": 0.87655246, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.46508789, + "step": 1345, + "time_per_iteration": 2.8372344970703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105515, + "balance_loss_mlp": 1.00868249, + "epoch": 0.2589457483647557, + "flos": 577995987456.0, + "grad_norm": 0.038033594557881883, + "language_loss": 0.90178049, + "learning_rate": 0.0008687380433873786, + "loss": 0.91233194, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46411133, + "step": 1346, + "time_per_iteration": 2.7660248279571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.00636888, + "epoch": 0.25913813005001923, + "flos": 536467195392.0, + "grad_norm": 0.03823400300780179, + "language_loss": 0.83192778, + "learning_rate": 0.0008685275644814448, + "loss": 0.8424564, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.46435547, + "step": 1347, + "time_per_iteration": 2.6657776832580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058039, + "balance_loss_mlp": 1.01118934, + "epoch": 0.2593305117352828, + "flos": 722347474944.0, + "grad_norm": 0.04308500968206218, + "language_loss": 0.85215819, + "learning_rate": 0.0008683169425004216, + "loss": 0.86273861, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46801758, + "step": 1348, + "time_per_iteration": 2.8938682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067463, + "balance_loss_mlp": 1.02058995, + "epoch": 0.25952289342054635, + "flos": 711356275200.0, + "grad_norm": 0.04420512127692048, + "language_loss": 0.84604859, + "learning_rate": 0.0008681061775260799, + "loss": 0.85672331, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.46826172, + "step": 1349, + "time_per_iteration": 2.8803627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_mlp": 1.00634348, + "epoch": 0.25971527510580994, + "flos": 456850738944.0, + "grad_norm": 0.03368144531989068, + "language_loss": 0.92376006, + "learning_rate": 0.0008678952696402458, + "loss": 0.93428755, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46337891, + "step": 1350, + "time_per_iteration": 2.5544798374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054358, + "balance_loss_mlp": 1.00824761, + "epoch": 0.25990765679107347, + "flos": 613754569728.0, + "grad_norm": 0.03011764192417466, + "language_loss": 0.87159944, + "learning_rate": 0.000867684218924801, + "loss": 0.88214302, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.46044922, + "step": 1351, + "time_per_iteration": 2.856372833251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_mlp": 1.02496338, + "epoch": 0.26010003847633706, + "flos": 1541407196160.0, + "grad_norm": 0.012951365709411706, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80016494, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.4453125, + "step": 1352, + "time_per_iteration": 4.943616628646851 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058433, + "balance_loss_mlp": 1.01194191, + "epoch": 0.2602924201616006, + "flos": 717545447424.0, + "grad_norm": 0.029832851456929797, + "language_loss": 0.85926312, + "learning_rate": 0.0008672616893328834, + "loss": 0.86984742, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.46435547, + "step": 1353, + "time_per_iteration": 2.913235664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.01012051, + "epoch": 0.2604848018468642, + "flos": 644686824960.0, + "grad_norm": 0.03749633937906014, + "language_loss": 0.91143578, + "learning_rate": 0.0008670502106204512, + "loss": 0.92200339, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.46582031, + "step": 1354, + "time_per_iteration": 2.821753978729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091569, + "balance_loss_mlp": 1.0442189, + "epoch": 0.26067718353212777, + "flos": 518038684416.0, + "grad_norm": 0.04686611644365056, + "language_loss": 0.82400739, + "learning_rate": 0.0008668385894064892, + "loss": 0.83492303, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.47314453, + "step": 1355, + "time_per_iteration": 2.642392158508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056006, + "balance_loss_mlp": 1.00925195, + "epoch": 0.2608695652173913, + "flos": 824226287616.0, + "grad_norm": 0.03313451231790272, + "language_loss": 0.89331532, + "learning_rate": 0.0008666268257731562, + "loss": 0.90387547, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46704102, + "step": 1356, + "time_per_iteration": 3.1127805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060563, + "balance_loss_mlp": 1.01414335, + "epoch": 0.2610619469026549, + "flos": 1009450422528.0, + "grad_norm": 0.04035878870854939, + "language_loss": 0.86687934, + "learning_rate": 0.0008664149198026662, + "loss": 0.87748504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.46362305, + "step": 1357, + "time_per_iteration": 3.2328455448150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106616, + "balance_loss_mlp": 1.01971614, + "epoch": 0.2612543285879184, + "flos": 537826149888.0, + "grad_norm": 0.03943672852684058, + "language_loss": 0.8952527, + "learning_rate": 0.0008662028715772883, + "loss": 0.90591431, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.46386719, + "step": 1358, + "time_per_iteration": 2.621894359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058771, + "balance_loss_mlp": 1.01213586, + "epoch": 0.261446710273182, + "flos": 520439698176.0, + "grad_norm": 0.03590038892764462, + "language_loss": 0.86476588, + "learning_rate": 0.0008659906811793467, + "loss": 0.87535357, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.46582031, + "step": 1359, + "time_per_iteration": 2.6540629863739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054238, + "balance_loss_mlp": 1.00741243, + "epoch": 0.26163909195844554, + "flos": 584399987712.0, + "grad_norm": 0.03384500135634075, + "language_loss": 0.90458202, + "learning_rate": 0.0008657783486912215, + "loss": 0.91512442, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.46777344, + "step": 1360, + "time_per_iteration": 2.71598744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.01626348, + "epoch": 0.2618314736437091, + "flos": 960369613056.0, + "grad_norm": 0.03695926115068694, + "language_loss": 0.90376949, + "learning_rate": 0.0008655658741953472, + "loss": 0.91440493, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.47241211, + "step": 1361, + "time_per_iteration": 3.233081102371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061537, + "balance_loss_mlp": 1.01413929, + "epoch": 0.26202385532897265, + "flos": 575903120640.0, + "grad_norm": 0.032102410789184695, + "language_loss": 0.892542, + "learning_rate": 0.0008653532577742136, + "loss": 0.90315735, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.47363281, + "step": 1362, + "time_per_iteration": 2.671513319015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_mlp": 1.00673676, + "epoch": 0.26221623701423624, + "flos": 446398065408.0, + "grad_norm": 0.034188430773875136, + "language_loss": 0.88125902, + "learning_rate": 0.0008651404995103659, + "loss": 0.8917954, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.46850586, + "step": 1363, + "time_per_iteration": 2.5599000453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064691, + "balance_loss_mlp": 1.01803255, + "epoch": 0.26240861869949983, + "flos": 536755900416.0, + "grad_norm": 0.03309695956224158, + "language_loss": 0.87925225, + "learning_rate": 0.0008649275994864041, + "loss": 0.88989913, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.46606445, + "step": 1364, + "time_per_iteration": 2.68673038482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061602, + "balance_loss_mlp": 1.01472914, + "epoch": 0.26260100038476336, + "flos": 566488615680.0, + "grad_norm": 0.0327166713474878, + "language_loss": 0.84653741, + "learning_rate": 0.0008647145577849834, + "loss": 0.85715348, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46826172, + "step": 1365, + "time_per_iteration": 2.8294174671173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_mlp": 1.01471996, + "epoch": 0.26279338207002695, + "flos": 614321286144.0, + "grad_norm": 0.027467777319160957, + "language_loss": 0.83391041, + "learning_rate": 0.0008645013744888139, + "loss": 0.84452683, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.46875, + "step": 1366, + "time_per_iteration": 2.845019578933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.01238823, + "epoch": 0.2629857637552905, + "flos": 523945954560.0, + "grad_norm": 0.034051307399065846, + "language_loss": 0.88423878, + "learning_rate": 0.0008642880496806607, + "loss": 0.89483547, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.47241211, + "step": 1367, + "time_per_iteration": 2.7665200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_mlp": 1.01832283, + "epoch": 0.26317814544055407, + "flos": 535655515392.0, + "grad_norm": 0.03476637042829631, + "language_loss": 0.85672963, + "learning_rate": 0.0008640745834433437, + "loss": 0.86738896, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.47583008, + "step": 1368, + "time_per_iteration": 2.7824857234954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_mlp": 1.00967455, + "epoch": 0.2633705271258176, + "flos": 556780548096.0, + "grad_norm": 0.035052832704740904, + "language_loss": 0.8778615, + "learning_rate": 0.000863860975859738, + "loss": 0.88843262, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.47412109, + "step": 1369, + "time_per_iteration": 2.938157796859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059209, + "balance_loss_mlp": 1.01214516, + "epoch": 0.2635629088110812, + "flos": 553462874880.0, + "grad_norm": 0.04030614296387141, + "language_loss": 0.89190161, + "learning_rate": 0.0008636472270127733, + "loss": 0.90249372, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.47021484, + "step": 1370, + "time_per_iteration": 2.6449878215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_mlp": 1.0106585, + "epoch": 0.2637552904963448, + "flos": 456915867648.0, + "grad_norm": 0.03827203709322554, + "language_loss": 0.91134202, + "learning_rate": 0.0008634333369854345, + "loss": 0.9219166, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.4675293, + "step": 1371, + "time_per_iteration": 2.6090121269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053294, + "balance_loss_mlp": 1.00642049, + "epoch": 0.2639476721816083, + "flos": 614260048128.0, + "grad_norm": 0.03299961926418253, + "language_loss": 0.88250023, + "learning_rate": 0.0008632193058607608, + "loss": 0.89303321, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.46826172, + "step": 1372, + "time_per_iteration": 2.6980674266815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_mlp": 1.00562024, + "epoch": 0.2641400538668719, + "flos": 573026764032.0, + "grad_norm": 0.03659842444989107, + "language_loss": 0.81553382, + "learning_rate": 0.0008630051337218466, + "loss": 0.82606065, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47021484, + "step": 1373, + "time_per_iteration": 2.6634395122528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056457, + "balance_loss_mlp": 1.00960791, + "epoch": 0.2643324355521354, + "flos": 583340431872.0, + "grad_norm": 0.03511173854729822, + "language_loss": 0.82885635, + "learning_rate": 0.0008627908206518409, + "loss": 0.83942091, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.46801758, + "step": 1374, + "time_per_iteration": 2.6550941467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_mlp": 1.01022339, + "epoch": 0.264524817237399, + "flos": 1548027969792.0, + "grad_norm": 0.005864236448565476, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76206684, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.45117188, + "step": 1375, + "time_per_iteration": 4.995543718338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_mlp": 1.01197898, + "epoch": 0.26471719892266254, + "flos": 519043805184.0, + "grad_norm": 0.03321674595186757, + "language_loss": 0.92123759, + "learning_rate": 0.0008623617720514241, + "loss": 0.93182206, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.46411133, + "step": 1376, + "time_per_iteration": 2.592569351196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061616, + "balance_loss_mlp": 1.0151242, + "epoch": 0.26490958060792613, + "flos": 518205880320.0, + "grad_norm": 0.036665073764434085, + "language_loss": 0.85824203, + "learning_rate": 0.0008621470366875848, + "loss": 0.8688581, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.46435547, + "step": 1377, + "time_per_iteration": 2.5636963844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00766897, + "epoch": 0.26510196229318966, + "flos": 597683331072.0, + "grad_norm": 0.03396624681403314, + "language_loss": 0.88501984, + "learning_rate": 0.0008619321607257966, + "loss": 0.8955617, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.46459961, + "step": 1378, + "time_per_iteration": 2.687581777572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056779, + "balance_loss_mlp": 1.010144, + "epoch": 0.26529434397845325, + "flos": 687053541888.0, + "grad_norm": 0.031207845572821406, + "language_loss": 0.82550275, + "learning_rate": 0.000861717144249482, + "loss": 0.83607054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.46582031, + "step": 1379, + "time_per_iteration": 2.8333678245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.00819123, + "epoch": 0.26548672566371684, + "flos": 425260393728.0, + "grad_norm": 0.03047521662480035, + "language_loss": 0.90854567, + "learning_rate": 0.0008615019873421175, + "loss": 0.91909492, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.46679688, + "step": 1380, + "time_per_iteration": 2.47892689704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00437295, + "epoch": 0.26567910734898037, + "flos": 490850846208.0, + "grad_norm": 0.03515354974137605, + "language_loss": 0.8636173, + "learning_rate": 0.0008612866900872349, + "loss": 0.87412781, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.46630859, + "step": 1381, + "time_per_iteration": 2.558497428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.00833893, + "epoch": 0.26587148903424396, + "flos": 535229750016.0, + "grad_norm": 0.033124361732310995, + "language_loss": 0.88441265, + "learning_rate": 0.0008610712525684197, + "loss": 0.89496362, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.46704102, + "step": 1382, + "time_per_iteration": 2.6567015647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056513, + "balance_loss_mlp": 1.00997365, + "epoch": 0.2660638707195075, + "flos": 1019056422912.0, + "grad_norm": 0.038309225150243896, + "language_loss": 0.84641987, + "learning_rate": 0.0008608556748693121, + "loss": 0.85698497, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.46484375, + "step": 1383, + "time_per_iteration": 3.266127347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054133, + "balance_loss_mlp": 1.00754607, + "epoch": 0.2662562524047711, + "flos": 525063836160.0, + "grad_norm": 0.03266135396779854, + "language_loss": 0.86478686, + "learning_rate": 0.000860639957073607, + "loss": 0.87532818, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.46533203, + "step": 1384, + "time_per_iteration": 2.701979398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052082, + "balance_loss_mlp": 1.00542331, + "epoch": 0.2664486340900346, + "flos": 553480371456.0, + "grad_norm": 0.03507018041250785, + "language_loss": 0.88455647, + "learning_rate": 0.0008604240992650534, + "loss": 0.89507735, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.46606445, + "step": 1385, + "time_per_iteration": 2.6528589725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051347, + "balance_loss_mlp": 1.00476038, + "epoch": 0.2666410157752982, + "flos": 471209189376.0, + "grad_norm": 0.03349459525563368, + "language_loss": 0.89804894, + "learning_rate": 0.0008602081015274545, + "loss": 0.90856242, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.46533203, + "step": 1386, + "time_per_iteration": 2.7359464168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00626826, + "epoch": 0.2668333974605617, + "flos": 571016522496.0, + "grad_norm": 0.027882929979452454, + "language_loss": 0.8367793, + "learning_rate": 0.0008599919639446684, + "loss": 0.84730947, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.46704102, + "step": 1387, + "time_per_iteration": 2.72188401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_mlp": 1.00572038, + "epoch": 0.2670257791458253, + "flos": 399896159232.0, + "grad_norm": 0.038277743086958374, + "language_loss": 0.80995691, + "learning_rate": 0.000859775686600607, + "loss": 0.82048184, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46728516, + "step": 1388, + "time_per_iteration": 2.5220229625701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051137, + "balance_loss_mlp": 1.00443089, + "epoch": 0.2672181608310889, + "flos": 516892612608.0, + "grad_norm": 0.03738976993969629, + "language_loss": 0.85769641, + "learning_rate": 0.0008595592695792367, + "loss": 0.86820781, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.46655273, + "step": 1389, + "time_per_iteration": 2.7041423320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050805, + "balance_loss_mlp": 1.0042417, + "epoch": 0.26741054251635243, + "flos": 508526002944.0, + "grad_norm": 0.03398026188762752, + "language_loss": 0.91414082, + "learning_rate": 0.0008593427129645778, + "loss": 0.92464888, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.46508789, + "step": 1390, + "time_per_iteration": 2.563215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.0067687, + "epoch": 0.267602924201616, + "flos": 577809349632.0, + "grad_norm": 0.03481446530036303, + "language_loss": 0.86254311, + "learning_rate": 0.0008591260168407052, + "loss": 0.87307882, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.4675293, + "step": 1391, + "time_per_iteration": 2.788869619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_mlp": 1.00475395, + "epoch": 0.26779530588687955, + "flos": 525000652800.0, + "grad_norm": 0.029176301882166727, + "language_loss": 0.83413607, + "learning_rate": 0.0008589091812917479, + "loss": 0.84465045, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.46630859, + "step": 1392, + "time_per_iteration": 2.6471304893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057177, + "balance_loss_mlp": 1.0103997, + "epoch": 0.26798768757214314, + "flos": 557828443392.0, + "grad_norm": 0.034011915135398356, + "language_loss": 0.85611916, + "learning_rate": 0.0008586922064018887, + "loss": 0.86669087, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.46728516, + "step": 1393, + "time_per_iteration": 2.665710926055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00488269, + "epoch": 0.2681800692574067, + "flos": 932095974144.0, + "grad_norm": 0.035119979561623306, + "language_loss": 0.89861763, + "learning_rate": 0.0008584750922553651, + "loss": 0.90913308, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.46606445, + "step": 1394, + "time_per_iteration": 3.1556007862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00813687, + "epoch": 0.26837245094267026, + "flos": 702318936576.0, + "grad_norm": 0.034220503648090136, + "language_loss": 0.84388494, + "learning_rate": 0.0008582578389364677, + "loss": 0.85443103, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.46411133, + "step": 1395, + "time_per_iteration": 2.8831770420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054667, + "balance_loss_mlp": 1.00824666, + "epoch": 0.26856483262793385, + "flos": 594394814976.0, + "grad_norm": 0.030437239966241224, + "language_loss": 0.92446673, + "learning_rate": 0.0008580404465295422, + "loss": 0.93501341, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.46362305, + "step": 1396, + "time_per_iteration": 2.823685884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_mlp": 1.00578523, + "epoch": 0.2687572143131974, + "flos": 715589640960.0, + "grad_norm": 0.035135728363153845, + "language_loss": 0.88714433, + "learning_rate": 0.0008578229151189876, + "loss": 0.89766812, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.46533203, + "step": 1397, + "time_per_iteration": 2.9427757263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_mlp": 1.00858808, + "epoch": 0.26894959599846097, + "flos": 468671115264.0, + "grad_norm": 0.03944499035247069, + "language_loss": 0.82205743, + "learning_rate": 0.0008576052447892573, + "loss": 0.83260822, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.46435547, + "step": 1398, + "time_per_iteration": 2.570364475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_mlp": 1.00712895, + "epoch": 0.2691419776837245, + "flos": 469630549248.0, + "grad_norm": 0.035560759826370754, + "language_loss": 0.87260717, + "learning_rate": 0.000857387435624858, + "loss": 0.88314486, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.46582031, + "step": 1399, + "time_per_iteration": 2.5241427421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_mlp": 1.00698149, + "epoch": 0.2693343593689881, + "flos": 939286376448.0, + "grad_norm": 0.026228750880396605, + "language_loss": 0.88826966, + "learning_rate": 0.0008571694877103513, + "loss": 0.89880389, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.46386719, + "step": 1400, + "time_per_iteration": 3.2871432304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049973, + "balance_loss_mlp": 1.00355244, + "epoch": 0.2695267410542516, + "flos": 578795028480.0, + "grad_norm": 0.031687518811048296, + "language_loss": 0.88370931, + "learning_rate": 0.0008569514011303515, + "loss": 0.89420903, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.46362305, + "step": 1401, + "time_per_iteration": 2.8385562896728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00763249, + "epoch": 0.2697191227395152, + "flos": 557965503744.0, + "grad_norm": 0.03646210542720766, + "language_loss": 0.89149171, + "learning_rate": 0.0008567331759695277, + "loss": 0.90203321, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.46459961, + "step": 1402, + "time_per_iteration": 2.73796010017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_mlp": 1.00663948, + "epoch": 0.26991150442477874, + "flos": 530314961664.0, + "grad_norm": 0.03368837159460442, + "language_loss": 0.86897242, + "learning_rate": 0.0008565148123126023, + "loss": 0.87950301, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.46362305, + "step": 1403, + "time_per_iteration": 2.654782772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055763, + "balance_loss_mlp": 1.00970042, + "epoch": 0.2701038861100423, + "flos": 533087305728.0, + "grad_norm": 0.02742415368344255, + "language_loss": 0.86797845, + "learning_rate": 0.0008562963102443516, + "loss": 0.87853605, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.45996094, + "step": 1404, + "time_per_iteration": 2.6844303607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057243, + "balance_loss_mlp": 1.01122797, + "epoch": 0.2702962677953059, + "flos": 736505681664.0, + "grad_norm": 0.03794782730472634, + "language_loss": 0.85607296, + "learning_rate": 0.0008560776698496056, + "loss": 0.86664534, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.45947266, + "step": 1405, + "time_per_iteration": 2.9016945362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054152, + "balance_loss_mlp": 1.00806534, + "epoch": 0.27048864948056944, + "flos": 576001297152.0, + "grad_norm": 0.03333453941991407, + "language_loss": 0.8661586, + "learning_rate": 0.0008558588912132481, + "loss": 0.8767001, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.46020508, + "step": 1406, + "time_per_iteration": 2.8187410831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.03042603, + "epoch": 0.27068103116583303, + "flos": 1426912856832.0, + "grad_norm": 0.025019447230712623, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77533662, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.44335938, + "step": 1407, + "time_per_iteration": 4.91855001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059735, + "balance_loss_mlp": 1.01386356, + "epoch": 0.27087341285109656, + "flos": 533032870656.0, + "grad_norm": 0.03180107690871134, + "language_loss": 0.83613265, + "learning_rate": 0.0008554209195555016, + "loss": 0.84672999, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.45800781, + "step": 1408, + "time_per_iteration": 2.7004964351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.02188134, + "epoch": 0.27106579453636015, + "flos": 582465568512.0, + "grad_norm": 0.03644580883658202, + "language_loss": 0.89378774, + "learning_rate": 0.0008552017267041483, + "loss": 0.90446383, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.45654297, + "step": 1409, + "time_per_iteration": 2.7288694381713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.0219177, + "epoch": 0.2712581762216237, + "flos": 507881518848.0, + "grad_norm": 0.03188220116364099, + "language_loss": 0.84328783, + "learning_rate": 0.0008549823959512549, + "loss": 0.85396332, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.45556641, + "step": 1410, + "time_per_iteration": 2.67370343208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060246, + "balance_loss_mlp": 1.01435077, + "epoch": 0.27145055790688727, + "flos": 999143557632.0, + "grad_norm": 0.03419744556224296, + "language_loss": 0.87478781, + "learning_rate": 0.0008547629273819728, + "loss": 0.88539028, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.45825195, + "step": 1411, + "time_per_iteration": 3.3728370666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_mlp": 1.0104996, + "epoch": 0.2716429395921508, + "flos": 547729603584.0, + "grad_norm": 0.037303619224495106, + "language_loss": 0.84070724, + "learning_rate": 0.0008545433210815074, + "loss": 0.85127789, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.46508789, + "step": 1412, + "time_per_iteration": 2.6812539100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_mlp": 1.01536179, + "epoch": 0.2718353212774144, + "flos": 574311841536.0, + "grad_norm": 0.033089137280770606, + "language_loss": 0.8805269, + "learning_rate": 0.0008543235771351176, + "loss": 0.89114881, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.46777344, + "step": 1413, + "time_per_iteration": 2.713487148284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00961292, + "epoch": 0.272027702962678, + "flos": 645585987840.0, + "grad_norm": 0.026077025600286987, + "language_loss": 0.85152733, + "learning_rate": 0.0008541036956281154, + "loss": 0.86208814, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.46411133, + "step": 1414, + "time_per_iteration": 2.9018056392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062827, + "balance_loss_mlp": 1.01631117, + "epoch": 0.2722200846479415, + "flos": 654996602112.0, + "grad_norm": 0.04047455719590206, + "language_loss": 0.83293629, + "learning_rate": 0.0008538836766458665, + "loss": 0.84356457, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.46459961, + "step": 1415, + "time_per_iteration": 2.84184193611145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106005, + "balance_loss_mlp": 1.01365411, + "epoch": 0.2724124663332051, + "flos": 580779025152.0, + "grad_norm": 0.0390255284508479, + "language_loss": 0.85920322, + "learning_rate": 0.0008536635202737897, + "loss": 0.86980367, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.46337891, + "step": 1416, + "time_per_iteration": 2.814687728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01272988, + "epoch": 0.2726048480184686, + "flos": 538468688640.0, + "grad_norm": 0.03678906161491062, + "language_loss": 0.82951486, + "learning_rate": 0.0008534432265973573, + "loss": 0.8401081, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.46533203, + "step": 1417, + "time_per_iteration": 2.641660451889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00930703, + "epoch": 0.2727972297037322, + "flos": 997550333184.0, + "grad_norm": 0.4222293446211692, + "language_loss": 0.88806397, + "learning_rate": 0.000853222795702095, + "loss": 0.89862669, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.46923828, + "step": 1418, + "time_per_iteration": 3.3743135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181433, + "balance_loss_mlp": 1.1334635, + "epoch": 0.27298961138899575, + "flos": 607335018240.0, + "grad_norm": 0.06715989722341878, + "language_loss": 0.84640503, + "learning_rate": 0.0008530022276735813, + "loss": 0.85821939, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.47949219, + "step": 1419, + "time_per_iteration": 2.752645254135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069458, + "balance_loss_mlp": 1.02225161, + "epoch": 0.27318199307425933, + "flos": 530397586944.0, + "grad_norm": 0.040820608700474346, + "language_loss": 0.87344372, + "learning_rate": 0.0008527815225974489, + "loss": 0.88413835, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.47167969, + "step": 1420, + "time_per_iteration": 2.65108585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085616, + "balance_loss_mlp": 1.03852844, + "epoch": 0.2733743747595229, + "flos": 409912373760.0, + "grad_norm": 0.06690132065136703, + "language_loss": 0.92052042, + "learning_rate": 0.0008525606805593829, + "loss": 0.93137658, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.47045898, + "step": 1421, + "time_per_iteration": 2.4201173782348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.03422987, + "epoch": 0.27356675644478645, + "flos": 517228949760.0, + "grad_norm": 0.05290317096475839, + "language_loss": 0.85793996, + "learning_rate": 0.0008523397016451213, + "loss": 0.86875236, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46972656, + "step": 1422, + "time_per_iteration": 2.632446765899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080251, + "balance_loss_mlp": 1.03328276, + "epoch": 0.27375913813005004, + "flos": 1054059705600.0, + "grad_norm": 0.039766191828199446, + "language_loss": 0.90321743, + "learning_rate": 0.0008521185859404564, + "loss": 0.91401994, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.46923828, + "step": 1423, + "time_per_iteration": 3.381535291671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107676, + "balance_loss_mlp": 1.02998257, + "epoch": 0.27395151981531357, + "flos": 626004602112.0, + "grad_norm": 0.042654551092476074, + "language_loss": 0.92207062, + "learning_rate": 0.0008518973335312326, + "loss": 0.9328382, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.46728516, + "step": 1424, + "time_per_iteration": 2.787799596786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070757, + "balance_loss_mlp": 1.0240984, + "epoch": 0.27414390150057716, + "flos": 551415694848.0, + "grad_norm": 0.04883209929837253, + "language_loss": 0.85839558, + "learning_rate": 0.0008516759445033477, + "loss": 0.86910313, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.46606445, + "step": 1425, + "time_per_iteration": 2.6206350326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_mlp": 1.01881957, + "epoch": 0.2743362831858407, + "flos": 540952327680.0, + "grad_norm": 0.043467714857121094, + "language_loss": 0.87962419, + "learning_rate": 0.0008514544189427526, + "loss": 0.89028037, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.4675293, + "step": 1426, + "time_per_iteration": 2.679623603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_mlp": 1.0118494, + "epoch": 0.2745286648711043, + "flos": 469545978624.0, + "grad_norm": 0.04158543868721512, + "language_loss": 0.89037859, + "learning_rate": 0.0008512327569354511, + "loss": 0.90096468, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.46704102, + "step": 1427, + "time_per_iteration": 2.5345683097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.01036775, + "epoch": 0.2747210465563678, + "flos": 473872663296.0, + "grad_norm": 0.05094281183667316, + "language_loss": 0.85685182, + "learning_rate": 0.0008510109585675001, + "loss": 0.8674283, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.47241211, + "step": 1428, + "time_per_iteration": 2.5991017818450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076946, + "balance_loss_mlp": 1.03031158, + "epoch": 0.2749134282416314, + "flos": 1318059436800.0, + "grad_norm": 0.019364160619571847, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82230288, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.46582031, + "step": 1429, + "time_per_iteration": 4.724486351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.03459787, + "epoch": 0.275105809926895, + "flos": 972533129472.0, + "grad_norm": 0.05143903496013185, + "language_loss": 0.82696635, + "learning_rate": 0.0008505669530941415, + "loss": 0.83778298, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.47021484, + "step": 1430, + "time_per_iteration": 3.3173024654388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058611, + "balance_loss_mlp": 1.01231062, + "epoch": 0.2752981916121585, + "flos": 528369848832.0, + "grad_norm": 0.04649662222604448, + "language_loss": 0.87158883, + "learning_rate": 0.000850344746161112, + "loss": 0.88217485, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.46240234, + "step": 1431, + "time_per_iteration": 2.635831356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065544, + "balance_loss_mlp": 1.01943398, + "epoch": 0.2754905732974221, + "flos": 454599424512.0, + "grad_norm": 0.04970989937431765, + "language_loss": 0.90776384, + "learning_rate": 0.0008501224032121894, + "loss": 0.91841936, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.46044922, + "step": 1432, + "time_per_iteration": 2.531921148300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.02339363, + "epoch": 0.27568295498268564, + "flos": 498509788416.0, + "grad_norm": 0.04336527805629792, + "language_loss": 0.84821916, + "learning_rate": 0.0008498999243336946, + "loss": 0.85891324, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.45947266, + "step": 1433, + "time_per_iteration": 2.6142802238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068976, + "balance_loss_mlp": 1.02298498, + "epoch": 0.2758753366679492, + "flos": 609417191424.0, + "grad_norm": 0.03822636329404569, + "language_loss": 0.8997575, + "learning_rate": 0.0008496773096120021, + "loss": 0.91044724, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.45922852, + "step": 1434, + "time_per_iteration": 2.788863182067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066122, + "balance_loss_mlp": 1.01977372, + "epoch": 0.27606771835321275, + "flos": 741437966592.0, + "grad_norm": 0.04844453313229188, + "language_loss": 0.86675751, + "learning_rate": 0.0008494545591335381, + "loss": 0.87741876, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46289062, + "step": 1435, + "time_per_iteration": 2.8883180618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061614, + "balance_loss_mlp": 1.01516986, + "epoch": 0.27626010003847634, + "flos": 555749182464.0, + "grad_norm": 0.03304758436240527, + "language_loss": 0.88791698, + "learning_rate": 0.0008492316729847823, + "loss": 0.89853311, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.46386719, + "step": 1436, + "time_per_iteration": 2.794938087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.0072248, + "epoch": 0.2764524817237399, + "flos": 543696481536.0, + "grad_norm": 0.13725655625344893, + "language_loss": 0.82129836, + "learning_rate": 0.0008490086512522664, + "loss": 0.83184153, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47045898, + "step": 1437, + "time_per_iteration": 2.6979260444641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.01495445, + "epoch": 0.27664486340900346, + "flos": 407129336064.0, + "grad_norm": 0.04115092615815086, + "language_loss": 0.92702913, + "learning_rate": 0.0008487854940225755, + "loss": 0.93765163, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47265625, + "step": 1438, + "time_per_iteration": 2.4361565113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_mlp": 1.0080049, + "epoch": 0.27683724509426705, + "flos": 523157607168.0, + "grad_norm": 0.06281356926864295, + "language_loss": 0.92480713, + "learning_rate": 0.0008485622013823466, + "loss": 0.93535829, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.47070312, + "step": 1439, + "time_per_iteration": 2.588972568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060631, + "balance_loss_mlp": 1.01332879, + "epoch": 0.2770296267795306, + "flos": 536410814976.0, + "grad_norm": 0.048827385499573994, + "language_loss": 0.8582921, + "learning_rate": 0.00084833877341827, + "loss": 0.86889839, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47265625, + "step": 1440, + "time_per_iteration": 2.6215152740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063403, + "balance_loss_mlp": 1.01648188, + "epoch": 0.27722200846479417, + "flos": 488970862080.0, + "grad_norm": 0.04074125375838667, + "language_loss": 0.82920921, + "learning_rate": 0.000848115210217088, + "loss": 0.83984327, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.46875, + "step": 1441, + "time_per_iteration": 2.578479290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059768, + "balance_loss_mlp": 1.01244187, + "epoch": 0.2774143901500577, + "flos": 619444099584.0, + "grad_norm": 0.03981713509883016, + "language_loss": 0.84628934, + "learning_rate": 0.0008478915118655952, + "loss": 0.85688698, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.47290039, + "step": 1442, + "time_per_iteration": 2.697610855102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055568, + "balance_loss_mlp": 1.0080508, + "epoch": 0.2776067718353213, + "flos": 514845432576.0, + "grad_norm": 0.032345577367045, + "language_loss": 0.88479745, + "learning_rate": 0.0008476676784506393, + "loss": 0.89535314, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.47485352, + "step": 1443, + "time_per_iteration": 2.6315112113952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056662, + "balance_loss_mlp": 1.00897789, + "epoch": 0.2777991535205848, + "flos": 1006042342656.0, + "grad_norm": 0.04008629757661371, + "language_loss": 0.8412413, + "learning_rate": 0.0008474437100591201, + "loss": 0.85180795, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.4765625, + "step": 1444, + "time_per_iteration": 3.3463656902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051562, + "balance_loss_mlp": 1.00371146, + "epoch": 0.2779915352058484, + "flos": 551376811008.0, + "grad_norm": 0.033834103416723965, + "language_loss": 0.87362587, + "learning_rate": 0.0008472196067779898, + "loss": 0.88414145, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47827148, + "step": 1445, + "time_per_iteration": 2.6647677421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054302, + "balance_loss_mlp": 1.00649953, + "epoch": 0.278183916891112, + "flos": 875217216768.0, + "grad_norm": 0.0457526450580795, + "language_loss": 0.87776953, + "learning_rate": 0.0008469953686942531, + "loss": 0.88831258, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.4777832, + "step": 1446, + "time_per_iteration": 3.076035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056504, + "balance_loss_mlp": 1.00882006, + "epoch": 0.2783762985763755, + "flos": 625196812800.0, + "grad_norm": 0.042452946668595545, + "language_loss": 0.85090148, + "learning_rate": 0.0008467709958949668, + "loss": 0.86146653, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.4765625, + "step": 1447, + "time_per_iteration": 2.744459629058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_mlp": 1.00850928, + "epoch": 0.2785686802616391, + "flos": 582912721152.0, + "grad_norm": 0.04136143865758397, + "language_loss": 0.87796736, + "learning_rate": 0.0008465464884672403, + "loss": 0.88852853, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.47583008, + "step": 1448, + "time_per_iteration": 2.6887707710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_mlp": 1.00235641, + "epoch": 0.27876106194690264, + "flos": 588540034560.0, + "grad_norm": 0.031263057988026755, + "language_loss": 0.87220562, + "learning_rate": 0.0008463218464982348, + "loss": 0.88270551, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.47607422, + "step": 1449, + "time_per_iteration": 2.8354454040527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_mlp": 1.00326335, + "epoch": 0.27895344363216623, + "flos": 877431592704.0, + "grad_norm": 0.03730856956989286, + "language_loss": 0.89626968, + "learning_rate": 0.0008460970700751645, + "loss": 0.90677798, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.4753418, + "step": 1450, + "time_per_iteration": 3.12705135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062156, + "balance_loss_mlp": 1.01442492, + "epoch": 0.27914582531742976, + "flos": 605036071680.0, + "grad_norm": 0.0379360607610882, + "language_loss": 0.8910991, + "learning_rate": 0.000845872159285295, + "loss": 0.90172064, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.47705078, + "step": 1451, + "time_per_iteration": 2.792448043823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065174, + "balance_loss_mlp": 1.02025604, + "epoch": 0.27933820700269335, + "flos": 1501133346048.0, + "grad_norm": 0.01376981107013524, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.7883203, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.44921875, + "step": 1452, + "time_per_iteration": 4.966037034988403 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_mlp": 1.00921774, + "epoch": 0.2795305886879569, + "flos": 1033518885888.0, + "grad_norm": 0.037040263742322534, + "language_loss": 0.87809932, + "learning_rate": 0.0008454219349544836, + "loss": 0.88866544, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47363281, + "step": 1453, + "time_per_iteration": 3.428589344024658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055627, + "balance_loss_mlp": 1.00851548, + "epoch": 0.27972297037322047, + "flos": 608227378176.0, + "grad_norm": 0.03307542484781365, + "language_loss": 0.83086669, + "learning_rate": 0.000845196621588334, + "loss": 0.84142298, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.47070312, + "step": 1454, + "time_per_iteration": 2.7620909214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053868, + "balance_loss_mlp": 1.00661373, + "epoch": 0.27991535205848406, + "flos": 631561929216.0, + "grad_norm": 0.034345141589198824, + "language_loss": 0.77104861, + "learning_rate": 0.0008449711742049706, + "loss": 0.78158724, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.47216797, + "step": 1455, + "time_per_iteration": 2.7629852294921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057076, + "balance_loss_mlp": 1.009655, + "epoch": 0.2801077337437476, + "flos": 550354193664.0, + "grad_norm": 0.03843537360044117, + "language_loss": 0.85426688, + "learning_rate": 0.0008447455928919196, + "loss": 0.86483765, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.47387695, + "step": 1456, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054361, + "balance_loss_mlp": 1.00670111, + "epoch": 0.2803001154290112, + "flos": 487742164992.0, + "grad_norm": 0.03308646323695097, + "language_loss": 0.8834334, + "learning_rate": 0.0008445198777367595, + "loss": 0.89397705, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.47631836, + "step": 1457, + "time_per_iteration": 2.5908620357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054094, + "balance_loss_mlp": 1.00633848, + "epoch": 0.2804924971142747, + "flos": 523092478464.0, + "grad_norm": 0.036759152060528134, + "language_loss": 0.82140505, + "learning_rate": 0.0008442940288271208, + "loss": 0.8319459, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.47729492, + "step": 1458, + "time_per_iteration": 2.6980724334716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057218, + "balance_loss_mlp": 1.00953484, + "epoch": 0.2806848787995383, + "flos": 528850049280.0, + "grad_norm": 0.03179596299998768, + "language_loss": 0.88266242, + "learning_rate": 0.0008440680462506856, + "loss": 0.89323461, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.4765625, + "step": 1459, + "time_per_iteration": 2.818169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058111, + "balance_loss_mlp": 1.01047492, + "epoch": 0.2808772604848018, + "flos": 486485277696.0, + "grad_norm": 0.030255628698855237, + "language_loss": 0.87626624, + "learning_rate": 0.0008438419300951883, + "loss": 0.88684738, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.47607422, + "step": 1460, + "time_per_iteration": 2.644911527633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_mlp": 1.00825953, + "epoch": 0.2810696421700654, + "flos": 619340087040.0, + "grad_norm": 0.03597967684758823, + "language_loss": 0.87670606, + "learning_rate": 0.0008436156804484148, + "loss": 0.88726676, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.4777832, + "step": 1461, + "time_per_iteration": 2.7725627422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_mlp": 1.00657165, + "epoch": 0.28126202385532895, + "flos": 455687170560.0, + "grad_norm": 0.0394598317615188, + "language_loss": 0.89263237, + "learning_rate": 0.0008433892973982031, + "loss": 0.90317494, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.4765625, + "step": 1462, + "time_per_iteration": 2.5091495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063928, + "balance_loss_mlp": 1.0156002, + "epoch": 0.28145440554059253, + "flos": 531739044864.0, + "grad_norm": 0.041651284680957995, + "language_loss": 0.866346, + "learning_rate": 0.0008431627810324431, + "loss": 0.87698531, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.4831543, + "step": 1463, + "time_per_iteration": 2.6705899238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056697, + "balance_loss_mlp": 1.00872695, + "epoch": 0.2816467872258561, + "flos": 453164647680.0, + "grad_norm": 0.03544245246238935, + "language_loss": 0.81977493, + "learning_rate": 0.000842936131439076, + "loss": 0.83034194, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.47949219, + "step": 1464, + "time_per_iteration": 2.610419511795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055364, + "balance_loss_mlp": 1.00763226, + "epoch": 0.28183916891111965, + "flos": 473705467392.0, + "grad_norm": 0.034609246408770326, + "language_loss": 0.89094436, + "learning_rate": 0.0008427093487060951, + "loss": 0.90149802, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.47705078, + "step": 1465, + "time_per_iteration": 2.72540283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054608, + "balance_loss_mlp": 1.00656629, + "epoch": 0.28203155059638324, + "flos": 558189080064.0, + "grad_norm": 0.02738603689522664, + "language_loss": 0.8552286, + "learning_rate": 0.000842482432921545, + "loss": 0.86577463, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.48022461, + "step": 1466, + "time_per_iteration": 2.8388257026672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105456, + "balance_loss_mlp": 1.00654304, + "epoch": 0.28222393228164677, + "flos": 417879462912.0, + "grad_norm": 0.03402242241185157, + "language_loss": 0.88381398, + "learning_rate": 0.0008422553841735225, + "loss": 0.89435959, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.47998047, + "step": 1467, + "time_per_iteration": 2.495126485824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057213, + "balance_loss_mlp": 1.00917137, + "epoch": 0.28241631396691036, + "flos": 606041192448.0, + "grad_norm": 0.032675143321136885, + "language_loss": 0.86003613, + "learning_rate": 0.0008420282025501757, + "loss": 0.87060827, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.48022461, + "step": 1468, + "time_per_iteration": 2.7908880710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052419, + "balance_loss_mlp": 1.00473487, + "epoch": 0.2826086956521739, + "flos": 574051326720.0, + "grad_norm": 0.03300906221563125, + "language_loss": 0.86686498, + "learning_rate": 0.0008418008881397043, + "loss": 0.87738919, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.4765625, + "step": 1469, + "time_per_iteration": 2.7646520137786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054478, + "balance_loss_mlp": 1.00693762, + "epoch": 0.2828010773374375, + "flos": 844319954688.0, + "grad_norm": 0.03195966631281891, + "language_loss": 0.84124947, + "learning_rate": 0.0008415734410303595, + "loss": 0.85179424, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.47509766, + "step": 1470, + "time_per_iteration": 3.1784656047821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059132, + "balance_loss_mlp": 1.01151943, + "epoch": 0.28299345902270107, + "flos": 543772303872.0, + "grad_norm": 0.0307788797974712, + "language_loss": 0.91781342, + "learning_rate": 0.0008413458613104444, + "loss": 0.92840481, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.47583008, + "step": 1471, + "time_per_iteration": 2.7000675201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057543, + "balance_loss_mlp": 1.00995505, + "epoch": 0.2831858407079646, + "flos": 572755555584.0, + "grad_norm": 0.03187726406761503, + "language_loss": 0.84024346, + "learning_rate": 0.0008411181490683129, + "loss": 0.85081899, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.47558594, + "step": 1472, + "time_per_iteration": 2.7358603477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105744, + "balance_loss_mlp": 1.00958943, + "epoch": 0.2833782223932282, + "flos": 765172038144.0, + "grad_norm": 0.03258814259190176, + "language_loss": 0.83765668, + "learning_rate": 0.0008408903043923707, + "loss": 0.84823108, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.47827148, + "step": 1473, + "time_per_iteration": 3.016690492630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060961, + "balance_loss_mlp": 1.01291955, + "epoch": 0.2835706040784917, + "flos": 540088157952.0, + "grad_norm": 0.03783140599229066, + "language_loss": 0.82463539, + "learning_rate": 0.0008406623273710754, + "loss": 0.83524501, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.48022461, + "step": 1474, + "time_per_iteration": 2.651932954788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_mlp": 1.00736535, + "epoch": 0.2837629857637553, + "flos": 531654474240.0, + "grad_norm": 0.03425671969493541, + "language_loss": 0.84354198, + "learning_rate": 0.0008404342180929351, + "loss": 0.85409558, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.47973633, + "step": 1475, + "time_per_iteration": 2.6064491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105922, + "balance_loss_mlp": 1.01120257, + "epoch": 0.28395536744901884, + "flos": 541110775296.0, + "grad_norm": 0.03564784056716401, + "language_loss": 0.8245163, + "learning_rate": 0.00084020597664651, + "loss": 0.83510846, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.47998047, + "step": 1476, + "time_per_iteration": 2.7597527503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.00890458, + "epoch": 0.2841477491342824, + "flos": 574802735616.0, + "grad_norm": 0.037292940254278956, + "language_loss": 0.8496412, + "learning_rate": 0.0008399776031204111, + "loss": 0.86021066, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.48022461, + "step": 1477, + "time_per_iteration": 2.759089231491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.00412941, + "epoch": 0.28434013081954596, + "flos": 573139524864.0, + "grad_norm": 0.03522410712402375, + "language_loss": 0.80955458, + "learning_rate": 0.0008397490976033009, + "loss": 0.8200742, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.47802734, + "step": 1478, + "time_per_iteration": 2.6423845291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056133, + "balance_loss_mlp": 1.0100708, + "epoch": 0.28453251250480954, + "flos": 1556676481536.0, + "grad_norm": 0.010218347035897045, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78935778, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.45996094, + "step": 1479, + "time_per_iteration": 4.732174396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_mlp": 1.0056026, + "epoch": 0.28472489419007313, + "flos": 750427673088.0, + "grad_norm": 0.028762601306014927, + "language_loss": 0.86263019, + "learning_rate": 0.0008392916909509525, + "loss": 0.87316358, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.47705078, + "step": 1480, + "time_per_iteration": 3.0842366218566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105593, + "balance_loss_mlp": 1.00817478, + "epoch": 0.28491727587533666, + "flos": 491139551232.0, + "grad_norm": 0.03654292068957682, + "language_loss": 0.86134857, + "learning_rate": 0.0008390627899932954, + "loss": 0.87190789, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.47729492, + "step": 1481, + "time_per_iteration": 2.615267753601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053309, + "balance_loss_mlp": 1.0055064, + "epoch": 0.28510965756060025, + "flos": 730360250880.0, + "grad_norm": 0.03257927187729683, + "language_loss": 0.89633858, + "learning_rate": 0.000838833757399789, + "loss": 0.90687168, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.4777832, + "step": 1482, + "time_per_iteration": 2.9428212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053301, + "balance_loss_mlp": 1.00528359, + "epoch": 0.2853020392458638, + "flos": 552670636800.0, + "grad_norm": 0.036455185890550544, + "language_loss": 0.82055122, + "learning_rate": 0.0008386045932593515, + "loss": 0.83108419, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.47998047, + "step": 1483, + "time_per_iteration": 2.724045991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_mlp": 1.00416255, + "epoch": 0.28549442093112737, + "flos": 756097761024.0, + "grad_norm": 0.02777472605390161, + "language_loss": 0.8718375, + "learning_rate": 0.0008383752976609525, + "loss": 0.8823595, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.48022461, + "step": 1484, + "time_per_iteration": 2.929905891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054156, + "balance_loss_mlp": 1.00618601, + "epoch": 0.2856868026163909, + "flos": 539704188672.0, + "grad_norm": 0.028392575187028035, + "language_loss": 0.8111921, + "learning_rate": 0.0008381458706936123, + "loss": 0.82173365, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.47949219, + "step": 1485, + "time_per_iteration": 2.717545986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053651, + "balance_loss_mlp": 1.00563323, + "epoch": 0.2858791843016545, + "flos": 584921017344.0, + "grad_norm": 0.03333139148622456, + "language_loss": 0.88664746, + "learning_rate": 0.0008379163124464025, + "loss": 0.8971839, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.47998047, + "step": 1486, + "time_per_iteration": 2.7234747409820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00685012, + "epoch": 0.286071565986918, + "flos": 646052582400.0, + "grad_norm": 0.03454926432429506, + "language_loss": 0.77946562, + "learning_rate": 0.0008376866230084452, + "loss": 0.79001164, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.47729492, + "step": 1487, + "time_per_iteration": 2.856128692626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00408018, + "epoch": 0.2862639476721816, + "flos": 492331309824.0, + "grad_norm": 0.034661288064865674, + "language_loss": 0.87705112, + "learning_rate": 0.000837456802468914, + "loss": 0.88757157, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.47949219, + "step": 1488, + "time_per_iteration": 2.57454514503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_mlp": 1.00700366, + "epoch": 0.2864563293574452, + "flos": 522745447680.0, + "grad_norm": 0.035472984165373166, + "language_loss": 0.86247557, + "learning_rate": 0.0008372268509170331, + "loss": 0.87302554, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.47973633, + "step": 1489, + "time_per_iteration": 2.661430597305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_mlp": 1.00452483, + "epoch": 0.2866487110427087, + "flos": 548257436160.0, + "grad_norm": 0.03357077125927176, + "language_loss": 0.85950172, + "learning_rate": 0.0008369967684420779, + "loss": 0.8700276, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.48046875, + "step": 1490, + "time_per_iteration": 2.703200101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052654, + "balance_loss_mlp": 1.0047555, + "epoch": 0.2868410927279723, + "flos": 483218148864.0, + "grad_norm": 0.03511930922286833, + "language_loss": 0.8567192, + "learning_rate": 0.0008367665551333736, + "loss": 0.86724567, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.47875977, + "step": 1491, + "time_per_iteration": 2.6027045249938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051173, + "balance_loss_mlp": 1.00334597, + "epoch": 0.28703347441323585, + "flos": 726137578752.0, + "grad_norm": 0.03668604763704844, + "language_loss": 0.86648476, + "learning_rate": 0.0008365362110802977, + "loss": 0.87699652, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47802734, + "step": 1492, + "time_per_iteration": 2.872743606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00630987, + "epoch": 0.28722585609849943, + "flos": 636214257408.0, + "grad_norm": 0.0346446819062503, + "language_loss": 0.83264536, + "learning_rate": 0.0008363057363722773, + "loss": 0.84318721, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.47851562, + "step": 1493, + "time_per_iteration": 2.830925941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055811, + "balance_loss_mlp": 1.00827014, + "epoch": 0.28741823778376296, + "flos": 511252660224.0, + "grad_norm": 0.03541460771255837, + "language_loss": 0.8481909, + "learning_rate": 0.0008360751310987906, + "loss": 0.85874903, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.47509766, + "step": 1494, + "time_per_iteration": 2.6102633476257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055778, + "balance_loss_mlp": 1.00840437, + "epoch": 0.28761061946902655, + "flos": 604932059136.0, + "grad_norm": 0.030521465086419404, + "language_loss": 0.86298919, + "learning_rate": 0.0008358443953493666, + "loss": 0.87354696, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.47338867, + "step": 1495, + "time_per_iteration": 2.8808648586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053186, + "balance_loss_mlp": 1.00590765, + "epoch": 0.28780300115429014, + "flos": 408060579840.0, + "grad_norm": 0.03760103829607362, + "language_loss": 0.89352167, + "learning_rate": 0.0008356135292135851, + "loss": 0.90405357, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.47241211, + "step": 1496, + "time_per_iteration": 2.5025811195373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055101, + "balance_loss_mlp": 1.00794196, + "epoch": 0.28799538283955367, + "flos": 375745070592.0, + "grad_norm": 0.04396673202836768, + "language_loss": 0.93575335, + "learning_rate": 0.0008353825327810758, + "loss": 0.94630432, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.47119141, + "step": 1497, + "time_per_iteration": 2.4455389976501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053362, + "balance_loss_mlp": 1.00601161, + "epoch": 0.28818776452481726, + "flos": 593020309248.0, + "grad_norm": 0.03575929377279749, + "language_loss": 0.82620615, + "learning_rate": 0.00083515140614152, + "loss": 0.83673978, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.47314453, + "step": 1498, + "time_per_iteration": 2.7318496704101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.01204443, + "epoch": 0.2883801462100808, + "flos": 536104613376.0, + "grad_norm": 0.03408677708994041, + "language_loss": 0.8771323, + "learning_rate": 0.0008349201493846485, + "loss": 0.88772887, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.47583008, + "step": 1499, + "time_per_iteration": 2.671473503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105332, + "balance_loss_mlp": 1.00606573, + "epoch": 0.2885725278953444, + "flos": 481077649920.0, + "grad_norm": 0.037679681148910335, + "language_loss": 0.90198493, + "learning_rate": 0.0008346887626002432, + "loss": 0.91251814, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.47216797, + "step": 1500, + "time_per_iteration": 2.565556287765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_mlp": 1.00290418, + "epoch": 0.2887649095806079, + "flos": 465030710784.0, + "grad_norm": 0.03453406345592784, + "language_loss": 0.87256986, + "learning_rate": 0.000834457245878137, + "loss": 0.88307267, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.47338867, + "step": 1501, + "time_per_iteration": 2.6684980392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_mlp": 1.00411427, + "epoch": 0.2889572912658715, + "flos": 932641303296.0, + "grad_norm": 0.034149555340210275, + "language_loss": 0.82079703, + "learning_rate": 0.000834225599308212, + "loss": 0.83131123, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.47265625, + "step": 1502, + "time_per_iteration": 3.2747607231140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052615, + "balance_loss_mlp": 1.00526536, + "epoch": 0.28914967295113503, + "flos": 571257595392.0, + "grad_norm": 0.03426641952710734, + "language_loss": 0.85934782, + "learning_rate": 0.0008339938229804016, + "loss": 0.869874, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.47314453, + "step": 1503, + "time_per_iteration": 2.7027056217193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062946, + "balance_loss_mlp": 1.01783752, + "epoch": 0.2893420546363986, + "flos": 1489874828544.0, + "grad_norm": 0.016861580481692767, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76497769, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.45019531, + "step": 1504, + "time_per_iteration": 4.9503560066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010625, + "balance_loss_mlp": 1.01536465, + "epoch": 0.2895344363216622, + "flos": 471182944512.0, + "grad_norm": 0.04276572481675365, + "language_loss": 0.8589167, + "learning_rate": 0.0008335298814111094, + "loss": 0.86954165, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47094727, + "step": 1505, + "time_per_iteration": 2.548398017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.01654112, + "epoch": 0.28972681800692573, + "flos": 649341098496.0, + "grad_norm": 0.03572405467889404, + "language_loss": 0.89211309, + "learning_rate": 0.0008332977163497455, + "loss": 0.90274966, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.47070312, + "step": 1506, + "time_per_iteration": 2.786355972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059474, + "balance_loss_mlp": 1.01241064, + "epoch": 0.2899191996921893, + "flos": 573306720768.0, + "grad_norm": 0.03560254091063293, + "language_loss": 0.84471554, + "learning_rate": 0.0008330654218907325, + "loss": 0.85531026, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.47021484, + "step": 1507, + "time_per_iteration": 2.706066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054224, + "balance_loss_mlp": 1.00701702, + "epoch": 0.29011158137745285, + "flos": 662638047744.0, + "grad_norm": 0.03364876986368613, + "language_loss": 0.82771999, + "learning_rate": 0.0008328329981242548, + "loss": 0.8382622, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47167969, + "step": 1508, + "time_per_iteration": 2.9025378227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053376, + "balance_loss_mlp": 1.00607395, + "epoch": 0.29030396306271644, + "flos": 537403296768.0, + "grad_norm": 0.0314370875382877, + "language_loss": 0.88638061, + "learning_rate": 0.0008326004451405475, + "loss": 0.89691436, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47265625, + "step": 1509, + "time_per_iteration": 2.740288496017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091124, + "balance_loss_mlp": 1.04370284, + "epoch": 0.29049634474798, + "flos": 512956700160.0, + "grad_norm": 0.04021928954994292, + "language_loss": 0.83711147, + "learning_rate": 0.0008323677630298957, + "loss": 0.84802264, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.47387695, + "step": 1510, + "time_per_iteration": 2.5700840950012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_mlp": 1.00935256, + "epoch": 0.29068872643324356, + "flos": 614983266816.0, + "grad_norm": 0.03498537298994642, + "language_loss": 0.86212677, + "learning_rate": 0.0008321349518826345, + "loss": 0.87268996, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.46923828, + "step": 1511, + "time_per_iteration": 2.7968146800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060609, + "balance_loss_mlp": 1.01364064, + "epoch": 0.2908811081185071, + "flos": 547469088768.0, + "grad_norm": 0.03734404843374857, + "language_loss": 0.95525789, + "learning_rate": 0.0008319020117891491, + "loss": 0.96586394, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.46923828, + "step": 1512, + "time_per_iteration": 2.646127939224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.01107061, + "epoch": 0.2910734898037707, + "flos": 605902186752.0, + "grad_norm": 0.03463533015087841, + "language_loss": 0.88378417, + "learning_rate": 0.0008316689428398751, + "loss": 0.89436436, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.46899414, + "step": 1513, + "time_per_iteration": 2.7310631275177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056228, + "balance_loss_mlp": 1.00935447, + "epoch": 0.29126587148903427, + "flos": 575836046592.0, + "grad_norm": 0.028150288904366032, + "language_loss": 0.89498413, + "learning_rate": 0.0008314357451252979, + "loss": 0.90554643, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.46826172, + "step": 1514, + "time_per_iteration": 2.8262994289398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054124, + "balance_loss_mlp": 1.00727487, + "epoch": 0.2914582531742978, + "flos": 572134404096.0, + "grad_norm": 0.05354948204009119, + "language_loss": 0.89001274, + "learning_rate": 0.0008312024187359527, + "loss": 0.90055394, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.46801758, + "step": 1515, + "time_per_iteration": 2.717780590057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_mlp": 1.01109469, + "epoch": 0.2916506348595614, + "flos": 732303418368.0, + "grad_norm": 0.032865630858266236, + "language_loss": 0.8831327, + "learning_rate": 0.000830968963762425, + "loss": 0.89371502, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.47094727, + "step": 1516, + "time_per_iteration": 3.080526828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051181, + "balance_loss_mlp": 1.00383127, + "epoch": 0.2918430165448249, + "flos": 511467488256.0, + "grad_norm": 0.032871242995291323, + "language_loss": 0.84882748, + "learning_rate": 0.0008307353802953497, + "loss": 0.85933936, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.47314453, + "step": 1517, + "time_per_iteration": 2.744476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.03726828, + "epoch": 0.2920353982300885, + "flos": 631607616000.0, + "grad_norm": 0.03594729450056152, + "language_loss": 0.86997348, + "learning_rate": 0.0008305016684254125, + "loss": 0.88082325, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.47680664, + "step": 1518, + "time_per_iteration": 2.8340506553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_mlp": 1.00001049, + "epoch": 0.29222777991535204, + "flos": 502671222528.0, + "grad_norm": 0.03192476620539529, + "language_loss": 0.87901479, + "learning_rate": 0.0008302678282433479, + "loss": 0.88948864, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.47338867, + "step": 1519, + "time_per_iteration": 2.5783281326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048912, + "balance_loss_mlp": 1.00177681, + "epoch": 0.2924201616006156, + "flos": 487842286848.0, + "grad_norm": 0.03491462978028735, + "language_loss": 0.85667795, + "learning_rate": 0.0008300338598399411, + "loss": 0.86716712, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.47094727, + "step": 1520, + "time_per_iteration": 2.6763737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105218, + "balance_loss_mlp": 1.0049969, + "epoch": 0.2926125432858792, + "flos": 477411000576.0, + "grad_norm": 0.036990289889529016, + "language_loss": 0.957196, + "learning_rate": 0.0008297997633060263, + "loss": 0.96771777, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.47143555, + "step": 1521, + "time_per_iteration": 2.5368785858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055222, + "balance_loss_mlp": 1.00799167, + "epoch": 0.29280492497114274, + "flos": 677868449280.0, + "grad_norm": 0.0362418142607002, + "language_loss": 0.86058486, + "learning_rate": 0.0008295655387324883, + "loss": 0.87113714, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.47192383, + "step": 1522, + "time_per_iteration": 2.8447062969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055265, + "balance_loss_mlp": 1.0079869, + "epoch": 0.29299730665640633, + "flos": 459345071616.0, + "grad_norm": 0.03782463739456531, + "language_loss": 0.86245579, + "learning_rate": 0.0008293311862102609, + "loss": 0.87300849, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.47241211, + "step": 1523, + "time_per_iteration": 2.5397908687591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050568, + "balance_loss_mlp": 1.00328994, + "epoch": 0.29318968834166986, + "flos": 447496505088.0, + "grad_norm": 0.03500221637525105, + "language_loss": 0.90103561, + "learning_rate": 0.0008290967058303275, + "loss": 0.91154128, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.47241211, + "step": 1524, + "time_per_iteration": 2.4784419536590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.03349924, + "epoch": 0.29338207002693345, + "flos": 451256473344.0, + "grad_norm": 0.038529021386844775, + "language_loss": 0.87365985, + "learning_rate": 0.0008288620976837219, + "loss": 0.88447046, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.4753418, + "step": 1525, + "time_per_iteration": 2.540762424468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_mlp": 1.00684249, + "epoch": 0.293574451712197, + "flos": 503285571072.0, + "grad_norm": 0.03477645959362119, + "language_loss": 0.8372373, + "learning_rate": 0.000828627361861527, + "loss": 0.84778112, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.47509766, + "step": 1526, + "time_per_iteration": 2.583862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058639, + "balance_loss_mlp": 1.01124167, + "epoch": 0.29376683339746057, + "flos": 697684104960.0, + "grad_norm": 0.03858140978476568, + "language_loss": 0.85503912, + "learning_rate": 0.0008283924984548752, + "loss": 0.8656255, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.47363281, + "step": 1527, + "time_per_iteration": 2.848947525024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054231, + "balance_loss_mlp": 1.00680923, + "epoch": 0.2939592150827241, + "flos": 479542751232.0, + "grad_norm": 0.03208252397749005, + "language_loss": 0.8577444, + "learning_rate": 0.0008281575075549485, + "loss": 0.86828673, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.47387695, + "step": 1528, + "time_per_iteration": 2.6076998710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063099, + "balance_loss_mlp": 1.01703644, + "epoch": 0.2941515967679877, + "flos": 1488389507328.0, + "grad_norm": 0.010941905571601225, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78415793, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.45996094, + "step": 1529, + "time_per_iteration": 4.672811508178711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175133, + "balance_loss_mlp": 1.12690103, + "epoch": 0.2943439784532513, + "flos": 675400361472.0, + "grad_norm": 0.05299717257038309, + "language_loss": 0.90924174, + "learning_rate": 0.0008276871436402469, + "loss": 0.92099309, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.48217773, + "step": 1530, + "time_per_iteration": 2.8220977783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010581, + "balance_loss_mlp": 1.01096439, + "epoch": 0.2945363601385148, + "flos": 577383584256.0, + "grad_norm": 0.03620573442946411, + "language_loss": 0.88955015, + "learning_rate": 0.000827451770808083, + "loss": 0.90013111, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.47094727, + "step": 1531, + "time_per_iteration": 2.6981046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057368, + "balance_loss_mlp": 1.01013768, + "epoch": 0.2947287418237784, + "flos": 481618121472.0, + "grad_norm": 0.03382548660060083, + "language_loss": 0.84345412, + "learning_rate": 0.0008272162708478674, + "loss": 0.85402787, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.47192383, + "step": 1532, + "time_per_iteration": 2.5975306034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_mlp": 1.01151645, + "epoch": 0.2949211235090419, + "flos": 559261274880.0, + "grad_norm": 0.03154442800865326, + "language_loss": 0.87544608, + "learning_rate": 0.000826980643851029, + "loss": 0.88603282, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.47119141, + "step": 1533, + "time_per_iteration": 2.6889007091522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063865, + "balance_loss_mlp": 1.01675379, + "epoch": 0.2951135051943055, + "flos": 484857060096.0, + "grad_norm": 0.03876668067992812, + "language_loss": 0.85914761, + "learning_rate": 0.0008267448899090464, + "loss": 0.86978626, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.47070312, + "step": 1534, + "time_per_iteration": 2.5630924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062291, + "balance_loss_mlp": 1.01498842, + "epoch": 0.29530588687956905, + "flos": 551422497792.0, + "grad_norm": 0.034923849251574525, + "language_loss": 0.81812191, + "learning_rate": 0.0008265090091134473, + "loss": 0.82874477, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.47265625, + "step": 1535, + "time_per_iteration": 2.8399465084075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105968, + "balance_loss_mlp": 1.01235437, + "epoch": 0.29549826856483263, + "flos": 674310670080.0, + "grad_norm": 0.028029616611284485, + "language_loss": 0.80873084, + "learning_rate": 0.0008262730015558088, + "loss": 0.81932771, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.47290039, + "step": 1536, + "time_per_iteration": 2.874537944793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059174, + "balance_loss_mlp": 1.01151371, + "epoch": 0.29569065025009617, + "flos": 766136329728.0, + "grad_norm": 0.03177117147053012, + "language_loss": 0.82803708, + "learning_rate": 0.0008260368673277574, + "loss": 0.83862883, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.47631836, + "step": 1537, + "time_per_iteration": 3.0976641178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_mlp": 1.00573432, + "epoch": 0.29588303193535975, + "flos": 544831859712.0, + "grad_norm": 0.031452220479770684, + "language_loss": 0.84814745, + "learning_rate": 0.0008258006065209682, + "loss": 0.85868478, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.47973633, + "step": 1538, + "time_per_iteration": 2.7704694271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115804, + "balance_loss_mlp": 1.06735778, + "epoch": 0.29607541362062334, + "flos": 598146034944.0, + "grad_norm": 0.04896094729194987, + "language_loss": 0.81966412, + "learning_rate": 0.0008255642192271657, + "loss": 0.83082211, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.484375, + "step": 1539, + "time_per_iteration": 2.774122714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_mlp": 1.01219356, + "epoch": 0.29626779530588687, + "flos": 611038606080.0, + "grad_norm": 0.02837345788652225, + "language_loss": 0.84628069, + "learning_rate": 0.0008253277055381241, + "loss": 0.85687971, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.47680664, + "step": 1540, + "time_per_iteration": 2.837587833404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061212, + "balance_loss_mlp": 1.01340961, + "epoch": 0.29646017699115046, + "flos": 868959025152.0, + "grad_norm": 0.03662488769273821, + "language_loss": 0.86757702, + "learning_rate": 0.0008250910655456658, + "loss": 0.87818909, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.4777832, + "step": 1541, + "time_per_iteration": 3.123687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010574, + "balance_loss_mlp": 1.00954938, + "epoch": 0.296652558676414, + "flos": 496881570816.0, + "grad_norm": 0.03318095479066229, + "language_loss": 0.84889704, + "learning_rate": 0.0008248542993416625, + "loss": 0.85947102, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.47827148, + "step": 1542, + "time_per_iteration": 2.637747049331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068583, + "balance_loss_mlp": 1.02082753, + "epoch": 0.2968449403616776, + "flos": 572627243520.0, + "grad_norm": 0.03443634648546435, + "language_loss": 0.84426934, + "learning_rate": 0.0008246174070180352, + "loss": 0.8549552, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.47729492, + "step": 1543, + "time_per_iteration": 2.6872684955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062899, + "balance_loss_mlp": 1.01511967, + "epoch": 0.2970373220469411, + "flos": 795651304704.0, + "grad_norm": 0.035080805136432934, + "language_loss": 0.85198414, + "learning_rate": 0.0008243803886667537, + "loss": 0.86261314, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.47753906, + "step": 1544, + "time_per_iteration": 3.13710618019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069403, + "balance_loss_mlp": 1.02145684, + "epoch": 0.2972297037322047, + "flos": 662249220864.0, + "grad_norm": 0.04094703338464919, + "language_loss": 0.80137819, + "learning_rate": 0.0008241432443798364, + "loss": 0.81207222, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.47924805, + "step": 1545, + "time_per_iteration": 2.841092109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061565, + "balance_loss_mlp": 1.0138818, + "epoch": 0.29742208541746823, + "flos": 598232550912.0, + "grad_norm": 0.028624248431763765, + "language_loss": 0.86072361, + "learning_rate": 0.0008239059742493512, + "loss": 0.87133932, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.4765625, + "step": 1546, + "time_per_iteration": 2.7034194469451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349258, + "balance_loss_mlp": 1.29957151, + "epoch": 0.2976144671027318, + "flos": 771339823104.0, + "grad_norm": 0.07377893489124947, + "language_loss": 0.88059306, + "learning_rate": 0.0008236685783674142, + "loss": 0.89408565, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.49584961, + "step": 1547, + "time_per_iteration": 3.063077688217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071266, + "balance_loss_mlp": 1.02510834, + "epoch": 0.2978068487879954, + "flos": 1487914164480.0, + "grad_norm": 0.01225569795264997, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.7729246, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.4609375, + "step": 1548, + "time_per_iteration": 4.894561767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.02564275, + "epoch": 0.29799923047325894, + "flos": 476330057472.0, + "grad_norm": 0.041178192237982324, + "language_loss": 0.84313369, + "learning_rate": 0.0008231934097178955, + "loss": 0.85386503, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.47460938, + "step": 1549, + "time_per_iteration": 2.630146026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081209, + "balance_loss_mlp": 1.03362012, + "epoch": 0.2981916121585225, + "flos": 761169051648.0, + "grad_norm": 0.037198017460407115, + "language_loss": 0.86745787, + "learning_rate": 0.0008229556371347903, + "loss": 0.87826997, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.47558594, + "step": 1550, + "time_per_iteration": 2.9614980220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081757, + "balance_loss_mlp": 1.03416848, + "epoch": 0.29838399384378606, + "flos": 876517845504.0, + "grad_norm": 0.043512769843104544, + "language_loss": 0.80808616, + "learning_rate": 0.0008227177391691874, + "loss": 0.81890368, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.47558594, + "step": 1551, + "time_per_iteration": 3.11059832572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081803, + "balance_loss_mlp": 1.03445339, + "epoch": 0.29857637552904964, + "flos": 580752780288.0, + "grad_norm": 0.039547132323558824, + "language_loss": 0.90871334, + "learning_rate": 0.0008224797159134463, + "loss": 0.91953135, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.47314453, + "step": 1552, + "time_per_iteration": 2.7177717685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077357, + "balance_loss_mlp": 1.03026903, + "epoch": 0.2987687572143132, + "flos": 837809029632.0, + "grad_norm": 0.03288289742732326, + "language_loss": 0.84735203, + "learning_rate": 0.0008222415674599765, + "loss": 0.85812569, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.47045898, + "step": 1553, + "time_per_iteration": 3.090768814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.02513897, + "epoch": 0.29896113889957676, + "flos": 568168356096.0, + "grad_norm": 0.03857517262144223, + "language_loss": 0.8489393, + "learning_rate": 0.0008220032939012349, + "loss": 0.85966009, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.46899414, + "step": 1554, + "time_per_iteration": 2.7050375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.02554476, + "epoch": 0.29915352058484035, + "flos": 499836662016.0, + "grad_norm": 0.03341170745827686, + "language_loss": 0.89154899, + "learning_rate": 0.0008217648953297277, + "loss": 0.90227222, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.46728516, + "step": 1555, + "time_per_iteration": 2.8296022415161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106052, + "balance_loss_mlp": 1.01376653, + "epoch": 0.2993459022701039, + "flos": 593215695360.0, + "grad_norm": 0.042418434687241845, + "language_loss": 0.79395097, + "learning_rate": 0.0008215263718380095, + "loss": 0.80455619, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.46704102, + "step": 1556, + "time_per_iteration": 2.683760643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02541041, + "balance_loss_mlp": 2.4871583, + "epoch": 0.29953828395536747, + "flos": 573473916672.0, + "grad_norm": 0.19828678552993478, + "language_loss": 0.85491472, + "learning_rate": 0.0008212877235186833, + "loss": 0.88032514, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.54003906, + "step": 1557, + "time_per_iteration": 2.6963422298431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.0413208, + "epoch": 0.299730665640631, + "flos": 1508086566144.0, + "grad_norm": 0.015049722833054002, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78823709, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.44824219, + "step": 1558, + "time_per_iteration": 4.971554279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098273, + "balance_loss_mlp": 1.05063736, + "epoch": 0.2999230473258946, + "flos": 514808494080.0, + "grad_norm": 0.04814176942398931, + "language_loss": 0.82249933, + "learning_rate": 0.0008208100527678611, + "loss": 0.83348203, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.47607422, + "step": 1559, + "time_per_iteration": 2.6210360527038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130027, + "balance_loss_mlp": 1.08127058, + "epoch": 0.3001154290111581, + "flos": 835855168512.0, + "grad_norm": 0.05333171316141313, + "language_loss": 0.80031002, + "learning_rate": 0.0008205710305218135, + "loss": 0.81161028, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.48730469, + "step": 1560, + "time_per_iteration": 3.0021140575408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168816, + "balance_loss_mlp": 1.11898673, + "epoch": 0.3003078106964217, + "flos": 557946061824.0, + "grad_norm": 0.05314988858528354, + "language_loss": 0.91578549, + "learning_rate": 0.0008203318838190541, + "loss": 0.92747366, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.49707031, + "step": 1561, + "time_per_iteration": 2.7369065284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153064, + "balance_loss_mlp": 1.10247147, + "epoch": 0.30050019238168524, + "flos": 527169341952.0, + "grad_norm": 0.047834322975263, + "language_loss": 0.86778915, + "learning_rate": 0.0008200926127524281, + "loss": 0.87931979, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.50634766, + "step": 1562, + "time_per_iteration": 2.6357791423797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.10565686, + "epoch": 0.3006925740669488, + "flos": 578937924864.0, + "grad_norm": 0.04357261617021945, + "language_loss": 0.84502149, + "learning_rate": 0.0008198532174148289, + "loss": 0.85659254, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.51513672, + "step": 1563, + "time_per_iteration": 2.7241976261138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097195, + "balance_loss_mlp": 1.04941559, + "epoch": 0.3008849557522124, + "flos": 1493613409536.0, + "grad_norm": 0.019627167679756308, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8178336, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.47753906, + "step": 1564, + "time_per_iteration": 4.851420879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122708, + "balance_loss_mlp": 1.07035148, + "epoch": 0.30107733743747594, + "flos": 510824949504.0, + "grad_norm": 0.045341503179798265, + "language_loss": 0.90611446, + "learning_rate": 0.0008193740542985244, + "loss": 0.91734147, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.52441406, + "step": 1565, + "time_per_iteration": 2.62724232673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113673, + "balance_loss_mlp": 1.06098223, + "epoch": 0.30126971912273953, + "flos": 588821936640.0, + "grad_norm": 0.04014967632238747, + "language_loss": 0.87587321, + "learning_rate": 0.0008191342867058467, + "loss": 0.88700998, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.52783203, + "step": 1566, + "time_per_iteration": 2.766045570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133038, + "balance_loss_mlp": 1.07991791, + "epoch": 0.30146210080800306, + "flos": 603221216256.0, + "grad_norm": 0.039455426947262194, + "language_loss": 0.84397018, + "learning_rate": 0.0008188943952142509, + "loss": 0.85530061, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.53222656, + "step": 1567, + "time_per_iteration": 2.798323154449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113428, + "balance_loss_mlp": 1.06030834, + "epoch": 0.30165448249326665, + "flos": 919287973632.0, + "grad_norm": 0.03836627098538091, + "language_loss": 0.83653766, + "learning_rate": 0.0008186543799168711, + "loss": 0.84767193, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.53222656, + "step": 1568, + "time_per_iteration": 3.1216585636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.0594008, + "epoch": 0.3018468641785302, + "flos": 778631325696.0, + "grad_norm": 0.037681015369085746, + "language_loss": 0.89441907, + "learning_rate": 0.0008184142409068892, + "loss": 0.90554047, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.52832031, + "step": 1569, + "time_per_iteration": 2.9987363815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.03584409, + "epoch": 0.30203924586379377, + "flos": 523389931776.0, + "grad_norm": 0.031063886155947292, + "language_loss": 0.87584674, + "learning_rate": 0.000818173978277536, + "loss": 0.88672638, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.52197266, + "step": 1570, + "time_per_iteration": 2.657801389694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.04125619, + "epoch": 0.3022316275490573, + "flos": 525649994496.0, + "grad_norm": 0.03542742618693904, + "language_loss": 0.8460654, + "learning_rate": 0.000817933592122089, + "loss": 0.85699487, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.51757812, + "step": 1571, + "time_per_iteration": 2.699676752090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094536, + "balance_loss_mlp": 1.04289424, + "epoch": 0.3024240092343209, + "flos": 480873515520.0, + "grad_norm": 0.03710559119511486, + "language_loss": 0.84148443, + "learning_rate": 0.0008176930825338749, + "loss": 0.85242975, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.51708984, + "step": 1572, + "time_per_iteration": 2.560293197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_mlp": 1.03446782, + "epoch": 0.3026163909195845, + "flos": 688431938304.0, + "grad_norm": 0.03769478699711506, + "language_loss": 0.89810324, + "learning_rate": 0.0008174524496062679, + "loss": 0.90895915, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.51171875, + "step": 1573, + "time_per_iteration": 2.9185256958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083791, + "balance_loss_mlp": 1.03334129, + "epoch": 0.302808772604848, + "flos": 544087253760.0, + "grad_norm": 0.033203995249134796, + "language_loss": 0.86450267, + "learning_rate": 0.0008172116934326894, + "loss": 0.87534058, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.50488281, + "step": 1574, + "time_per_iteration": 2.77254056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107482, + "balance_loss_mlp": 1.02456117, + "epoch": 0.3030011542901116, + "flos": 476052046080.0, + "grad_norm": 0.03232260410081742, + "language_loss": 0.88820696, + "learning_rate": 0.0008169708141066097, + "loss": 0.89895517, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.50268555, + "step": 1575, + "time_per_iteration": 2.5428524017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083713, + "balance_loss_mlp": 1.03402615, + "epoch": 0.30319353597537513, + "flos": 482473542912.0, + "grad_norm": 0.035261838486320786, + "language_loss": 0.91478366, + "learning_rate": 0.0008167298117215465, + "loss": 0.92562079, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.49536133, + "step": 1576, + "time_per_iteration": 2.5388023853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.0151732, + "epoch": 0.3033859176606387, + "flos": 706113897984.0, + "grad_norm": 0.033895137386355495, + "language_loss": 0.89157575, + "learning_rate": 0.0008164886863710649, + "loss": 0.90221858, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.49047852, + "step": 1577, + "time_per_iteration": 2.9326250553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072249, + "balance_loss_mlp": 1.02363503, + "epoch": 0.30357829934590225, + "flos": 766110084864.0, + "grad_norm": 0.03320904121402137, + "language_loss": 0.87079322, + "learning_rate": 0.0008162474381487783, + "loss": 0.88151574, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.48608398, + "step": 1578, + "time_per_iteration": 3.0217320919036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_mlp": 1.02135277, + "epoch": 0.30377068103116583, + "flos": 533449887744.0, + "grad_norm": 0.035817825196195696, + "language_loss": 0.854909, + "learning_rate": 0.0008160060671483475, + "loss": 0.86560726, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.48461914, + "step": 1579, + "time_per_iteration": 2.6730797290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074874, + "balance_loss_mlp": 1.02647483, + "epoch": 0.3039630627164294, + "flos": 511224470016.0, + "grad_norm": 0.04566645575365512, + "language_loss": 0.84833682, + "learning_rate": 0.0008157645734634809, + "loss": 0.85908556, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.48388672, + "step": 1580, + "time_per_iteration": 2.5822741985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186287, + "balance_loss_mlp": 1.14089203, + "epoch": 0.30415544440169295, + "flos": 1509190841856.0, + "grad_norm": 0.045615209750242004, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78082776, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.453125, + "step": 1581, + "time_per_iteration": 4.900806665420532 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157879, + "balance_loss_mlp": 1.11257935, + "epoch": 0.30434782608695654, + "flos": 1461789772800.0, + "grad_norm": 0.04177274485031814, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74372375, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.45214844, + "step": 1582, + "time_per_iteration": 4.890560150146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071245, + "balance_loss_mlp": 1.02329922, + "epoch": 0.3045402077722201, + "flos": 483535044096.0, + "grad_norm": 0.03665669352532136, + "language_loss": 0.84926951, + "learning_rate": 0.000815039357240067, + "loss": 0.85998201, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.47924805, + "step": 1583, + "time_per_iteration": 2.655641555786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075238, + "balance_loss_mlp": 1.02695799, + "epoch": 0.30473258945748366, + "flos": 544627725312.0, + "grad_norm": 0.03699880598765725, + "language_loss": 0.86035675, + "learning_rate": 0.0008147973737554952, + "loss": 0.87110913, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.48266602, + "step": 1584, + "time_per_iteration": 2.8118185997009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066041, + "balance_loss_mlp": 1.01754665, + "epoch": 0.3049249711427472, + "flos": 568122669312.0, + "grad_norm": 0.039919187148179, + "language_loss": 0.86646891, + "learning_rate": 0.000814555268055744, + "loss": 0.87712932, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.48486328, + "step": 1585, + "time_per_iteration": 2.618649482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067734, + "balance_loss_mlp": 1.01926374, + "epoch": 0.3051173528280108, + "flos": 529290398976.0, + "grad_norm": 0.034961032963054674, + "language_loss": 0.88066852, + "learning_rate": 0.0008143130402348073, + "loss": 0.89134592, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.48461914, + "step": 1586, + "time_per_iteration": 2.6645073890686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.01593137, + "epoch": 0.3053097345132743, + "flos": 587600042496.0, + "grad_norm": 0.03198607314396223, + "language_loss": 0.79707628, + "learning_rate": 0.0008140706903867265, + "loss": 0.80772173, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.48608398, + "step": 1587, + "time_per_iteration": 2.772688150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.01706147, + "epoch": 0.3055021161985379, + "flos": 608201133312.0, + "grad_norm": 0.03820330265300666, + "language_loss": 0.90882033, + "learning_rate": 0.0008138282186055897, + "loss": 0.91947937, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.48803711, + "step": 1588, + "time_per_iteration": 2.6824429035186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106751, + "balance_loss_mlp": 1.01851535, + "epoch": 0.3056944978838015, + "flos": 574963128576.0, + "grad_norm": 0.03364087196891663, + "language_loss": 0.83419842, + "learning_rate": 0.0008135856249855331, + "loss": 0.84487349, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.48950195, + "step": 1589, + "time_per_iteration": 2.6829729080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.0164994, + "epoch": 0.305886879569065, + "flos": 635072076288.0, + "grad_norm": 0.036524553871552005, + "language_loss": 0.90591866, + "learning_rate": 0.0008133429096207398, + "loss": 0.91657621, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.4909668, + "step": 1590, + "time_per_iteration": 2.7734742164611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135399, + "balance_loss_mlp": 1.08351898, + "epoch": 0.3060792612543286, + "flos": 1372133769216.0, + "grad_norm": 0.023040785082221134, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76447666, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.51953125, + "step": 1591, + "time_per_iteration": 4.964044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106727, + "balance_loss_mlp": 1.01806068, + "epoch": 0.30627164293959214, + "flos": 519619269888.0, + "grad_norm": 0.029618090290997726, + "language_loss": 0.87174189, + "learning_rate": 0.0008128571140339123, + "loss": 0.88241458, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.49121094, + "step": 1592, + "time_per_iteration": 2.6813180446624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.01942289, + "epoch": 0.3064640246248557, + "flos": 456533843712.0, + "grad_norm": 0.02963099688993501, + "language_loss": 0.87551641, + "learning_rate": 0.0008126140340004805, + "loss": 0.88620031, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.48876953, + "step": 1593, + "time_per_iteration": 2.5293447971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_mlp": 1.01580834, + "epoch": 0.30665640631011926, + "flos": 851609511936.0, + "grad_norm": 0.028917997945976257, + "language_loss": 0.82855684, + "learning_rate": 0.0008123708325995172, + "loss": 0.8392061, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.49023438, + "step": 1594, + "time_per_iteration": 3.1976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068322, + "balance_loss_mlp": 1.01937473, + "epoch": 0.30684878799538284, + "flos": 759616656384.0, + "grad_norm": 0.02786640270256765, + "language_loss": 0.80270225, + "learning_rate": 0.0008121275099254414, + "loss": 0.81338549, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.48901367, + "step": 1595, + "time_per_iteration": 2.9073448181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105895, + "balance_loss_mlp": 1.01069379, + "epoch": 0.3070411696806464, + "flos": 518596652544.0, + "grad_norm": 0.02828411740511225, + "language_loss": 0.89261508, + "learning_rate": 0.0008118840660727194, + "loss": 0.90320462, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.48242188, + "step": 1596, + "time_per_iteration": 2.6137096881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105679, + "balance_loss_mlp": 1.00855815, + "epoch": 0.30723355136590996, + "flos": 845791670016.0, + "grad_norm": 0.02807637717187332, + "language_loss": 0.8853125, + "learning_rate": 0.0008116405011358644, + "loss": 0.89588046, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.48217773, + "step": 1597, + "time_per_iteration": 3.1528680324554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_mlp": 1.01163971, + "epoch": 0.30742593305117355, + "flos": 467079836160.0, + "grad_norm": 0.032917462624290315, + "language_loss": 0.80716425, + "learning_rate": 0.0008113968152094369, + "loss": 0.81776392, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.4831543, + "step": 1598, + "time_per_iteration": 2.5390987396240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059252, + "balance_loss_mlp": 1.011235, + "epoch": 0.3076183147364371, + "flos": 687817589760.0, + "grad_norm": 0.03298344899906339, + "language_loss": 0.830042, + "learning_rate": 0.0008111530083880438, + "loss": 0.84063458, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.47998047, + "step": 1599, + "time_per_iteration": 2.904327154159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059695, + "balance_loss_mlp": 1.01170099, + "epoch": 0.30781069642170067, + "flos": 615180598272.0, + "grad_norm": 0.03364515132561045, + "language_loss": 0.86925042, + "learning_rate": 0.0008109090807663399, + "loss": 0.87984729, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.47973633, + "step": 1600, + "time_per_iteration": 2.794553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059845, + "balance_loss_mlp": 1.01206601, + "epoch": 0.3080030781069642, + "flos": 591509710080.0, + "grad_norm": 0.029450986393402313, + "language_loss": 0.89288217, + "learning_rate": 0.0008106650324390257, + "loss": 0.90348059, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.47753906, + "step": 1601, + "time_per_iteration": 2.825118064880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055341, + "balance_loss_mlp": 1.00744271, + "epoch": 0.3081954597922278, + "flos": 563691972096.0, + "grad_norm": 0.03217567830931305, + "language_loss": 0.82333392, + "learning_rate": 0.0008104208635008493, + "loss": 0.83388734, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.47875977, + "step": 1602, + "time_per_iteration": 2.7727856636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057631, + "balance_loss_mlp": 1.0099231, + "epoch": 0.3083878414774913, + "flos": 448762140672.0, + "grad_norm": 0.03928010080840531, + "language_loss": 0.82422024, + "learning_rate": 0.0008101765740466058, + "loss": 0.83479655, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.47680664, + "step": 1603, + "time_per_iteration": 2.5764591693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106031, + "balance_loss_mlp": 1.01272202, + "epoch": 0.3085802231627549, + "flos": 494545685760.0, + "grad_norm": 0.03880240670965016, + "language_loss": 0.84925759, + "learning_rate": 0.0008099321641711364, + "loss": 0.85986066, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.47558594, + "step": 1604, + "time_per_iteration": 2.6562154293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059833, + "balance_loss_mlp": 1.01262641, + "epoch": 0.3087726048480185, + "flos": 488690905344.0, + "grad_norm": 0.030963234073246262, + "language_loss": 0.84138477, + "learning_rate": 0.0008096876339693295, + "loss": 0.85198307, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.47167969, + "step": 1605, + "time_per_iteration": 2.6818747520446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057926, + "balance_loss_mlp": 1.01083875, + "epoch": 0.308964986533282, + "flos": 731888346624.0, + "grad_norm": 0.03606871420254603, + "language_loss": 0.82584137, + "learning_rate": 0.0008094429835361206, + "loss": 0.83642066, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.47045898, + "step": 1606, + "time_per_iteration": 2.940202236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01211011, + "epoch": 0.3091573682185456, + "flos": 606516535296.0, + "grad_norm": 0.033324674351776856, + "language_loss": 0.86802429, + "learning_rate": 0.0008091982129664908, + "loss": 0.87861747, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.47167969, + "step": 1607, + "time_per_iteration": 2.7152366638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055819, + "balance_loss_mlp": 1.00858819, + "epoch": 0.30934974990380915, + "flos": 461307681024.0, + "grad_norm": 0.0316485976101594, + "language_loss": 0.83554763, + "learning_rate": 0.0008089533223554687, + "loss": 0.84610581, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.47192383, + "step": 1608, + "time_per_iteration": 2.73236083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00692201, + "epoch": 0.30954213158907273, + "flos": 554568117504.0, + "grad_norm": 0.03240022060424308, + "language_loss": 0.85798776, + "learning_rate": 0.0008087083117981294, + "loss": 0.86852884, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.47143555, + "step": 1609, + "time_per_iteration": 2.8992979526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052885, + "balance_loss_mlp": 1.00543988, + "epoch": 0.30973451327433627, + "flos": 554114161920.0, + "grad_norm": 0.03509024741452312, + "language_loss": 0.88937026, + "learning_rate": 0.0008084631813895943, + "loss": 0.89989913, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.47412109, + "step": 1610, + "time_per_iteration": 2.8113343715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00168037, + "epoch": 0.30992689495959985, + "flos": 566763714816.0, + "grad_norm": 0.03310460584308608, + "language_loss": 0.8446725, + "learning_rate": 0.0008082179312250315, + "loss": 0.85516399, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.47436523, + "step": 1611, + "time_per_iteration": 2.6286494731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146973, + "balance_loss_mlp": 1.09509277, + "epoch": 0.3101192766448634, + "flos": 1445562998784.0, + "grad_norm": 0.022501740699277736, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8100282, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.51953125, + "step": 1612, + "time_per_iteration": 4.877255439758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132568, + "balance_loss_mlp": 1.08087921, + "epoch": 0.31031165833012697, + "flos": 1535130541056.0, + "grad_norm": 0.020576462480935535, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.777619, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.51757812, + "step": 1613, + "time_per_iteration": 5.064774751663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.00363839, + "epoch": 0.31050404001539056, + "flos": 993633862656.0, + "grad_norm": 0.03245007970491877, + "language_loss": 0.83116508, + "learning_rate": 0.0008074814631475545, + "loss": 0.84167451, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.47265625, + "step": 1614, + "time_per_iteration": 3.322155714035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_mlp": 1.00741875, + "epoch": 0.3106964217006541, + "flos": 446973530112.0, + "grad_norm": 0.03235075185089818, + "language_loss": 0.80034411, + "learning_rate": 0.0008072357349114907, + "loss": 0.81089151, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.47290039, + "step": 1615, + "time_per_iteration": 2.699772596359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056016, + "balance_loss_mlp": 1.00880885, + "epoch": 0.3108888033859177, + "flos": 511495678464.0, + "grad_norm": 0.0340106704308988, + "language_loss": 0.89603639, + "learning_rate": 0.0008069898873959363, + "loss": 0.90659654, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.47167969, + "step": 1616, + "time_per_iteration": 2.680640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051359, + "balance_loss_mlp": 1.0043664, + "epoch": 0.3110811850711812, + "flos": 521779210752.0, + "grad_norm": 0.029395602971080924, + "language_loss": 0.86344647, + "learning_rate": 0.0008067439206963375, + "loss": 0.87396008, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.46948242, + "step": 1617, + "time_per_iteration": 2.6484971046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055386, + "balance_loss_mlp": 1.00844121, + "epoch": 0.3112735667564448, + "flos": 687731073792.0, + "grad_norm": 0.03406090033110643, + "language_loss": 0.87673247, + "learning_rate": 0.0008064978349081873, + "loss": 0.88728631, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.46899414, + "step": 1618, + "time_per_iteration": 2.92702579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_mlp": 1.00965679, + "epoch": 0.31146594844170833, + "flos": 534166303488.0, + "grad_norm": 0.030256910717709223, + "language_loss": 0.87292403, + "learning_rate": 0.0008062516301270245, + "loss": 0.88348979, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.46875, + "step": 1619, + "time_per_iteration": 2.7301478385925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.00859511, + "epoch": 0.3116583301269719, + "flos": 680842982400.0, + "grad_norm": 0.027867683897015817, + "language_loss": 0.88937479, + "learning_rate": 0.0008060053064484343, + "loss": 0.89992964, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.46850586, + "step": 1620, + "time_per_iteration": 2.947906017303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048804, + "balance_loss_mlp": 1.00202632, + "epoch": 0.31185071181223545, + "flos": 587330779392.0, + "grad_norm": 0.03167203134142694, + "language_loss": 0.86095911, + "learning_rate": 0.0008057588639680482, + "loss": 0.87144709, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.46728516, + "step": 1621, + "time_per_iteration": 2.7836551666259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_mlp": 1.00282919, + "epoch": 0.31204309349749904, + "flos": 726658608384.0, + "grad_norm": 0.037979301866738396, + "language_loss": 0.83855367, + "learning_rate": 0.0008055123027815434, + "loss": 0.84904802, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.46557617, + "step": 1622, + "time_per_iteration": 2.9263358116149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_mlp": 1.00455689, + "epoch": 0.3122354751827626, + "flos": 577895865600.0, + "grad_norm": 0.032507776226150094, + "language_loss": 0.85607505, + "learning_rate": 0.0008052656229846436, + "loss": 0.86658645, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.46533203, + "step": 1623, + "time_per_iteration": 2.662386894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051831, + "balance_loss_mlp": 1.00514877, + "epoch": 0.31242785686802615, + "flos": 577029750528.0, + "grad_norm": 0.03513403942618559, + "language_loss": 0.91195071, + "learning_rate": 0.0008050188246731182, + "loss": 0.92246902, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.46630859, + "step": 1624, + "time_per_iteration": 2.710176467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052884, + "balance_loss_mlp": 1.00624907, + "epoch": 0.31262023855328974, + "flos": 738197082624.0, + "grad_norm": 0.0324646036152644, + "language_loss": 0.82931978, + "learning_rate": 0.0008047719079427834, + "loss": 0.83984858, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.46582031, + "step": 1625, + "time_per_iteration": 2.970287561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.03533173, + "epoch": 0.3128126202385533, + "flos": 1562594445312.0, + "grad_norm": 0.01743050972952843, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75434434, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.47363281, + "step": 1626, + "time_per_iteration": 4.816533088684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053999, + "balance_loss_mlp": 1.0071733, + "epoch": 0.31300500192381686, + "flos": 515943872256.0, + "grad_norm": 0.030770809254638827, + "language_loss": 0.86711371, + "learning_rate": 0.0008042777196091757, + "loss": 0.87765372, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.46777344, + "step": 1627, + "time_per_iteration": 2.7191882133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.01072919, + "epoch": 0.3131973836090804, + "flos": 527662181376.0, + "grad_norm": 0.031150181208545357, + "language_loss": 0.82488692, + "learning_rate": 0.0008040304481977643, + "loss": 0.83546221, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.4675293, + "step": 1628, + "time_per_iteration": 2.706782579421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.01065385, + "epoch": 0.313389765294344, + "flos": 824210736384.0, + "grad_norm": 0.032636383561425994, + "language_loss": 0.87568998, + "learning_rate": 0.0008037830587512649, + "loss": 0.88626337, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.46630859, + "step": 1629, + "time_per_iteration": 3.0928542613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054937, + "balance_loss_mlp": 1.00820696, + "epoch": 0.31358214697960757, + "flos": 394703359488.0, + "grad_norm": 0.03241768310332359, + "language_loss": 0.79631239, + "learning_rate": 0.0008035355513657224, + "loss": 0.80686176, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.46679688, + "step": 1630, + "time_per_iteration": 2.449666738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054798, + "balance_loss_mlp": 1.00806797, + "epoch": 0.3137745286648711, + "flos": 573098695680.0, + "grad_norm": 0.0293939817515363, + "language_loss": 0.93494189, + "learning_rate": 0.0008032879261372279, + "loss": 0.94548988, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.46679688, + "step": 1631, + "time_per_iteration": 2.766951084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068432, + "balance_loss_mlp": 1.02256012, + "epoch": 0.3139669103501347, + "flos": 1501632021504.0, + "grad_norm": 0.011791019456215185, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80704272, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.45800781, + "step": 1632, + "time_per_iteration": 5.585620403289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_mlp": 1.00425589, + "epoch": 0.3141592920353982, + "flos": 526359607296.0, + "grad_norm": 0.030163528949794682, + "language_loss": 0.87607086, + "learning_rate": 0.0008027923225359748, + "loss": 0.88657928, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.46533203, + "step": 1633, + "time_per_iteration": 2.607407808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105517, + "balance_loss_mlp": 1.0084641, + "epoch": 0.3143516737206618, + "flos": 594388012032.0, + "grad_norm": 0.030785944321789945, + "language_loss": 0.88644683, + "learning_rate": 0.0008025443443556267, + "loss": 0.89699847, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.46655273, + "step": 1634, + "time_per_iteration": 2.704568862915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053981, + "balance_loss_mlp": 1.00756085, + "epoch": 0.31454405540592534, + "flos": 649680347904.0, + "grad_norm": 0.028625636333363444, + "language_loss": 0.88813668, + "learning_rate": 0.000802296248717147, + "loss": 0.89867646, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.46362305, + "step": 1635, + "time_per_iteration": 2.914228916168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_mlp": 1.00461841, + "epoch": 0.3147364370911889, + "flos": 644070531072.0, + "grad_norm": 0.032412817231273386, + "language_loss": 0.79727387, + "learning_rate": 0.0008020480357168554, + "loss": 0.80778593, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.46533203, + "step": 1636, + "time_per_iteration": 2.8196966648101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_mlp": 1.00505865, + "epoch": 0.31492881877645246, + "flos": 472821855744.0, + "grad_norm": 0.028828485286514015, + "language_loss": 0.88662213, + "learning_rate": 0.0008017997054511165, + "loss": 0.89713949, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.46630859, + "step": 1637, + "time_per_iteration": 2.6545960903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051567, + "balance_loss_mlp": 1.00486124, + "epoch": 0.31512120046171604, + "flos": 630630685440.0, + "grad_norm": 0.03463883423234526, + "language_loss": 0.86238796, + "learning_rate": 0.0008015512580163407, + "loss": 0.87290359, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.46655273, + "step": 1638, + "time_per_iteration": 2.775726795196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00429583, + "epoch": 0.31531358214697963, + "flos": 705054342144.0, + "grad_norm": 0.0328972983749375, + "language_loss": 0.81582069, + "learning_rate": 0.0008013026935089838, + "loss": 0.82632947, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.46533203, + "step": 1639, + "time_per_iteration": 2.859405040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048793, + "balance_loss_mlp": 1.00182474, + "epoch": 0.31550596383224316, + "flos": 573632364288.0, + "grad_norm": 0.03266078051512415, + "language_loss": 0.84787768, + "learning_rate": 0.0008010540120255472, + "loss": 0.85836554, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.46923828, + "step": 1640, + "time_per_iteration": 2.654087781906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051523, + "balance_loss_mlp": 1.00457835, + "epoch": 0.31569834551750675, + "flos": 659513815296.0, + "grad_norm": 0.0373471738494659, + "language_loss": 0.87093472, + "learning_rate": 0.0008008052136625774, + "loss": 0.88144994, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.46899414, + "step": 1641, + "time_per_iteration": 2.7806570529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_mlp": 1.00730693, + "epoch": 0.3158907272027703, + "flos": 567404308224.0, + "grad_norm": 0.028103315573088077, + "language_loss": 0.87394774, + "learning_rate": 0.0008005562985166666, + "loss": 0.88449007, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.46875, + "step": 1642, + "time_per_iteration": 2.6866798400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053754, + "balance_loss_mlp": 1.00699973, + "epoch": 0.31608310888803387, + "flos": 537973903872.0, + "grad_norm": 0.024374019828786602, + "language_loss": 0.85555339, + "learning_rate": 0.0008003072666844524, + "loss": 0.86609089, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.46704102, + "step": 1643, + "time_per_iteration": 2.684518337249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_mlp": 1.00856149, + "epoch": 0.3162754905732974, + "flos": 487640097792.0, + "grad_norm": 0.037314537224785074, + "language_loss": 0.8350842, + "learning_rate": 0.0008000581182626173, + "loss": 0.84563494, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.46459961, + "step": 1644, + "time_per_iteration": 2.5574259757995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051572, + "balance_loss_mlp": 1.00481844, + "epoch": 0.316467872258561, + "flos": 531096506112.0, + "grad_norm": 0.03327277300757214, + "language_loss": 0.87005818, + "learning_rate": 0.0007998088533478894, + "loss": 0.88057387, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.46704102, + "step": 1645, + "time_per_iteration": 2.6987338066101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_mlp": 1.00894499, + "epoch": 0.3166602539438245, + "flos": 444414068736.0, + "grad_norm": 0.040202418156990175, + "language_loss": 0.85042381, + "learning_rate": 0.000799559472037042, + "loss": 0.8609792, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.46533203, + "step": 1646, + "time_per_iteration": 2.6219563484191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056055, + "balance_loss_mlp": 1.00958765, + "epoch": 0.3168526356290881, + "flos": 647103389952.0, + "grad_norm": 0.026601574185044653, + "language_loss": 0.8823331, + "learning_rate": 0.0007993099744268932, + "loss": 0.89289367, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.46411133, + "step": 1647, + "time_per_iteration": 2.8902037143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_mlp": 1.00817358, + "epoch": 0.3170450173143517, + "flos": 587258847744.0, + "grad_norm": 0.03281471441230887, + "language_loss": 0.8855083, + "learning_rate": 0.000799060360614307, + "loss": 0.89605635, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.46582031, + "step": 1648, + "time_per_iteration": 2.694293975830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.00945473, + "epoch": 0.3172373989996152, + "flos": 828574359552.0, + "grad_norm": 0.03046931045185914, + "language_loss": 0.84284711, + "learning_rate": 0.0007988106306961917, + "loss": 0.85340536, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.46313477, + "step": 1649, + "time_per_iteration": 3.121788501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_mlp": 1.01195896, + "epoch": 0.3174297806848788, + "flos": 528434977536.0, + "grad_norm": 0.03563880571664149, + "language_loss": 0.85299373, + "learning_rate": 0.0007985607847695014, + "loss": 0.8635785, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.46459961, + "step": 1650, + "time_per_iteration": 2.625356912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.00107014, + "epoch": 0.31762216237014235, + "flos": 714482452992.0, + "grad_norm": 0.030498079123472206, + "language_loss": 0.83133662, + "learning_rate": 0.0007983108229312345, + "loss": 0.84180987, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.46191406, + "step": 1651, + "time_per_iteration": 2.894109010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_mlp": 1.00362098, + "epoch": 0.31781454405540593, + "flos": 484800679680.0, + "grad_norm": 0.03387492306443982, + "language_loss": 0.86931884, + "learning_rate": 0.0007980607452784351, + "loss": 0.87981641, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.46069336, + "step": 1652, + "time_per_iteration": 2.5593390464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048407, + "balance_loss_mlp": 1.00236845, + "epoch": 0.31800692574066947, + "flos": 549804973824.0, + "grad_norm": 0.04030851184116312, + "language_loss": 0.90997875, + "learning_rate": 0.0007978105519081919, + "loss": 0.92046285, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.4597168, + "step": 1653, + "time_per_iteration": 2.683809995651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_mlp": 0.99982309, + "epoch": 0.31819930742593305, + "flos": 517917175296.0, + "grad_norm": 0.033294821801319624, + "language_loss": 0.88831019, + "learning_rate": 0.0007975602429176385, + "loss": 0.89876974, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.46069336, + "step": 1654, + "time_per_iteration": 2.5786075592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00238276, + "epoch": 0.31839168911119664, + "flos": 456970302720.0, + "grad_norm": 0.028947480678153642, + "language_loss": 0.82318926, + "learning_rate": 0.0007973098184039536, + "loss": 0.83367276, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.45898438, + "step": 1655, + "time_per_iteration": 2.651188611984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 0.99921381, + "epoch": 0.3185840707964602, + "flos": 627296482560.0, + "grad_norm": 0.03276090001573999, + "language_loss": 0.8731916, + "learning_rate": 0.0007970592784643602, + "loss": 0.88364458, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.46020508, + "step": 1656, + "time_per_iteration": 2.8683595657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 0.99976265, + "epoch": 0.31877645248172376, + "flos": 568541631744.0, + "grad_norm": 0.035945607337745746, + "language_loss": 0.85986471, + "learning_rate": 0.0007968086231961272, + "loss": 0.87032342, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.46044922, + "step": 1657, + "time_per_iteration": 2.642733335494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00119007, + "epoch": 0.3189688341669873, + "flos": 490553392896.0, + "grad_norm": 0.04377426906704287, + "language_loss": 0.84065533, + "learning_rate": 0.0007965578526965671, + "loss": 0.85112733, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.45947266, + "step": 1658, + "time_per_iteration": 2.5638930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049099, + "balance_loss_mlp": 1.00291717, + "epoch": 0.3191612158522509, + "flos": 577381638912.0, + "grad_norm": 0.02931224295785387, + "language_loss": 0.86766565, + "learning_rate": 0.0007963069670630377, + "loss": 0.87815666, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.46118164, + "step": 1659, + "time_per_iteration": 2.7154479026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051177, + "balance_loss_mlp": 1.00506639, + "epoch": 0.3193535975375144, + "flos": 539193852672.0, + "grad_norm": 0.03496177903686506, + "language_loss": 0.88776976, + "learning_rate": 0.0007960559663929416, + "loss": 0.89828151, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.46044922, + "step": 1660, + "time_per_iteration": 2.6322021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_mlp": 1.00868368, + "epoch": 0.319545979222778, + "flos": 735628872960.0, + "grad_norm": 0.030221795014758104, + "language_loss": 0.88154632, + "learning_rate": 0.0007958048507837259, + "loss": 0.89209306, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.45922852, + "step": 1661, + "time_per_iteration": 2.9221389293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105245, + "balance_loss_mlp": 1.00648332, + "epoch": 0.31973836090804153, + "flos": 765768890112.0, + "grad_norm": 0.037416739988226255, + "language_loss": 0.87668484, + "learning_rate": 0.0007955536203328822, + "loss": 0.88720942, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.45898438, + "step": 1662, + "time_per_iteration": 2.9018445014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_mlp": 1.00184774, + "epoch": 0.3199307425933051, + "flos": 561742968576.0, + "grad_norm": 0.03025687936293395, + "language_loss": 0.84124553, + "learning_rate": 0.0007953022751379469, + "loss": 0.85172796, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.46337891, + "step": 1663, + "time_per_iteration": 2.781562566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085701, + "balance_loss_mlp": 1.03906643, + "epoch": 0.3201231242785687, + "flos": 752672184576.0, + "grad_norm": 0.03881407073457837, + "language_loss": 0.82717097, + "learning_rate": 0.000795050815296501, + "loss": 0.83802795, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.46582031, + "step": 1664, + "time_per_iteration": 2.9950287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_mlp": 1.00446498, + "epoch": 0.32031550596383224, + "flos": 497385103872.0, + "grad_norm": 0.02713287522590179, + "language_loss": 0.93810016, + "learning_rate": 0.0007947992409061695, + "loss": 0.94860852, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.46313477, + "step": 1665, + "time_per_iteration": 2.583118438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056564, + "balance_loss_mlp": 1.01045382, + "epoch": 0.3205078876490958, + "flos": 732875970816.0, + "grad_norm": 0.03263285268561658, + "language_loss": 0.86165506, + "learning_rate": 0.0007945475520646226, + "loss": 0.8722207, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.46044922, + "step": 1666, + "time_per_iteration": 2.903190851211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_mlp": 1.01324141, + "epoch": 0.32070026933435936, + "flos": 550475702784.0, + "grad_norm": 0.03801033406135743, + "language_loss": 0.85650241, + "learning_rate": 0.0007942957488695743, + "loss": 0.86709714, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.46166992, + "step": 1667, + "time_per_iteration": 2.661292791366577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_mlp": 1.01277089, + "epoch": 0.32089265101962294, + "flos": 746685201408.0, + "grad_norm": 0.031638418068872444, + "language_loss": 0.81749988, + "learning_rate": 0.0007940438314187833, + "loss": 0.82809013, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.46191406, + "step": 1668, + "time_per_iteration": 3.0293474197387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057511, + "balance_loss_mlp": 1.01144862, + "epoch": 0.3210850327048865, + "flos": 495196972800.0, + "grad_norm": 0.034120041175176606, + "language_loss": 0.81371748, + "learning_rate": 0.0007937917998100529, + "loss": 0.82429266, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.45996094, + "step": 1669, + "time_per_iteration": 2.5822434425354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08258255, + "balance_loss_mlp": 8.0, + "epoch": 0.32127741439015006, + "flos": 531673916160.0, + "grad_norm": 0.043058724234977634, + "language_loss": 0.81425405, + "learning_rate": 0.0007935396541412302, + "loss": 0.89683664, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 2.58203125, + "step": 1670, + "time_per_iteration": 2.5968360900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0830899, + "balance_loss_mlp": 8.0, + "epoch": 0.3214697960754136, + "flos": 502224069888.0, + "grad_norm": 0.0363513778225316, + "language_loss": 0.87401152, + "learning_rate": 0.0007932873945102068, + "loss": 0.9571014, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 3.0859375, + "step": 1671, + "time_per_iteration": 2.582617998123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08312805, + "balance_loss_mlp": 8.0, + "epoch": 0.3216621777606772, + "flos": 1386404736768.0, + "grad_norm": 0.003686648730821959, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.84074581, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 3.125, + "step": 1672, + "time_per_iteration": 4.829998970031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08311279, + "balance_loss_mlp": 8.0, + "epoch": 0.32185455944594077, + "flos": 572635991808.0, + "grad_norm": 0.030782594356869853, + "language_loss": 0.88089788, + "learning_rate": 0.0007927825337533461, + "loss": 0.96401072, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 3.109375, + "step": 1673, + "time_per_iteration": 2.6633598804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08310516, + "balance_loss_mlp": 8.0, + "epoch": 0.3220469411312043, + "flos": 544937817600.0, + "grad_norm": 0.040711103761993876, + "language_loss": 0.86732781, + "learning_rate": 0.0007925299328235131, + "loss": 0.95043296, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 3.1015625, + "step": 1674, + "time_per_iteration": 2.634169578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08307083, + "balance_loss_mlp": 8.0, + "epoch": 0.3222393228164679, + "flos": 492162168576.0, + "grad_norm": 0.03938689136463286, + "language_loss": 0.86802006, + "learning_rate": 0.000792277218323488, + "loss": 0.95109081, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 3.06640625, + "step": 1675, + "time_per_iteration": 2.5893990993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08270843, + "balance_loss_mlp": 8.0, + "epoch": 0.3224317045017314, + "flos": 491363127552.0, + "grad_norm": 0.03386575094399551, + "language_loss": 0.86165106, + "learning_rate": 0.0007920243903513833, + "loss": 0.94435954, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 2.7109375, + "step": 1676, + "time_per_iteration": 2.5602426528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02321873, + "balance_loss_mlp": 2.26942062, + "epoch": 0.322624086186995, + "flos": 576871302912.0, + "grad_norm": 0.12910494226103245, + "language_loss": 0.85448408, + "learning_rate": 0.0007917714490053556, + "loss": 0.87770277, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.52539062, + "step": 1677, + "time_per_iteration": 2.6558380126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071536, + "balance_loss_mlp": 1.02492559, + "epoch": 0.32281646787225854, + "flos": 630572359680.0, + "grad_norm": 0.04049679721352166, + "language_loss": 0.87627459, + "learning_rate": 0.0007915183943836055, + "loss": 0.88698995, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.46557617, + "step": 1678, + "time_per_iteration": 2.898658037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.02631712, + "epoch": 0.3230088495575221, + "flos": 782808311040.0, + "grad_norm": 0.04272749105284559, + "language_loss": 0.85738349, + "learning_rate": 0.0007912652265843773, + "loss": 0.86811107, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.46386719, + "step": 1679, + "time_per_iteration": 3.049938917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082142, + "balance_loss_mlp": 1.03557873, + "epoch": 0.3232012312427857, + "flos": 537201107712.0, + "grad_norm": 0.04201967602882564, + "language_loss": 0.83624417, + "learning_rate": 0.0007910119457059597, + "loss": 0.84706557, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.46508789, + "step": 1680, + "time_per_iteration": 2.7126853466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108585, + "balance_loss_mlp": 1.03895342, + "epoch": 0.32339361292804925, + "flos": 706233461760.0, + "grad_norm": 0.044345030126194285, + "language_loss": 0.81981564, + "learning_rate": 0.0007907585518466849, + "loss": 0.83067411, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.46850586, + "step": 1681, + "time_per_iteration": 2.9758992195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088847, + "balance_loss_mlp": 1.0419023, + "epoch": 0.32358599461331283, + "flos": 453257966592.0, + "grad_norm": 0.04210474159896445, + "language_loss": 0.91257876, + "learning_rate": 0.000790505045104929, + "loss": 0.92346722, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.46899414, + "step": 1682, + "time_per_iteration": 2.5105395317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090368, + "balance_loss_mlp": 1.04337561, + "epoch": 0.32377837629857636, + "flos": 602092641024.0, + "grad_norm": 0.04465728550727914, + "language_loss": 0.88834655, + "learning_rate": 0.0007902514255791125, + "loss": 0.89925027, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.46948242, + "step": 1683, + "time_per_iteration": 2.7610387802124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089116, + "balance_loss_mlp": 1.04190934, + "epoch": 0.32397075798383995, + "flos": 808899654912.0, + "grad_norm": 0.04108658803287063, + "language_loss": 0.89801908, + "learning_rate": 0.0007899976933676986, + "loss": 0.90891027, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.47167969, + "step": 1684, + "time_per_iteration": 2.963387966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089307, + "balance_loss_mlp": 1.04205263, + "epoch": 0.3241631396691035, + "flos": 602793505536.0, + "grad_norm": 0.046655842402160155, + "language_loss": 0.89137548, + "learning_rate": 0.0007897438485691955, + "loss": 0.90226853, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.47216797, + "step": 1685, + "time_per_iteration": 2.675910711288452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079467, + "balance_loss_mlp": 1.03195012, + "epoch": 0.32435552135436707, + "flos": 475177182720.0, + "grad_norm": 0.045429866607221585, + "language_loss": 0.84063458, + "learning_rate": 0.0007894898912821542, + "loss": 0.85142922, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.47485352, + "step": 1686, + "time_per_iteration": 2.530951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077585, + "balance_loss_mlp": 1.02980566, + "epoch": 0.3245479030396306, + "flos": 539220097536.0, + "grad_norm": 0.03833008440392265, + "language_loss": 0.88029444, + "learning_rate": 0.0007892358216051695, + "loss": 0.89107037, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.47753906, + "step": 1687, + "time_per_iteration": 2.7729742527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.01963735, + "epoch": 0.3247402847248942, + "flos": 548697785856.0, + "grad_norm": 0.039082280310976325, + "language_loss": 0.93519121, + "learning_rate": 0.0007889816396368803, + "loss": 0.94586968, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.48193359, + "step": 1688, + "time_per_iteration": 2.625795602798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_mlp": 1.01371753, + "epoch": 0.3249326664101578, + "flos": 378992757504.0, + "grad_norm": 0.03548852277095179, + "language_loss": 0.86296374, + "learning_rate": 0.0007887273454759687, + "loss": 0.87358844, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.48754883, + "step": 1689, + "time_per_iteration": 2.4798507690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070366, + "balance_loss_mlp": 1.02106154, + "epoch": 0.3251250480954213, + "flos": 529123203072.0, + "grad_norm": 0.03304707654173593, + "language_loss": 0.83602285, + "learning_rate": 0.0007884729392211603, + "loss": 0.84672654, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.49194336, + "step": 1690, + "time_per_iteration": 2.6475188732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.01732576, + "epoch": 0.3253174297806849, + "flos": 450559499520.0, + "grad_norm": 0.03986808198030794, + "language_loss": 0.86860085, + "learning_rate": 0.0007882184209712245, + "loss": 0.87927043, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.49609375, + "step": 1691, + "time_per_iteration": 2.5213029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.03961909, + "epoch": 0.32550981146594843, + "flos": 705490801152.0, + "grad_norm": 0.03183986603149819, + "language_loss": 0.86227143, + "learning_rate": 0.000787963790824974, + "loss": 0.8731674, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.49975586, + "step": 1692, + "time_per_iteration": 2.9866673946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086614, + "balance_loss_mlp": 1.03654587, + "epoch": 0.325702193151212, + "flos": 393559233024.0, + "grad_norm": 0.035135222587328305, + "language_loss": 0.90092403, + "learning_rate": 0.0007877090488812651, + "loss": 0.91179013, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.50073242, + "step": 1693, + "time_per_iteration": 2.443784475326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067298, + "balance_loss_mlp": 1.01708698, + "epoch": 0.32589457483647555, + "flos": 578584091136.0, + "grad_norm": 0.03604448220117138, + "language_loss": 0.84406531, + "learning_rate": 0.0007874541952389973, + "loss": 0.85473824, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.50219727, + "step": 1694, + "time_per_iteration": 2.6662275791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069205, + "balance_loss_mlp": 1.01918459, + "epoch": 0.32608695652173914, + "flos": 499330216704.0, + "grad_norm": 0.03462929627838828, + "language_loss": 0.87473089, + "learning_rate": 0.0007871992299971136, + "loss": 0.88542295, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.50024414, + "step": 1695, + "time_per_iteration": 2.5501420497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068106, + "balance_loss_mlp": 1.01803839, + "epoch": 0.32627933820700267, + "flos": 592301948160.0, + "grad_norm": 0.0349674772808078, + "language_loss": 0.85830671, + "learning_rate": 0.0007869441532546001, + "loss": 0.86898774, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.5, + "step": 1696, + "time_per_iteration": 2.7640528678894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065186, + "balance_loss_mlp": 1.01550007, + "epoch": 0.32647171989226625, + "flos": 610274558208.0, + "grad_norm": 0.03448959411295718, + "language_loss": 0.80548751, + "learning_rate": 0.0007866889651104867, + "loss": 0.81613934, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.49658203, + "step": 1697, + "time_per_iteration": 2.8403704166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106687, + "balance_loss_mlp": 1.01723123, + "epoch": 0.32666410157752984, + "flos": 478190599680.0, + "grad_norm": 0.0393752309547029, + "language_loss": 0.84585583, + "learning_rate": 0.000786433665663846, + "loss": 0.85652447, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.49536133, + "step": 1698, + "time_per_iteration": 2.7460434436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.01603401, + "epoch": 0.3268564832627934, + "flos": 719694694656.0, + "grad_norm": 0.03598572558720647, + "language_loss": 0.87469888, + "learning_rate": 0.0007861782550137942, + "loss": 0.88535315, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.49291992, + "step": 1699, + "time_per_iteration": 2.922189474105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_mlp": 1.01299262, + "epoch": 0.32704886494805696, + "flos": 770106268416.0, + "grad_norm": 0.033319227910548664, + "language_loss": 0.86952895, + "learning_rate": 0.0007859227332594901, + "loss": 0.88014954, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.48999023, + "step": 1700, + "time_per_iteration": 2.8891940116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_mlp": 1.00782549, + "epoch": 0.3272412466333205, + "flos": 851405377536.0, + "grad_norm": 0.0384838580126543, + "language_loss": 0.85734528, + "learning_rate": 0.0007856671005001365, + "loss": 0.8679111, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.48730469, + "step": 1701, + "time_per_iteration": 3.169032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105609, + "balance_loss_mlp": 1.00728559, + "epoch": 0.3274336283185841, + "flos": 833041995264.0, + "grad_norm": 0.03605284930108709, + "language_loss": 0.82799482, + "learning_rate": 0.0007854113568349787, + "loss": 0.83855575, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.48779297, + "step": 1702, + "time_per_iteration": 3.123967170715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060179, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3276260100038476, + "flos": 693253407744.0, + "grad_norm": 0.03564674283827795, + "language_loss": 0.81364781, + "learning_rate": 0.0007851555023633052, + "loss": 0.82424963, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.48388672, + "step": 1703, + "time_per_iteration": 2.8430581092834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059511, + "balance_loss_mlp": 1.01120698, + "epoch": 0.3278183916891112, + "flos": 436978702848.0, + "grad_norm": 0.03514994366577059, + "language_loss": 0.83518881, + "learning_rate": 0.0007848995371844474, + "loss": 0.84578383, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.48291016, + "step": 1704, + "time_per_iteration": 2.552917003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_mlp": 1.00861514, + "epoch": 0.3280107733743748, + "flos": 462017293824.0, + "grad_norm": 0.03278124420090015, + "language_loss": 0.81157213, + "learning_rate": 0.0007846434613977801, + "loss": 0.82213771, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.47924805, + "step": 1705, + "time_per_iteration": 2.496506929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062567, + "balance_loss_mlp": 1.01483595, + "epoch": 0.3282031550596383, + "flos": 680529977856.0, + "grad_norm": 0.03615486988598079, + "language_loss": 0.79136091, + "learning_rate": 0.0007843872751027203, + "loss": 0.80198663, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.47705078, + "step": 1706, + "time_per_iteration": 2.8048393726348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00088739, + "epoch": 0.3283955367449019, + "flos": 546255942912.0, + "grad_norm": 0.030185021157442368, + "language_loss": 0.879673, + "learning_rate": 0.0007841309783987287, + "loss": 0.89015824, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.47607422, + "step": 1707, + "time_per_iteration": 2.7402358055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053102, + "balance_loss_mlp": 1.00553715, + "epoch": 0.32858791843016544, + "flos": 482241218304.0, + "grad_norm": 0.035416956868504886, + "language_loss": 0.89878803, + "learning_rate": 0.0007838745713853084, + "loss": 0.90931904, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.4753418, + "step": 1708, + "time_per_iteration": 2.603816270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.00752318, + "epoch": 0.328780300115429, + "flos": 567916589568.0, + "grad_norm": 0.03507338685235107, + "language_loss": 0.84775996, + "learning_rate": 0.0007836180541620053, + "loss": 0.8583082, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.47265625, + "step": 1709, + "time_per_iteration": 2.7194666862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_mlp": 1.00730944, + "epoch": 0.32897268180069256, + "flos": 476992038144.0, + "grad_norm": 0.03621825417570051, + "language_loss": 0.86992389, + "learning_rate": 0.0007833614268284082, + "loss": 0.88046837, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.47094727, + "step": 1710, + "time_per_iteration": 2.510921001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.01346588, + "epoch": 0.32916506348595614, + "flos": 1580453327616.0, + "grad_norm": 0.014405511351568959, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75167489, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.44335938, + "step": 1711, + "time_per_iteration": 4.875708341598511 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.00429153, + "epoch": 0.3293574451712197, + "flos": 483851939328.0, + "grad_norm": 0.03545808379065215, + "language_loss": 0.7916249, + "learning_rate": 0.0007828478422289016, + "loss": 0.80213821, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.4699707, + "step": 1712, + "time_per_iteration": 2.583045721054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_mlp": 1.00582564, + "epoch": 0.32954982685648326, + "flos": 623725097472.0, + "grad_norm": 0.0327870747371716, + "language_loss": 0.89787406, + "learning_rate": 0.0007825908851623833, + "loss": 0.9084022, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.46948242, + "step": 1713, + "time_per_iteration": 2.824685573577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050866, + "balance_loss_mlp": 1.00396931, + "epoch": 0.32974220854174685, + "flos": 546071250432.0, + "grad_norm": 0.03386258255996434, + "language_loss": 0.85659784, + "learning_rate": 0.0007823338183843533, + "loss": 0.8671065, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.46850586, + "step": 1714, + "time_per_iteration": 2.672525644302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051399, + "balance_loss_mlp": 1.00459802, + "epoch": 0.3299345902270104, + "flos": 983823727872.0, + "grad_norm": 0.03566876288837857, + "language_loss": 0.82096756, + "learning_rate": 0.0007820766419946141, + "loss": 0.83148158, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4675293, + "step": 1715, + "time_per_iteration": 3.2718288898468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051636, + "balance_loss_mlp": 1.00662231, + "epoch": 0.33012697191227397, + "flos": 1406904727296.0, + "grad_norm": 0.0085720970679931, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80724114, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.44921875, + "step": 1716, + "time_per_iteration": 4.983957290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065575, + "balance_loss_mlp": 1.01836789, + "epoch": 0.3303193535975375, + "flos": 506170675968.0, + "grad_norm": 0.038525927315114124, + "language_loss": 0.76583785, + "learning_rate": 0.0007815619607794288, + "loss": 0.77649361, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.47167969, + "step": 1717, + "time_per_iteration": 2.6315019130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054249, + "balance_loss_mlp": 1.00713778, + "epoch": 0.3305117352828011, + "flos": 939485653248.0, + "grad_norm": 0.041342276741222116, + "language_loss": 0.83710063, + "learning_rate": 0.0007813044561538001, + "loss": 0.84764308, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.47070312, + "step": 1718, + "time_per_iteration": 3.127446174621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055308, + "balance_loss_mlp": 1.00814831, + "epoch": 0.3307041169680646, + "flos": 722794627584.0, + "grad_norm": 0.03526572402512133, + "language_loss": 0.88796169, + "learning_rate": 0.0007810468423160958, + "loss": 0.89851475, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.47119141, + "step": 1719, + "time_per_iteration": 2.8622305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_mlp": 1.00741386, + "epoch": 0.3308964986533282, + "flos": 584817004800.0, + "grad_norm": 0.029883098234782163, + "language_loss": 0.82424414, + "learning_rate": 0.0007807891193663306, + "loss": 0.83478725, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.46850586, + "step": 1720, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.01715815, + "epoch": 0.33108888033859174, + "flos": 474525895680.0, + "grad_norm": 0.040993977150413745, + "language_loss": 0.82757467, + "learning_rate": 0.0007805312874045614, + "loss": 0.83821499, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.46826172, + "step": 1721, + "time_per_iteration": 2.516045331954956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_mlp": 1.00279772, + "epoch": 0.3312812620238553, + "flos": 386996785152.0, + "grad_norm": 0.03885390252626127, + "language_loss": 0.87709427, + "learning_rate": 0.0007802733465308874, + "loss": 0.88759029, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.4675293, + "step": 1722, + "time_per_iteration": 2.4662280082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_mlp": 1.00108933, + "epoch": 0.3314736437091189, + "flos": 495605241600.0, + "grad_norm": 0.03316625802825005, + "language_loss": 0.85110468, + "learning_rate": 0.0007800152968454501, + "loss": 0.86158121, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.46508789, + "step": 1723, + "time_per_iteration": 2.6313533782958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_mlp": 1.00515401, + "epoch": 0.33166602539438245, + "flos": 654931473408.0, + "grad_norm": 0.02722776998075876, + "language_loss": 0.90998107, + "learning_rate": 0.0007797571384484334, + "loss": 0.92049968, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.46655273, + "step": 1724, + "time_per_iteration": 2.8411970138549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00257659, + "epoch": 0.33185840707964603, + "flos": 521835591168.0, + "grad_norm": 0.03419077024576391, + "language_loss": 0.92796665, + "learning_rate": 0.0007794988714400633, + "loss": 0.93846071, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.46777344, + "step": 1725, + "time_per_iteration": 2.5964980125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_mlp": 1.00367355, + "epoch": 0.33205078876490957, + "flos": 437899252992.0, + "grad_norm": 0.033932075991051254, + "language_loss": 0.86014992, + "learning_rate": 0.0007792404959206079, + "loss": 0.87065518, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.46801758, + "step": 1726, + "time_per_iteration": 2.491852283477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_mlp": 1.00497568, + "epoch": 0.33224317045017315, + "flos": 770095574784.0, + "grad_norm": 0.034529473302537826, + "language_loss": 0.82129228, + "learning_rate": 0.0007789820119903774, + "loss": 0.83181036, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.46777344, + "step": 1727, + "time_per_iteration": 2.9898605346679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_mlp": 1.01260376, + "epoch": 0.3324355521354367, + "flos": 1469296103424.0, + "grad_norm": 0.013638873720884416, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79550946, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.45605469, + "step": 1728, + "time_per_iteration": 4.859704971313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_mlp": 1.00343382, + "epoch": 0.3326279338207003, + "flos": 497800175616.0, + "grad_norm": 0.033386991625918766, + "language_loss": 0.84234303, + "learning_rate": 0.0007784647192990428, + "loss": 0.85284609, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.46826172, + "step": 1729, + "time_per_iteration": 2.7268624305725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_mlp": 1.00419581, + "epoch": 0.33282031550596386, + "flos": 637054127616.0, + "grad_norm": 0.031138270474946127, + "language_loss": 0.81414318, + "learning_rate": 0.0007782059107387696, + "loss": 0.82465172, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.46606445, + "step": 1730, + "time_per_iteration": 2.85831618309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00752223, + "epoch": 0.3330126971912274, + "flos": 690722136576.0, + "grad_norm": 0.03556521205278414, + "language_loss": 0.89100444, + "learning_rate": 0.0007779469941693826, + "loss": 0.9015491, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.46899414, + "step": 1731, + "time_per_iteration": 2.8736839294433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058267, + "balance_loss_mlp": 1.01168013, + "epoch": 0.333205078876491, + "flos": 567554007552.0, + "grad_norm": 0.03898705252222011, + "language_loss": 0.77083337, + "learning_rate": 0.0007776879696914029, + "loss": 0.78141606, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.46533203, + "step": 1732, + "time_per_iteration": 2.84578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055134, + "balance_loss_mlp": 1.00868976, + "epoch": 0.3333974605617545, + "flos": 642171105024.0, + "grad_norm": 0.028730663384365272, + "language_loss": 0.89631069, + "learning_rate": 0.000777428837405392, + "loss": 0.90686202, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.46386719, + "step": 1733, + "time_per_iteration": 2.8595433235168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.00275302, + "epoch": 0.3335898422470181, + "flos": 462779396352.0, + "grad_norm": 0.03984590801707433, + "language_loss": 0.87746447, + "learning_rate": 0.0007771695974119544, + "loss": 0.88795674, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.46411133, + "step": 1734, + "time_per_iteration": 2.5200014114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_mlp": 1.00537193, + "epoch": 0.33378222393228163, + "flos": 854338114560.0, + "grad_norm": 0.03554719013753984, + "language_loss": 0.76235908, + "learning_rate": 0.0007769102498117359, + "loss": 0.77287674, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.46337891, + "step": 1735, + "time_per_iteration": 3.1014633178710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052548, + "balance_loss_mlp": 1.00624716, + "epoch": 0.3339746056175452, + "flos": 956310246144.0, + "grad_norm": 0.03187783426815399, + "language_loss": 0.80701965, + "learning_rate": 0.000776650794705424, + "loss": 0.81754518, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.46240234, + "step": 1736, + "time_per_iteration": 3.253756046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_mlp": 1.00434327, + "epoch": 0.33416698730280875, + "flos": 545895306240.0, + "grad_norm": 0.03238990381642275, + "language_loss": 0.83209848, + "learning_rate": 0.0007763912321937483, + "loss": 0.84260583, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.46337891, + "step": 1737, + "time_per_iteration": 2.712942361831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051632, + "balance_loss_mlp": 1.00525999, + "epoch": 0.33435936898807234, + "flos": 1015876776960.0, + "grad_norm": 0.036470780413058734, + "language_loss": 0.8337301, + "learning_rate": 0.0007761315623774799, + "loss": 0.84424639, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.46313477, + "step": 1738, + "time_per_iteration": 3.38946795463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053137, + "balance_loss_mlp": 1.00671661, + "epoch": 0.3345517506733359, + "flos": 616372356864.0, + "grad_norm": 0.034452353492031275, + "language_loss": 0.88688117, + "learning_rate": 0.0007758717853574313, + "loss": 0.89741254, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.46362305, + "step": 1739, + "time_per_iteration": 2.7438387870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105524, + "balance_loss_mlp": 1.00896263, + "epoch": 0.33474413235859946, + "flos": 495570248448.0, + "grad_norm": 0.03665446817767542, + "language_loss": 0.90973008, + "learning_rate": 0.0007756119012344571, + "loss": 0.92028248, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.4621582, + "step": 1740, + "time_per_iteration": 2.5443572998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.0052774, + "epoch": 0.33493651404386304, + "flos": 629488504320.0, + "grad_norm": 0.0365358867260097, + "language_loss": 0.85516071, + "learning_rate": 0.0007753519101094535, + "loss": 0.86567724, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.46313477, + "step": 1741, + "time_per_iteration": 2.785595417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050508, + "balance_loss_mlp": 1.00396836, + "epoch": 0.3351288957291266, + "flos": 514743365376.0, + "grad_norm": 0.038608286094447275, + "language_loss": 0.87042749, + "learning_rate": 0.0007750918120833575, + "loss": 0.88093251, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.46484375, + "step": 1742, + "time_per_iteration": 2.5612564086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_mlp": 1.00825262, + "epoch": 0.33532127741439016, + "flos": 648483731712.0, + "grad_norm": 0.038902913238311417, + "language_loss": 0.88245445, + "learning_rate": 0.0007748316072571485, + "loss": 0.89300191, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.46435547, + "step": 1743, + "time_per_iteration": 2.8040030002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056064, + "balance_loss_mlp": 1.00969172, + "epoch": 0.3355136590996537, + "flos": 769789373184.0, + "grad_norm": 0.032744002461956113, + "language_loss": 0.80090916, + "learning_rate": 0.0007745712957318467, + "loss": 0.81146979, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.46313477, + "step": 1744, + "time_per_iteration": 2.955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_mlp": 1.00656557, + "epoch": 0.3357060407849173, + "flos": 596650020096.0, + "grad_norm": 0.027209343707751667, + "language_loss": 0.86834347, + "learning_rate": 0.0007743108776085141, + "loss": 0.87887406, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.46435547, + "step": 1745, + "time_per_iteration": 2.8065922260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059361, + "balance_loss_mlp": 1.01277399, + "epoch": 0.3358984224701808, + "flos": 599802442752.0, + "grad_norm": 0.030632877870575562, + "language_loss": 0.83193165, + "learning_rate": 0.0007740503529882543, + "loss": 0.84252524, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.46533203, + "step": 1746, + "time_per_iteration": 2.783057451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058625, + "balance_loss_mlp": 1.01218116, + "epoch": 0.3360908041554444, + "flos": 579430764288.0, + "grad_norm": 0.03209356344176002, + "language_loss": 0.91440552, + "learning_rate": 0.0007737897219722114, + "loss": 0.92499179, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.46386719, + "step": 1747, + "time_per_iteration": 2.6678693294525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00723922, + "epoch": 0.336283185840708, + "flos": 514621856256.0, + "grad_norm": 0.02947569275247992, + "language_loss": 0.81706387, + "learning_rate": 0.0007735289846615716, + "loss": 0.82759976, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.46289062, + "step": 1748, + "time_per_iteration": 2.664217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_mlp": 1.00312185, + "epoch": 0.3364755675259715, + "flos": 526014521856.0, + "grad_norm": 0.03437288512368296, + "language_loss": 0.83148289, + "learning_rate": 0.0007732681411575621, + "loss": 0.84197474, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.45996094, + "step": 1749, + "time_per_iteration": 2.679304361343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_mlp": 1.00613475, + "epoch": 0.3366679492112351, + "flos": 555974704128.0, + "grad_norm": 0.040002531784274646, + "language_loss": 0.88002014, + "learning_rate": 0.0007730071915614514, + "loss": 0.89053994, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.45776367, + "step": 1750, + "time_per_iteration": 2.6813647747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00734055, + "epoch": 0.33686033089649864, + "flos": 428164940544.0, + "grad_norm": 0.03793638318473741, + "language_loss": 0.88937026, + "learning_rate": 0.0007727461359745489, + "loss": 0.89990187, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.45751953, + "step": 1751, + "time_per_iteration": 2.459137439727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_mlp": 1.00425673, + "epoch": 0.3370527125817622, + "flos": 542841060096.0, + "grad_norm": 0.030686532457312277, + "language_loss": 0.86821485, + "learning_rate": 0.0007724849744982056, + "loss": 0.87871712, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.45898438, + "step": 1752, + "time_per_iteration": 2.682023525238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.00412822, + "epoch": 0.33724509426702576, + "flos": 543231832320.0, + "grad_norm": 0.03146587739195435, + "language_loss": 0.82788759, + "learning_rate": 0.0007722237072338131, + "loss": 0.8383888, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.45922852, + "step": 1753, + "time_per_iteration": 2.7289977073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_mlp": 1.00735557, + "epoch": 0.33743747595228935, + "flos": 473753099520.0, + "grad_norm": 0.036309304678759154, + "language_loss": 0.86263937, + "learning_rate": 0.0007719623342828046, + "loss": 0.8731702, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.45654297, + "step": 1754, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_mlp": 1.00127256, + "epoch": 0.33762985763755293, + "flos": 470837859072.0, + "grad_norm": 0.037209700878319825, + "language_loss": 0.84580374, + "learning_rate": 0.000771700855746654, + "loss": 0.85627109, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.45385742, + "step": 1755, + "time_per_iteration": 2.585667848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049151, + "balance_loss_mlp": 1.00366056, + "epoch": 0.33782223932281646, + "flos": 493251859968.0, + "grad_norm": 0.03059786996599164, + "language_loss": 0.89290714, + "learning_rate": 0.0007714392717268763, + "loss": 0.90339863, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.45410156, + "step": 1756, + "time_per_iteration": 2.5836589336395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_mlp": 1.00321686, + "epoch": 0.33801462100808005, + "flos": 466018334976.0, + "grad_norm": 0.035533831964213135, + "language_loss": 0.87473714, + "learning_rate": 0.0007711775823250273, + "loss": 0.88522607, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.45605469, + "step": 1757, + "time_per_iteration": 2.5619492530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_mlp": 1.00417781, + "epoch": 0.3382070026933436, + "flos": 797068584960.0, + "grad_norm": 0.03198873828119691, + "language_loss": 0.84101963, + "learning_rate": 0.0007709157876427039, + "loss": 0.85151625, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.45410156, + "step": 1758, + "time_per_iteration": 3.084735870361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049654, + "balance_loss_mlp": 1.00414026, + "epoch": 0.33839938437860717, + "flos": 509429056512.0, + "grad_norm": 0.031347294296384644, + "language_loss": 0.86196065, + "learning_rate": 0.0007706538877815439, + "loss": 0.87245721, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4543457, + "step": 1759, + "time_per_iteration": 2.6354048252105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_mlp": 1.00371122, + "epoch": 0.3385917660638707, + "flos": 485274077184.0, + "grad_norm": 0.03028112214235413, + "language_loss": 0.83875918, + "learning_rate": 0.0007703918828432259, + "loss": 0.84925139, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.4543457, + "step": 1760, + "time_per_iteration": 2.6017844676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_mlp": 1.00358403, + "epoch": 0.3387841477491343, + "flos": 546416335872.0, + "grad_norm": 0.033680258429279644, + "language_loss": 0.89293355, + "learning_rate": 0.000770129772929469, + "loss": 0.90342498, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.45483398, + "step": 1761, + "time_per_iteration": 2.671287775039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048026, + "balance_loss_mlp": 1.00217831, + "epoch": 0.3389765294343978, + "flos": 721064342784.0, + "grad_norm": 0.03497277274463044, + "language_loss": 0.89180952, + "learning_rate": 0.0007698675581420334, + "loss": 0.90228981, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.45776367, + "step": 1762, + "time_per_iteration": 2.9236271381378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_mlp": 1.00677264, + "epoch": 0.3391689111196614, + "flos": 701264238336.0, + "grad_norm": 0.034268369898116914, + "language_loss": 0.79778481, + "learning_rate": 0.0007696052385827199, + "loss": 0.80830908, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.45581055, + "step": 1763, + "time_per_iteration": 2.9605488777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055868, + "balance_loss_mlp": 1.01018691, + "epoch": 0.339361292804925, + "flos": 628249113600.0, + "grad_norm": 0.03454670185411084, + "language_loss": 0.78905737, + "learning_rate": 0.00076934281435337, + "loss": 0.79961604, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.45605469, + "step": 1764, + "time_per_iteration": 2.7454025745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052159, + "balance_loss_mlp": 1.00647831, + "epoch": 0.33955367449018853, + "flos": 610795587840.0, + "grad_norm": 0.03693575970108084, + "language_loss": 0.86892688, + "learning_rate": 0.0007690802855558658, + "loss": 0.87944847, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.45605469, + "step": 1765, + "time_per_iteration": 2.8936946392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.01057434, + "epoch": 0.3397460561754521, + "flos": 1456589191680.0, + "grad_norm": 0.006269192400269108, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77429777, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.44335938, + "step": 1766, + "time_per_iteration": 4.913206100463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054352, + "balance_loss_mlp": 1.00855207, + "epoch": 0.33993843786071565, + "flos": 488291384832.0, + "grad_norm": 0.039386286306125895, + "language_loss": 0.89967024, + "learning_rate": 0.0007685549146641262, + "loss": 0.91021377, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.45727539, + "step": 1767, + "time_per_iteration": 2.593353271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050967, + "balance_loss_mlp": 1.00554788, + "epoch": 0.34013081954597923, + "flos": 418233296640.0, + "grad_norm": 0.032458575290873634, + "language_loss": 0.89062989, + "learning_rate": 0.0007682920727738579, + "loss": 0.90113962, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.45336914, + "step": 1768, + "time_per_iteration": 2.510331392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054131, + "balance_loss_mlp": 1.00835514, + "epoch": 0.34032320123124277, + "flos": 438430976256.0, + "grad_norm": 0.037803385345055784, + "language_loss": 0.85379529, + "learning_rate": 0.000768029126723369, + "loss": 0.86433661, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.45703125, + "step": 1769, + "time_per_iteration": 2.5152533054351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054345, + "balance_loss_mlp": 1.00852144, + "epoch": 0.34051558291650635, + "flos": 458544085248.0, + "grad_norm": 0.04157155741286578, + "language_loss": 0.82432753, + "learning_rate": 0.0007677660766147447, + "loss": 0.83487099, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.45751953, + "step": 1770, + "time_per_iteration": 2.5669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_mlp": 1.00858307, + "epoch": 0.3407079646017699, + "flos": 1562140489728.0, + "grad_norm": 0.006526141838203855, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73523682, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.44238281, + "step": 1771, + "time_per_iteration": 4.953578233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051633, + "balance_loss_mlp": 1.00602317, + "epoch": 0.3409003462870335, + "flos": 493531816704.0, + "grad_norm": 0.043561887450476046, + "language_loss": 0.80659652, + "learning_rate": 0.0007672396646316306, + "loss": 0.81711292, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.45532227, + "step": 1772, + "time_per_iteration": 2.5720248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00356674, + "epoch": 0.34109272797229706, + "flos": 809822150400.0, + "grad_norm": 0.03735237922314452, + "language_loss": 0.80629146, + "learning_rate": 0.000766976302961512, + "loss": 0.81678128, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.45336914, + "step": 1773, + "time_per_iteration": 3.0438191890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050599, + "balance_loss_mlp": 1.00513268, + "epoch": 0.3412851096575606, + "flos": 471100319232.0, + "grad_norm": 0.03730121261656314, + "language_loss": 0.82086515, + "learning_rate": 0.0007667128376420003, + "loss": 0.83137119, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.45385742, + "step": 1774, + "time_per_iteration": 2.5461959838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_mlp": 1.00681531, + "epoch": 0.3414774913428242, + "flos": 596771529216.0, + "grad_norm": 0.03978671612524881, + "language_loss": 0.85611963, + "learning_rate": 0.0007664492687753817, + "loss": 0.86664057, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.4519043, + "step": 1775, + "time_per_iteration": 2.7454183101654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_mlp": 1.00362372, + "epoch": 0.3416698730280877, + "flos": 528508854528.0, + "grad_norm": 0.03225195621375244, + "language_loss": 0.82109249, + "learning_rate": 0.000766185596463983, + "loss": 0.83158267, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.453125, + "step": 1776, + "time_per_iteration": 2.636876106262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050456, + "balance_loss_mlp": 1.00513279, + "epoch": 0.3418622547133513, + "flos": 876118324992.0, + "grad_norm": 0.033083928099711564, + "language_loss": 0.77454132, + "learning_rate": 0.0007659218208101706, + "loss": 0.78504586, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.45239258, + "step": 1777, + "time_per_iteration": 3.097163677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055706, + "balance_loss_mlp": 1.01031137, + "epoch": 0.34205463639861483, + "flos": 604877624064.0, + "grad_norm": 0.03453483859247358, + "language_loss": 0.86064076, + "learning_rate": 0.0007656579419163515, + "loss": 0.87119782, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.453125, + "step": 1778, + "time_per_iteration": 2.7452263832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055225, + "balance_loss_mlp": 1.0096159, + "epoch": 0.3422470180838784, + "flos": 464715760896.0, + "grad_norm": 0.037184345749469765, + "language_loss": 0.77793133, + "learning_rate": 0.0007653939598849724, + "loss": 0.78848356, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.45532227, + "step": 1779, + "time_per_iteration": 2.5020663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0134964, + "epoch": 0.34243939976914195, + "flos": 1589819222016.0, + "grad_norm": 0.009860928497574006, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83937383, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.4375, + "step": 1780, + "time_per_iteration": 4.958939552307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00849116, + "epoch": 0.34263178145440554, + "flos": 874444420608.0, + "grad_norm": 0.034671274665512654, + "language_loss": 0.80890739, + "learning_rate": 0.000764865686819522, + "loss": 0.81944883, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.45581055, + "step": 1781, + "time_per_iteration": 3.0468943119049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.01148522, + "epoch": 0.3428241631396691, + "flos": 507874715904.0, + "grad_norm": 0.02984044691012994, + "language_loss": 0.86276633, + "learning_rate": 0.0007646013959905449, + "loss": 0.87333775, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.45581055, + "step": 1782, + "time_per_iteration": 2.59788179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056783, + "balance_loss_mlp": 1.01114941, + "epoch": 0.34301654482493266, + "flos": 881525952768.0, + "grad_norm": 0.034646354408830966, + "language_loss": 0.81384498, + "learning_rate": 0.0007643370024341949, + "loss": 0.82441282, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.45556641, + "step": 1783, + "time_per_iteration": 3.0783512592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_mlp": 1.00288546, + "epoch": 0.34320892651019624, + "flos": 432669514752.0, + "grad_norm": 0.031189947688426686, + "language_loss": 0.84145617, + "learning_rate": 0.0007640725062531195, + "loss": 0.85193729, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.45141602, + "step": 1784, + "time_per_iteration": 2.5152812004089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00559807, + "epoch": 0.3434013081954598, + "flos": 464594251776.0, + "grad_norm": 0.03760163078295718, + "language_loss": 0.86810297, + "learning_rate": 0.0007638079075500047, + "loss": 0.87861264, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.45288086, + "step": 1785, + "time_per_iteration": 2.5846633911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.0019455, + "epoch": 0.34359368988072336, + "flos": 1560677522688.0, + "grad_norm": 0.003111664808940008, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76225722, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.43164062, + "step": 1786, + "time_per_iteration": 4.94433856010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_mlp": 1.003739, + "epoch": 0.3437860715659869, + "flos": 496573423872.0, + "grad_norm": 0.03208809815455149, + "language_loss": 0.83580017, + "learning_rate": 0.0007632784029886026, + "loss": 0.8462882, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.45019531, + "step": 1787, + "time_per_iteration": 2.6222987174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_mlp": 1.00523186, + "epoch": 0.3439784532512505, + "flos": 719610124032.0, + "grad_norm": 0.03771035877194531, + "language_loss": 0.86448389, + "learning_rate": 0.0007630134973358873, + "loss": 0.87498415, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.44799805, + "step": 1788, + "time_per_iteration": 2.9359545707702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00320995, + "epoch": 0.34417083493651407, + "flos": 566922162432.0, + "grad_norm": 0.0315223877917514, + "language_loss": 0.8730194, + "learning_rate": 0.0007627484895722763, + "loss": 0.88349926, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.44775391, + "step": 1789, + "time_per_iteration": 2.710433006286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00397587, + "epoch": 0.3443632166217776, + "flos": 797702375424.0, + "grad_norm": 0.034658336241014505, + "language_loss": 0.80973929, + "learning_rate": 0.0007624833798006552, + "loss": 0.82022536, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.4465332, + "step": 1790, + "time_per_iteration": 3.061995506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049093, + "balance_loss_mlp": 1.00419891, + "epoch": 0.3445555983070412, + "flos": 570393425664.0, + "grad_norm": 0.0359941873064626, + "language_loss": 0.84664464, + "learning_rate": 0.0007622181681239483, + "loss": 0.85713559, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.44873047, + "step": 1791, + "time_per_iteration": 2.708204984664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00192165, + "epoch": 0.3447479799923047, + "flos": 569981266176.0, + "grad_norm": 0.030307911746310208, + "language_loss": 0.85264516, + "learning_rate": 0.0007619528546451202, + "loss": 0.86311066, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.4465332, + "step": 1792, + "time_per_iteration": 2.8142476081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047842, + "balance_loss_mlp": 1.00323367, + "epoch": 0.3449403616775683, + "flos": 969333074688.0, + "grad_norm": 0.03266645448260783, + "language_loss": 0.84415537, + "learning_rate": 0.0007616874394671745, + "loss": 0.85463381, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.4465332, + "step": 1793, + "time_per_iteration": 3.340257406234741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048671, + "balance_loss_mlp": 1.00411057, + "epoch": 0.34513274336283184, + "flos": 569677009920.0, + "grad_norm": 0.042713127170940564, + "language_loss": 0.85883492, + "learning_rate": 0.0007614219226931547, + "loss": 0.86932158, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44604492, + "step": 1794, + "time_per_iteration": 2.666299343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00301611, + "epoch": 0.3453251250480954, + "flos": 461858846208.0, + "grad_norm": 0.03409376285864792, + "language_loss": 0.85191298, + "learning_rate": 0.0007611563044261435, + "loss": 0.86238825, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.44580078, + "step": 1795, + "time_per_iteration": 2.509730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00340092, + "epoch": 0.34551750673335896, + "flos": 416520508416.0, + "grad_norm": 0.03871598691360063, + "language_loss": 0.87655377, + "learning_rate": 0.0007608905847692631, + "loss": 0.88703358, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.4465332, + "step": 1796, + "time_per_iteration": 2.468144416809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045751, + "balance_loss_mlp": 1.0012145, + "epoch": 0.34570988841862255, + "flos": 589115499264.0, + "grad_norm": 0.03133980127061019, + "language_loss": 0.87422049, + "learning_rate": 0.0007606247638256749, + "loss": 0.88467801, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.44580078, + "step": 1797, + "time_per_iteration": 2.8401029109954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_mlp": 1.00758362, + "epoch": 0.34590227010388613, + "flos": 1571145747456.0, + "grad_norm": 0.007450888717391324, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79220599, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.42773438, + "step": 1798, + "time_per_iteration": 4.913544178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_mlp": 1.00097656, + "epoch": 0.34609465178914967, + "flos": 1540930886400.0, + "grad_norm": 0.004797214297707501, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80371094, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.4296875, + "step": 1799, + "time_per_iteration": 4.771878719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_mlp": 1.00469148, + "epoch": 0.34628703347441325, + "flos": 610517576448.0, + "grad_norm": 0.037119753663607306, + "language_loss": 0.86850703, + "learning_rate": 0.0007598266943068686, + "loss": 0.8790009, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44750977, + "step": 1800, + "time_per_iteration": 2.746819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050462, + "balance_loss_mlp": 1.00535274, + "epoch": 0.3464794151596768, + "flos": 474265380864.0, + "grad_norm": 0.03436691989893219, + "language_loss": 0.84791839, + "learning_rate": 0.0007595604692488507, + "loss": 0.85842299, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.45019531, + "step": 1801, + "time_per_iteration": 2.564328908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_mlp": 1.00587356, + "epoch": 0.34667179684494037, + "flos": 606822736896.0, + "grad_norm": 0.03808690892272381, + "language_loss": 0.83437663, + "learning_rate": 0.0007592941434205215, + "loss": 0.8448841, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.44848633, + "step": 1802, + "time_per_iteration": 2.826420545578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059933, + "balance_loss_mlp": 1.016922, + "epoch": 0.3468641785302039, + "flos": 1568362709760.0, + "grad_norm": 0.013636299413791342, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74630988, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.43066406, + "step": 1803, + "time_per_iteration": 5.063625812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050175, + "balance_loss_mlp": 1.00523341, + "epoch": 0.3470565602154675, + "flos": 908724484608.0, + "grad_norm": 0.03942668215130471, + "language_loss": 0.80763334, + "learning_rate": 0.0007587611898665566, + "loss": 0.81813502, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.44921875, + "step": 1804, + "time_per_iteration": 3.0834579467773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.0052247, + "epoch": 0.347248941900731, + "flos": 640060741632.0, + "grad_norm": 0.031209613313051415, + "language_loss": 0.82727098, + "learning_rate": 0.0007584945623478315, + "loss": 0.83777213, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.44873047, + "step": 1805, + "time_per_iteration": 2.861560106277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00688517, + "epoch": 0.3474413235859946, + "flos": 848782732800.0, + "grad_norm": 0.03633023546687314, + "language_loss": 0.81859386, + "learning_rate": 0.000758227834472617, + "loss": 0.82910925, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.44702148, + "step": 1806, + "time_per_iteration": 3.0337021350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.00767589, + "epoch": 0.3476337052712582, + "flos": 516697226496.0, + "grad_norm": 0.035243207865769656, + "language_loss": 0.77929807, + "learning_rate": 0.0007579610063444664, + "loss": 0.78982013, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.44580078, + "step": 1807, + "time_per_iteration": 2.7339653968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_mlp": 1.01154768, + "epoch": 0.34782608695652173, + "flos": 915115845888.0, + "grad_norm": 0.03414685220945043, + "language_loss": 0.88006967, + "learning_rate": 0.0007576940780669712, + "loss": 0.89063108, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4465332, + "step": 1808, + "time_per_iteration": 3.211806058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_mlp": 1.00756717, + "epoch": 0.3480184686417853, + "flos": 775084240128.0, + "grad_norm": 0.07111913657628408, + "language_loss": 0.84903318, + "learning_rate": 0.0007574270497437624, + "loss": 0.85955209, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.4440918, + "step": 1809, + "time_per_iteration": 2.984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00518048, + "epoch": 0.34821085032704885, + "flos": 578004735744.0, + "grad_norm": 0.031195535995176178, + "language_loss": 0.88877916, + "learning_rate": 0.000757159921478509, + "loss": 0.89927369, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.44360352, + "step": 1810, + "time_per_iteration": 2.778917074203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.00888824, + "epoch": 0.34840323201231244, + "flos": 1528042205952.0, + "grad_norm": 0.009192534613281171, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75502062, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.42578125, + "step": 1811, + "time_per_iteration": 4.791734218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.0040617, + "epoch": 0.34859561369757597, + "flos": 510182410752.0, + "grad_norm": 0.038842956055274956, + "language_loss": 0.88272417, + "learning_rate": 0.0007566253655367423, + "loss": 0.89320654, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.44262695, + "step": 1812, + "time_per_iteration": 2.6542506217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050997, + "balance_loss_mlp": 1.00689006, + "epoch": 0.34878799538283956, + "flos": 549757341696.0, + "grad_norm": 0.030689577509801048, + "language_loss": 0.90222162, + "learning_rate": 0.000756357938067762, + "loss": 0.91273159, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.44189453, + "step": 1813, + "time_per_iteration": 2.6897120475769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047642, + "balance_loss_mlp": 1.00346339, + "epoch": 0.34898037706810314, + "flos": 985195321344.0, + "grad_norm": 0.03422241032564105, + "language_loss": 0.83499646, + "learning_rate": 0.0007560904110718033, + "loss": 0.84547287, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44262695, + "step": 1814, + "time_per_iteration": 3.3129422664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_mlp": 1.00102115, + "epoch": 0.3491727587533667, + "flos": 682837672704.0, + "grad_norm": 0.03439092984945392, + "language_loss": 0.84187126, + "learning_rate": 0.0007558227846527297, + "loss": 0.85232258, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.44189453, + "step": 1815, + "time_per_iteration": 2.8228747844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_mlp": 1.00880051, + "epoch": 0.34936514043863026, + "flos": 394889997312.0, + "grad_norm": 0.04066201843968592, + "language_loss": 0.84257603, + "learning_rate": 0.0007555550589144429, + "loss": 0.8531037, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44042969, + "step": 1816, + "time_per_iteration": 2.4170055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053968, + "balance_loss_mlp": 1.01000416, + "epoch": 0.3495575221238938, + "flos": 462340992000.0, + "grad_norm": 0.036355924698056825, + "language_loss": 0.84744954, + "learning_rate": 0.000755287233960883, + "loss": 0.85798925, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.44042969, + "step": 1817, + "time_per_iteration": 2.577195405960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_mlp": 1.01115596, + "epoch": 0.3497499038091574, + "flos": 725429911296.0, + "grad_norm": 0.037028935917378006, + "language_loss": 0.78975379, + "learning_rate": 0.0007550193098960292, + "loss": 0.80030644, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.44189453, + "step": 1818, + "time_per_iteration": 2.9124276638031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_mlp": 1.00609303, + "epoch": 0.3499422854944209, + "flos": 829197456384.0, + "grad_norm": 0.03031702063556045, + "language_loss": 0.8721534, + "learning_rate": 0.0007547512868238988, + "loss": 0.88265729, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.44384766, + "step": 1819, + "time_per_iteration": 3.1275570392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_mlp": 1.00203693, + "epoch": 0.3501346671796845, + "flos": 494543740416.0, + "grad_norm": 0.03689243892136314, + "language_loss": 0.8434422, + "learning_rate": 0.0007544831648485473, + "loss": 0.85390604, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.44433594, + "step": 1820, + "time_per_iteration": 2.6672415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053354, + "balance_loss_mlp": 1.00917482, + "epoch": 0.35032704886494803, + "flos": 579849726720.0, + "grad_norm": 0.04031883928972686, + "language_loss": 0.8166672, + "learning_rate": 0.0007542149440740694, + "loss": 0.82720077, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.44262695, + "step": 1821, + "time_per_iteration": 2.659205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_mlp": 1.0069536, + "epoch": 0.3505194305502116, + "flos": 585832819200.0, + "grad_norm": 0.035872862949689145, + "language_loss": 0.86380953, + "learning_rate": 0.000753946624604597, + "loss": 0.8743242, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.44604492, + "step": 1822, + "time_per_iteration": 2.748387575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049848, + "balance_loss_mlp": 1.00528705, + "epoch": 0.3507118122354752, + "flos": 527979076608.0, + "grad_norm": 0.036265727976650085, + "language_loss": 0.88431466, + "learning_rate": 0.0007536782065443015, + "loss": 0.89481318, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44628906, + "step": 1823, + "time_per_iteration": 2.608429193496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054753, + "balance_loss_mlp": 1.00997818, + "epoch": 0.35090419392073874, + "flos": 512546486016.0, + "grad_norm": 0.039277226542114754, + "language_loss": 0.75647306, + "learning_rate": 0.0007534096899973919, + "loss": 0.76702058, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.44799805, + "step": 1824, + "time_per_iteration": 2.702721118927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.0046134, + "epoch": 0.3510965756060023, + "flos": 565196735232.0, + "grad_norm": 0.031185756782702443, + "language_loss": 0.83427215, + "learning_rate": 0.0007531410750681154, + "loss": 0.84476435, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.44677734, + "step": 1825, + "time_per_iteration": 2.7568912506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00831807, + "epoch": 0.35128895729126586, + "flos": 1022254532352.0, + "grad_norm": 0.030666943866844928, + "language_loss": 0.87304175, + "learning_rate": 0.0007528723618607575, + "loss": 0.88357341, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.44848633, + "step": 1826, + "time_per_iteration": 3.4575371742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.00510669, + "epoch": 0.35148133897652944, + "flos": 589425591552.0, + "grad_norm": 0.04947505148138052, + "language_loss": 0.83428013, + "learning_rate": 0.0007526035504796422, + "loss": 0.84477776, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.44702148, + "step": 1827, + "time_per_iteration": 2.7913553714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053021, + "balance_loss_mlp": 1.00838912, + "epoch": 0.351673720661793, + "flos": 496286664192.0, + "grad_norm": 0.03604129919469899, + "language_loss": 0.87358594, + "learning_rate": 0.0007523346410291312, + "loss": 0.88411617, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.44702148, + "step": 1828, + "time_per_iteration": 2.769817590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.00499058, + "epoch": 0.35186610234705656, + "flos": 763999721472.0, + "grad_norm": 0.036507155273352104, + "language_loss": 0.85486639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86536574, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.44921875, + "step": 1829, + "time_per_iteration": 2.960890293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00364745, + "epoch": 0.3520584840323201, + "flos": 627389801472.0, + "grad_norm": 0.0323509050656096, + "language_loss": 0.88885164, + "learning_rate": 0.0007517965283375599, + "loss": 0.89933491, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.44702148, + "step": 1830, + "time_per_iteration": 2.868405818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047282, + "balance_loss_mlp": 1.00260293, + "epoch": 0.3522508657175837, + "flos": 538449246720.0, + "grad_norm": 0.03139560131485747, + "language_loss": 0.89993465, + "learning_rate": 0.0007515273253054132, + "loss": 0.91040754, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.44726562, + "step": 1831, + "time_per_iteration": 2.6341445446014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_mlp": 1.00298083, + "epoch": 0.35244324740284727, + "flos": 568502747904.0, + "grad_norm": 0.03545868131612223, + "language_loss": 0.83198845, + "learning_rate": 0.0007512580246216988, + "loss": 0.8424651, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44726562, + "step": 1832, + "time_per_iteration": 2.691678524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00860476, + "epoch": 0.3526356290881108, + "flos": 514055139840.0, + "grad_norm": 0.03517539350184397, + "language_loss": 0.85415643, + "learning_rate": 0.000750988626390968, + "loss": 0.86468661, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.44506836, + "step": 1833, + "time_per_iteration": 2.6027944087982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050095, + "balance_loss_mlp": 1.00577271, + "epoch": 0.3528280107733744, + "flos": 596973718272.0, + "grad_norm": 0.033457257877764275, + "language_loss": 0.85569251, + "learning_rate": 0.0007507191307178108, + "loss": 0.86619347, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.4440918, + "step": 1834, + "time_per_iteration": 2.8065004348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054314, + "balance_loss_mlp": 1.00999165, + "epoch": 0.3530203924586379, + "flos": 552299306496.0, + "grad_norm": 0.040042804692427734, + "language_loss": 0.75668854, + "learning_rate": 0.0007504495377068543, + "loss": 0.76723164, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.4440918, + "step": 1835, + "time_per_iteration": 2.736536741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052598, + "balance_loss_mlp": 1.00832355, + "epoch": 0.3532127741439015, + "flos": 654306431232.0, + "grad_norm": 0.0387965270782292, + "language_loss": 0.82353514, + "learning_rate": 0.0007501798474627642, + "loss": 0.83406115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44360352, + "step": 1836, + "time_per_iteration": 2.9019014835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052616, + "balance_loss_mlp": 1.00824583, + "epoch": 0.35340515582916504, + "flos": 724151636736.0, + "grad_norm": 0.03634896017563763, + "language_loss": 0.84383756, + "learning_rate": 0.0007499100600902433, + "loss": 0.85436368, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.44458008, + "step": 1837, + "time_per_iteration": 3.0071663856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105242, + "balance_loss_mlp": 1.00812232, + "epoch": 0.35359753751442863, + "flos": 595998733056.0, + "grad_norm": 0.039287132740407786, + "language_loss": 0.853827, + "learning_rate": 0.0007496401756940324, + "loss": 0.86435115, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.44384766, + "step": 1838, + "time_per_iteration": 2.6924545764923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052318, + "balance_loss_mlp": 1.00780547, + "epoch": 0.3537899191996922, + "flos": 633806440704.0, + "grad_norm": 0.041905435038062475, + "language_loss": 0.83424079, + "learning_rate": 0.0007493701943789098, + "loss": 0.84476393, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.44580078, + "step": 1839, + "time_per_iteration": 2.744781970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.00727141, + "epoch": 0.35398230088495575, + "flos": 507353686272.0, + "grad_norm": 0.0353986915713622, + "language_loss": 0.8339026, + "learning_rate": 0.000749100116249692, + "loss": 0.84441972, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.44506836, + "step": 1840, + "time_per_iteration": 2.5823822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_mlp": 1.00490189, + "epoch": 0.35417468257021933, + "flos": 509047032576.0, + "grad_norm": 0.03988576427868324, + "language_loss": 0.86907303, + "learning_rate": 0.0007488299414112321, + "loss": 0.87956673, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.4453125, + "step": 1841, + "time_per_iteration": 2.6171295642852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055735, + "balance_loss_mlp": 1.01126969, + "epoch": 0.35436706425548287, + "flos": 657660076032.0, + "grad_norm": 0.035376771477334756, + "language_loss": 0.78015333, + "learning_rate": 0.0007485596699684215, + "loss": 0.79071069, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.44555664, + "step": 1842, + "time_per_iteration": 2.8393046855926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070571, + "balance_loss_mlp": 1.02572489, + "epoch": 0.35455944594074645, + "flos": 653889414144.0, + "grad_norm": 0.03498191670442302, + "language_loss": 0.86517459, + "learning_rate": 0.000748289302026189, + "loss": 0.87588024, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.44848633, + "step": 1843, + "time_per_iteration": 2.8524656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060154, + "balance_loss_mlp": 1.01566541, + "epoch": 0.35475182762601, + "flos": 850011429888.0, + "grad_norm": 0.03510464987001869, + "language_loss": 0.86422503, + "learning_rate": 0.0007480188376895004, + "loss": 0.87482655, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.4453125, + "step": 1844, + "time_per_iteration": 3.1228320598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048378, + "balance_loss_mlp": 1.00584412, + "epoch": 0.3549442093112736, + "flos": 1524777989376.0, + "grad_norm": 0.00626506088035535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74859715, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.42578125, + "step": 1845, + "time_per_iteration": 4.8881309032440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053267, + "balance_loss_mlp": 1.00906432, + "epoch": 0.3551365909965371, + "flos": 652715152128.0, + "grad_norm": 0.03760423595997357, + "language_loss": 0.78996736, + "learning_rate": 0.0007474776202528074, + "loss": 0.80050004, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.44287109, + "step": 1846, + "time_per_iteration": 2.9740474224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055296, + "balance_loss_mlp": 1.01118839, + "epoch": 0.3553289726818007, + "flos": 898923098112.0, + "grad_norm": 0.04404679517400465, + "language_loss": 0.81547415, + "learning_rate": 0.000747206867362922, + "loss": 0.82602704, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.44189453, + "step": 1847, + "time_per_iteration": 3.0834994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052455, + "balance_loss_mlp": 1.00822854, + "epoch": 0.3555213543670643, + "flos": 689734512384.0, + "grad_norm": 0.03965516085145463, + "language_loss": 0.8451193, + "learning_rate": 0.0007469360184988194, + "loss": 0.85564387, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.44311523, + "step": 1848, + "time_per_iteration": 2.8074848651885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050526, + "balance_loss_mlp": 1.00632286, + "epoch": 0.3557137360523278, + "flos": 539604066816.0, + "grad_norm": 0.033414642983477745, + "language_loss": 0.87585986, + "learning_rate": 0.0007466650737656518, + "loss": 0.88636506, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.44287109, + "step": 1849, + "time_per_iteration": 2.604926347732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049824, + "balance_loss_mlp": 1.00562072, + "epoch": 0.3559061177375914, + "flos": 403154539776.0, + "grad_norm": 0.03235738057519393, + "language_loss": 0.9068622, + "learning_rate": 0.0007463940332686098, + "loss": 0.91736042, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.44287109, + "step": 1850, + "time_per_iteration": 2.4913558959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056333, + "balance_loss_mlp": 1.01196373, + "epoch": 0.35609849942285493, + "flos": 697895042304.0, + "grad_norm": 0.0320980052654178, + "language_loss": 0.85078359, + "learning_rate": 0.0007461228971129205, + "loss": 0.86134696, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.44458008, + "step": 1851, + "time_per_iteration": 2.898726463317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.01557255, + "epoch": 0.3562908811081185, + "flos": 570002653440.0, + "grad_norm": 0.036011031747473804, + "language_loss": 0.86088216, + "learning_rate": 0.0007458516654038483, + "loss": 0.87148154, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.44458008, + "step": 1852, + "time_per_iteration": 2.6340625286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050769, + "balance_loss_mlp": 1.00651896, + "epoch": 0.35648326279338205, + "flos": 683610468864.0, + "grad_norm": 0.03085087761867809, + "language_loss": 0.87196577, + "learning_rate": 0.0007455803382466946, + "loss": 0.88247347, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44335938, + "step": 1853, + "time_per_iteration": 2.7936782836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_mlp": 1.00468445, + "epoch": 0.35667564447864564, + "flos": 630341980416.0, + "grad_norm": 0.02905562967314866, + "language_loss": 0.8756358, + "learning_rate": 0.0007453089157467979, + "loss": 0.88612318, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.44140625, + "step": 1854, + "time_per_iteration": 2.8003768920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053337, + "balance_loss_mlp": 1.00920558, + "epoch": 0.35686802616390917, + "flos": 815505844224.0, + "grad_norm": 0.03187136352260198, + "language_loss": 0.82840991, + "learning_rate": 0.0007450373980095341, + "loss": 0.83894324, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.44213867, + "step": 1855, + "time_per_iteration": 3.072218179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.00869787, + "epoch": 0.35706040784917276, + "flos": 527206280448.0, + "grad_norm": 0.03314729603592228, + "language_loss": 0.87318838, + "learning_rate": 0.0007447657851403155, + "loss": 0.88371575, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.44116211, + "step": 1856, + "time_per_iteration": 2.5849640369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047421, + "balance_loss_mlp": 1.00338531, + "epoch": 0.35725278953443634, + "flos": 513065570304.0, + "grad_norm": 0.033114806318055315, + "language_loss": 0.79136717, + "learning_rate": 0.0007444940772445915, + "loss": 0.80184138, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.44116211, + "step": 1857, + "time_per_iteration": 2.729100227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048076, + "balance_loss_mlp": 1.00404048, + "epoch": 0.3574451712196999, + "flos": 488493573888.0, + "grad_norm": 0.030889137628629628, + "language_loss": 0.80389744, + "learning_rate": 0.0007442222744278484, + "loss": 0.81437826, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.44116211, + "step": 1858, + "time_per_iteration": 2.673224687576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_mlp": 1.00433075, + "epoch": 0.35763755290496346, + "flos": 551822018304.0, + "grad_norm": 0.029026961526961815, + "language_loss": 0.8481214, + "learning_rate": 0.0007439503767956099, + "loss": 0.8586058, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.44189453, + "step": 1859, + "time_per_iteration": 2.7095680236816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_mlp": 1.00567627, + "epoch": 0.357829934590227, + "flos": 1507228232448.0, + "grad_norm": 0.007157576597672099, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80719817, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.42578125, + "step": 1860, + "time_per_iteration": 4.909587383270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00549006, + "epoch": 0.3580223162754906, + "flos": 569842260480.0, + "grad_norm": 0.027013738684289513, + "language_loss": 0.86190987, + "learning_rate": 0.000743406297506922, + "loss": 0.87240434, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.44042969, + "step": 1861, + "time_per_iteration": 2.7355735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00518215, + "epoch": 0.3582146979607541, + "flos": 627761131776.0, + "grad_norm": 0.0339710504259095, + "language_loss": 0.84903038, + "learning_rate": 0.0007431341160617031, + "loss": 0.8595221, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.44067383, + "step": 1862, + "time_per_iteration": 2.8932178020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054928, + "balance_loss_mlp": 1.01082051, + "epoch": 0.3584070796460177, + "flos": 508319923200.0, + "grad_norm": 0.030700215862736833, + "language_loss": 0.88826722, + "learning_rate": 0.0007428618402234491, + "loss": 0.89881647, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.44189453, + "step": 1863, + "time_per_iteration": 2.6574699878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105281, + "balance_loss_mlp": 1.00882196, + "epoch": 0.3585994613312813, + "flos": 607641219840.0, + "grad_norm": 0.030466419719222444, + "language_loss": 0.80836076, + "learning_rate": 0.0007425894700978668, + "loss": 0.8188889, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.44067383, + "step": 1864, + "time_per_iteration": 2.7388875484466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048686, + "balance_loss_mlp": 1.00467396, + "epoch": 0.3587918430165448, + "flos": 1415089579776.0, + "grad_norm": 0.030441642762586523, + "language_loss": 0.8033703, + "learning_rate": 0.0007423170057906996, + "loss": 0.8138572, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.44091797, + "step": 1865, + "time_per_iteration": 3.8431384563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3589842247018084, + "flos": 479514561024.0, + "grad_norm": 0.03198832631900347, + "language_loss": 0.8674798, + "learning_rate": 0.0007420444474077275, + "loss": 0.87792838, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.44067383, + "step": 1866, + "time_per_iteration": 2.5487258434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_mlp": 1.0028863, + "epoch": 0.35917660638707194, + "flos": 505706026752.0, + "grad_norm": 0.036738697797889144, + "language_loss": 0.90374953, + "learning_rate": 0.0007417717950547671, + "loss": 0.91421801, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.44042969, + "step": 1867, + "time_per_iteration": 2.6784894466400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052044, + "balance_loss_mlp": 1.00960541, + "epoch": 0.3593689880723355, + "flos": 1495484645376.0, + "grad_norm": 0.0080630279180651, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77048653, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.42480469, + "step": 1868, + "time_per_iteration": 4.930212497711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104515, + "balance_loss_mlp": 1.00118589, + "epoch": 0.35956136975759906, + "flos": 529672422912.0, + "grad_norm": 0.03031015371847706, + "language_loss": 0.85577166, + "learning_rate": 0.0007412262088623299, + "loss": 0.86622322, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.44042969, + "step": 1869, + "time_per_iteration": 2.73066782951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_mlp": 1.00385797, + "epoch": 0.35975375144286265, + "flos": 536000600832.0, + "grad_norm": 0.03552204952813077, + "language_loss": 0.80084878, + "learning_rate": 0.0007409532752346684, + "loss": 0.81132627, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.43969727, + "step": 1870, + "time_per_iteration": 2.6379218101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00638759, + "epoch": 0.3599461331281262, + "flos": 505929603072.0, + "grad_norm": 0.028943079800369927, + "language_loss": 0.8876543, + "learning_rate": 0.0007406802480606491, + "loss": 0.89815807, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.44067383, + "step": 1871, + "time_per_iteration": 2.6258225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049318, + "balance_loss_mlp": 1.00547302, + "epoch": 0.36013851481338977, + "flos": 512537737728.0, + "grad_norm": 0.03609789661305553, + "language_loss": 0.91903639, + "learning_rate": 0.0007404071274462707, + "loss": 0.92952955, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.43920898, + "step": 1872, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_mlp": 1.00494921, + "epoch": 0.36033089649865335, + "flos": 548632657152.0, + "grad_norm": 0.03255043761438457, + "language_loss": 0.84506214, + "learning_rate": 0.0007401339134975682, + "loss": 0.85555267, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.44189453, + "step": 1873, + "time_per_iteration": 2.6355786323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_mlp": 1.00575614, + "epoch": 0.3605232781839169, + "flos": 459614334720.0, + "grad_norm": 0.03456024010205507, + "language_loss": 0.84983587, + "learning_rate": 0.0007398606063206122, + "loss": 0.86033404, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.44140625, + "step": 1874, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_mlp": 1.00577569, + "epoch": 0.36071565986918047, + "flos": 510564434688.0, + "grad_norm": 0.03262157431229983, + "language_loss": 0.79280519, + "learning_rate": 0.0007395872060215101, + "loss": 0.80330336, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.44116211, + "step": 1875, + "time_per_iteration": 2.59242582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_mlp": 1.00785792, + "epoch": 0.360908041554444, + "flos": 560257647360.0, + "grad_norm": 0.03426029536230158, + "language_loss": 0.89306337, + "learning_rate": 0.0007393137127064056, + "loss": 0.9035809, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.43969727, + "step": 1876, + "time_per_iteration": 2.6217613220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00577068, + "epoch": 0.3611004232397076, + "flos": 524879143680.0, + "grad_norm": 0.03313366432597027, + "language_loss": 0.84778088, + "learning_rate": 0.0007390401264814779, + "loss": 0.85827708, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.43920898, + "step": 1877, + "time_per_iteration": 2.621366262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_mlp": 1.00752687, + "epoch": 0.3612928049249711, + "flos": 542033270784.0, + "grad_norm": 0.036139064810301956, + "language_loss": 0.85492337, + "learning_rate": 0.0007387664474529427, + "loss": 0.86543715, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.43920898, + "step": 1878, + "time_per_iteration": 2.6200942993164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.00776029, + "epoch": 0.3614851866102347, + "flos": 553630070784.0, + "grad_norm": 0.03346030230294773, + "language_loss": 0.91826439, + "learning_rate": 0.0007384926757270518, + "loss": 0.92877924, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.43798828, + "step": 1879, + "time_per_iteration": 2.6367645263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048472, + "balance_loss_mlp": 1.00481761, + "epoch": 0.36167756829549824, + "flos": 773427832320.0, + "grad_norm": 0.030641441804162946, + "language_loss": 0.80120707, + "learning_rate": 0.0007382188114100924, + "loss": 0.81169182, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.43725586, + "step": 1880, + "time_per_iteration": 2.9662272930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_mlp": 1.0051316, + "epoch": 0.36186994998076183, + "flos": 713188627200.0, + "grad_norm": 0.030233131555612264, + "language_loss": 0.82161707, + "learning_rate": 0.0007379448546083884, + "loss": 0.83210421, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.43652344, + "step": 1881, + "time_per_iteration": 2.9433577060699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_mlp": 1.00420797, + "epoch": 0.3620623316660254, + "flos": 748901522688.0, + "grad_norm": 0.028477152913266954, + "language_loss": 0.88624489, + "learning_rate": 0.0007376708054282992, + "loss": 0.89672405, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.43774414, + "step": 1882, + "time_per_iteration": 2.9565789699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00425851, + "epoch": 0.36225471335128895, + "flos": 483535044096.0, + "grad_norm": 0.03088815199044137, + "language_loss": 0.84632647, + "learning_rate": 0.0007373966639762201, + "loss": 0.85680467, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.4362793, + "step": 1883, + "time_per_iteration": 2.6308107376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_mlp": 1.00762069, + "epoch": 0.36244709503655254, + "flos": 507911654400.0, + "grad_norm": 0.045291722940018896, + "language_loss": 0.89109468, + "learning_rate": 0.0007371224303585822, + "loss": 0.90160698, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.43676758, + "step": 1884, + "time_per_iteration": 2.5738682746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053806, + "balance_loss_mlp": 1.01194, + "epoch": 0.36263947672181607, + "flos": 1397054741760.0, + "grad_norm": 0.007615502937667497, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81410873, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.41894531, + "step": 1885, + "time_per_iteration": 4.7547221183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105859, + "balance_loss_mlp": 1.01500738, + "epoch": 0.36283185840707965, + "flos": 654523204608.0, + "grad_norm": 0.03432185210428161, + "language_loss": 0.83272493, + "learning_rate": 0.0007365736870525335, + "loss": 0.84331077, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.43652344, + "step": 1886, + "time_per_iteration": 2.8305654525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_mlp": 1.00591362, + "epoch": 0.3630242400923432, + "flos": 489845725440.0, + "grad_norm": 0.036050619102321185, + "language_loss": 0.8310129, + "learning_rate": 0.000736299177577164, + "loss": 0.84150714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.43579102, + "step": 1887, + "time_per_iteration": 2.632485866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105207, + "balance_loss_mlp": 1.00853443, + "epoch": 0.3632166217776068, + "flos": 518232125184.0, + "grad_norm": 0.034844830144856315, + "language_loss": 0.84275633, + "learning_rate": 0.0007360245763623174, + "loss": 0.85327709, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.43603516, + "step": 1888, + "time_per_iteration": 2.6480350494384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049354, + "balance_loss_mlp": 1.00596213, + "epoch": 0.36340900346287036, + "flos": 647348353536.0, + "grad_norm": 0.03423797247490227, + "language_loss": 0.90607542, + "learning_rate": 0.0007357498835146039, + "loss": 0.91656893, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.43457031, + "step": 1889, + "time_per_iteration": 2.8152430057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055179, + "balance_loss_mlp": 1.01154852, + "epoch": 0.3636013851481339, + "flos": 554411615232.0, + "grad_norm": 0.0362068794335816, + "language_loss": 0.87730169, + "learning_rate": 0.0007354750991406684, + "loss": 0.8878535, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.43701172, + "step": 1890, + "time_per_iteration": 2.71056866645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047867, + "balance_loss_mlp": 1.0042125, + "epoch": 0.3637937668333975, + "flos": 547692665088.0, + "grad_norm": 0.03762567530645649, + "language_loss": 0.81321651, + "learning_rate": 0.0007352002233471919, + "loss": 0.82369518, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.43725586, + "step": 1891, + "time_per_iteration": 2.6590068340301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.01098096, + "epoch": 0.363986148518661, + "flos": 539211349248.0, + "grad_norm": 0.036762310622647384, + "language_loss": 0.79772675, + "learning_rate": 0.0007349252562408906, + "loss": 0.808276, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.44018555, + "step": 1892, + "time_per_iteration": 2.715721368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111044, + "balance_loss_mlp": 1.0663805, + "epoch": 0.3641785302039246, + "flos": 661511417856.0, + "grad_norm": 0.04360229312277944, + "language_loss": 0.82000142, + "learning_rate": 0.0007346501979285158, + "loss": 0.83110583, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.44140625, + "step": 1893, + "time_per_iteration": 2.927184820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_mlp": 1.01934052, + "epoch": 0.36437091188918813, + "flos": 1472084965632.0, + "grad_norm": 0.015393341944361743, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81600404, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.41992188, + "step": 1894, + "time_per_iteration": 4.786630868911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050162, + "balance_loss_mlp": 1.00648379, + "epoch": 0.3645632935744517, + "flos": 598445433600.0, + "grad_norm": 0.030741456608760154, + "language_loss": 0.86771834, + "learning_rate": 0.0007340998081127308, + "loss": 0.87822002, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.4375, + "step": 1895, + "time_per_iteration": 2.7590408325195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00284135, + "epoch": 0.36475567525971525, + "flos": 600696748032.0, + "grad_norm": 0.032247737775586885, + "language_loss": 0.91682166, + "learning_rate": 0.0007338244768230007, + "loss": 0.92728615, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.43676758, + "step": 1896, + "time_per_iteration": 2.806001663208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048582, + "balance_loss_mlp": 1.00502336, + "epoch": 0.36494805694497884, + "flos": 799832180736.0, + "grad_norm": 0.03166243516623692, + "language_loss": 0.89817142, + "learning_rate": 0.0007335490547545578, + "loss": 0.90865725, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.4362793, + "step": 1897, + "time_per_iteration": 3.0448927879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049034, + "balance_loss_mlp": 1.00535595, + "epoch": 0.3651404386302424, + "flos": 638478210816.0, + "grad_norm": 0.03536594015703217, + "language_loss": 0.82896376, + "learning_rate": 0.0007332735420143308, + "loss": 0.83945411, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.4375, + "step": 1898, + "time_per_iteration": 2.739990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_mlp": 1.00419891, + "epoch": 0.36533282031550596, + "flos": 492563634432.0, + "grad_norm": 0.03491103953335563, + "language_loss": 0.87321162, + "learning_rate": 0.0007329979387092826, + "loss": 0.88369012, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.43725586, + "step": 1899, + "time_per_iteration": 2.5661838054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_mlp": 1.00020182, + "epoch": 0.36552520200076954, + "flos": 857509979136.0, + "grad_norm": 0.025671163998745472, + "language_loss": 0.84557235, + "learning_rate": 0.0007327222449464124, + "loss": 0.85601258, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.43896484, + "step": 1900, + "time_per_iteration": 3.2916476726531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_mlp": 1.00545931, + "epoch": 0.3657175836860331, + "flos": 484716109056.0, + "grad_norm": 0.033162883177173925, + "language_loss": 0.89287698, + "learning_rate": 0.0007324464608327538, + "loss": 0.90336835, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4375, + "step": 1901, + "time_per_iteration": 2.6514644622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050151, + "balance_loss_mlp": 1.00647259, + "epoch": 0.36590996537129666, + "flos": 435721815552.0, + "grad_norm": 0.0385016057803441, + "language_loss": 0.88887352, + "learning_rate": 0.0007321705864753758, + "loss": 0.89937502, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.4375, + "step": 1902, + "time_per_iteration": 2.6785683631896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00171947, + "epoch": 0.3661023470565602, + "flos": 713514270720.0, + "grad_norm": 0.027132815564249787, + "language_loss": 0.85073566, + "learning_rate": 0.0007318946219813823, + "loss": 0.86119133, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.43920898, + "step": 1903, + "time_per_iteration": 2.9874324798583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00431406, + "epoch": 0.3662947287418238, + "flos": 565823722752.0, + "grad_norm": 0.03452387251033087, + "language_loss": 0.90632051, + "learning_rate": 0.000731618567457912, + "loss": 0.91680402, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.44116211, + "step": 1904, + "time_per_iteration": 2.684290885925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049154, + "balance_loss_mlp": 1.00516582, + "epoch": 0.3664871104270873, + "flos": 791203110912.0, + "grad_norm": 0.032826620308443535, + "language_loss": 0.87174082, + "learning_rate": 0.000731342423012139, + "loss": 0.88223237, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.44067383, + "step": 1905, + "time_per_iteration": 3.0617177486419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051542, + "balance_loss_mlp": 1.00750625, + "epoch": 0.3666794921123509, + "flos": 753981561600.0, + "grad_norm": 0.03506961035904521, + "language_loss": 0.83108962, + "learning_rate": 0.0007310661887512722, + "loss": 0.84160507, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.44116211, + "step": 1906, + "time_per_iteration": 3.046901226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.0011121, + "epoch": 0.3668718737976145, + "flos": 524607935232.0, + "grad_norm": 0.03388484398579531, + "language_loss": 0.82964659, + "learning_rate": 0.0007307898647825549, + "loss": 0.84010023, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.44335938, + "step": 1907, + "time_per_iteration": 2.6592161655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_mlp": 1.00767255, + "epoch": 0.367064255482878, + "flos": 573046205952.0, + "grad_norm": 0.03554957537225944, + "language_loss": 0.8992576, + "learning_rate": 0.0007305134512132659, + "loss": 0.90977585, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.44238281, + "step": 1908, + "time_per_iteration": 2.6961183547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055033, + "balance_loss_mlp": 1.01078284, + "epoch": 0.3672566371681416, + "flos": 448054473216.0, + "grad_norm": 0.04018581054394134, + "language_loss": 0.843858, + "learning_rate": 0.0007302369481507183, + "loss": 0.85440832, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.44335938, + "step": 1909, + "time_per_iteration": 2.488203763961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056026, + "balance_loss_mlp": 1.01358795, + "epoch": 0.36744901885340514, + "flos": 1543366893312.0, + "grad_norm": 0.00771809390988723, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81017786, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.42480469, + "step": 1910, + "time_per_iteration": 4.828088045120239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059395, + "balance_loss_mlp": 1.01457202, + "epoch": 0.36764140053866873, + "flos": 564762221568.0, + "grad_norm": 0.032014471163266715, + "language_loss": 0.86287534, + "learning_rate": 0.000729683673975274, + "loss": 0.87346923, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.44824219, + "step": 1911, + "time_per_iteration": 2.6982359886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058317, + "balance_loss_mlp": 1.01366162, + "epoch": 0.36783378222393226, + "flos": 1218652614144.0, + "grad_norm": 0.03007186425733569, + "language_loss": 0.8357197, + "learning_rate": 0.0007294069030771774, + "loss": 0.84630299, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.44702148, + "step": 1912, + "time_per_iteration": 3.6612210273742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049037, + "balance_loss_mlp": 1.0043577, + "epoch": 0.36802616390919585, + "flos": 499720988928.0, + "grad_norm": 0.03131225250708543, + "language_loss": 0.91280997, + "learning_rate": 0.0007291300431154224, + "loss": 0.92330033, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.44726562, + "step": 1913, + "time_per_iteration": 2.574129581451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053715, + "balance_loss_mlp": 1.01108551, + "epoch": 0.36821854559445943, + "flos": 1585618904064.0, + "grad_norm": 0.006266309435424964, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7144345, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.42675781, + "step": 1914, + "time_per_iteration": 4.960723876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052577, + "balance_loss_mlp": 1.0082792, + "epoch": 0.36841092727972297, + "flos": 837090668544.0, + "grad_norm": 0.03136779226227803, + "language_loss": 0.80375087, + "learning_rate": 0.0007285760564309179, + "loss": 0.81427664, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.44384766, + "step": 1915, + "time_per_iteration": 3.0985960960388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010541, + "balance_loss_mlp": 1.00965917, + "epoch": 0.36860330896498655, + "flos": 691211085312.0, + "grad_norm": 0.031502418433557444, + "language_loss": 0.85988045, + "learning_rate": 0.0007282989299232448, + "loss": 0.87042141, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.4453125, + "step": 1916, + "time_per_iteration": 3.034715175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.01065195, + "epoch": 0.3687956906502501, + "flos": 555240791808.0, + "grad_norm": 0.03953946470073971, + "language_loss": 0.84794021, + "learning_rate": 0.0007280217147820668, + "loss": 0.85849106, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.4453125, + "step": 1917, + "time_per_iteration": 2.61297869682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_mlp": 1.0093317, + "epoch": 0.3689880723355137, + "flos": 577820043264.0, + "grad_norm": 0.030128455165502346, + "language_loss": 0.7994225, + "learning_rate": 0.0007277444111150079, + "loss": 0.80996048, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.44555664, + "step": 1918, + "time_per_iteration": 2.7244873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_mlp": 1.00845671, + "epoch": 0.3691804540207772, + "flos": 529887250944.0, + "grad_norm": 0.035938670194894204, + "language_loss": 0.84948546, + "learning_rate": 0.0007274670190297272, + "loss": 0.86001301, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.44384766, + "step": 1919, + "time_per_iteration": 2.6209609508514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_mlp": 1.0041858, + "epoch": 0.3693728357060408, + "flos": 562181372928.0, + "grad_norm": 0.026922320390231402, + "language_loss": 0.82273662, + "learning_rate": 0.0007271895386339179, + "loss": 0.83322287, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.4453125, + "step": 1920, + "time_per_iteration": 2.7952609062194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00292087, + "epoch": 0.3695652173913043, + "flos": 580900534272.0, + "grad_norm": 0.03055527362799568, + "language_loss": 0.83712995, + "learning_rate": 0.0007269119700353073, + "loss": 0.84760189, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.44360352, + "step": 1921, + "time_per_iteration": 2.808595895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049468, + "balance_loss_mlp": 1.00519335, + "epoch": 0.3697575990765679, + "flos": 514059997440.0, + "grad_norm": 0.029192022992987326, + "language_loss": 0.85655916, + "learning_rate": 0.0007266343133416571, + "loss": 0.86705387, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.44360352, + "step": 1922, + "time_per_iteration": 2.7229409217834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045757, + "balance_loss_mlp": 1.00255585, + "epoch": 0.3699499807618315, + "flos": 1573906430976.0, + "grad_norm": 0.004633598174219594, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.7816267, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.43261719, + "step": 1923, + "time_per_iteration": 4.855220556259155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049661, + "balance_loss_mlp": 1.00526702, + "epoch": 0.37014236244709503, + "flos": 498325095936.0, + "grad_norm": 0.04063724538866958, + "language_loss": 0.84789312, + "learning_rate": 0.0007260787361004556, + "loss": 0.85838968, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.44482422, + "step": 1924, + "time_per_iteration": 2.5634405612945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063278, + "balance_loss_mlp": 1.01998138, + "epoch": 0.3703347441323586, + "flos": 1447608233472.0, + "grad_norm": 0.011285785538321925, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.7482478, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 4.881471157073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050604, + "balance_loss_mlp": 1.00601971, + "epoch": 0.37052712581762215, + "flos": 564714589440.0, + "grad_norm": 0.030700116077417884, + "language_loss": 0.87676865, + "learning_rate": 0.0007255228077730903, + "loss": 0.88727468, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.44628906, + "step": 1926, + "time_per_iteration": 2.6604056358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_mlp": 1.00426066, + "epoch": 0.37071950750288574, + "flos": 927571958016.0, + "grad_norm": 0.030848240929213684, + "language_loss": 0.82266426, + "learning_rate": 0.0007252447122218632, + "loss": 0.83315009, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.4440918, + "step": 1927, + "time_per_iteration": 3.189232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.00424135, + "epoch": 0.37091188918814927, + "flos": 419201478912.0, + "grad_norm": 0.038028798643346066, + "language_loss": 0.88517463, + "learning_rate": 0.0007249665292228834, + "loss": 0.89565861, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.44238281, + "step": 1928, + "time_per_iteration": 2.6051783561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_mlp": 1.00443351, + "epoch": 0.37110427087341286, + "flos": 464147099136.0, + "grad_norm": 0.03246756835091633, + "language_loss": 0.8426615, + "learning_rate": 0.000724688258884151, + "loss": 0.85314661, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.44165039, + "step": 1929, + "time_per_iteration": 2.5537402629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105112, + "balance_loss_mlp": 1.00703681, + "epoch": 0.3712966525586764, + "flos": 851081679360.0, + "grad_norm": 0.026814038228573516, + "language_loss": 0.86998665, + "learning_rate": 0.0007244099013137002, + "loss": 0.88049793, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.44165039, + "step": 1930, + "time_per_iteration": 3.091195821762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052812, + "balance_loss_mlp": 1.00901484, + "epoch": 0.37148903424394, + "flos": 927559319040.0, + "grad_norm": 0.03484228463474462, + "language_loss": 0.89224607, + "learning_rate": 0.0007241314566195993, + "loss": 0.90277416, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.4387207, + "step": 1931, + "time_per_iteration": 3.2276151180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00616395, + "epoch": 0.37168141592920356, + "flos": 520821722112.0, + "grad_norm": 0.033577876196724185, + "language_loss": 0.86394525, + "learning_rate": 0.0007238529249099496, + "loss": 0.87444603, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.43994141, + "step": 1932, + "time_per_iteration": 2.6099538803100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_mlp": 1.00075531, + "epoch": 0.3718737976144671, + "flos": 1449062452224.0, + "grad_norm": 0.005805601038449312, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78900075, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.42480469, + "step": 1933, + "time_per_iteration": 4.864013910293579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00741839, + "epoch": 0.3720661792997307, + "flos": 760954223616.0, + "grad_norm": 0.031651541573232696, + "language_loss": 0.81381935, + "learning_rate": 0.000723295600876581, + "loss": 0.82433319, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.44042969, + "step": 1934, + "time_per_iteration": 3.003988742828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_mlp": 1.00353265, + "epoch": 0.3722585609849942, + "flos": 518045487360.0, + "grad_norm": 0.031160015664157277, + "language_loss": 0.88386387, + "learning_rate": 0.0007230168087692344, + "loss": 0.89433783, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.43945312, + "step": 1935, + "time_per_iteration": 2.6490824222564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_mlp": 1.00165451, + "epoch": 0.3724509426702578, + "flos": 783869812224.0, + "grad_norm": 0.03743087194604022, + "language_loss": 0.82867873, + "learning_rate": 0.0007227379300790839, + "loss": 0.83913326, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.4387207, + "step": 1936, + "time_per_iteration": 3.010700225830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044084, + "balance_loss_mlp": 1.00011992, + "epoch": 0.37264332435552133, + "flos": 392599799040.0, + "grad_norm": 0.032423549870759565, + "language_loss": 0.86443603, + "learning_rate": 0.0007224589649143997, + "loss": 0.87487686, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.44042969, + "step": 1937, + "time_per_iteration": 2.54010272026062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_mlp": 1.00072384, + "epoch": 0.3728357060407849, + "flos": 543913254912.0, + "grad_norm": 0.03387233199209411, + "language_loss": 0.81436574, + "learning_rate": 0.0007221799133834861, + "loss": 0.82481098, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.4387207, + "step": 1938, + "time_per_iteration": 2.6355655193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_mlp": 1.00154293, + "epoch": 0.3730280877260485, + "flos": 434484370176.0, + "grad_norm": 0.03416430777388856, + "language_loss": 0.82122993, + "learning_rate": 0.00072190077559468, + "loss": 0.83168304, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.43847656, + "step": 1939, + "time_per_iteration": 2.5033867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049643, + "balance_loss_mlp": 1.00579786, + "epoch": 0.37322046941131204, + "flos": 532511841024.0, + "grad_norm": 0.031902006564455146, + "language_loss": 0.89473069, + "learning_rate": 0.0007216215516563527, + "loss": 0.90522707, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.43920898, + "step": 1940, + "time_per_iteration": 2.685201406478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_mlp": 1.00538588, + "epoch": 0.3734128510965756, + "flos": 532576969728.0, + "grad_norm": 0.03682978505173481, + "language_loss": 0.83770883, + "learning_rate": 0.0007213422416769083, + "loss": 0.84820092, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.43896484, + "step": 1941, + "time_per_iteration": 2.5981826782226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104851, + "balance_loss_mlp": 1.00454593, + "epoch": 0.37360523278183916, + "flos": 501433777152.0, + "grad_norm": 0.029644951468961563, + "language_loss": 0.75750655, + "learning_rate": 0.0007210628457647849, + "loss": 0.76799166, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.44042969, + "step": 1942, + "time_per_iteration": 2.5780391693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_mlp": 1.00365365, + "epoch": 0.37379761446710275, + "flos": 549112857600.0, + "grad_norm": 0.03283775645447924, + "language_loss": 0.79155779, + "learning_rate": 0.000720783364028453, + "loss": 0.80203396, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.44042969, + "step": 1943, + "time_per_iteration": 2.7498555183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052612, + "balance_loss_mlp": 1.0085758, + "epoch": 0.3739899961523663, + "flos": 476740271616.0, + "grad_norm": 0.03229344723146533, + "language_loss": 0.88345349, + "learning_rate": 0.0007205037965764177, + "loss": 0.89397967, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.44116211, + "step": 1944, + "time_per_iteration": 2.559565305709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049461, + "balance_loss_mlp": 1.00533009, + "epoch": 0.37418237783762986, + "flos": 613077037824.0, + "grad_norm": 0.033726561022773015, + "language_loss": 0.85856438, + "learning_rate": 0.0007202241435172161, + "loss": 0.86905897, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.44213867, + "step": 1945, + "time_per_iteration": 2.7495012283325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_mlp": 1.00618136, + "epoch": 0.3743747595228934, + "flos": 767629432320.0, + "grad_norm": 0.030482282234963888, + "language_loss": 0.88839138, + "learning_rate": 0.0007199444049594198, + "loss": 0.89889503, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.44262695, + "step": 1946, + "time_per_iteration": 2.927438259124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_mlp": 1.00679135, + "epoch": 0.374567141208157, + "flos": 525491546880.0, + "grad_norm": 0.03274984488565387, + "language_loss": 0.84098482, + "learning_rate": 0.0007196645810116322, + "loss": 0.85149455, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.44262695, + "step": 1947, + "time_per_iteration": 2.669954538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00717854, + "epoch": 0.37475952289342057, + "flos": 682614096384.0, + "grad_norm": 0.03500222096290466, + "language_loss": 0.84308642, + "learning_rate": 0.0007193846717824912, + "loss": 0.85360044, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.44311523, + "step": 1948, + "time_per_iteration": 2.873595714569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_mlp": 1.01018322, + "epoch": 0.3749519045786841, + "flos": 461216307456.0, + "grad_norm": 0.03758393676626501, + "language_loss": 0.89286113, + "learning_rate": 0.0007191046773806669, + "loss": 0.90340507, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.44287109, + "step": 1949, + "time_per_iteration": 2.5632805824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052491, + "balance_loss_mlp": 1.00816894, + "epoch": 0.3751442862639477, + "flos": 956388013824.0, + "grad_norm": 0.04355990755149793, + "language_loss": 0.83803475, + "learning_rate": 0.0007188245979148631, + "loss": 0.84855968, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.4440918, + "step": 1950, + "time_per_iteration": 3.153048515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050564, + "balance_loss_mlp": 1.00619411, + "epoch": 0.3753366679492112, + "flos": 528806307840.0, + "grad_norm": 0.034134677221205334, + "language_loss": 0.88437903, + "learning_rate": 0.0007185444334938157, + "loss": 0.89488459, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.44458008, + "step": 1951, + "time_per_iteration": 2.77795147895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052982, + "balance_loss_mlp": 1.0084213, + "epoch": 0.3755290496344748, + "flos": 522849460224.0, + "grad_norm": 0.03641649118573359, + "language_loss": 0.85489821, + "learning_rate": 0.0007182641842262947, + "loss": 0.86542803, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.44628906, + "step": 1952, + "time_per_iteration": 2.6038033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063178, + "balance_loss_mlp": 1.01852179, + "epoch": 0.37572143131973834, + "flos": 622372945920.0, + "grad_norm": 0.036303705105214745, + "language_loss": 0.78406018, + "learning_rate": 0.0007179838502211022, + "loss": 0.79469192, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.44702148, + "step": 1953, + "time_per_iteration": 2.8537991046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050235, + "balance_loss_mlp": 1.00565112, + "epoch": 0.37591381300500193, + "flos": 772274957568.0, + "grad_norm": 0.033405608161133214, + "language_loss": 0.87193865, + "learning_rate": 0.0007177034315870738, + "loss": 0.88244104, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.44677734, + "step": 1954, + "time_per_iteration": 2.9944725036621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_mlp": 1.00469208, + "epoch": 0.37610619469026546, + "flos": 521481757440.0, + "grad_norm": 0.05036646851246907, + "language_loss": 0.91552407, + "learning_rate": 0.0007174229284330773, + "loss": 0.92601728, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.44702148, + "step": 1955, + "time_per_iteration": 2.607128143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046853, + "balance_loss_mlp": 1.0023644, + "epoch": 0.37629857637552905, + "flos": 599971584000.0, + "grad_norm": 0.029911324472659546, + "language_loss": 0.87468076, + "learning_rate": 0.0007171423408680141, + "loss": 0.88514924, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.44555664, + "step": 1956, + "time_per_iteration": 2.8234241008758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00272334, + "epoch": 0.37649095806079264, + "flos": 566019108864.0, + "grad_norm": 0.03303955535560464, + "language_loss": 0.90624022, + "learning_rate": 0.0007168616690008176, + "loss": 0.91671115, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.44458008, + "step": 1957, + "time_per_iteration": 2.645219326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047066, + "balance_loss_mlp": 1.00271976, + "epoch": 0.37668333974605617, + "flos": 593569529088.0, + "grad_norm": 0.03512927569377508, + "language_loss": 0.86650079, + "learning_rate": 0.0007165809129404545, + "loss": 0.87697142, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.44433594, + "step": 1958, + "time_per_iteration": 2.762319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.00742376, + "epoch": 0.37687572143131975, + "flos": 420365047296.0, + "grad_norm": 0.03381206580119959, + "language_loss": 0.8673501, + "learning_rate": 0.0007163000727959239, + "loss": 0.87786663, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.44311523, + "step": 1959, + "time_per_iteration": 2.4887454509735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_mlp": 1.00466919, + "epoch": 0.3770681031165833, + "flos": 1360387269888.0, + "grad_norm": 0.007286715675134549, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79006183, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.42480469, + "step": 1960, + "time_per_iteration": 4.844388961791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053466, + "balance_loss_mlp": 1.00938201, + "epoch": 0.3772604848018469, + "flos": 646154649600.0, + "grad_norm": 0.030030705089392724, + "language_loss": 0.85244703, + "learning_rate": 0.00071573814069052, + "loss": 0.86298174, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.44165039, + "step": 1961, + "time_per_iteration": 2.93870210647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_mlp": 0.99976981, + "epoch": 0.3774528664871104, + "flos": 903202150656.0, + "grad_norm": 0.029467737659617427, + "language_loss": 0.88618672, + "learning_rate": 0.0007154570489478081, + "loss": 0.89662528, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.44165039, + "step": 1962, + "time_per_iteration": 3.2101829051971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_mlp": 1.00241697, + "epoch": 0.377645248172374, + "flos": 789464077824.0, + "grad_norm": 0.02894999631439154, + "language_loss": 0.87102842, + "learning_rate": 0.0007151758735572514, + "loss": 0.88149416, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.44238281, + "step": 1963, + "time_per_iteration": 3.0217864513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046842, + "balance_loss_mlp": 1.00282979, + "epoch": 0.3778376298576376, + "flos": 587925686016.0, + "grad_norm": 0.035422959183698866, + "language_loss": 0.81287247, + "learning_rate": 0.0007148946146280119, + "loss": 0.82334089, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.44091797, + "step": 1964, + "time_per_iteration": 2.9066553115844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_mlp": 1.01407623, + "epoch": 0.3780300115429011, + "flos": 1399672528896.0, + "grad_norm": 0.012885740561533653, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73248661, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.42480469, + "step": 1965, + "time_per_iteration": 4.874085426330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3782223932281647, + "flos": 1360634178816.0, + "grad_norm": 0.008484298942656315, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397645, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.42578125, + "step": 1966, + "time_per_iteration": 4.964066743850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048666, + "balance_loss_mlp": 1.00467777, + "epoch": 0.37841477491342823, + "flos": 705517046016.0, + "grad_norm": 0.02737284959483133, + "language_loss": 0.8436377, + "learning_rate": 0.0007140503377003022, + "loss": 0.85412437, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.44067383, + "step": 1967, + "time_per_iteration": 3.014033555984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_mlp": 1.00764298, + "epoch": 0.3786071565986918, + "flos": 530156514048.0, + "grad_norm": 0.03014770490429956, + "language_loss": 0.85294402, + "learning_rate": 0.000713768745708599, + "loss": 0.86346149, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.44189453, + "step": 1968, + "time_per_iteration": 2.6359875202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_mlp": 1.0084002, + "epoch": 0.37879953828395535, + "flos": 994901443584.0, + "grad_norm": 0.03323886334735767, + "language_loss": 0.78270096, + "learning_rate": 0.0007134870707245085, + "loss": 0.79322648, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.44238281, + "step": 1969, + "time_per_iteration": 3.276670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054281, + "balance_loss_mlp": 1.01010239, + "epoch": 0.37899191996921894, + "flos": 627793212672.0, + "grad_norm": 0.033324026165203316, + "language_loss": 0.84867144, + "learning_rate": 0.0007132053128573864, + "loss": 0.85921425, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.44262695, + "step": 1970, + "time_per_iteration": 2.747647523880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_mlp": 1.00727034, + "epoch": 0.37918430165448247, + "flos": 687520136448.0, + "grad_norm": 0.034311044198206936, + "language_loss": 0.84702653, + "learning_rate": 0.0007129234722166211, + "loss": 0.85754126, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.44287109, + "step": 1971, + "time_per_iteration": 2.8502755165100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104953, + "balance_loss_mlp": 1.00535131, + "epoch": 0.37937668333974606, + "flos": 476618762496.0, + "grad_norm": 0.028798969169212138, + "language_loss": 0.91637433, + "learning_rate": 0.0007126415489116328, + "loss": 0.92686969, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.44262695, + "step": 1972, + "time_per_iteration": 2.703598737716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_mlp": 1.00559556, + "epoch": 0.37956906502500964, + "flos": 708825004032.0, + "grad_norm": 0.033945121596029554, + "language_loss": 0.81780016, + "learning_rate": 0.0007123595430518736, + "loss": 0.82829797, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.44262695, + "step": 1973, + "time_per_iteration": 2.859210252761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_mlp": 1.00345445, + "epoch": 0.3797614467102732, + "flos": 427559340288.0, + "grad_norm": 0.03504063937858188, + "language_loss": 0.86830699, + "learning_rate": 0.0007120774547468282, + "loss": 0.87878382, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.44311523, + "step": 1974, + "time_per_iteration": 2.5465054512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105377, + "balance_loss_mlp": 1.00944817, + "epoch": 0.37995382839553676, + "flos": 482881811712.0, + "grad_norm": 0.031503790568027705, + "language_loss": 0.82317638, + "learning_rate": 0.0007117952841060128, + "loss": 0.83371413, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.4440918, + "step": 1975, + "time_per_iteration": 2.789965867996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_mlp": 1.00924242, + "epoch": 0.3801462100808003, + "flos": 561671036928.0, + "grad_norm": 0.03572346778222672, + "language_loss": 0.84539783, + "learning_rate": 0.0007115130312389756, + "loss": 0.85593396, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.44433594, + "step": 1976, + "time_per_iteration": 2.7104804515838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046782, + "balance_loss_mlp": 1.00236499, + "epoch": 0.3803385917660639, + "flos": 465888077568.0, + "grad_norm": 0.03508123942848817, + "language_loss": 0.80071044, + "learning_rate": 0.0007112306962552973, + "loss": 0.81117821, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.44506836, + "step": 1977, + "time_per_iteration": 2.644700527191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053171, + "balance_loss_mlp": 1.00863445, + "epoch": 0.3805309734513274, + "flos": 522905840640.0, + "grad_norm": 0.0297417361696937, + "language_loss": 0.8625899, + "learning_rate": 0.0007109482792645896, + "loss": 0.87312162, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.44580078, + "step": 1978, + "time_per_iteration": 2.736924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052388, + "balance_loss_mlp": 1.00780404, + "epoch": 0.380723355136591, + "flos": 592553714688.0, + "grad_norm": 0.03207088172149068, + "language_loss": 0.84620887, + "learning_rate": 0.0007106657803764969, + "loss": 0.85673285, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.44628906, + "step": 1979, + "time_per_iteration": 2.797027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_mlp": 1.00851822, + "epoch": 0.38091573682185453, + "flos": 623855354880.0, + "grad_norm": 0.034228405400289826, + "language_loss": 0.82734859, + "learning_rate": 0.0007103831997006948, + "loss": 0.83788031, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.4465332, + "step": 1980, + "time_per_iteration": 2.774831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00601208, + "epoch": 0.3811081185071181, + "flos": 570176652288.0, + "grad_norm": 0.02916230611543443, + "language_loss": 0.85986841, + "learning_rate": 0.0007101005373468908, + "loss": 0.87037432, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.4465332, + "step": 1981, + "time_per_iteration": 2.889430284500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_mlp": 1.00647449, + "epoch": 0.3813005001923817, + "flos": 585991266816.0, + "grad_norm": 0.029260882769569122, + "language_loss": 0.87282979, + "learning_rate": 0.0007098177934248242, + "loss": 0.88334191, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.44726562, + "step": 1982, + "time_per_iteration": 2.734011173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_mlp": 1.00509369, + "epoch": 0.38149288187764524, + "flos": 622811350272.0, + "grad_norm": 0.03279838714755621, + "language_loss": 0.86164075, + "learning_rate": 0.0007095349680442661, + "loss": 0.87213778, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.44677734, + "step": 1983, + "time_per_iteration": 2.8532214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049432, + "balance_loss_mlp": 1.00496709, + "epoch": 0.3816852635629088, + "flos": 571798066944.0, + "grad_norm": 0.03407469020321441, + "language_loss": 0.79342288, + "learning_rate": 0.0007092520613150188, + "loss": 0.80391723, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4453125, + "step": 1984, + "time_per_iteration": 2.6656527519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_mlp": 1.01058352, + "epoch": 0.38187764524817236, + "flos": 566679144192.0, + "grad_norm": 0.03287674379309895, + "language_loss": 0.81891948, + "learning_rate": 0.0007089690733469165, + "loss": 0.82946956, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.44506836, + "step": 1985, + "time_per_iteration": 2.6921868324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104986, + "balance_loss_mlp": 1.00544298, + "epoch": 0.38207002693343595, + "flos": 632399854080.0, + "grad_norm": 0.03591516825864857, + "language_loss": 0.8265506, + "learning_rate": 0.000708686004249825, + "loss": 0.83704919, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.44506836, + "step": 1986, + "time_per_iteration": 2.771472454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_mlp": 1.0026772, + "epoch": 0.3822624086186995, + "flos": 549841912320.0, + "grad_norm": 0.027805852633017242, + "language_loss": 0.91746366, + "learning_rate": 0.0007084028541336413, + "loss": 0.92793083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.44116211, + "step": 1987, + "time_per_iteration": 2.7168381214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049881, + "balance_loss_mlp": 1.00572634, + "epoch": 0.38245479030396307, + "flos": 615067837440.0, + "grad_norm": 0.03052630202850825, + "language_loss": 0.86906445, + "learning_rate": 0.0007081196231082942, + "loss": 0.87956333, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.44238281, + "step": 1988, + "time_per_iteration": 2.8021280765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00325835, + "epoch": 0.38264717198922665, + "flos": 669304508160.0, + "grad_norm": 0.03253134732635267, + "language_loss": 0.8090933, + "learning_rate": 0.0007078363112837436, + "loss": 0.81956601, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.44091797, + "step": 1989, + "time_per_iteration": 2.812901020050049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00232375, + "epoch": 0.3828395536744902, + "flos": 455687170560.0, + "grad_norm": 0.03353740504071411, + "language_loss": 0.8610149, + "learning_rate": 0.000707552918769981, + "loss": 0.87147707, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43969727, + "step": 1990, + "time_per_iteration": 2.503817081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0038017, + "epoch": 0.3830319353597538, + "flos": 500483091456.0, + "grad_norm": 0.030831133245435974, + "language_loss": 0.84298265, + "learning_rate": 0.000707269445677029, + "loss": 0.85345787, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43798828, + "step": 1991, + "time_per_iteration": 2.77250599861145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_mlp": 1.00373507, + "epoch": 0.3832243170450173, + "flos": 745467197952.0, + "grad_norm": 0.03142895241328533, + "language_loss": 0.85860848, + "learning_rate": 0.0007069858921149416, + "loss": 0.86908376, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.4387207, + "step": 1992, + "time_per_iteration": 3.001058578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_mlp": 1.00363255, + "epoch": 0.3834166987302809, + "flos": 579346193664.0, + "grad_norm": 0.027707623231004064, + "language_loss": 0.86360574, + "learning_rate": 0.0007067022581938043, + "loss": 0.87407815, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.43676758, + "step": 1993, + "time_per_iteration": 2.896017551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049302, + "balance_loss_mlp": 1.00579047, + "epoch": 0.3836090804155444, + "flos": 537609376512.0, + "grad_norm": 0.038344647976828676, + "language_loss": 0.83944476, + "learning_rate": 0.0007064185440237334, + "loss": 0.8499378, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.43579102, + "step": 1994, + "time_per_iteration": 2.8133461475372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051736, + "balance_loss_mlp": 1.00820076, + "epoch": 0.383801462100808, + "flos": 603052075008.0, + "grad_norm": 0.0304270283066245, + "language_loss": 0.85033917, + "learning_rate": 0.0007061347497148764, + "loss": 0.86085653, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.43603516, + "step": 1995, + "time_per_iteration": 2.829977035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050646, + "balance_loss_mlp": 1.00694358, + "epoch": 0.38399384378607154, + "flos": 573799560192.0, + "grad_norm": 0.034646706108572276, + "language_loss": 0.86866224, + "learning_rate": 0.0007058508753774122, + "loss": 0.87916863, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.43774414, + "step": 1996, + "time_per_iteration": 2.684966564178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049901, + "balance_loss_mlp": 1.00629473, + "epoch": 0.38418622547133513, + "flos": 537780463104.0, + "grad_norm": 0.03333459391135046, + "language_loss": 0.87270373, + "learning_rate": 0.0007055669211215505, + "loss": 0.88320273, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.43676758, + "step": 1997, + "time_per_iteration": 2.623508930206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054497, + "balance_loss_mlp": 1.01079535, + "epoch": 0.3843786071565987, + "flos": 574014388224.0, + "grad_norm": 0.04127067736406929, + "language_loss": 0.78599155, + "learning_rate": 0.0007052828870575322, + "loss": 0.79653656, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43774414, + "step": 1998, + "time_per_iteration": 2.644423723220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_mlp": 1.00761676, + "epoch": 0.38457098884186225, + "flos": 730080294144.0, + "grad_norm": 0.03146347648703673, + "language_loss": 0.87266672, + "learning_rate": 0.0007049987732956291, + "loss": 0.88318008, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.43798828, + "step": 1999, + "time_per_iteration": 2.963409185409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_mlp": 1.00447905, + "epoch": 0.38476337052712584, + "flos": 584621618688.0, + "grad_norm": 0.024706606255084192, + "language_loss": 0.83278054, + "learning_rate": 0.0007047145799461439, + "loss": 0.84326208, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.4375, + "step": 2000, + "time_per_iteration": 2.86661434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_mlp": 1.00459874, + "epoch": 0.38495575221238937, + "flos": 554159848704.0, + "grad_norm": 0.03147773281119346, + "language_loss": 0.83074015, + "learning_rate": 0.00070443030711941, + "loss": 0.84122348, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.43798828, + "step": 2001, + "time_per_iteration": 2.778719425201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_mlp": 1.00175321, + "epoch": 0.38514813389765296, + "flos": 655678024704.0, + "grad_norm": 0.03168685191580143, + "language_loss": 0.82975376, + "learning_rate": 0.0007041459549257924, + "loss": 0.84020758, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.43701172, + "step": 2002, + "time_per_iteration": 2.8597054481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046243, + "balance_loss_mlp": 1.00261223, + "epoch": 0.3853405155829165, + "flos": 869647250688.0, + "grad_norm": 0.03552713767777679, + "language_loss": 0.78954732, + "learning_rate": 0.0007038615234756859, + "loss": 0.80000973, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.43701172, + "step": 2003, + "time_per_iteration": 3.167647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.00697505, + "epoch": 0.3855328972681801, + "flos": 547469088768.0, + "grad_norm": 0.03596547507231522, + "language_loss": 0.84374714, + "learning_rate": 0.000703577012879517, + "loss": 0.85425198, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.43579102, + "step": 2004, + "time_per_iteration": 2.644718885421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00397706, + "epoch": 0.3857252789534436, + "flos": 535099492608.0, + "grad_norm": 0.03525407945169758, + "language_loss": 0.89214581, + "learning_rate": 0.0007032924232477423, + "loss": 0.90262067, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.43579102, + "step": 2005, + "time_per_iteration": 2.6340301036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_mlp": 1.01023984, + "epoch": 0.3859176606387072, + "flos": 492767768832.0, + "grad_norm": 0.0325086763316175, + "language_loss": 0.80829036, + "learning_rate": 0.0007030077546908493, + "loss": 0.81882888, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.43676758, + "step": 2006, + "time_per_iteration": 2.6427574157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051659, + "balance_loss_mlp": 1.00969696, + "epoch": 0.3861100423239708, + "flos": 1490158675968.0, + "grad_norm": 0.006099468603868092, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84116316, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.41992188, + "step": 2007, + "time_per_iteration": 4.792185068130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_mlp": 1.00383234, + "epoch": 0.3863024240092343, + "flos": 474693091584.0, + "grad_norm": 0.0379943815396184, + "language_loss": 0.79703128, + "learning_rate": 0.0007024381812438117, + "loss": 0.80750644, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.4375, + "step": 2008, + "time_per_iteration": 2.6320388317108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_mlp": 1.0153178, + "epoch": 0.3864948056944979, + "flos": 717979961088.0, + "grad_norm": 0.04179543058298576, + "language_loss": 0.84345418, + "learning_rate": 0.0007021532765747951, + "loss": 0.85404319, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.43652344, + "step": 2009, + "time_per_iteration": 3.0408942699432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_mlp": 1.01370513, + "epoch": 0.38668718737976143, + "flos": 728955609600.0, + "grad_norm": 0.033678441310908816, + "language_loss": 0.80296206, + "learning_rate": 0.0007018682934229162, + "loss": 0.81353402, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43554688, + "step": 2010, + "time_per_iteration": 2.9119958877563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_mlp": 1.01025474, + "epoch": 0.386879569065025, + "flos": 526489864704.0, + "grad_norm": 0.031759350944825356, + "language_loss": 0.83489478, + "learning_rate": 0.0007015832318988152, + "loss": 0.84543192, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43530273, + "step": 2011, + "time_per_iteration": 2.625828981399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_mlp": 1.00643158, + "epoch": 0.38707195075028855, + "flos": 1530727067136.0, + "grad_norm": 0.008010138125144308, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74938273, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.41796875, + "step": 2012, + "time_per_iteration": 4.969848155975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_mlp": 1.00555038, + "epoch": 0.38726433243555214, + "flos": 558386411520.0, + "grad_norm": 0.029387859415775444, + "language_loss": 0.84841448, + "learning_rate": 0.0007010128741766604, + "loss": 0.85890484, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.43554688, + "step": 2013, + "time_per_iteration": 2.808583974838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_mlp": 1.00205982, + "epoch": 0.38745671412081567, + "flos": 554756700672.0, + "grad_norm": 0.037665143906504196, + "language_loss": 0.84820414, + "learning_rate": 0.0007007275782000391, + "loss": 0.85866058, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.43652344, + "step": 2014, + "time_per_iteration": 2.6201975345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_mlp": 1.00775766, + "epoch": 0.38764909580607926, + "flos": 459345071616.0, + "grad_norm": 0.03590133597746071, + "language_loss": 0.85486585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86537898, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.4362793, + "step": 2015, + "time_per_iteration": 2.5167059898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_mlp": 1.00792837, + "epoch": 0.38784147749134285, + "flos": 523259674368.0, + "grad_norm": 0.036833384765870066, + "language_loss": 0.90223992, + "learning_rate": 0.0007001567525695169, + "loss": 0.9127546, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.43603516, + "step": 2016, + "time_per_iteration": 2.663416624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_mlp": 0.99923599, + "epoch": 0.3880338591766064, + "flos": 667401191424.0, + "grad_norm": 0.027528515382714943, + "language_loss": 0.84397906, + "learning_rate": 0.0006998712231372303, + "loss": 0.85440457, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.43383789, + "step": 2017, + "time_per_iteration": 2.982222080230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_mlp": 1.00389743, + "epoch": 0.38822624086186996, + "flos": 595176359424.0, + "grad_norm": 0.028816590459513517, + "language_loss": 0.86776507, + "learning_rate": 0.0006995856161080532, + "loss": 0.87823659, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43310547, + "step": 2018, + "time_per_iteration": 2.8449933528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046566, + "balance_loss_mlp": 1.00300694, + "epoch": 0.3884186225471335, + "flos": 613682638080.0, + "grad_norm": 0.032032500930829794, + "language_loss": 0.82425624, + "learning_rate": 0.0006992999315928679, + "loss": 0.83472192, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.4362793, + "step": 2019, + "time_per_iteration": 2.803743362426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_mlp": 1.00401926, + "epoch": 0.3886110042323971, + "flos": 608244874752.0, + "grad_norm": 0.027721707471257077, + "language_loss": 0.86241317, + "learning_rate": 0.0006990141697025871, + "loss": 0.87288654, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.43383789, + "step": 2020, + "time_per_iteration": 2.7804739475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046585, + "balance_loss_mlp": 1.00481415, + "epoch": 0.3888033859176606, + "flos": 1531196573952.0, + "grad_norm": 0.004554603876592686, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77406228, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.41796875, + "step": 2021, + "time_per_iteration": 4.76949667930603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_mlp": 1.00478315, + "epoch": 0.3889957676029242, + "flos": 693672370176.0, + "grad_norm": 0.038162906437672096, + "language_loss": 0.8292582, + "learning_rate": 0.0006984424142405392, + "loss": 0.83973902, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.43359375, + "step": 2022, + "time_per_iteration": 2.7983930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_mlp": 1.00599611, + "epoch": 0.3891881492881878, + "flos": 516195638784.0, + "grad_norm": 0.03974199995652067, + "language_loss": 0.82402384, + "learning_rate": 0.0006981564208907474, + "loss": 0.83451867, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.43554688, + "step": 2023, + "time_per_iteration": 2.613600730895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_mlp": 1.00707471, + "epoch": 0.3893805309734513, + "flos": 630176729856.0, + "grad_norm": 0.03303002735023947, + "language_loss": 0.90586042, + "learning_rate": 0.0006978703506098102, + "loss": 0.91636622, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.43579102, + "step": 2024, + "time_per_iteration": 2.7258403301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_mlp": 1.00748503, + "epoch": 0.3895729126587149, + "flos": 545207080704.0, + "grad_norm": 0.0334033578711094, + "language_loss": 0.88520938, + "learning_rate": 0.00069758420350879, + "loss": 0.89571834, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.43481445, + "step": 2025, + "time_per_iteration": 2.6406970024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_mlp": 1.00427127, + "epoch": 0.38976529434397844, + "flos": 619407161088.0, + "grad_norm": 0.03600656764113765, + "language_loss": 0.86979783, + "learning_rate": 0.000697297979698779, + "loss": 0.88027489, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43505859, + "step": 2026, + "time_per_iteration": 2.729025363922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00297225, + "epoch": 0.38995767602924203, + "flos": 836346062592.0, + "grad_norm": 0.030634369701250594, + "language_loss": 0.84155977, + "learning_rate": 0.0006970116792908992, + "loss": 0.85202479, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.43603516, + "step": 2027, + "time_per_iteration": 3.0780837535858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054265, + "balance_loss_mlp": 1.01070547, + "epoch": 0.39015005771450556, + "flos": 542647619328.0, + "grad_norm": 0.03376343400122794, + "language_loss": 0.81809974, + "learning_rate": 0.000696725302396302, + "loss": 0.82864237, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.4362793, + "step": 2028, + "time_per_iteration": 2.6632442474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_mlp": 1.00277102, + "epoch": 0.39034243939976915, + "flos": 1009142275584.0, + "grad_norm": 0.030316104633677343, + "language_loss": 0.86213875, + "learning_rate": 0.0006964388491261692, + "loss": 0.872603, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.43725586, + "step": 2029, + "time_per_iteration": 3.2410776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052185, + "balance_loss_mlp": 1.00848317, + "epoch": 0.3905348210850327, + "flos": 680241272832.0, + "grad_norm": 0.03528753395725821, + "language_loss": 0.88294208, + "learning_rate": 0.0006961523195917114, + "loss": 0.89346391, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.43774414, + "step": 2030, + "time_per_iteration": 2.8754475116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104739, + "balance_loss_mlp": 1.00375915, + "epoch": 0.39072720277029627, + "flos": 549989666304.0, + "grad_norm": 0.032806843563698423, + "language_loss": 0.78588331, + "learning_rate": 0.0006958657139041696, + "loss": 0.79635721, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.43701172, + "step": 2031, + "time_per_iteration": 2.7329561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_mlp": 1.00554657, + "epoch": 0.39091958445555985, + "flos": 1551054025728.0, + "grad_norm": 0.008088132411436895, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77760577, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.42089844, + "step": 2032, + "time_per_iteration": 4.958296298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048975, + "balance_loss_mlp": 1.00529635, + "epoch": 0.3911119661408234, + "flos": 505052794368.0, + "grad_norm": 0.03533188094946227, + "language_loss": 0.78901434, + "learning_rate": 0.0006952922745149434, + "loss": 0.7995041, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.4375, + "step": 2033, + "time_per_iteration": 2.6192519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050645, + "balance_loss_mlp": 1.00684798, + "epoch": 0.391304347826087, + "flos": 558330031104.0, + "grad_norm": 0.032114717040763616, + "language_loss": 0.88009661, + "learning_rate": 0.000695005441035888, + "loss": 0.89060307, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.4387207, + "step": 2034, + "time_per_iteration": 2.6519060134887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_mlp": 1.00334167, + "epoch": 0.3914967295113505, + "flos": 1502944322304.0, + "grad_norm": 0.004600085335304226, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7476902, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.42285156, + "step": 2035, + "time_per_iteration": 4.875830888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049757, + "balance_loss_mlp": 1.00581694, + "epoch": 0.3916891111966141, + "flos": 708330219264.0, + "grad_norm": 0.02756997110289995, + "language_loss": 0.81809461, + "learning_rate": 0.0006944315470656863, + "loss": 0.82859218, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.44018555, + "step": 2036, + "time_per_iteration": 2.9486818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104941, + "balance_loss_mlp": 1.00537384, + "epoch": 0.3918814928818776, + "flos": 557409480960.0, + "grad_norm": 0.03430912315299504, + "language_loss": 0.91194409, + "learning_rate": 0.000694144486797345, + "loss": 0.92243814, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.44116211, + "step": 2037, + "time_per_iteration": 2.661637783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_mlp": 1.01155853, + "epoch": 0.3920738745671412, + "flos": 1541688131328.0, + "grad_norm": 0.009695617032389551, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80574143, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.421875, + "step": 2038, + "time_per_iteration": 4.676162004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_mlp": 1.00672829, + "epoch": 0.39226625625240474, + "flos": 499805559552.0, + "grad_norm": 0.03059706599431713, + "language_loss": 0.9011066, + "learning_rate": 0.0006935701402514156, + "loss": 0.91161263, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.43945312, + "step": 2039, + "time_per_iteration": 2.5921828746795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_mlp": 0.99837494, + "epoch": 0.39245863793766833, + "flos": 1350453680640.0, + "grad_norm": 0.0024785612799689367, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74075705, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.42480469, + "step": 2040, + "time_per_iteration": 4.920953273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_mlp": 1.00180471, + "epoch": 0.3926510196229319, + "flos": 1348115873280.0, + "grad_norm": 0.032003611488688986, + "language_loss": 0.84899294, + "learning_rate": 0.0006929954931031422, + "loss": 0.85944915, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.43896484, + "step": 2041, + "time_per_iteration": 3.7454288005828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_mlp": 1.00144792, + "epoch": 0.39284340130819545, + "flos": 500604600576.0, + "grad_norm": 0.027328608847006428, + "language_loss": 0.89267606, + "learning_rate": 0.0006927080570819805, + "loss": 0.9031285, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.4387207, + "step": 2042, + "time_per_iteration": 2.6191000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049281, + "balance_loss_mlp": 1.00565004, + "epoch": 0.39303578299345904, + "flos": 521342751744.0, + "grad_norm": 0.03887631720492337, + "language_loss": 0.81479704, + "learning_rate": 0.0006924205462449161, + "loss": 0.82528985, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.43701172, + "step": 2043, + "time_per_iteration": 2.6156415939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_mlp": 1.00467432, + "epoch": 0.39322816467872257, + "flos": 909539076864.0, + "grad_norm": 0.03230930456366714, + "language_loss": 0.82451463, + "learning_rate": 0.0006921329607035702, + "loss": 0.83499742, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.43676758, + "step": 2044, + "time_per_iteration": 3.248239040374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_mlp": 1.0066911, + "epoch": 0.39342054636398616, + "flos": 518642339328.0, + "grad_norm": 0.028076885263619615, + "language_loss": 0.88591248, + "learning_rate": 0.0006918453005695938, + "loss": 0.89641762, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.43896484, + "step": 2045, + "time_per_iteration": 2.6417062282562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.00430059, + "epoch": 0.3936129280492497, + "flos": 549012735744.0, + "grad_norm": 0.027900695924135757, + "language_loss": 0.84910023, + "learning_rate": 0.0006915575659546662, + "loss": 0.85958266, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.44018555, + "step": 2046, + "time_per_iteration": 2.6784913539886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053966, + "balance_loss_mlp": 1.0100733, + "epoch": 0.3938053097345133, + "flos": 527141151744.0, + "grad_norm": 0.03448231278490725, + "language_loss": 0.81310439, + "learning_rate": 0.0006912697569704959, + "loss": 0.82364404, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.43969727, + "step": 2047, + "time_per_iteration": 2.6214752197265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050494, + "balance_loss_mlp": 1.00679207, + "epoch": 0.39399769141977686, + "flos": 472589531136.0, + "grad_norm": 0.03168334850546869, + "language_loss": 0.87124646, + "learning_rate": 0.0006909818737288205, + "loss": 0.88175148, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.43774414, + "step": 2048, + "time_per_iteration": 2.6057982444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00775015, + "epoch": 0.3941900731050404, + "flos": 502727602944.0, + "grad_norm": 0.03501112209435681, + "language_loss": 0.81578481, + "learning_rate": 0.000690693916341406, + "loss": 0.82629883, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.43725586, + "step": 2049, + "time_per_iteration": 2.6459243297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.00910771, + "epoch": 0.394382454790304, + "flos": 582007722240.0, + "grad_norm": 0.03071224069667877, + "language_loss": 0.83009964, + "learning_rate": 0.0006904058849200475, + "loss": 0.8406263, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.4362793, + "step": 2050, + "time_per_iteration": 2.766828775405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_mlp": 1.00243104, + "epoch": 0.3945748364755675, + "flos": 514845432576.0, + "grad_norm": 0.030877215482718844, + "language_loss": 0.85563171, + "learning_rate": 0.0006901177795765683, + "loss": 0.86609566, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.44042969, + "step": 2051, + "time_per_iteration": 2.659912109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051919, + "balance_loss_mlp": 1.00807357, + "epoch": 0.3947672181608311, + "flos": 595058740992.0, + "grad_norm": 0.03343854917241654, + "language_loss": 0.821091, + "learning_rate": 0.0006898296004228213, + "loss": 0.8316102, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.43920898, + "step": 2052, + "time_per_iteration": 2.7115862369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_mlp": 1.00455475, + "epoch": 0.39495959984609463, + "flos": 1551052080384.0, + "grad_norm": 0.003971648916451202, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79173255, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.41992188, + "step": 2053, + "time_per_iteration": 4.894740343093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051907, + "balance_loss_mlp": 1.00818145, + "epoch": 0.3951519815313582, + "flos": 497524109568.0, + "grad_norm": 0.03573797234588687, + "language_loss": 0.80267316, + "learning_rate": 0.0006892530211320763, + "loss": 0.81319225, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.43798828, + "step": 2054, + "time_per_iteration": 2.767686605453491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104549, + "balance_loss_mlp": 1.00193131, + "epoch": 0.39534436321662175, + "flos": 532223136000.0, + "grad_norm": 0.03591265467553322, + "language_loss": 0.84680569, + "learning_rate": 0.000688964621218926, + "loss": 0.85726058, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.4362793, + "step": 2055, + "time_per_iteration": 2.6054694652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_mlp": 1.004722, + "epoch": 0.39553674490188534, + "flos": 703725523200.0, + "grad_norm": 0.03424008758122415, + "language_loss": 0.8074584, + "learning_rate": 0.0006886761479432037, + "loss": 0.8179388, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.43383789, + "step": 2056, + "time_per_iteration": 2.8390727043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.0042696, + "epoch": 0.3957291265871489, + "flos": 410656979712.0, + "grad_norm": 0.03388460034269331, + "language_loss": 0.85256028, + "learning_rate": 0.0006883876014169045, + "loss": 0.86303759, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.43530273, + "step": 2057, + "time_per_iteration": 2.554170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051678, + "balance_loss_mlp": 1.00814319, + "epoch": 0.39592150827241246, + "flos": 619639485696.0, + "grad_norm": 0.03722447028160607, + "language_loss": 0.90694773, + "learning_rate": 0.000688098981752052, + "loss": 0.91746461, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.43603516, + "step": 2058, + "time_per_iteration": 2.733053684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_mlp": 1.00568974, + "epoch": 0.39611388995767605, + "flos": 822721524480.0, + "grad_norm": 0.04279286873756595, + "language_loss": 0.80609208, + "learning_rate": 0.0006878102890606982, + "loss": 0.81658387, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.43554688, + "step": 2059, + "time_per_iteration": 3.084789752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_mlp": 1.00416124, + "epoch": 0.3963062716429396, + "flos": 493214921472.0, + "grad_norm": 0.03961147378322192, + "language_loss": 0.81771576, + "learning_rate": 0.0006875215234549239, + "loss": 0.82819128, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.43457031, + "step": 2060, + "time_per_iteration": 2.5823421478271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_mlp": 1.00351596, + "epoch": 0.39649865332820317, + "flos": 585834764544.0, + "grad_norm": 0.03854635921535854, + "language_loss": 0.8654902, + "learning_rate": 0.0006872326850468376, + "loss": 0.87595946, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.43481445, + "step": 2061, + "time_per_iteration": 2.705690860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.0052762, + "epoch": 0.3966910350134667, + "flos": 459512267520.0, + "grad_norm": 0.037411346592439484, + "language_loss": 0.79843795, + "learning_rate": 0.0006869437739485762, + "loss": 0.80892581, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.43579102, + "step": 2062, + "time_per_iteration": 2.5978832244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050469, + "balance_loss_mlp": 1.00710082, + "epoch": 0.3968834166987303, + "flos": 509615694336.0, + "grad_norm": 0.03224635872548594, + "language_loss": 0.93265009, + "learning_rate": 0.0006866547902723053, + "loss": 0.94315481, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.43432617, + "step": 2063, + "time_per_iteration": 2.7325148582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.00502992, + "epoch": 0.3970757983839938, + "flos": 573743179776.0, + "grad_norm": 0.0353853142482034, + "language_loss": 0.80804694, + "learning_rate": 0.000686365734130218, + "loss": 0.81852973, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.43310547, + "step": 2064, + "time_per_iteration": 2.719521999359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_mlp": 1.00350547, + "epoch": 0.3972681800692574, + "flos": 482586303744.0, + "grad_norm": 0.03284702600830507, + "language_loss": 0.8411094, + "learning_rate": 0.000686076605634536, + "loss": 0.8515777, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.43383789, + "step": 2065, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_mlp": 1.00822306, + "epoch": 0.397460561754521, + "flos": 488905733376.0, + "grad_norm": 0.0324228687482344, + "language_loss": 0.84781277, + "learning_rate": 0.0006857874048975088, + "loss": 0.85833061, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.4362793, + "step": 2066, + "time_per_iteration": 2.5906848907470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_mlp": 1.00659895, + "epoch": 0.3976529434397845, + "flos": 422896318464.0, + "grad_norm": 0.03171433053589848, + "language_loss": 0.8744958, + "learning_rate": 0.0006854981320314142, + "loss": 0.8849957, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.43457031, + "step": 2067, + "time_per_iteration": 2.4699788093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_mlp": 1.00240779, + "epoch": 0.3978453251250481, + "flos": 546622415616.0, + "grad_norm": 0.03563960500295594, + "language_loss": 0.8728829, + "learning_rate": 0.0006852087871485579, + "loss": 0.88334048, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.43408203, + "step": 2068, + "time_per_iteration": 2.6414859294891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_mlp": 1.00163472, + "epoch": 0.39803770681031164, + "flos": 652002627072.0, + "grad_norm": 0.03732729296318665, + "language_loss": 0.82978511, + "learning_rate": 0.0006849193703612735, + "loss": 0.84023428, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.43334961, + "step": 2069, + "time_per_iteration": 2.791269063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_mlp": 0.999928, + "epoch": 0.39823008849557523, + "flos": 741427272960.0, + "grad_norm": 0.030595728613543666, + "language_loss": 0.78243995, + "learning_rate": 0.0006846298817819225, + "loss": 0.79287314, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.43457031, + "step": 2070, + "time_per_iteration": 2.9561986923217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_mlp": 1.00235701, + "epoch": 0.39842247018083876, + "flos": 385889597184.0, + "grad_norm": 0.036398106493658954, + "language_loss": 0.81909132, + "learning_rate": 0.0006843403215228945, + "loss": 0.82954645, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.43212891, + "step": 2071, + "time_per_iteration": 2.4993679523468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.00218797, + "epoch": 0.39861485186610235, + "flos": 534763155456.0, + "grad_norm": 0.028807086351499752, + "language_loss": 0.8150484, + "learning_rate": 0.0006840506896966065, + "loss": 0.82550067, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.4309082, + "step": 2072, + "time_per_iteration": 2.7684881687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049113, + "balance_loss_mlp": 1.00595963, + "epoch": 0.39880723355136594, + "flos": 644413671168.0, + "grad_norm": 0.03625588542647267, + "language_loss": 0.83127856, + "learning_rate": 0.0006837609864155038, + "loss": 0.8417697, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.43212891, + "step": 2073, + "time_per_iteration": 2.8514270782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_mlp": 1.00782108, + "epoch": 0.39899961523662947, + "flos": 516892612608.0, + "grad_norm": 0.031931162968107815, + "language_loss": 0.83936673, + "learning_rate": 0.0006834712117920592, + "loss": 0.84987766, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.43334961, + "step": 2074, + "time_per_iteration": 2.6099319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_mlp": 1.00583923, + "epoch": 0.39919199692189306, + "flos": 465338857728.0, + "grad_norm": 0.040350277752625376, + "language_loss": 0.86345923, + "learning_rate": 0.0006831813659387729, + "loss": 0.87394845, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.43139648, + "step": 2075, + "time_per_iteration": 2.5189003944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_mlp": 1.00421119, + "epoch": 0.3993843786071566, + "flos": 532679036928.0, + "grad_norm": 0.031639049857806745, + "language_loss": 0.84865057, + "learning_rate": 0.0006828914489681733, + "loss": 0.85912478, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.43261719, + "step": 2076, + "time_per_iteration": 2.7052366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_mlp": 1.00252223, + "epoch": 0.3995767602924202, + "flos": 505024604160.0, + "grad_norm": 0.02906284980485529, + "language_loss": 0.85967886, + "learning_rate": 0.0006826014609928162, + "loss": 0.87013543, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.43188477, + "step": 2077, + "time_per_iteration": 2.7127158641815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046635, + "balance_loss_mlp": 1.00514984, + "epoch": 0.3997691419776837, + "flos": 1457473781760.0, + "grad_norm": 0.010869866041652092, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84246022, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.41503906, + "step": 2078, + "time_per_iteration": 4.8602213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_mlp": 1.00586236, + "epoch": 0.3999615236629473, + "flos": 531756541440.0, + "grad_norm": 0.03484656463436615, + "language_loss": 0.80513203, + "learning_rate": 0.0006820212724781896, + "loss": 0.81562173, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.43164062, + "step": 2079, + "time_per_iteration": 2.6769065856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00732243, + "epoch": 0.4001539053482108, + "flos": 696362088960.0, + "grad_norm": 0.03370335981625205, + "language_loss": 0.84624374, + "learning_rate": 0.0006817310721641694, + "loss": 0.85674727, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.4309082, + "step": 2080, + "time_per_iteration": 2.8362321853637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_mlp": 1.00619566, + "epoch": 0.4003462870334744, + "flos": 521379690240.0, + "grad_norm": 0.0372462453928972, + "language_loss": 0.84107649, + "learning_rate": 0.00068144080129589, + "loss": 0.85156924, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.43139648, + "step": 2081, + "time_per_iteration": 2.673391342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047044, + "balance_loss_mlp": 1.00400949, + "epoch": 0.400538668718738, + "flos": 493503626496.0, + "grad_norm": 0.03624950820375382, + "language_loss": 0.83452618, + "learning_rate": 0.0006811504599860441, + "loss": 0.84499657, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.4309082, + "step": 2082, + "time_per_iteration": 2.5872161388397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048557, + "balance_loss_mlp": 1.0056175, + "epoch": 0.40073105040400153, + "flos": 491452555776.0, + "grad_norm": 0.03058886918361784, + "language_loss": 0.86615109, + "learning_rate": 0.0006808600483473526, + "loss": 0.87663668, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.42993164, + "step": 2083, + "time_per_iteration": 2.9167916774749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_mlp": 1.00165451, + "epoch": 0.4009234320892651, + "flos": 563540327424.0, + "grad_norm": 0.029579631805043773, + "language_loss": 0.86442864, + "learning_rate": 0.0006805695664925629, + "loss": 0.87487578, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.43115234, + "step": 2084, + "time_per_iteration": 2.8129522800445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_mlp": 1.00328159, + "epoch": 0.40111581377452865, + "flos": 426853618176.0, + "grad_norm": 0.03869673141168483, + "language_loss": 0.84653956, + "learning_rate": 0.0006802790145344506, + "loss": 0.85700059, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.42871094, + "step": 2085, + "time_per_iteration": 2.4816439151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_mlp": 1.00480056, + "epoch": 0.40130819545979224, + "flos": 613643754240.0, + "grad_norm": 0.033294901740297575, + "language_loss": 0.87748265, + "learning_rate": 0.0006799883925858176, + "loss": 0.88795811, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.42797852, + "step": 2086, + "time_per_iteration": 2.883460760116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010467, + "balance_loss_mlp": 1.00397515, + "epoch": 0.40150057714505577, + "flos": 524451432960.0, + "grad_norm": 0.03567087941007639, + "language_loss": 0.85852945, + "learning_rate": 0.0006796977007594933, + "loss": 0.86899644, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.42773438, + "step": 2087, + "time_per_iteration": 2.6274635791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_mlp": 1.00641906, + "epoch": 0.40169295883031936, + "flos": 562554648576.0, + "grad_norm": 0.03237434691106299, + "language_loss": 0.86948609, + "learning_rate": 0.0006794069391683345, + "loss": 0.87997776, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.42797852, + "step": 2088, + "time_per_iteration": 2.7452995777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00204611, + "epoch": 0.4018853405155829, + "flos": 520020735744.0, + "grad_norm": 0.03787206100605993, + "language_loss": 0.81785774, + "learning_rate": 0.0006791161079252248, + "loss": 0.82830572, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.42797852, + "step": 2089, + "time_per_iteration": 2.7205429077148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104968, + "balance_loss_mlp": 1.00683641, + "epoch": 0.4020777222008465, + "flos": 527288905728.0, + "grad_norm": 0.03117280194599123, + "language_loss": 0.83103907, + "learning_rate": 0.0006788252071430747, + "loss": 0.84153581, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.42895508, + "step": 2090, + "time_per_iteration": 2.659057378768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105285, + "balance_loss_mlp": 1.01000619, + "epoch": 0.40227010388611006, + "flos": 526841753088.0, + "grad_norm": 0.038447003118097976, + "language_loss": 0.86962426, + "learning_rate": 0.0006785342369348222, + "loss": 0.88015276, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.42895508, + "step": 2091, + "time_per_iteration": 2.7038679122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_mlp": 1.00374973, + "epoch": 0.4024624855713736, + "flos": 433227482880.0, + "grad_norm": 0.04129881296644863, + "language_loss": 0.80178273, + "learning_rate": 0.0006782431974134316, + "loss": 0.81224871, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.42895508, + "step": 2092, + "time_per_iteration": 2.522822618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_mlp": 1.00185025, + "epoch": 0.4026548672566372, + "flos": 768092136192.0, + "grad_norm": 0.028161411572745265, + "language_loss": 0.89556634, + "learning_rate": 0.0006779520886918949, + "loss": 0.90601373, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.42944336, + "step": 2093, + "time_per_iteration": 3.059269905090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051245, + "balance_loss_mlp": 1.00847256, + "epoch": 0.4028472489419007, + "flos": 644118163200.0, + "grad_norm": 0.031871945568835235, + "language_loss": 0.81586826, + "learning_rate": 0.0006776609108832301, + "loss": 0.82638067, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.42822266, + "step": 2094, + "time_per_iteration": 2.824986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_mlp": 1.00707567, + "epoch": 0.4030396306271643, + "flos": 492824149248.0, + "grad_norm": 0.03027887325873737, + "language_loss": 0.85679066, + "learning_rate": 0.0006773696641004828, + "loss": 0.86729133, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.43041992, + "step": 2095, + "time_per_iteration": 2.575521230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00742352, + "epoch": 0.40323201231242783, + "flos": 903195347712.0, + "grad_norm": 0.03549236004367387, + "language_loss": 0.78398442, + "learning_rate": 0.0006770783484567247, + "loss": 0.7944876, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.42944336, + "step": 2096, + "time_per_iteration": 3.1476502418518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_mlp": 1.00417244, + "epoch": 0.4034243939976914, + "flos": 571730992896.0, + "grad_norm": 0.04456027219971551, + "language_loss": 0.86790794, + "learning_rate": 0.000676786964065055, + "loss": 0.87837982, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.43066406, + "step": 2097, + "time_per_iteration": 2.826936960220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049393, + "balance_loss_mlp": 1.00635874, + "epoch": 0.403616775682955, + "flos": 508460874240.0, + "grad_norm": 0.03200015951198879, + "language_loss": 0.79479361, + "learning_rate": 0.0006764955110385986, + "loss": 0.80528748, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.4309082, + "step": 2098, + "time_per_iteration": 2.732429027557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105173, + "balance_loss_mlp": 1.0086236, + "epoch": 0.40380915736821854, + "flos": 520411507968.0, + "grad_norm": 0.033549102084289066, + "language_loss": 0.81161886, + "learning_rate": 0.0006762039894905083, + "loss": 0.82213616, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.43164062, + "step": 2099, + "time_per_iteration": 2.638117790222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104845, + "balance_loss_mlp": 1.00524902, + "epoch": 0.40400153905348213, + "flos": 442887918336.0, + "grad_norm": 0.03592642868139018, + "language_loss": 0.80970824, + "learning_rate": 0.000675912399533962, + "loss": 0.82019281, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.43261719, + "step": 2100, + "time_per_iteration": 2.58172345161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_mlp": 1.00585735, + "epoch": 0.40419392073874566, + "flos": 773705843712.0, + "grad_norm": 0.032245854328407444, + "language_loss": 0.85358262, + "learning_rate": 0.0006756207412821656, + "loss": 0.86407304, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.43237305, + "step": 2101, + "time_per_iteration": 3.0158467292785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053218, + "balance_loss_mlp": 1.01006424, + "epoch": 0.40438630242400925, + "flos": 767990068992.0, + "grad_norm": 0.03424537155124627, + "language_loss": 0.81043333, + "learning_rate": 0.0006753290148483505, + "loss": 0.82096547, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.43212891, + "step": 2102, + "time_per_iteration": 3.0169148445129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050406, + "balance_loss_mlp": 1.0073241, + "epoch": 0.4045786841092728, + "flos": 416129736192.0, + "grad_norm": 0.032341452227877814, + "language_loss": 0.79544723, + "learning_rate": 0.0006750372203457752, + "loss": 0.80595136, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.43139648, + "step": 2103, + "time_per_iteration": 2.459439277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104576, + "balance_loss_mlp": 1.00274944, + "epoch": 0.40477106579453637, + "flos": 540309788928.0, + "grad_norm": 0.028365330829485943, + "language_loss": 0.87031502, + "learning_rate": 0.0006747453578877242, + "loss": 0.88077265, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.43066406, + "step": 2104, + "time_per_iteration": 2.704583168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.00413048, + "epoch": 0.4049634474797999, + "flos": 828092213760.0, + "grad_norm": 0.03564801319951872, + "language_loss": 0.83885705, + "learning_rate": 0.0006744534275875085, + "loss": 0.84932852, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.43066406, + "step": 2105, + "time_per_iteration": 3.070952892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049443, + "balance_loss_mlp": 1.00631273, + "epoch": 0.4051558291650635, + "flos": 573753873408.0, + "grad_norm": 0.03321600555114549, + "language_loss": 0.86069483, + "learning_rate": 0.0006741614295584657, + "loss": 0.87118924, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.43188477, + "step": 2106, + "time_per_iteration": 2.677860736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_mlp": 1.00802493, + "epoch": 0.4053482108503271, + "flos": 733245355776.0, + "grad_norm": 0.034313991245887424, + "language_loss": 0.78860825, + "learning_rate": 0.0006738693639139595, + "loss": 0.79911888, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.4309082, + "step": 2107, + "time_per_iteration": 3.021329402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_mlp": 1.0043304, + "epoch": 0.4055405925355906, + "flos": 1214950971648.0, + "grad_norm": 0.03202932182515954, + "language_loss": 0.77947468, + "learning_rate": 0.0006735772307673796, + "loss": 0.7899493, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.43188477, + "step": 2108, + "time_per_iteration": 3.524618148803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_mlp": 1.00476336, + "epoch": 0.4057329742208542, + "flos": 717108988416.0, + "grad_norm": 0.03284224075250963, + "language_loss": 0.84037805, + "learning_rate": 0.0006732850302321421, + "loss": 0.85085559, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.43041992, + "step": 2109, + "time_per_iteration": 2.9528980255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_mlp": 1.00423336, + "epoch": 0.4059253559061177, + "flos": 565953980160.0, + "grad_norm": 0.033245578967332844, + "language_loss": 0.85031784, + "learning_rate": 0.00067299276242169, + "loss": 0.86078906, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.42944336, + "step": 2110, + "time_per_iteration": 2.715207815170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046326, + "balance_loss_mlp": 1.00493622, + "epoch": 0.4061177375913813, + "flos": 1597189459200.0, + "grad_norm": 0.00881896921345328, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75428492, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.4140625, + "step": 2111, + "time_per_iteration": 4.921623468399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045683, + "balance_loss_mlp": 1.00276768, + "epoch": 0.40631011927664484, + "flos": 616622178048.0, + "grad_norm": 0.03872377126422628, + "language_loss": 0.78301811, + "learning_rate": 0.0006724080254290395, + "loss": 0.79347491, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.4296875, + "step": 2112, + "time_per_iteration": 2.7997756004333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104896, + "balance_loss_mlp": 1.00606835, + "epoch": 0.40650250096190843, + "flos": 558748993536.0, + "grad_norm": 0.03550284292845091, + "language_loss": 0.90693575, + "learning_rate": 0.0006721155564738566, + "loss": 0.91742539, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.42944336, + "step": 2113, + "time_per_iteration": 2.6585686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_mlp": 1.00054932, + "epoch": 0.40669488264717196, + "flos": 1583545479168.0, + "grad_norm": 0.009767435928617773, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79664576, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.4140625, + "step": 2114, + "time_per_iteration": 4.948775053024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047453, + "balance_loss_mlp": 1.00460887, + "epoch": 0.40688726433243555, + "flos": 508656260352.0, + "grad_norm": 0.031160170727070474, + "language_loss": 0.86169994, + "learning_rate": 0.0006715304182135078, + "loss": 0.8721745, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.42895508, + "step": 2115, + "time_per_iteration": 2.6279850006103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00449646, + "epoch": 0.40707964601769914, + "flos": 590352944640.0, + "grad_norm": 0.04782787246513916, + "language_loss": 0.89337373, + "learning_rate": 0.0006712377491355127, + "loss": 0.90384591, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.42773438, + "step": 2116, + "time_per_iteration": 2.863960027694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_mlp": 1.00449598, + "epoch": 0.40727202770296267, + "flos": 581651943168.0, + "grad_norm": 0.026696862883813798, + "language_loss": 0.81451207, + "learning_rate": 0.0006709450135771274, + "loss": 0.8249836, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.42700195, + "step": 2117, + "time_per_iteration": 2.94854998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_mlp": 1.00589585, + "epoch": 0.40746440938822626, + "flos": 505109174784.0, + "grad_norm": 0.029498043522937258, + "language_loss": 0.87031925, + "learning_rate": 0.0006706522116520023, + "loss": 0.88080668, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.42895508, + "step": 2118, + "time_per_iteration": 2.6655611991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051527, + "balance_loss_mlp": 1.00880289, + "epoch": 0.4076567910734898, + "flos": 606711921408.0, + "grad_norm": 0.03542644850365937, + "language_loss": 0.83226359, + "learning_rate": 0.0006703593434738127, + "loss": 0.84277886, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.42773438, + "step": 2119, + "time_per_iteration": 2.7478883266448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049424, + "balance_loss_mlp": 1.00662768, + "epoch": 0.4078491727587534, + "flos": 480519681792.0, + "grad_norm": 0.032767120193604775, + "language_loss": 0.788118, + "learning_rate": 0.0006700664091562604, + "loss": 0.79861224, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.4284668, + "step": 2120, + "time_per_iteration": 2.532407760620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054491, + "balance_loss_mlp": 1.01167095, + "epoch": 0.4080415544440169, + "flos": 511419856128.0, + "grad_norm": 0.031947051498113735, + "language_loss": 0.85428649, + "learning_rate": 0.0006697734088130725, + "loss": 0.86483139, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.42871094, + "step": 2121, + "time_per_iteration": 2.6053290367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_mlp": 1.00899482, + "epoch": 0.4082339361292805, + "flos": 735928271616.0, + "grad_norm": 0.0331707162631359, + "language_loss": 0.86154819, + "learning_rate": 0.0006694803425580018, + "loss": 0.87206686, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.42919922, + "step": 2122, + "time_per_iteration": 2.995340585708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051457, + "balance_loss_mlp": 1.00863671, + "epoch": 0.4084263178145441, + "flos": 458405079552.0, + "grad_norm": 0.03582566166827548, + "language_loss": 0.85069245, + "learning_rate": 0.0006691872105048268, + "loss": 0.86120701, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.42871094, + "step": 2123, + "time_per_iteration": 2.6434147357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_mlp": 1.00655949, + "epoch": 0.4086186994998076, + "flos": 564026363904.0, + "grad_norm": 0.030981369506813725, + "language_loss": 0.84940457, + "learning_rate": 0.0006688940127673513, + "loss": 0.85990047, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.4309082, + "step": 2124, + "time_per_iteration": 2.677267074584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051992, + "balance_loss_mlp": 1.00914872, + "epoch": 0.4088110811850712, + "flos": 574894109184.0, + "grad_norm": 0.03166953679677798, + "language_loss": 0.86061293, + "learning_rate": 0.0006686007494594049, + "loss": 0.87113285, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.42895508, + "step": 2125, + "time_per_iteration": 2.806321620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_mlp": 1.00845325, + "epoch": 0.40900346287033473, + "flos": 457847111424.0, + "grad_norm": 0.04138148105998068, + "language_loss": 0.81154513, + "learning_rate": 0.0006683074206948425, + "loss": 0.82205856, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.42944336, + "step": 2126, + "time_per_iteration": 2.5422966480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051657, + "balance_loss_mlp": 1.00878966, + "epoch": 0.4091958445555983, + "flos": 618595481088.0, + "grad_norm": 0.03139043933990307, + "language_loss": 0.81871778, + "learning_rate": 0.0006680140265875443, + "loss": 0.82923436, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.42919922, + "step": 2127, + "time_per_iteration": 2.8402438163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_mlp": 1.0048064, + "epoch": 0.40938822624086185, + "flos": 473371075584.0, + "grad_norm": 0.031125843736347292, + "language_loss": 0.96506268, + "learning_rate": 0.0006677205672514162, + "loss": 0.97553754, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.42724609, + "step": 2128, + "time_per_iteration": 2.6291539669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047334, + "balance_loss_mlp": 1.00460982, + "epoch": 0.40958060792612544, + "flos": 571118589696.0, + "grad_norm": 0.02838685720934929, + "language_loss": 0.89474666, + "learning_rate": 0.000667427042800389, + "loss": 0.90522003, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.42773438, + "step": 2129, + "time_per_iteration": 2.749999761581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_mlp": 1.00435364, + "epoch": 0.40977298961138897, + "flos": 610471889664.0, + "grad_norm": 0.033304274322438925, + "language_loss": 0.8343153, + "learning_rate": 0.0006671334533484192, + "loss": 0.84478706, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.42871094, + "step": 2130, + "time_per_iteration": 2.778238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_mlp": 1.00636995, + "epoch": 0.40996537129665256, + "flos": 582873837312.0, + "grad_norm": 0.027360354791446346, + "language_loss": 0.83860981, + "learning_rate": 0.0006668397990094881, + "loss": 0.84910274, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.4296875, + "step": 2131, + "time_per_iteration": 2.711257219314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_mlp": 1.00145221, + "epoch": 0.41015775298191615, + "flos": 517554593280.0, + "grad_norm": 0.031461982022778785, + "language_loss": 0.85118818, + "learning_rate": 0.0006665460798976027, + "loss": 0.86163139, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.42919922, + "step": 2132, + "time_per_iteration": 2.7143847942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046552, + "balance_loss_mlp": 1.00370777, + "epoch": 0.4103501346671797, + "flos": 511446100992.0, + "grad_norm": 0.02874706903740214, + "language_loss": 0.82064044, + "learning_rate": 0.0006662522961267947, + "loss": 0.83110595, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.42895508, + "step": 2133, + "time_per_iteration": 2.683544635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_mlp": 1.00212467, + "epoch": 0.41054251635244327, + "flos": 550927713024.0, + "grad_norm": 0.027003210560574007, + "language_loss": 0.87900901, + "learning_rate": 0.0006659584478111211, + "loss": 0.88945937, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.4296875, + "step": 2134, + "time_per_iteration": 2.781217336654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_mlp": 1.00254142, + "epoch": 0.4107348980377068, + "flos": 841299734784.0, + "grad_norm": 0.03651700728131785, + "language_loss": 0.83066756, + "learning_rate": 0.000665664535064664, + "loss": 0.84112048, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.42797852, + "step": 2135, + "time_per_iteration": 3.067751169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_mlp": 1.00390935, + "epoch": 0.4109272797229704, + "flos": 504764089344.0, + "grad_norm": 0.03160666135819327, + "language_loss": 0.83225, + "learning_rate": 0.0006653705580015303, + "loss": 0.84271616, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.42749023, + "step": 2136, + "time_per_iteration": 2.6899030208587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.00521994, + "epoch": 0.4111196614082339, + "flos": 612024284928.0, + "grad_norm": 0.02957451828286975, + "language_loss": 0.87109792, + "learning_rate": 0.0006650765167358523, + "loss": 0.8815788, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.42919922, + "step": 2137, + "time_per_iteration": 2.8179140090942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048701, + "balance_loss_mlp": 1.00590456, + "epoch": 0.4113120430934975, + "flos": 454104639744.0, + "grad_norm": 0.033800673848535426, + "language_loss": 0.91012341, + "learning_rate": 0.0006647824113817864, + "loss": 0.92061043, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.4284668, + "step": 2138, + "time_per_iteration": 2.5263419151306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00635624, + "epoch": 0.41150442477876104, + "flos": 542710802688.0, + "grad_norm": 0.028316546184043286, + "language_loss": 0.818874, + "learning_rate": 0.000664488242053515, + "loss": 0.82936704, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.42993164, + "step": 2139, + "time_per_iteration": 2.770169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_mlp": 1.0037353, + "epoch": 0.4116968064640246, + "flos": 577392332544.0, + "grad_norm": 0.027329597632332964, + "language_loss": 0.84529692, + "learning_rate": 0.0006641940088652445, + "loss": 0.8557626, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.42895508, + "step": 2140, + "time_per_iteration": 2.761660575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046986, + "balance_loss_mlp": 1.00416613, + "epoch": 0.4118891881492882, + "flos": 497150833920.0, + "grad_norm": 0.03165424709394261, + "language_loss": 0.82833397, + "learning_rate": 0.0006638997119312065, + "loss": 0.83880383, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.42871094, + "step": 2141, + "time_per_iteration": 2.6978652477264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071571, + "balance_loss_mlp": 1.02980042, + "epoch": 0.41208156983455174, + "flos": 1541573425152.0, + "grad_norm": 0.013007961614308571, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76134878, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.41796875, + "step": 2142, + "time_per_iteration": 4.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.00611305, + "epoch": 0.41227395151981533, + "flos": 586058340864.0, + "grad_norm": 0.033991757131589403, + "language_loss": 0.85150123, + "learning_rate": 0.000663310927282877, + "loss": 0.86199009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.42822266, + "step": 2143, + "time_per_iteration": 2.7552297115325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_mlp": 1.00635242, + "epoch": 0.41246633320507886, + "flos": 443893039104.0, + "grad_norm": 0.031026250164357557, + "language_loss": 0.8627826, + "learning_rate": 0.000663016439797172, + "loss": 0.87327409, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.4284668, + "step": 2144, + "time_per_iteration": 2.627795934677124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_mlp": 1.00593042, + "epoch": 0.41265871489034245, + "flos": 581095920384.0, + "grad_norm": 0.032902127624834396, + "language_loss": 0.81700695, + "learning_rate": 0.0006627218890228724, + "loss": 0.82749426, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.4284668, + "step": 2145, + "time_per_iteration": 2.7726335525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051442, + "balance_loss_mlp": 1.00852692, + "epoch": 0.412851096575606, + "flos": 762529951488.0, + "grad_norm": 0.03700396426728773, + "language_loss": 0.8427214, + "learning_rate": 0.0006624272750743326, + "loss": 0.85323578, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.4296875, + "step": 2146, + "time_per_iteration": 3.047786235809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051555, + "balance_loss_mlp": 1.00854468, + "epoch": 0.41304347826086957, + "flos": 556521978624.0, + "grad_norm": 0.0279029176228374, + "language_loss": 0.83148611, + "learning_rate": 0.0006621325980659322, + "loss": 0.84200168, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.43066406, + "step": 2147, + "time_per_iteration": 2.7805261611938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105086, + "balance_loss_mlp": 1.00796807, + "epoch": 0.41323585994613315, + "flos": 666894746112.0, + "grad_norm": 0.03289726182172815, + "language_loss": 0.82395911, + "learning_rate": 0.000661837858112075, + "loss": 0.83446777, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.42944336, + "step": 2148, + "time_per_iteration": 2.8236329555511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_mlp": 1.00153887, + "epoch": 0.4134282416313967, + "flos": 549785531904.0, + "grad_norm": 0.03194652549549522, + "language_loss": 0.89158356, + "learning_rate": 0.0006615430553271888, + "loss": 0.90202832, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.42993164, + "step": 2149, + "time_per_iteration": 2.7931926250457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043808, + "balance_loss_mlp": 1.00101149, + "epoch": 0.4136206233166603, + "flos": 647513604096.0, + "grad_norm": 0.02946183128139913, + "language_loss": 0.8604427, + "learning_rate": 0.0006612481898257264, + "loss": 0.87088078, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.4284668, + "step": 2150, + "time_per_iteration": 2.853116512298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_mlp": 1.00279057, + "epoch": 0.4138130050019238, + "flos": 518364327936.0, + "grad_norm": 0.034556300996824205, + "language_loss": 0.85756087, + "learning_rate": 0.000660953261722165, + "loss": 0.86801755, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.42919922, + "step": 2151, + "time_per_iteration": 2.5899548530578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00575054, + "epoch": 0.4140053866871874, + "flos": 610369822464.0, + "grad_norm": 0.032804683798420206, + "language_loss": 0.83155799, + "learning_rate": 0.0006606582711310055, + "loss": 0.84204322, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.42822266, + "step": 2152, + "time_per_iteration": 2.7591912746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_mlp": 1.00258613, + "epoch": 0.4141977683724509, + "flos": 580846099200.0, + "grad_norm": 0.031179869336458114, + "language_loss": 0.84146237, + "learning_rate": 0.0006603632181667736, + "loss": 0.85191619, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.4284668, + "step": 2153, + "time_per_iteration": 2.661051034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_mlp": 1.00470734, + "epoch": 0.4143901500577145, + "flos": 1310178863616.0, + "grad_norm": 0.005957353398288201, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79989231, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.41210938, + "step": 2154, + "time_per_iteration": 4.908870458602905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_mlp": 1.00416827, + "epoch": 0.41458253174297804, + "flos": 461122988544.0, + "grad_norm": 0.03503771604154275, + "language_loss": 0.82412434, + "learning_rate": 0.0006597729255773153, + "loss": 0.83459282, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.42724609, + "step": 2155, + "time_per_iteration": 2.51566481590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048208, + "balance_loss_mlp": 1.00531614, + "epoch": 0.41477491342824163, + "flos": 554439805440.0, + "grad_norm": 0.033219020360443, + "language_loss": 0.82733047, + "learning_rate": 0.0006594776861812608, + "loss": 0.83781254, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.42944336, + "step": 2156, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_mlp": 1.00501156, + "epoch": 0.4149672951135052, + "flos": 699086800896.0, + "grad_norm": 0.029687792529517126, + "language_loss": 0.87240821, + "learning_rate": 0.0006591823848704776, + "loss": 0.88288647, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.42871094, + "step": 2157, + "time_per_iteration": 2.950136661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00647271, + "epoch": 0.41515967679876875, + "flos": 566837591808.0, + "grad_norm": 0.02753963183350331, + "language_loss": 0.82045114, + "learning_rate": 0.0006588870217596117, + "loss": 0.83094263, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.42724609, + "step": 2158, + "time_per_iteration": 2.742954730987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_mlp": 1.00440574, + "epoch": 0.41535205848403234, + "flos": 502178383104.0, + "grad_norm": 0.03782519840746282, + "language_loss": 0.86309534, + "learning_rate": 0.0006585915969633334, + "loss": 0.8735671, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.42822266, + "step": 2159, + "time_per_iteration": 2.6314492225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048815, + "balance_loss_mlp": 1.00599504, + "epoch": 0.41554444016929587, + "flos": 608702721024.0, + "grad_norm": 0.03160589415450587, + "language_loss": 0.8965854, + "learning_rate": 0.0006582961105963366, + "loss": 0.90707356, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.42871094, + "step": 2160, + "time_per_iteration": 2.779524564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052466, + "balance_loss_mlp": 1.0094316, + "epoch": 0.41573682185455946, + "flos": 530156514048.0, + "grad_norm": 0.0316987683946157, + "language_loss": 0.78011453, + "learning_rate": 0.0006580005627733395, + "loss": 0.79063922, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.4309082, + "step": 2161, + "time_per_iteration": 2.655961275100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053795, + "balance_loss_mlp": 1.01095116, + "epoch": 0.415929203539823, + "flos": 506038473216.0, + "grad_norm": 0.030200496407476712, + "language_loss": 0.82344484, + "learning_rate": 0.0006577049536090838, + "loss": 0.83398283, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.42895508, + "step": 2162, + "time_per_iteration": 2.734727144241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_mlp": 1.00536776, + "epoch": 0.4161215852250866, + "flos": 583824523008.0, + "grad_norm": 0.03528478058898885, + "language_loss": 0.86106777, + "learning_rate": 0.000657409283218335, + "loss": 0.87155068, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.4296875, + "step": 2163, + "time_per_iteration": 2.659733533859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051194, + "balance_loss_mlp": 1.00844538, + "epoch": 0.4163139669103501, + "flos": 491760702720.0, + "grad_norm": 0.03176725688202085, + "language_loss": 0.81183624, + "learning_rate": 0.0006571135517158829, + "loss": 0.82234824, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.42797852, + "step": 2164, + "time_per_iteration": 2.639364004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_mlp": 1.00241089, + "epoch": 0.4165063485956137, + "flos": 1291023243264.0, + "grad_norm": 0.009317160244550511, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807671, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.41210938, + "step": 2165, + "time_per_iteration": 4.755609750747681 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_mlp": 1.00600576, + "epoch": 0.4166987302808773, + "flos": 496258473984.0, + "grad_norm": 0.03907979296448248, + "language_loss": 0.83556676, + "learning_rate": 0.0006565219058351444, + "loss": 0.84605455, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.42822266, + "step": 2166, + "time_per_iteration": 2.549835443496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_mlp": 1.00087476, + "epoch": 0.4168911119661408, + "flos": 465067649280.0, + "grad_norm": 0.0316582334519174, + "language_loss": 0.83126116, + "learning_rate": 0.0006562259916865553, + "loss": 0.8416996, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.43017578, + "step": 2167, + "time_per_iteration": 2.577807664871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045446, + "balance_loss_mlp": 1.00253069, + "epoch": 0.4170834936514044, + "flos": 537943768320.0, + "grad_norm": 0.03263228805326442, + "language_loss": 0.79910517, + "learning_rate": 0.0006559300168856573, + "loss": 0.8095597, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.4296875, + "step": 2168, + "time_per_iteration": 2.716322898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_mlp": 1.00819373, + "epoch": 0.41727587533666793, + "flos": 551750086656.0, + "grad_norm": 0.029704951266317694, + "language_loss": 0.86753178, + "learning_rate": 0.0006556339815473577, + "loss": 0.87804294, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.4296875, + "step": 2169, + "time_per_iteration": 2.627387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044935, + "balance_loss_mlp": 1.00204313, + "epoch": 0.4174682570219315, + "flos": 632378466816.0, + "grad_norm": 0.03018462927838879, + "language_loss": 0.86615288, + "learning_rate": 0.000655337885786588, + "loss": 0.87660229, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.42944336, + "step": 2170, + "time_per_iteration": 2.8836913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_mlp": 1.00211012, + "epoch": 0.41766063870719505, + "flos": 520756593408.0, + "grad_norm": 0.03274558076895909, + "language_loss": 0.85911119, + "learning_rate": 0.0006550417297183025, + "loss": 0.86956197, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.43017578, + "step": 2171, + "time_per_iteration": 2.6085855960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054021, + "balance_loss_mlp": 1.0111295, + "epoch": 0.41785302039245864, + "flos": 559055195136.0, + "grad_norm": 0.03215226267597247, + "language_loss": 0.82142568, + "learning_rate": 0.0006547455134574793, + "loss": 0.83196592, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.42944336, + "step": 2172, + "time_per_iteration": 2.7207438945770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_mlp": 1.0057919, + "epoch": 0.41804540207772223, + "flos": 790028848896.0, + "grad_norm": 0.03152263917705172, + "language_loss": 0.84573895, + "learning_rate": 0.0006544492371191198, + "loss": 0.85622525, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.42895508, + "step": 2173, + "time_per_iteration": 3.1091549396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050974, + "balance_loss_mlp": 1.00791526, + "epoch": 0.41823778376298576, + "flos": 905891869440.0, + "grad_norm": 0.03158772894298815, + "language_loss": 0.83616948, + "learning_rate": 0.0006541529008182485, + "loss": 0.84667921, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.43115234, + "step": 2174, + "time_per_iteration": 3.1934547424316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050617, + "balance_loss_mlp": 1.0074867, + "epoch": 0.41843016544824935, + "flos": 512574676224.0, + "grad_norm": 0.036197783568866736, + "language_loss": 0.87799633, + "learning_rate": 0.0006538565046699136, + "loss": 0.88850248, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.43188477, + "step": 2175, + "time_per_iteration": 2.6156668663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043414, + "balance_loss_mlp": 1.00047445, + "epoch": 0.4186225471335129, + "flos": 654290880000.0, + "grad_norm": 0.03486733903162065, + "language_loss": 0.81864989, + "learning_rate": 0.0006535600487891862, + "loss": 0.82908404, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.42993164, + "step": 2176, + "time_per_iteration": 2.7715044021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00778568, + "epoch": 0.41881492881877647, + "flos": 570226229760.0, + "grad_norm": 0.03182850960977162, + "language_loss": 0.89874047, + "learning_rate": 0.0006532635332911603, + "loss": 0.90924585, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.42797852, + "step": 2177, + "time_per_iteration": 2.714635133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_mlp": 1.00352275, + "epoch": 0.41900731050404, + "flos": 913485682944.0, + "grad_norm": 0.031061931256926825, + "language_loss": 0.81313407, + "learning_rate": 0.0006529669582909541, + "loss": 0.82359695, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.42822266, + "step": 2178, + "time_per_iteration": 3.2592601776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052105, + "balance_loss_mlp": 1.00923753, + "epoch": 0.4191996921893036, + "flos": 536784090624.0, + "grad_norm": 0.03590517964257674, + "language_loss": 0.86468148, + "learning_rate": 0.0006526703239037077, + "loss": 0.87520254, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.42919922, + "step": 2179, + "time_per_iteration": 2.6636452674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.00259995, + "epoch": 0.4193920738745671, + "flos": 583731204096.0, + "grad_norm": 0.030716470700417473, + "language_loss": 0.86737585, + "learning_rate": 0.0006523736302445851, + "loss": 0.87783122, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.42993164, + "step": 2180, + "time_per_iteration": 2.801374673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00535846, + "epoch": 0.4195844555598307, + "flos": 1337802205440.0, + "grad_norm": 0.03692120158624074, + "language_loss": 0.77735525, + "learning_rate": 0.0006520768774287728, + "loss": 0.78783798, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.4296875, + "step": 2181, + "time_per_iteration": 3.781163454055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048528, + "balance_loss_mlp": 1.00568438, + "epoch": 0.4197768372450943, + "flos": 599997828864.0, + "grad_norm": 0.02986751846873145, + "language_loss": 0.85868645, + "learning_rate": 0.0006517800655714806, + "loss": 0.86917174, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.42895508, + "step": 2182, + "time_per_iteration": 2.8340775966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00454116, + "epoch": 0.4199692189303578, + "flos": 736597055232.0, + "grad_norm": 0.031915917751050384, + "language_loss": 0.8544265, + "learning_rate": 0.0006514831947879407, + "loss": 0.86489916, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.42773438, + "step": 2183, + "time_per_iteration": 2.943141460418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.005602, + "epoch": 0.4201616006156214, + "flos": 751663173120.0, + "grad_norm": 0.03318909585917556, + "language_loss": 0.78676963, + "learning_rate": 0.0006511862651934091, + "loss": 0.79725242, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.42724609, + "step": 2184, + "time_per_iteration": 3.0779521465301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049999, + "balance_loss_mlp": 1.00713122, + "epoch": 0.42035398230088494, + "flos": 548092185600.0, + "grad_norm": 0.030200903128349884, + "language_loss": 0.82675183, + "learning_rate": 0.0006508892769031638, + "loss": 0.83725178, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.42919922, + "step": 2185, + "time_per_iteration": 2.6862621307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052696, + "balance_loss_mlp": 1.0098995, + "epoch": 0.42054636398614853, + "flos": 618048206592.0, + "grad_norm": 0.035053166321698394, + "language_loss": 0.87309551, + "learning_rate": 0.000650592230032506, + "loss": 0.88362241, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.4284668, + "step": 2186, + "time_per_iteration": 2.7250919342041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051072, + "balance_loss_mlp": 1.00813246, + "epoch": 0.42073874567141206, + "flos": 641667571968.0, + "grad_norm": 0.033545410607481084, + "language_loss": 0.85750729, + "learning_rate": 0.0006502951246967595, + "loss": 0.86801797, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.42993164, + "step": 2187, + "time_per_iteration": 2.8897902965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051911, + "balance_loss_mlp": 1.00911534, + "epoch": 0.42093112735667565, + "flos": 494823697152.0, + "grad_norm": 0.02963421973388752, + "language_loss": 0.87416923, + "learning_rate": 0.0006499979610112706, + "loss": 0.88468838, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.4284668, + "step": 2188, + "time_per_iteration": 2.690762519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_mlp": 1.00219369, + "epoch": 0.4211235090419392, + "flos": 543437912064.0, + "grad_norm": 0.03405892185917734, + "language_loss": 0.84498167, + "learning_rate": 0.000649700739091409, + "loss": 0.85543036, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.42724609, + "step": 2189, + "time_per_iteration": 2.7150561809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050289, + "balance_loss_mlp": 1.00918579, + "epoch": 0.42131589072720277, + "flos": 1535391055872.0, + "grad_norm": 0.006162303642849888, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.7488656, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.41113281, + "step": 2190, + "time_per_iteration": 4.829074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_mlp": 1.00466371, + "epoch": 0.42150827241246636, + "flos": 567936031488.0, + "grad_norm": 0.029782751851152003, + "language_loss": 0.85824835, + "learning_rate": 0.0006491061210101557, + "loss": 0.8687222, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.42773438, + "step": 2191, + "time_per_iteration": 2.7018613815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_mlp": 1.00197124, + "epoch": 0.4217006540977299, + "flos": 708842500608.0, + "grad_norm": 0.03166528206992478, + "language_loss": 0.84430063, + "learning_rate": 0.0006488087250796157, + "loss": 0.85474735, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.42749023, + "step": 2192, + "time_per_iteration": 2.907424211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_mlp": 1.00236881, + "epoch": 0.4218930357829935, + "flos": 628562118144.0, + "grad_norm": 0.02920565844268777, + "language_loss": 0.82024074, + "learning_rate": 0.0006485112713764049, + "loss": 0.83069193, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.42797852, + "step": 2193, + "time_per_iteration": 2.9393887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_mlp": 1.00435925, + "epoch": 0.422085417468257, + "flos": 461290184448.0, + "grad_norm": 0.02925244938415649, + "language_loss": 0.84264457, + "learning_rate": 0.0006482137600160051, + "loss": 0.85311759, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.42993164, + "step": 2194, + "time_per_iteration": 2.549301862716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050742, + "balance_loss_mlp": 1.00780332, + "epoch": 0.4222777991535206, + "flos": 474981796608.0, + "grad_norm": 0.030629871462955913, + "language_loss": 0.85158336, + "learning_rate": 0.0006479161911139206, + "loss": 0.86209077, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.42993164, + "step": 2195, + "time_per_iteration": 2.6384336948394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_mlp": 1.01116967, + "epoch": 0.4224701808387841, + "flos": 471844925184.0, + "grad_norm": 0.03651823295441523, + "language_loss": 0.8580153, + "learning_rate": 0.0006476185647856778, + "loss": 0.8685571, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.43066406, + "step": 2196, + "time_per_iteration": 2.61171817779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050335, + "balance_loss_mlp": 1.00737166, + "epoch": 0.4226625625240477, + "flos": 678823992576.0, + "grad_norm": 0.03269819945270571, + "language_loss": 0.81914455, + "learning_rate": 0.0006473208811468255, + "loss": 0.8296479, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.43017578, + "step": 2197, + "time_per_iteration": 2.892245292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_mlp": 1.00611031, + "epoch": 0.4228549442093113, + "flos": 504559954944.0, + "grad_norm": 0.030930986611316814, + "language_loss": 0.84766257, + "learning_rate": 0.0006470231403129347, + "loss": 0.85815352, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.43041992, + "step": 2198, + "time_per_iteration": 2.64943265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104433, + "balance_loss_mlp": 1.00119996, + "epoch": 0.42304732589457483, + "flos": 613075092480.0, + "grad_norm": 0.027263393707605364, + "language_loss": 0.81978631, + "learning_rate": 0.0006467253423995988, + "loss": 0.83022958, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.43188477, + "step": 2199, + "time_per_iteration": 2.8850364685058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048401, + "balance_loss_mlp": 1.00527155, + "epoch": 0.4232397075798384, + "flos": 516649594368.0, + "grad_norm": 0.03785502815659436, + "language_loss": 0.79452145, + "learning_rate": 0.000646427487522433, + "loss": 0.80500549, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.43188477, + "step": 2200, + "time_per_iteration": 2.694916009902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050341, + "balance_loss_mlp": 1.00713968, + "epoch": 0.42343208926510195, + "flos": 590934245376.0, + "grad_norm": 0.030735047123199966, + "language_loss": 0.83900952, + "learning_rate": 0.0006461295757970749, + "loss": 0.84951293, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.43261719, + "step": 2201, + "time_per_iteration": 2.835726737976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046215, + "balance_loss_mlp": 1.00320446, + "epoch": 0.42362447095036554, + "flos": 641819216640.0, + "grad_norm": 0.03465447846020762, + "language_loss": 0.82287079, + "learning_rate": 0.0006458316073391839, + "loss": 0.83333296, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.43066406, + "step": 2202, + "time_per_iteration": 2.8503153324127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045543, + "balance_loss_mlp": 1.00241327, + "epoch": 0.42381685263562907, + "flos": 513718802688.0, + "grad_norm": 0.030503622319833546, + "language_loss": 0.88278598, + "learning_rate": 0.0006455335822644422, + "loss": 0.89324141, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.43188477, + "step": 2203, + "time_per_iteration": 2.6294915676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_mlp": 1.00689554, + "epoch": 0.42400923432089266, + "flos": 547822922496.0, + "grad_norm": 0.03601428124518316, + "language_loss": 0.78504658, + "learning_rate": 0.0006452355006885527, + "loss": 0.79554689, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.43188477, + "step": 2204, + "time_per_iteration": 2.7194669246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050575, + "balance_loss_mlp": 1.00756454, + "epoch": 0.4242016160061562, + "flos": 623288638464.0, + "grad_norm": 0.038292152226624715, + "language_loss": 0.88211453, + "learning_rate": 0.0006449373627272412, + "loss": 0.89262021, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.43066406, + "step": 2205, + "time_per_iteration": 2.760643243789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048111, + "balance_loss_mlp": 1.00495708, + "epoch": 0.4243939976914198, + "flos": 572972328960.0, + "grad_norm": 0.03657249930928273, + "language_loss": 0.83085704, + "learning_rate": 0.0006446391684962553, + "loss": 0.84133816, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.43212891, + "step": 2206, + "time_per_iteration": 2.656205892562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050624, + "balance_loss_mlp": 1.00766063, + "epoch": 0.42458637937668336, + "flos": 449665194240.0, + "grad_norm": 0.03531472123955245, + "language_loss": 0.83588743, + "learning_rate": 0.000644340918111364, + "loss": 0.84639364, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.43017578, + "step": 2207, + "time_per_iteration": 2.563599109649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_mlp": 1.00460744, + "epoch": 0.4247787610619469, + "flos": 436336164096.0, + "grad_norm": 0.035922125926704504, + "language_loss": 0.8567791, + "learning_rate": 0.0006440426116883585, + "loss": 0.86725497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.43041992, + "step": 2208, + "time_per_iteration": 2.5554726123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_mlp": 1.00743186, + "epoch": 0.4249711427472105, + "flos": 497122643712.0, + "grad_norm": 0.02878008588010938, + "language_loss": 0.86522639, + "learning_rate": 0.0006437442493430519, + "loss": 0.87572914, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.42895508, + "step": 2209, + "time_per_iteration": 2.698664426803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00334466, + "epoch": 0.425163524432474, + "flos": 657108910848.0, + "grad_norm": 0.03332162137783894, + "language_loss": 0.87084454, + "learning_rate": 0.000643445831191278, + "loss": 0.88130671, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.42919922, + "step": 2210, + "time_per_iteration": 2.919759750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_mlp": 1.00526094, + "epoch": 0.4253559061177376, + "flos": 651779050752.0, + "grad_norm": 0.0360276634161647, + "language_loss": 0.82163692, + "learning_rate": 0.0006431473573488937, + "loss": 0.83211577, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.42675781, + "step": 2211, + "time_per_iteration": 2.7520995140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051038, + "balance_loss_mlp": 1.00836086, + "epoch": 0.42554828780300114, + "flos": 555203853312.0, + "grad_norm": 0.03839138543396186, + "language_loss": 0.85743141, + "learning_rate": 0.0006428488279317765, + "loss": 0.86794186, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.42724609, + "step": 2212, + "time_per_iteration": 2.6509060859680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046098, + "balance_loss_mlp": 1.00356376, + "epoch": 0.4257406694882647, + "flos": 515422842624.0, + "grad_norm": 0.03572196481521071, + "language_loss": 0.88174772, + "learning_rate": 0.0006425502430558259, + "loss": 0.89220864, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.42578125, + "step": 2213, + "time_per_iteration": 2.6220855712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_mlp": 1.00623667, + "epoch": 0.42593305117352825, + "flos": 516705974784.0, + "grad_norm": 0.03258136107598633, + "language_loss": 0.85395515, + "learning_rate": 0.0006422516028369628, + "loss": 0.86444604, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.42895508, + "step": 2214, + "time_per_iteration": 2.6463093757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_mlp": 1.00069499, + "epoch": 0.42612543285879184, + "flos": 589238953728.0, + "grad_norm": 0.0291937048711678, + "language_loss": 0.83896095, + "learning_rate": 0.0006419529073911296, + "loss": 0.8493973, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.42993164, + "step": 2215, + "time_per_iteration": 2.910792112350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.0052923, + "epoch": 0.42631781454405543, + "flos": 636752783616.0, + "grad_norm": 0.03192715722055512, + "language_loss": 0.86142385, + "learning_rate": 0.0006416541568342901, + "loss": 0.87190473, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.4284668, + "step": 2216, + "time_per_iteration": 2.846374750137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_mlp": 1.00366437, + "epoch": 0.42651019622931896, + "flos": 542246153472.0, + "grad_norm": 0.029068811164029314, + "language_loss": 0.84547782, + "learning_rate": 0.0006413553512824297, + "loss": 0.8559429, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.42895508, + "step": 2217, + "time_per_iteration": 2.7738640308380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.00467396, + "epoch": 0.42670257791458255, + "flos": 559224336384.0, + "grad_norm": 0.03125487953761627, + "language_loss": 0.85257965, + "learning_rate": 0.0006410564908515549, + "loss": 0.86305416, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.42822266, + "step": 2218, + "time_per_iteration": 2.654423713684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050321, + "balance_loss_mlp": 1.00757229, + "epoch": 0.4268949595998461, + "flos": 622450713600.0, + "grad_norm": 0.03350458888486861, + "language_loss": 0.85655409, + "learning_rate": 0.0006407575756576935, + "loss": 0.86705726, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.42797852, + "step": 2219, + "time_per_iteration": 2.7789905071258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047517, + "balance_loss_mlp": 1.00479233, + "epoch": 0.42708734128510967, + "flos": 539015963136.0, + "grad_norm": 0.029341516559542476, + "language_loss": 0.87978554, + "learning_rate": 0.0006404586058168951, + "loss": 0.8902607, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.42773438, + "step": 2220, + "time_per_iteration": 2.7526872158050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047218, + "balance_loss_mlp": 1.00456524, + "epoch": 0.4272797229703732, + "flos": 503862981120.0, + "grad_norm": 0.03177497968579407, + "language_loss": 0.87384629, + "learning_rate": 0.0006401595814452296, + "loss": 0.88431847, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.42700195, + "step": 2221, + "time_per_iteration": 2.620292901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_mlp": 1.00282323, + "epoch": 0.4274721046556368, + "flos": 493438497792.0, + "grad_norm": 0.03138650703960668, + "language_loss": 0.81104958, + "learning_rate": 0.000639860502658789, + "loss": 0.82150364, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.42626953, + "step": 2222, + "time_per_iteration": 2.6335668563842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_mlp": 1.01007414, + "epoch": 0.4276644863409004, + "flos": 569462181888.0, + "grad_norm": 0.029337527326174825, + "language_loss": 0.84956491, + "learning_rate": 0.0006395613695736853, + "loss": 0.86009336, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.42822266, + "step": 2223, + "time_per_iteration": 2.69158935546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_mlp": 1.01059997, + "epoch": 0.4278568680261639, + "flos": 608563715328.0, + "grad_norm": 0.03527650476558936, + "language_loss": 0.8254534, + "learning_rate": 0.0006392621823060529, + "loss": 0.83598542, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.42651367, + "step": 2224, + "time_per_iteration": 2.7607972621917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_mlp": 0.99978256, + "epoch": 0.4280492497114275, + "flos": 561579663360.0, + "grad_norm": 0.03854840542263403, + "language_loss": 0.8576616, + "learning_rate": 0.0006389629409720465, + "loss": 0.86808693, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.42797852, + "step": 2225, + "time_per_iteration": 2.675492525100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_mlp": 1.00333333, + "epoch": 0.428241631396691, + "flos": 721902267648.0, + "grad_norm": 0.035169952304445494, + "language_loss": 0.89023572, + "learning_rate": 0.0006386636456878417, + "loss": 0.90069675, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.42822266, + "step": 2226, + "time_per_iteration": 2.8786110877990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046721, + "balance_loss_mlp": 1.00397301, + "epoch": 0.4284340130819546, + "flos": 430370568192.0, + "grad_norm": 0.04053005061098929, + "language_loss": 0.92206526, + "learning_rate": 0.0006383642965696353, + "loss": 0.93253243, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.42797852, + "step": 2227, + "time_per_iteration": 2.468848705291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00519955, + "epoch": 0.42862639476721814, + "flos": 526160330496.0, + "grad_norm": 0.0312355764309364, + "language_loss": 0.83643448, + "learning_rate": 0.000638064893733645, + "loss": 0.84691536, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.42944336, + "step": 2228, + "time_per_iteration": 2.7273313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_mlp": 1.0059433, + "epoch": 0.42881877645248173, + "flos": 466378971648.0, + "grad_norm": 0.033088247906643435, + "language_loss": 0.90412128, + "learning_rate": 0.000637765437296109, + "loss": 0.91460913, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.42895508, + "step": 2229, + "time_per_iteration": 2.6459994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104348, + "balance_loss_mlp": 1.00051713, + "epoch": 0.42901115813774526, + "flos": 561356087040.0, + "grad_norm": 0.033851055909267555, + "language_loss": 0.85812581, + "learning_rate": 0.000637465927373287, + "loss": 0.86856055, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.43017578, + "step": 2230, + "time_per_iteration": 2.6650984287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_mlp": 1.00843728, + "epoch": 0.42920353982300885, + "flos": 562528403712.0, + "grad_norm": 0.03941473686966497, + "language_loss": 0.79439276, + "learning_rate": 0.000637166364081459, + "loss": 0.80490577, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.42919922, + "step": 2231, + "time_per_iteration": 2.6497089862823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_mlp": 1.00242341, + "epoch": 0.42939592150827244, + "flos": 557316162048.0, + "grad_norm": 0.0345529023969128, + "language_loss": 0.84757453, + "learning_rate": 0.0006368667475369256, + "loss": 0.85802627, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.42797852, + "step": 2232, + "time_per_iteration": 2.7934672832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_mlp": 1.00753021, + "epoch": 0.42958830319353597, + "flos": 1524945185280.0, + "grad_norm": 0.006396251355867503, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79576218, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.40917969, + "step": 2233, + "time_per_iteration": 6.342620372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_mlp": 1.0040741, + "epoch": 0.42978068487879956, + "flos": 1498872316416.0, + "grad_norm": 0.003657386104401554, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.7994051, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.41015625, + "step": 2234, + "time_per_iteration": 4.862509250640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044747, + "balance_loss_mlp": 1.00209367, + "epoch": 0.4299730665640631, + "flos": 548063995392.0, + "grad_norm": 0.029617650166464796, + "language_loss": 0.86346197, + "learning_rate": 0.0006359675795504112, + "loss": 0.87390947, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.42700195, + "step": 2235, + "time_per_iteration": 2.747687339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_mlp": 1.0022428, + "epoch": 0.4301654482493267, + "flos": 1131116700672.0, + "grad_norm": 0.034530900471349386, + "language_loss": 0.74852663, + "learning_rate": 0.0006356677511584775, + "loss": 0.75897634, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.42773438, + "step": 2236, + "time_per_iteration": 3.4453399181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_mlp": 1.00291729, + "epoch": 0.4303578299345902, + "flos": 496742565120.0, + "grad_norm": 0.03572959525697719, + "language_loss": 0.8668766, + "learning_rate": 0.0006353678700956511, + "loss": 0.87733233, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.42700195, + "step": 2237, + "time_per_iteration": 2.562898874282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_mlp": 1.00228131, + "epoch": 0.4305502116198538, + "flos": 616930324992.0, + "grad_norm": 0.03185512314906856, + "language_loss": 0.84350532, + "learning_rate": 0.0006350679364783569, + "loss": 0.853953, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.42529297, + "step": 2238, + "time_per_iteration": 2.7968668937683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044024, + "balance_loss_mlp": 1.00139523, + "epoch": 0.4307425933051173, + "flos": 560322776064.0, + "grad_norm": 0.03209283293682184, + "language_loss": 0.85997605, + "learning_rate": 0.0006347679504230393, + "loss": 0.87041628, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.42675781, + "step": 2239, + "time_per_iteration": 2.634075880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042998, + "balance_loss_mlp": 1.00039279, + "epoch": 0.4309349749903809, + "flos": 973818206976.0, + "grad_norm": 0.03253096283776471, + "language_loss": 0.77016532, + "learning_rate": 0.0006344679120461632, + "loss": 0.7805953, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.42651367, + "step": 2240, + "time_per_iteration": 3.334874153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044857, + "balance_loss_mlp": 1.00222731, + "epoch": 0.4311273566756445, + "flos": 542973262848.0, + "grad_norm": 0.034862997803941254, + "language_loss": 0.8043505, + "learning_rate": 0.0006341678214642134, + "loss": 0.81479907, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.42675781, + "step": 2241, + "time_per_iteration": 2.6504814624786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00386059, + "epoch": 0.43131973836090803, + "flos": 763112219136.0, + "grad_norm": 0.032836493574204505, + "language_loss": 0.83329326, + "learning_rate": 0.0006338676787936963, + "loss": 0.84375745, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.42602539, + "step": 2242, + "time_per_iteration": 3.0819406509399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049013, + "balance_loss_mlp": 1.0064075, + "epoch": 0.4315121200461716, + "flos": 555603373824.0, + "grad_norm": 0.03474898353682057, + "language_loss": 0.8436116, + "learning_rate": 0.0006335674841511367, + "loss": 0.85410172, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.42651367, + "step": 2243, + "time_per_iteration": 2.688323974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.00395203, + "epoch": 0.43170450173143515, + "flos": 1488689872896.0, + "grad_norm": 0.005657229041031833, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80226028, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.40917969, + "step": 2244, + "time_per_iteration": 5.0437562465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_mlp": 1.00093079, + "epoch": 0.43189688341669874, + "flos": 1476910325760.0, + "grad_norm": 0.004174711640612148, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.784073, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.40820312, + "step": 2245, + "time_per_iteration": 4.930269002914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105129, + "balance_loss_mlp": 1.00870872, + "epoch": 0.43208926510196227, + "flos": 493985772288.0, + "grad_norm": 0.03367129883883542, + "language_loss": 0.83325648, + "learning_rate": 0.0006326665895567652, + "loss": 0.84376937, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.42626953, + "step": 2246, + "time_per_iteration": 2.6496520042419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_mlp": 1.0025456, + "epoch": 0.43228164678722586, + "flos": 521303867904.0, + "grad_norm": 0.0373506965449987, + "language_loss": 0.88340402, + "learning_rate": 0.0006323661881916976, + "loss": 0.89385581, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.42675781, + "step": 2247, + "time_per_iteration": 2.7220535278320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_mlp": 1.00188208, + "epoch": 0.4324740284724894, + "flos": 797396173824.0, + "grad_norm": 0.03547023876634794, + "language_loss": 0.8184936, + "learning_rate": 0.0006320657354375179, + "loss": 0.82893801, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.42602539, + "step": 2248, + "time_per_iteration": 2.939730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00463986, + "epoch": 0.432666410157753, + "flos": 483098585088.0, + "grad_norm": 0.03653679675435745, + "language_loss": 0.87333679, + "learning_rate": 0.0006317652314108726, + "loss": 0.88380903, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.42626953, + "step": 2249, + "time_per_iteration": 2.554605007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104756, + "balance_loss_mlp": 1.00512183, + "epoch": 0.43285879184301657, + "flos": 501210200832.0, + "grad_norm": 0.035110898136686476, + "language_loss": 0.91870761, + "learning_rate": 0.0006314646762284277, + "loss": 0.92918324, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.42480469, + "step": 2250, + "time_per_iteration": 2.6592071056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051212, + "balance_loss_mlp": 1.01029968, + "epoch": 0.4330511735282801, + "flos": 1513793592576.0, + "grad_norm": 0.004753866691066904, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76477039, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.40917969, + "step": 2251, + "time_per_iteration": 4.880429267883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00837088, + "epoch": 0.4332435552135437, + "flos": 700838472960.0, + "grad_norm": 0.03213295924784481, + "language_loss": 0.77973437, + "learning_rate": 0.0006308634128629022, + "loss": 0.790241, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.42333984, + "step": 2252, + "time_per_iteration": 2.882138729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048462, + "balance_loss_mlp": 1.00621462, + "epoch": 0.4334359368988072, + "flos": 593483013120.0, + "grad_norm": 0.03310670466815904, + "language_loss": 0.87855673, + "learning_rate": 0.0006305627049132531, + "loss": 0.8890413, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.42285156, + "step": 2253, + "time_per_iteration": 2.756601095199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052718, + "balance_loss_mlp": 1.01049364, + "epoch": 0.4336283185840708, + "flos": 844276213248.0, + "grad_norm": 0.028181128656308053, + "language_loss": 0.86222875, + "learning_rate": 0.0006302619462746662, + "loss": 0.87275594, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.42260742, + "step": 2254, + "time_per_iteration": 3.1384341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00748384, + "epoch": 0.43382070026933434, + "flos": 627402440448.0, + "grad_norm": 0.031912731462448586, + "language_loss": 0.90840006, + "learning_rate": 0.0006299611370639069, + "loss": 0.91889828, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.42382812, + "step": 2255, + "time_per_iteration": 2.712411642074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00746036, + "epoch": 0.4340130819545979, + "flos": 592210574592.0, + "grad_norm": 0.034079381595113686, + "language_loss": 0.79521996, + "learning_rate": 0.0006296602773977593, + "loss": 0.80571818, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.42407227, + "step": 2256, + "time_per_iteration": 2.714035987854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044192, + "balance_loss_mlp": 1.00182462, + "epoch": 0.4342054636398615, + "flos": 491956088832.0, + "grad_norm": 0.031173748742501443, + "language_loss": 0.88170785, + "learning_rate": 0.0006293593673930277, + "loss": 0.89214981, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.42407227, + "step": 2257, + "time_per_iteration": 2.6403400897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050154, + "balance_loss_mlp": 1.00771534, + "epoch": 0.43439784532512504, + "flos": 700261062912.0, + "grad_norm": 0.031956889919079245, + "language_loss": 0.79138076, + "learning_rate": 0.0006290584071665358, + "loss": 0.80188227, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.42480469, + "step": 2258, + "time_per_iteration": 2.88726544380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051787, + "balance_loss_mlp": 1.00942004, + "epoch": 0.43459022701038863, + "flos": 486802172928.0, + "grad_norm": 0.03220669099915263, + "language_loss": 0.82764459, + "learning_rate": 0.0006287573968351266, + "loss": 0.83816242, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.42407227, + "step": 2259, + "time_per_iteration": 2.556873083114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_mlp": 1.00314939, + "epoch": 0.43478260869565216, + "flos": 644267862528.0, + "grad_norm": 0.0421666552527836, + "language_loss": 0.83019865, + "learning_rate": 0.0006284563365156626, + "loss": 0.84065259, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.42285156, + "step": 2260, + "time_per_iteration": 2.7845253944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044012, + "balance_loss_mlp": 1.0014782, + "epoch": 0.43497499038091575, + "flos": 427010120448.0, + "grad_norm": 0.03632893260701325, + "language_loss": 0.87946701, + "learning_rate": 0.0006281552263250261, + "loss": 0.88990712, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.42578125, + "step": 2261, + "time_per_iteration": 2.4605414867401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_mlp": 1.00973511, + "epoch": 0.4351673720661793, + "flos": 1541527738368.0, + "grad_norm": 0.007050141628338806, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81742275, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.41015625, + "step": 2262, + "time_per_iteration": 4.901712656021118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105392, + "balance_loss_mlp": 1.01160097, + "epoch": 0.43535975375144287, + "flos": 750466556928.0, + "grad_norm": 0.036118497785784055, + "language_loss": 0.8206706, + "learning_rate": 0.0006275528567978593, + "loss": 0.83120978, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.42358398, + "step": 2263, + "time_per_iteration": 2.9023561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049817, + "balance_loss_mlp": 1.00749719, + "epoch": 0.4355521354367064, + "flos": 862752356352.0, + "grad_norm": 0.037575674234966834, + "language_loss": 0.82972687, + "learning_rate": 0.0006272515976951898, + "loss": 0.84022498, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.42358398, + "step": 2264, + "time_per_iteration": 3.062626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_mlp": 1.00086057, + "epoch": 0.43574451712197, + "flos": 735843700992.0, + "grad_norm": 0.027621901281680974, + "language_loss": 0.7971707, + "learning_rate": 0.0006269502891890687, + "loss": 0.80760157, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.42260742, + "step": 2265, + "time_per_iteration": 3.006544351577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047735, + "balance_loss_mlp": 1.00548732, + "epoch": 0.4359368988072336, + "flos": 571713496320.0, + "grad_norm": 0.03795602123750952, + "language_loss": 0.88080567, + "learning_rate": 0.0006266489313964743, + "loss": 0.89128304, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.42285156, + "step": 2266, + "time_per_iteration": 2.7217609882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00633645, + "epoch": 0.4361292804924971, + "flos": 556671677952.0, + "grad_norm": 0.02985944883667051, + "language_loss": 0.86046827, + "learning_rate": 0.0006263475244344041, + "loss": 0.87095433, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.4230957, + "step": 2267, + "time_per_iteration": 2.844616651535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048894, + "balance_loss_mlp": 1.00688469, + "epoch": 0.4363216621777607, + "flos": 558349473024.0, + "grad_norm": 0.03645132335916721, + "language_loss": 0.84930134, + "learning_rate": 0.0006260460684198746, + "loss": 0.85979033, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.42041016, + "step": 2268, + "time_per_iteration": 2.6209938526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_mlp": 1.00457883, + "epoch": 0.4365140438630242, + "flos": 479197665792.0, + "grad_norm": 0.03681259693925087, + "language_loss": 0.84888554, + "learning_rate": 0.0006257445634699213, + "loss": 0.85935068, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.41967773, + "step": 2269, + "time_per_iteration": 2.5371193885803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_mlp": 1.00675571, + "epoch": 0.4367064255482878, + "flos": 580008174336.0, + "grad_norm": 0.03379370609735099, + "language_loss": 0.83707798, + "learning_rate": 0.0006254430097015993, + "loss": 0.84756517, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.41992188, + "step": 2270, + "time_per_iteration": 2.663670539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053223, + "balance_loss_mlp": 1.01278687, + "epoch": 0.43689880723355135, + "flos": 1462274830848.0, + "grad_norm": 0.005499517712732893, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77532315, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.40429688, + "step": 2271, + "time_per_iteration": 4.872848033905029 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051249, + "balance_loss_mlp": 1.00945389, + "epoch": 0.43709118891881493, + "flos": 668874852096.0, + "grad_norm": 0.028346757116800847, + "language_loss": 0.85555887, + "learning_rate": 0.0006248397561781609, + "loss": 0.86607134, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.41821289, + "step": 2272, + "time_per_iteration": 2.8525848388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_mlp": 1.01004434, + "epoch": 0.43728357060407846, + "flos": 545914748160.0, + "grad_norm": 0.03971939435737374, + "language_loss": 0.86681366, + "learning_rate": 0.0006245380566572482, + "loss": 0.87733418, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.42041016, + "step": 2273, + "time_per_iteration": 2.65950608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_mlp": 1.01047897, + "epoch": 0.43747595228934205, + "flos": 748185106944.0, + "grad_norm": 0.03474296828051499, + "language_loss": 0.764799, + "learning_rate": 0.0006242363087863744, + "loss": 0.77532339, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.41992188, + "step": 2274, + "time_per_iteration": 3.009678363800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_mlp": 1.00212932, + "epoch": 0.43766833397460564, + "flos": 632530111488.0, + "grad_norm": 0.043644038275203835, + "language_loss": 0.86733937, + "learning_rate": 0.0006239345126826878, + "loss": 0.87778056, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.42016602, + "step": 2275, + "time_per_iteration": 2.7913572788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_mlp": 1.00093269, + "epoch": 0.43786071565986917, + "flos": 532099681536.0, + "grad_norm": 0.03488456741245989, + "language_loss": 0.84520668, + "learning_rate": 0.0006236326684633561, + "loss": 0.85563612, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.42041016, + "step": 2276, + "time_per_iteration": 2.868460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_mlp": 1.00567341, + "epoch": 0.43805309734513276, + "flos": 539558380032.0, + "grad_norm": 0.04090877877929134, + "language_loss": 0.75841373, + "learning_rate": 0.0006233307762455658, + "loss": 0.76888937, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.41918945, + "step": 2277, + "time_per_iteration": 2.675471782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.00563169, + "epoch": 0.4382454790303963, + "flos": 865965050112.0, + "grad_norm": 0.057141626101515054, + "language_loss": 0.83989596, + "learning_rate": 0.0006230288361465216, + "loss": 0.85037291, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.42089844, + "step": 2278, + "time_per_iteration": 3.0322673320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_mlp": 1.005216, + "epoch": 0.4384378607156599, + "flos": 766802201088.0, + "grad_norm": 0.03709867443192191, + "language_loss": 0.85241038, + "learning_rate": 0.0006227268482834473, + "loss": 0.86288601, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.42382812, + "step": 2279, + "time_per_iteration": 2.900203227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.0024029, + "epoch": 0.4386302424009234, + "flos": 669797347584.0, + "grad_norm": 0.03112976006735108, + "language_loss": 0.87510288, + "learning_rate": 0.000622424812773585, + "loss": 0.88555157, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.42504883, + "step": 2280, + "time_per_iteration": 2.8384146690368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_mlp": 1.00591767, + "epoch": 0.438822624086187, + "flos": 486150885888.0, + "grad_norm": 0.037274279546085635, + "language_loss": 0.8020004, + "learning_rate": 0.000622122729734195, + "loss": 0.81248468, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.42553711, + "step": 2281, + "time_per_iteration": 2.6004860401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048001, + "balance_loss_mlp": 1.00549114, + "epoch": 0.4390150057714506, + "flos": 500259515136.0, + "grad_norm": 0.032261530197162686, + "language_loss": 0.88006121, + "learning_rate": 0.0006218205992825566, + "loss": 0.8905412, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.42553711, + "step": 2282, + "time_per_iteration": 2.619781494140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049893, + "balance_loss_mlp": 1.00745404, + "epoch": 0.4392073874567141, + "flos": 559352648448.0, + "grad_norm": 0.035010140104523226, + "language_loss": 0.8217926, + "learning_rate": 0.0006215184215359671, + "loss": 0.83229148, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.42480469, + "step": 2283, + "time_per_iteration": 2.7295265197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_mlp": 1.00495577, + "epoch": 0.4393997691419777, + "flos": 606423216384.0, + "grad_norm": 0.031848598857185544, + "language_loss": 0.86998332, + "learning_rate": 0.0006212161966117425, + "loss": 0.88045812, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.42578125, + "step": 2284, + "time_per_iteration": 2.718440532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_mlp": 1.00607538, + "epoch": 0.43959215082724123, + "flos": 805484772096.0, + "grad_norm": 0.035712970592664255, + "language_loss": 0.82239711, + "learning_rate": 0.0006209139246272164, + "loss": 0.83288318, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.42578125, + "step": 2285, + "time_per_iteration": 2.9688222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050536, + "balance_loss_mlp": 1.00793087, + "epoch": 0.4397845325125048, + "flos": 488608280064.0, + "grad_norm": 0.03687327973299051, + "language_loss": 0.82202113, + "learning_rate": 0.0006206116056997421, + "loss": 0.8325265, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.42651367, + "step": 2286, + "time_per_iteration": 2.5476558208465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048309, + "balance_loss_mlp": 1.00579894, + "epoch": 0.43997691419776835, + "flos": 481785317376.0, + "grad_norm": 0.030160303580515496, + "language_loss": 0.8299154, + "learning_rate": 0.0006203092399466892, + "loss": 0.84039849, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.42553711, + "step": 2287, + "time_per_iteration": 2.5308852195739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_mlp": 1.00539064, + "epoch": 0.44016929588303194, + "flos": 484129950720.0, + "grad_norm": 0.02729114822665251, + "language_loss": 0.85650307, + "learning_rate": 0.0006200068274854473, + "loss": 0.8669818, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.42529297, + "step": 2288, + "time_per_iteration": 2.6596133708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045912, + "balance_loss_mlp": 1.00361645, + "epoch": 0.4403616775682955, + "flos": 573024818688.0, + "grad_norm": 0.028573956325372987, + "language_loss": 0.86632061, + "learning_rate": 0.0006197043684334229, + "loss": 0.87677968, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.42333984, + "step": 2289, + "time_per_iteration": 2.773327350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00496542, + "epoch": 0.44055405925355906, + "flos": 632000333568.0, + "grad_norm": 0.03542319310998882, + "language_loss": 0.80357343, + "learning_rate": 0.0006194018629080411, + "loss": 0.81404698, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.42431641, + "step": 2290, + "time_per_iteration": 2.7465741634368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_mlp": 1.00444698, + "epoch": 0.44074644093882265, + "flos": 537826149888.0, + "grad_norm": 0.033710926441732514, + "language_loss": 0.82429153, + "learning_rate": 0.0006190993110267451, + "loss": 0.83475971, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.42407227, + "step": 2291, + "time_per_iteration": 2.734936237335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_mlp": 1.00401258, + "epoch": 0.4409388226240862, + "flos": 464166541056.0, + "grad_norm": 0.03677198311176373, + "language_loss": 0.84841394, + "learning_rate": 0.0006187967129069958, + "loss": 0.85887772, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.42407227, + "step": 2292, + "time_per_iteration": 2.491478443145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048193, + "balance_loss_mlp": 1.00604105, + "epoch": 0.44113120430934977, + "flos": 567161289984.0, + "grad_norm": 0.027373577802651455, + "language_loss": 0.87309539, + "learning_rate": 0.0006184940686662722, + "loss": 0.88357735, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.421875, + "step": 2293, + "time_per_iteration": 2.7358779907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045965, + "balance_loss_mlp": 1.00371683, + "epoch": 0.4413235859946133, + "flos": 544675357440.0, + "grad_norm": 0.03072432375615432, + "language_loss": 0.9056381, + "learning_rate": 0.0006181913784220714, + "loss": 0.91609776, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.42285156, + "step": 2294, + "time_per_iteration": 2.6358015537261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_mlp": 1.00485992, + "epoch": 0.4415159676798769, + "flos": 1573305688320.0, + "grad_norm": 0.007789835090792861, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81599367, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.40722656, + "step": 2295, + "time_per_iteration": 4.902246713638306 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_mlp": 1.00181961, + "epoch": 0.4417083493651404, + "flos": 660013457664.0, + "grad_norm": 0.029698143477661094, + "language_loss": 0.80193049, + "learning_rate": 0.0006175858603933146, + "loss": 0.8123709, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.42260742, + "step": 2296, + "time_per_iteration": 2.8894712924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010469, + "balance_loss_mlp": 1.00477171, + "epoch": 0.441900731050404, + "flos": 741818045184.0, + "grad_norm": 0.03343125158047759, + "language_loss": 0.81235009, + "learning_rate": 0.0006172830328438416, + "loss": 0.82281911, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.42163086, + "step": 2297, + "time_per_iteration": 3.03363299369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_mlp": 1.00080705, + "epoch": 0.44209311273566754, + "flos": 540596548608.0, + "grad_norm": 0.03516131163144532, + "language_loss": 0.87775767, + "learning_rate": 0.0006169801597610572, + "loss": 0.88818848, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.4230957, + "step": 2298, + "time_per_iteration": 2.7615511417388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047937, + "balance_loss_mlp": 1.00580859, + "epoch": 0.4422854944209311, + "flos": 622730670336.0, + "grad_norm": 0.03691263796350213, + "language_loss": 0.90342188, + "learning_rate": 0.0006166772412625469, + "loss": 0.91390121, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.42163086, + "step": 2299, + "time_per_iteration": 2.757885456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_mlp": 1.00208378, + "epoch": 0.4424778761061947, + "flos": 660061089792.0, + "grad_norm": 0.03315959572172903, + "language_loss": 0.82509053, + "learning_rate": 0.0006163742774659141, + "loss": 0.835536, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.42504883, + "step": 2300, + "time_per_iteration": 2.8489365577697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045508, + "balance_loss_mlp": 1.00316477, + "epoch": 0.44267025779145824, + "flos": 569703254784.0, + "grad_norm": 0.02877714461404429, + "language_loss": 0.86486191, + "learning_rate": 0.0006160712684887801, + "loss": 0.87531698, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.42382812, + "step": 2301, + "time_per_iteration": 2.783581495285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_mlp": 1.00126386, + "epoch": 0.44286263947672183, + "flos": 497819617536.0, + "grad_norm": 0.032325076823307486, + "language_loss": 0.82883227, + "learning_rate": 0.0006157682144487832, + "loss": 0.83926737, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.42285156, + "step": 2302, + "time_per_iteration": 2.8138058185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046294, + "balance_loss_mlp": 1.00395119, + "epoch": 0.44305502116198536, + "flos": 610608950016.0, + "grad_norm": 0.032307808069359366, + "language_loss": 0.83262819, + "learning_rate": 0.0006154651154635793, + "loss": 0.84309107, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.42382812, + "step": 2303, + "time_per_iteration": 2.9065494537353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045793, + "balance_loss_mlp": 1.00349796, + "epoch": 0.44324740284724895, + "flos": 471742857984.0, + "grad_norm": 0.03422426159351285, + "language_loss": 0.85742319, + "learning_rate": 0.0006151619716508421, + "loss": 0.86788118, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.42333984, + "step": 2304, + "time_per_iteration": 2.5973682403564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00377095, + "epoch": 0.4434397845325125, + "flos": 579812788224.0, + "grad_norm": 0.032225909976612614, + "language_loss": 0.87212336, + "learning_rate": 0.0006148587831282625, + "loss": 0.88258433, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.42358398, + "step": 2305, + "time_per_iteration": 2.6349332332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_mlp": 1.00563049, + "epoch": 0.44363216621777607, + "flos": 1499997967872.0, + "grad_norm": 0.0072841640427745245, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80222803, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.41113281, + "step": 2306, + "time_per_iteration": 4.920953989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047952, + "balance_loss_mlp": 1.00565624, + "epoch": 0.44382454790303966, + "flos": 478285863936.0, + "grad_norm": 0.035350800366555836, + "language_loss": 0.87850344, + "learning_rate": 0.0006142522724244255, + "loss": 0.88898295, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.42333984, + "step": 2307, + "time_per_iteration": 2.5206384658813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044529, + "balance_loss_mlp": 1.00361633, + "epoch": 0.4440169295883032, + "flos": 1547306696448.0, + "grad_norm": 0.0037013242818687312, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77529252, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.40917969, + "step": 2308, + "time_per_iteration": 4.906585454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00570333, + "epoch": 0.4442093112735668, + "flos": 592291254528.0, + "grad_norm": 0.03559804859588436, + "language_loss": 0.78114909, + "learning_rate": 0.000613645584293942, + "loss": 0.79162765, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.421875, + "step": 2309, + "time_per_iteration": 2.9084970951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_mlp": 1.00767648, + "epoch": 0.4444016929588303, + "flos": 531328830720.0, + "grad_norm": 0.036447190975963356, + "language_loss": 0.83448339, + "learning_rate": 0.0006133421739881185, + "loss": 0.84498286, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.4230957, + "step": 2310, + "time_per_iteration": 2.652672052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044821, + "balance_loss_mlp": 1.0026927, + "epoch": 0.4445940746440939, + "flos": 621389212416.0, + "grad_norm": 0.035906278639006764, + "language_loss": 0.83511341, + "learning_rate": 0.0006130387196789605, + "loss": 0.84556162, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.42163086, + "step": 2311, + "time_per_iteration": 2.747197151184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.00328362, + "epoch": 0.4447864563293574, + "flos": 630376973568.0, + "grad_norm": 0.027043038636915952, + "language_loss": 0.84677482, + "learning_rate": 0.0006127352214842795, + "loss": 0.85723037, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.4230957, + "step": 2312, + "time_per_iteration": 3.0515668392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045637, + "balance_loss_mlp": 1.00327015, + "epoch": 0.444978838014621, + "flos": 652002627072.0, + "grad_norm": 0.034195517498726076, + "language_loss": 0.85929281, + "learning_rate": 0.0006124316795219041, + "loss": 0.86974919, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.42407227, + "step": 2313, + "time_per_iteration": 2.778184652328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050022, + "balance_loss_mlp": 1.00786984, + "epoch": 0.44517121969988455, + "flos": 613589319168.0, + "grad_norm": 0.029604729226228255, + "language_loss": 0.82924336, + "learning_rate": 0.0006121280939096794, + "loss": 0.83974361, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.421875, + "step": 2314, + "time_per_iteration": 2.7615392208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045707, + "balance_loss_mlp": 1.00350666, + "epoch": 0.44536360138514813, + "flos": 489715468032.0, + "grad_norm": 0.036472505020621125, + "language_loss": 0.8826952, + "learning_rate": 0.000611824464765468, + "loss": 0.89315224, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.42236328, + "step": 2315, + "time_per_iteration": 2.67606782913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_mlp": 1.01759338, + "epoch": 0.4455559830704117, + "flos": 1519056390144.0, + "grad_norm": 0.01193419136680653, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79653352, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.40820312, + "step": 2316, + "time_per_iteration": 4.725375652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045978, + "balance_loss_mlp": 1.00384891, + "epoch": 0.44574836475567525, + "flos": 616817564160.0, + "grad_norm": 0.032139423648612636, + "language_loss": 0.85745513, + "learning_rate": 0.000611217076352619, + "loss": 0.86791497, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.42163086, + "step": 2317, + "time_per_iteration": 2.8277692794799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046443, + "balance_loss_mlp": 1.00429094, + "epoch": 0.44594074644093884, + "flos": 507434366208.0, + "grad_norm": 0.030845694350894858, + "language_loss": 0.83782113, + "learning_rate": 0.0006109133173197905, + "loss": 0.84828556, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.421875, + "step": 2318, + "time_per_iteration": 2.740814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044287, + "balance_loss_mlp": 1.0021348, + "epoch": 0.44613312812620237, + "flos": 728313070848.0, + "grad_norm": 0.03532114030566384, + "language_loss": 0.86011016, + "learning_rate": 0.0006106095152265935, + "loss": 0.87055302, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.421875, + "step": 2319, + "time_per_iteration": 2.982090473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048334, + "balance_loss_mlp": 1.00615764, + "epoch": 0.44632550981146596, + "flos": 637058985216.0, + "grad_norm": 0.029959494040304766, + "language_loss": 0.85331011, + "learning_rate": 0.0006103056701909739, + "loss": 0.86379343, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.42211914, + "step": 2320, + "time_per_iteration": 2.911764621734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_mlp": 1.00878716, + "epoch": 0.4465178914967295, + "flos": 828618100992.0, + "grad_norm": 0.026414177364328564, + "language_loss": 0.83389866, + "learning_rate": 0.0006100017823308956, + "loss": 0.8444078, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.42163086, + "step": 2321, + "time_per_iteration": 3.166370153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00672007, + "epoch": 0.4467102731819931, + "flos": 667033751808.0, + "grad_norm": 0.03675396641442824, + "language_loss": 0.80177474, + "learning_rate": 0.0006096978517643377, + "loss": 0.81226206, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.42041016, + "step": 2322, + "time_per_iteration": 2.7839677333831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049863, + "balance_loss_mlp": 1.00780618, + "epoch": 0.4469026548672566, + "flos": 513970569216.0, + "grad_norm": 0.036357166954029595, + "language_loss": 0.84299958, + "learning_rate": 0.0006093938786092968, + "loss": 0.85349822, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.42089844, + "step": 2323, + "time_per_iteration": 2.6366002559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_mlp": 1.01054394, + "epoch": 0.4470950365525202, + "flos": 685286318592.0, + "grad_norm": 0.03621901423501995, + "language_loss": 0.9042533, + "learning_rate": 0.0006090898629837857, + "loss": 0.91477954, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.42114258, + "step": 2324, + "time_per_iteration": 2.8338427543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_mlp": 1.00514829, + "epoch": 0.4472874182377838, + "flos": 628535873280.0, + "grad_norm": 0.028780974393906523, + "language_loss": 0.87792349, + "learning_rate": 0.0006087858050058337, + "loss": 0.88839531, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.4206543, + "step": 2325, + "time_per_iteration": 2.7868492603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_mlp": 1.00534439, + "epoch": 0.4474797999230473, + "flos": 548241884928.0, + "grad_norm": 0.03362424978515615, + "language_loss": 0.83227015, + "learning_rate": 0.0006084817047934866, + "loss": 0.84274435, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.42114258, + "step": 2326, + "time_per_iteration": 2.6603922843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.01144028, + "epoch": 0.4476721816083109, + "flos": 456757420032.0, + "grad_norm": 0.033869443234677665, + "language_loss": 0.90294945, + "learning_rate": 0.0006081775624648066, + "loss": 0.91348392, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.42041016, + "step": 2327, + "time_per_iteration": 2.563965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_mlp": 1.00730181, + "epoch": 0.44786456329357444, + "flos": 482501733120.0, + "grad_norm": 0.03973119590818811, + "language_loss": 0.83093679, + "learning_rate": 0.0006078733781378721, + "loss": 0.8414318, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.42236328, + "step": 2328, + "time_per_iteration": 2.5500621795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056217, + "balance_loss_mlp": 1.01401651, + "epoch": 0.448056944978838, + "flos": 553237353216.0, + "grad_norm": 0.0336771809947293, + "language_loss": 0.82818258, + "learning_rate": 0.0006075691519307781, + "loss": 0.83874476, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.42236328, + "step": 2329, + "time_per_iteration": 2.8369436264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053357, + "balance_loss_mlp": 1.01125205, + "epoch": 0.44824932666410156, + "flos": 551917282560.0, + "grad_norm": 0.03290883990888194, + "language_loss": 0.81853932, + "learning_rate": 0.0006072648839616356, + "loss": 0.82907289, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.42138672, + "step": 2330, + "time_per_iteration": 2.707853078842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_mlp": 1.00861132, + "epoch": 0.44844170834936514, + "flos": 990273414912.0, + "grad_norm": 0.029288900679948552, + "language_loss": 0.83132529, + "learning_rate": 0.0006069605743485718, + "loss": 0.84183216, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.42114258, + "step": 2331, + "time_per_iteration": 3.347529649734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053351, + "balance_loss_mlp": 1.011127, + "epoch": 0.44863409003462873, + "flos": 592451647488.0, + "grad_norm": 0.033148459483392366, + "language_loss": 0.84139442, + "learning_rate": 0.0006066562232097303, + "loss": 0.85192794, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.42260742, + "step": 2332, + "time_per_iteration": 2.7059993743896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048256, + "balance_loss_mlp": 1.00600874, + "epoch": 0.44882647171989226, + "flos": 725985934080.0, + "grad_norm": 0.033171968523288915, + "language_loss": 0.86700636, + "learning_rate": 0.0006063518306632708, + "loss": 0.87748891, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.42285156, + "step": 2333, + "time_per_iteration": 2.9296460151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_mlp": 1.00607038, + "epoch": 0.44901885340515585, + "flos": 535991852544.0, + "grad_norm": 0.03657763323068719, + "language_loss": 0.83056581, + "learning_rate": 0.0006060473968273688, + "loss": 0.84104872, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.42260742, + "step": 2334, + "time_per_iteration": 2.6368448734283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104747, + "balance_loss_mlp": 1.0070343, + "epoch": 0.4492112350904194, + "flos": 1558693526016.0, + "grad_norm": 0.008278759352477436, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.7892701, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.40429688, + "step": 2335, + "time_per_iteration": 4.866518497467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050045, + "balance_loss_mlp": 1.00951385, + "epoch": 0.44940361677568297, + "flos": 1526703660288.0, + "grad_norm": 0.009772749846677187, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82055259, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.40527344, + "step": 2336, + "time_per_iteration": 4.832434892654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049508, + "balance_loss_mlp": 1.00759399, + "epoch": 0.4495959984609465, + "flos": 383321387520.0, + "grad_norm": 0.039418428301582195, + "language_loss": 0.88819385, + "learning_rate": 0.0006051338487650047, + "loss": 0.89868897, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.41943359, + "step": 2337, + "time_per_iteration": 2.4261343479156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104738, + "balance_loss_mlp": 1.00537109, + "epoch": 0.4497883801462101, + "flos": 498883064064.0, + "grad_norm": 0.03829280299631375, + "language_loss": 0.83062887, + "learning_rate": 0.0006048292509534095, + "loss": 0.84110272, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.42041016, + "step": 2338, + "time_per_iteration": 2.5792438983917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00425851, + "epoch": 0.4499807618314736, + "flos": 615590812416.0, + "grad_norm": 0.03236488600067343, + "language_loss": 0.78186011, + "learning_rate": 0.0006045246124434895, + "loss": 0.79232258, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.42016602, + "step": 2339, + "time_per_iteration": 2.736332654953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049178, + "balance_loss_mlp": 1.00704992, + "epoch": 0.4501731435167372, + "flos": 1007068850688.0, + "grad_norm": 0.0336222564343559, + "language_loss": 0.8735106, + "learning_rate": 0.0006042199333535162, + "loss": 0.88400233, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.42163086, + "step": 2340, + "time_per_iteration": 3.3217411041259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_mlp": 1.0066278, + "epoch": 0.4503655252020008, + "flos": 822328806912.0, + "grad_norm": 0.031746848330129245, + "language_loss": 0.8445214, + "learning_rate": 0.0006039152138017763, + "loss": 0.85500968, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.42236328, + "step": 2341, + "time_per_iteration": 3.027831792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_mlp": 1.00464213, + "epoch": 0.4505579068872643, + "flos": 487414576128.0, + "grad_norm": 0.03971234339866032, + "language_loss": 0.84330553, + "learning_rate": 0.0006036104539065726, + "loss": 0.85377491, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.42333984, + "step": 2342, + "time_per_iteration": 2.6650640964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_mlp": 1.00030267, + "epoch": 0.4507502885725279, + "flos": 886336728576.0, + "grad_norm": 0.030953760348096254, + "language_loss": 0.8473978, + "learning_rate": 0.000603305653786223, + "loss": 0.85782403, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.42358398, + "step": 2343, + "time_per_iteration": 3.146728277206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_mlp": 1.00284708, + "epoch": 0.45094267025779144, + "flos": 579422016000.0, + "grad_norm": 0.032254310776320565, + "language_loss": 0.84862161, + "learning_rate": 0.0006030008135590622, + "loss": 0.859074, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.42431641, + "step": 2344, + "time_per_iteration": 2.716326951980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00387657, + "epoch": 0.45113505194305503, + "flos": 526442232576.0, + "grad_norm": 0.029625683171065443, + "language_loss": 0.81110835, + "learning_rate": 0.0006026959333434387, + "loss": 0.82157081, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.42407227, + "step": 2345, + "time_per_iteration": 2.757293939590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046277, + "balance_loss_mlp": 1.00379133, + "epoch": 0.45132743362831856, + "flos": 503116429824.0, + "grad_norm": 0.029442245536271623, + "language_loss": 0.77997512, + "learning_rate": 0.0006023910132577181, + "loss": 0.79043788, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.42529297, + "step": 2346, + "time_per_iteration": 2.6643226146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_mlp": 1.00201178, + "epoch": 0.45151981531358215, + "flos": 432836710656.0, + "grad_norm": 0.03508285710405181, + "language_loss": 0.85304409, + "learning_rate": 0.0006020860534202806, + "loss": 0.86348718, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.42333984, + "step": 2347, + "time_per_iteration": 2.508922815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_mlp": 1.00444722, + "epoch": 0.4517121969988457, + "flos": 713494828800.0, + "grad_norm": 0.031320840574665956, + "language_loss": 0.81720173, + "learning_rate": 0.0006017810539495224, + "loss": 0.8276692, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.42333984, + "step": 2348, + "time_per_iteration": 2.916851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046985, + "balance_loss_mlp": 1.00459409, + "epoch": 0.45190457868410927, + "flos": 580557394176.0, + "grad_norm": 0.03199810496833265, + "language_loss": 0.82887936, + "learning_rate": 0.0006014760149638547, + "loss": 0.83934915, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.42431641, + "step": 2349, + "time_per_iteration": 2.6583147048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_mlp": 1.00189018, + "epoch": 0.45209696036937286, + "flos": 483628363008.0, + "grad_norm": 0.034942038630734404, + "language_loss": 0.89322019, + "learning_rate": 0.000601170936581704, + "loss": 0.90366322, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.42456055, + "step": 2350, + "time_per_iteration": 2.5171234607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051553, + "balance_loss_mlp": 1.00906706, + "epoch": 0.4522893420546364, + "flos": 541260474624.0, + "grad_norm": 0.03828852417675836, + "language_loss": 0.85383743, + "learning_rate": 0.0006008658189215121, + "loss": 0.86435294, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.42529297, + "step": 2351, + "time_per_iteration": 2.6463332176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_mlp": 1.00725281, + "epoch": 0.4524817237399, + "flos": 497691305472.0, + "grad_norm": 0.039190213199739796, + "language_loss": 0.80507791, + "learning_rate": 0.0006005606621017366, + "loss": 0.81557548, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.42553711, + "step": 2352, + "time_per_iteration": 2.5637879371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048153, + "balance_loss_mlp": 1.00597668, + "epoch": 0.4526741054251635, + "flos": 653841782016.0, + "grad_norm": 0.04275245206988235, + "language_loss": 0.80476063, + "learning_rate": 0.0006002554662408496, + "loss": 0.81524217, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.42211914, + "step": 2353, + "time_per_iteration": 2.8951141834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_mlp": 1.00500786, + "epoch": 0.4528664871104271, + "flos": 572004146688.0, + "grad_norm": 0.03654890079235127, + "language_loss": 0.91683698, + "learning_rate": 0.0005999502314573388, + "loss": 0.92731076, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.42407227, + "step": 2354, + "time_per_iteration": 2.64512300491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051522, + "balance_loss_mlp": 1.00927448, + "epoch": 0.45305886879569063, + "flos": 459679463424.0, + "grad_norm": 0.03675635166201985, + "language_loss": 0.86984789, + "learning_rate": 0.0005996449578697066, + "loss": 0.88036311, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.42285156, + "step": 2355, + "time_per_iteration": 2.6577048301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.0069412, + "epoch": 0.4532512504809542, + "flos": 506207614464.0, + "grad_norm": 0.033984488129296754, + "language_loss": 0.81732345, + "learning_rate": 0.0005993396455964709, + "loss": 0.82781321, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.4206543, + "step": 2356, + "time_per_iteration": 2.7086563110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.0067569, + "epoch": 0.4534436321662178, + "flos": 583312241664.0, + "grad_norm": 0.03467705138292274, + "language_loss": 0.82385033, + "learning_rate": 0.0005990342947561647, + "loss": 0.8343392, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.42163086, + "step": 2357, + "time_per_iteration": 2.6705219745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_mlp": 1.00484145, + "epoch": 0.45363601385148133, + "flos": 550773156096.0, + "grad_norm": 0.03186226313127573, + "language_loss": 0.78742826, + "learning_rate": 0.0005987289054673351, + "loss": 0.79789847, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.42211914, + "step": 2358, + "time_per_iteration": 2.6073710918426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105212, + "balance_loss_mlp": 1.01063538, + "epoch": 0.4538283955367449, + "flos": 1477793937408.0, + "grad_norm": 0.008894510659601113, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77627861, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.41503906, + "step": 2359, + "time_per_iteration": 4.796559810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044324, + "balance_loss_mlp": 1.00245762, + "epoch": 0.45402077722200845, + "flos": 585797826048.0, + "grad_norm": 0.043889208643714143, + "language_loss": 0.91937214, + "learning_rate": 0.0005981180120183722, + "loss": 0.92981529, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.41894531, + "step": 2360, + "time_per_iteration": 2.6962461471557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104759, + "balance_loss_mlp": 1.00584316, + "epoch": 0.45421315890727204, + "flos": 532889974272.0, + "grad_norm": 0.05191452902852925, + "language_loss": 0.85740328, + "learning_rate": 0.0005978125080954089, + "loss": 0.86787915, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.41772461, + "step": 2361, + "time_per_iteration": 2.777160882949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049759, + "balance_loss_mlp": 1.00794065, + "epoch": 0.4544055405925356, + "flos": 786552728064.0, + "grad_norm": 0.0404371323010207, + "language_loss": 0.77941048, + "learning_rate": 0.000597506966198262, + "loss": 0.78990805, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.41845703, + "step": 2362, + "time_per_iteration": 2.9561667442321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048479, + "balance_loss_mlp": 1.00663614, + "epoch": 0.45459792227779916, + "flos": 519202252800.0, + "grad_norm": 0.0386377549927772, + "language_loss": 0.84570003, + "learning_rate": 0.0005972013864455536, + "loss": 0.85618478, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.41870117, + "step": 2363, + "time_per_iteration": 2.577075958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_mlp": 1.00757432, + "epoch": 0.4547903039630627, + "flos": 538598946048.0, + "grad_norm": 0.03734609962487706, + "language_loss": 0.86156821, + "learning_rate": 0.0005968957689559203, + "loss": 0.87206089, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.41723633, + "step": 2364, + "time_per_iteration": 2.663912773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_mlp": 1.00543737, + "epoch": 0.4549826856483263, + "flos": 529691864832.0, + "grad_norm": 0.03600076061776594, + "language_loss": 0.89443278, + "learning_rate": 0.0005965901138480131, + "loss": 0.90490627, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.41943359, + "step": 2365, + "time_per_iteration": 2.635735034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048292, + "balance_loss_mlp": 1.00633037, + "epoch": 0.45517506733358987, + "flos": 521983345152.0, + "grad_norm": 0.04096543812015268, + "language_loss": 0.87860775, + "learning_rate": 0.0005962844212404982, + "loss": 0.88909072, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.41992188, + "step": 2366, + "time_per_iteration": 2.675039291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049904, + "balance_loss_mlp": 1.00799048, + "epoch": 0.4553674490188534, + "flos": 452009827584.0, + "grad_norm": 0.02917585056549172, + "language_loss": 0.88090932, + "learning_rate": 0.0005959786912520558, + "loss": 0.89140838, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.41943359, + "step": 2367, + "time_per_iteration": 2.605693817138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046028, + "balance_loss_mlp": 1.00399494, + "epoch": 0.455559830704117, + "flos": 547745154816.0, + "grad_norm": 0.029185999772899627, + "language_loss": 0.84459692, + "learning_rate": 0.0005956729240013806, + "loss": 0.85505724, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.4206543, + "step": 2368, + "time_per_iteration": 2.792929172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104665, + "balance_loss_mlp": 1.00447345, + "epoch": 0.4557522123893805, + "flos": 584866582272.0, + "grad_norm": 0.02991931447914949, + "language_loss": 0.92050606, + "learning_rate": 0.0005953671196071824, + "loss": 0.93097258, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.42211914, + "step": 2369, + "time_per_iteration": 2.7024593353271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052055, + "balance_loss_mlp": 1.00992644, + "epoch": 0.4559445940746441, + "flos": 527484291840.0, + "grad_norm": 0.03299201390628513, + "language_loss": 0.80723774, + "learning_rate": 0.0005950612781881846, + "loss": 0.81775832, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.42163086, + "step": 2370, + "time_per_iteration": 2.7288575172424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.0061928, + "epoch": 0.45613697575990764, + "flos": 653368384512.0, + "grad_norm": 0.034012751150725565, + "language_loss": 0.76432264, + "learning_rate": 0.0005947553998631259, + "loss": 0.77480543, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.42114258, + "step": 2371, + "time_per_iteration": 2.865060567855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051777, + "balance_loss_mlp": 1.00976777, + "epoch": 0.4563293574451712, + "flos": 868624633344.0, + "grad_norm": 0.02789239974176414, + "language_loss": 0.79458821, + "learning_rate": 0.000594449484750758, + "loss": 0.80510592, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.42041016, + "step": 2372, + "time_per_iteration": 3.147550344467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_mlp": 1.00242209, + "epoch": 0.45652173913043476, + "flos": 499132885248.0, + "grad_norm": 0.03342359133343608, + "language_loss": 0.83513892, + "learning_rate": 0.0005941435329698484, + "loss": 0.84558398, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.42114258, + "step": 2373, + "time_per_iteration": 2.6924219131469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_mlp": 1.00441325, + "epoch": 0.45671412081569834, + "flos": 561959741952.0, + "grad_norm": 0.03267163379038315, + "language_loss": 0.83796972, + "learning_rate": 0.0005938375446391778, + "loss": 0.84843373, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.42016602, + "step": 2374, + "time_per_iteration": 2.731687307357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_mlp": 1.00281477, + "epoch": 0.45690650250096193, + "flos": 504123495936.0, + "grad_norm": 0.03711297965033783, + "language_loss": 0.89367199, + "learning_rate": 0.0005935315198775415, + "loss": 0.90412098, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.42114258, + "step": 2375, + "time_per_iteration": 2.679049015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_mlp": 1.0040555, + "epoch": 0.45709888418622546, + "flos": 431599265280.0, + "grad_norm": 0.033405413713201326, + "language_loss": 0.87559128, + "learning_rate": 0.0005932254588037486, + "loss": 0.88605309, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.42163086, + "step": 2376, + "time_per_iteration": 2.5139987468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_mlp": 1.00384891, + "epoch": 0.45729126587148905, + "flos": 526693999104.0, + "grad_norm": 0.034118342932564036, + "language_loss": 0.86638731, + "learning_rate": 0.000592919361536623, + "loss": 0.87684566, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.42016602, + "step": 2377, + "time_per_iteration": 2.652921438217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047367, + "balance_loss_mlp": 1.00545263, + "epoch": 0.4574836475567526, + "flos": 639148939776.0, + "grad_norm": 0.03214355149845838, + "language_loss": 0.89487022, + "learning_rate": 0.0005926132281950017, + "loss": 0.90534389, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.41943359, + "step": 2378, + "time_per_iteration": 2.7740533351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050302, + "balance_loss_mlp": 1.00819683, + "epoch": 0.45767602924201617, + "flos": 650791426560.0, + "grad_norm": 0.03291422707035226, + "language_loss": 0.85368007, + "learning_rate": 0.0005923070588977367, + "loss": 0.86418307, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.42138672, + "step": 2379, + "time_per_iteration": 2.8456881046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050793, + "balance_loss_mlp": 1.00873554, + "epoch": 0.4578684109272797, + "flos": 747963475968.0, + "grad_norm": 0.03509802642472786, + "language_loss": 0.86739749, + "learning_rate": 0.0005920008537636931, + "loss": 0.87790543, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.42089844, + "step": 2380, + "time_per_iteration": 2.910720109939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048858, + "balance_loss_mlp": 1.00692058, + "epoch": 0.4580607926125433, + "flos": 642729073152.0, + "grad_norm": 0.029242782263759974, + "language_loss": 0.87235177, + "learning_rate": 0.0005916946129117504, + "loss": 0.88284034, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.41967773, + "step": 2381, + "time_per_iteration": 2.8813161849975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.00948262, + "epoch": 0.4582531742978069, + "flos": 803240260608.0, + "grad_norm": 0.03239264438363608, + "language_loss": 0.81130052, + "learning_rate": 0.0005913883364608017, + "loss": 0.82181567, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.4206543, + "step": 2382, + "time_per_iteration": 3.062751531600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105103, + "balance_loss_mlp": 1.00914025, + "epoch": 0.4584455559830704, + "flos": 685518643200.0, + "grad_norm": 0.031797549541833704, + "language_loss": 0.88895178, + "learning_rate": 0.0005910820245297542, + "loss": 0.8994621, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.41918945, + "step": 2383, + "time_per_iteration": 2.8653757572174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_mlp": 1.00387442, + "epoch": 0.458637937668334, + "flos": 519282932736.0, + "grad_norm": 0.03550111139800055, + "language_loss": 0.80986464, + "learning_rate": 0.000590775677237529, + "loss": 0.82032269, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.41967773, + "step": 2384, + "time_per_iteration": 2.7324440479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_mlp": 1.0042969, + "epoch": 0.4588303193535975, + "flos": 506533257984.0, + "grad_norm": 0.03366806840699952, + "language_loss": 0.80683196, + "learning_rate": 0.0005904692947030601, + "loss": 0.81729311, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.41845703, + "step": 2385, + "time_per_iteration": 2.5837819576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_mlp": 1.00176287, + "epoch": 0.4590227010388611, + "flos": 496909761024.0, + "grad_norm": 0.03855013464211847, + "language_loss": 0.89966094, + "learning_rate": 0.0005901628770452963, + "loss": 0.91009706, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.41870117, + "step": 2386, + "time_per_iteration": 2.60300350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_mlp": 1.00132906, + "epoch": 0.45921508272412465, + "flos": 494602066176.0, + "grad_norm": 0.034718704885035666, + "language_loss": 0.87768519, + "learning_rate": 0.000589856424383199, + "loss": 0.88811642, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.41821289, + "step": 2387, + "time_per_iteration": 2.6108267307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_mlp": 1.00232685, + "epoch": 0.45940746440938823, + "flos": 692593372416.0, + "grad_norm": 0.03330437261727838, + "language_loss": 0.83652228, + "learning_rate": 0.000589549936835744, + "loss": 0.846964, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.41870117, + "step": 2388, + "time_per_iteration": 2.8968546390533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_mlp": 1.00545883, + "epoch": 0.45959984609465176, + "flos": 504737844480.0, + "grad_norm": 0.03238722342606361, + "language_loss": 0.79404306, + "learning_rate": 0.0005892434145219202, + "loss": 0.80451536, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.41796875, + "step": 2389, + "time_per_iteration": 2.6019601821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_mlp": 1.00350749, + "epoch": 0.45979222777991535, + "flos": 677840259072.0, + "grad_norm": 0.03571192687498619, + "language_loss": 0.83136904, + "learning_rate": 0.0005889368575607303, + "loss": 0.84182131, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.41748047, + "step": 2390, + "time_per_iteration": 2.8418307304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042843, + "balance_loss_mlp": 1.00107241, + "epoch": 0.45998460946517894, + "flos": 779039594496.0, + "grad_norm": 0.031212653964934608, + "language_loss": 0.79287618, + "learning_rate": 0.00058863026607119, + "loss": 0.80330467, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.41796875, + "step": 2391, + "time_per_iteration": 3.0931389331817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_mlp": 1.00333977, + "epoch": 0.46017699115044247, + "flos": 853022901504.0, + "grad_norm": 0.035796836390277, + "language_loss": 0.80142331, + "learning_rate": 0.0005883236401723287, + "loss": 0.8118751, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.41870117, + "step": 2392, + "time_per_iteration": 3.170374631881714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_mlp": 1.00222623, + "epoch": 0.46036937283570606, + "flos": 576964621824.0, + "grad_norm": 0.03330985308732758, + "language_loss": 0.84980971, + "learning_rate": 0.0005880169799831893, + "loss": 0.86025083, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.41918945, + "step": 2393, + "time_per_iteration": 2.693976879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048284, + "balance_loss_mlp": 1.00641727, + "epoch": 0.4605617545209696, + "flos": 613120779264.0, + "grad_norm": 0.03386951364717573, + "language_loss": 0.82288468, + "learning_rate": 0.0005877102856228278, + "loss": 0.83336759, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.41894531, + "step": 2394, + "time_per_iteration": 2.8137876987457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104836, + "balance_loss_mlp": 1.0063504, + "epoch": 0.4607541362062332, + "flos": 534159500544.0, + "grad_norm": 0.06543347642857557, + "language_loss": 0.85095239, + "learning_rate": 0.0005874035572103133, + "loss": 0.86143595, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.42041016, + "step": 2395, + "time_per_iteration": 2.6604816913604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_mlp": 1.0043298, + "epoch": 0.4609465178914967, + "flos": 648474983424.0, + "grad_norm": 0.04503809754512356, + "language_loss": 0.83026469, + "learning_rate": 0.0005870967948647288, + "loss": 0.84072733, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.41967773, + "step": 2396, + "time_per_iteration": 2.8022336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.00658417, + "epoch": 0.4611388995767603, + "flos": 1469501204736.0, + "grad_norm": 0.004136605290049959, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75355613, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.40722656, + "step": 2397, + "time_per_iteration": 5.5826334953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00350773, + "epoch": 0.46133128126202383, + "flos": 724477280256.0, + "grad_norm": 0.03194619056097999, + "language_loss": 0.86316049, + "learning_rate": 0.0005864831688507443, + "loss": 0.8736161, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.42089844, + "step": 2398, + "time_per_iteration": 3.0160725116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051143, + "balance_loss_mlp": 1.00903809, + "epoch": 0.4615236629472874, + "flos": 549114802944.0, + "grad_norm": 0.0336665595141197, + "language_loss": 0.75746781, + "learning_rate": 0.0005861763054205754, + "loss": 0.76797926, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.42138672, + "step": 2399, + "time_per_iteration": 2.7720346450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052341, + "balance_loss_mlp": 1.01011705, + "epoch": 0.461716044632551, + "flos": 603460343808.0, + "grad_norm": 0.030278987672658065, + "language_loss": 0.80694187, + "learning_rate": 0.0005858694085337976, + "loss": 0.81746531, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.42260742, + "step": 2400, + "time_per_iteration": 2.790825366973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_mlp": 1.00722611, + "epoch": 0.46190842631781454, + "flos": 475437697536.0, + "grad_norm": 0.03561782978750914, + "language_loss": 0.83960855, + "learning_rate": 0.0005855624783095589, + "loss": 0.85010278, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.42236328, + "step": 2401, + "time_per_iteration": 2.5512595176696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_mlp": 1.00930238, + "epoch": 0.4621008080030781, + "flos": 438402786048.0, + "grad_norm": 0.034731386600305836, + "language_loss": 0.85895813, + "learning_rate": 0.00058525551486702, + "loss": 0.86947024, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.41943359, + "step": 2402, + "time_per_iteration": 2.5168349742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_mlp": 1.0077796, + "epoch": 0.46229318968834165, + "flos": 526498612992.0, + "grad_norm": 0.03903258697063272, + "language_loss": 0.81848848, + "learning_rate": 0.0005849485183253548, + "loss": 0.82898641, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.42041016, + "step": 2403, + "time_per_iteration": 2.640596389770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043904, + "balance_loss_mlp": 1.00213277, + "epoch": 0.46248557137360524, + "flos": 440534536704.0, + "grad_norm": 0.0318215105397156, + "language_loss": 0.87703103, + "learning_rate": 0.0005846414888037501, + "loss": 0.88747007, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.41796875, + "step": 2404, + "time_per_iteration": 2.4814634323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046869, + "balance_loss_mlp": 1.00516927, + "epoch": 0.4626779530588688, + "flos": 618773370624.0, + "grad_norm": 0.036713203920182555, + "language_loss": 0.8266353, + "learning_rate": 0.0005843344264214049, + "loss": 0.83710396, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.41723633, + "step": 2405, + "time_per_iteration": 2.7493507862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_mlp": 1.00461316, + "epoch": 0.46287033474413236, + "flos": 671360436480.0, + "grad_norm": 0.031131832431387497, + "language_loss": 0.85281026, + "learning_rate": 0.0005840273312975317, + "loss": 0.86327314, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.41699219, + "step": 2406, + "time_per_iteration": 2.8235156536102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045049, + "balance_loss_mlp": 1.00332618, + "epoch": 0.46306271642939595, + "flos": 481199159040.0, + "grad_norm": 0.037353418102982906, + "language_loss": 0.90573472, + "learning_rate": 0.0005837202035513555, + "loss": 0.91618526, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.41748047, + "step": 2407, + "time_per_iteration": 2.5672457218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043171, + "balance_loss_mlp": 1.001472, + "epoch": 0.4632550981146595, + "flos": 581858022912.0, + "grad_norm": 0.03272683029516706, + "language_loss": 0.81903768, + "learning_rate": 0.0005834130433021136, + "loss": 0.82946944, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.41723633, + "step": 2408, + "time_per_iteration": 4.229294538497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_mlp": 1.00044954, + "epoch": 0.46344747979992307, + "flos": 525018149376.0, + "grad_norm": 0.030754893265702864, + "language_loss": 0.73835284, + "learning_rate": 0.0005831058506690563, + "loss": 0.74877453, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.41748047, + "step": 2409, + "time_per_iteration": 2.614616632461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_mlp": 1.00183976, + "epoch": 0.4636398614851866, + "flos": 747813776640.0, + "grad_norm": 0.03608107183813509, + "language_loss": 0.86105043, + "learning_rate": 0.0005827986257714464, + "loss": 0.87148345, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.41479492, + "step": 2410, + "time_per_iteration": 2.953162670135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00935507, + "epoch": 0.4638322431704502, + "flos": 597646392576.0, + "grad_norm": 0.032192415237476964, + "language_loss": 0.89042687, + "learning_rate": 0.0005824913687285591, + "loss": 0.90093744, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.41723633, + "step": 2411, + "time_per_iteration": 2.685081958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045225, + "balance_loss_mlp": 1.00357294, + "epoch": 0.4640246248557137, + "flos": 540533365248.0, + "grad_norm": 0.03324810257023632, + "language_loss": 0.82180583, + "learning_rate": 0.0005821840796596821, + "loss": 0.83225811, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.41674805, + "step": 2412, + "time_per_iteration": 2.7183375358581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_mlp": 1.00403953, + "epoch": 0.4642170065409773, + "flos": 563809590528.0, + "grad_norm": 0.030050486484180242, + "language_loss": 0.80926406, + "learning_rate": 0.0005818767586841158, + "loss": 0.81972128, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.41699219, + "step": 2413, + "time_per_iteration": 2.7701165676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050594, + "balance_loss_mlp": 1.00884688, + "epoch": 0.46440938822624084, + "flos": 532062743040.0, + "grad_norm": 0.027541485530404662, + "language_loss": 0.86138541, + "learning_rate": 0.0005815694059211726, + "loss": 0.87189138, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.41772461, + "step": 2414, + "time_per_iteration": 2.668760061264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104752, + "balance_loss_mlp": 1.00717926, + "epoch": 0.4646017699115044, + "flos": 1529627649024.0, + "grad_norm": 0.008676045744997887, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81921148, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.40332031, + "step": 2415, + "time_per_iteration": 4.801916599273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054371, + "balance_loss_mlp": 1.01403046, + "epoch": 0.464794151596768, + "flos": 1544174682624.0, + "grad_norm": 0.009441918844152984, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.77999437, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.40332031, + "step": 2416, + "time_per_iteration": 4.990759372711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.0040803, + "epoch": 0.46498653328203154, + "flos": 502539019776.0, + "grad_norm": 0.03083676606802021, + "language_loss": 0.86654723, + "learning_rate": 0.0005806471581013931, + "loss": 0.87700671, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.41894531, + "step": 2417, + "time_per_iteration": 2.6697516441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046587, + "balance_loss_mlp": 1.00452995, + "epoch": 0.46517891496729513, + "flos": 677301732864.0, + "grad_norm": 0.03671323650301262, + "language_loss": 0.79226685, + "learning_rate": 0.0005803396793823146, + "loss": 0.80273271, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.42089844, + "step": 2418, + "time_per_iteration": 2.8375697135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054201, + "balance_loss_mlp": 1.01212037, + "epoch": 0.46537129665255866, + "flos": 586512296448.0, + "grad_norm": 0.037063881541601694, + "language_loss": 0.86435425, + "learning_rate": 0.0005800321694726065, + "loss": 0.87489623, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.42114258, + "step": 2419, + "time_per_iteration": 2.7743778228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01136279, + "epoch": 0.46556367833782225, + "flos": 588821936640.0, + "grad_norm": 0.0340005426894483, + "language_loss": 0.87128568, + "learning_rate": 0.0005797246284916545, + "loss": 0.8818208, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.421875, + "step": 2420, + "time_per_iteration": 2.6835851669311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049313, + "balance_loss_mlp": 1.00878143, + "epoch": 0.4657560600230858, + "flos": 1488584893440.0, + "grad_norm": 0.006163961209168608, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78554499, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.40527344, + "step": 2421, + "time_per_iteration": 4.943193197250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_mlp": 1.00570607, + "epoch": 0.46594844170834937, + "flos": 581393373696.0, + "grad_norm": 0.03388172676180004, + "language_loss": 0.8850925, + "learning_rate": 0.0005791094537936233, + "loss": 0.89556992, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.4206543, + "step": 2422, + "time_per_iteration": 2.694913148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.00559843, + "epoch": 0.4661408233936129, + "flos": 513571048704.0, + "grad_norm": 0.036220885297141736, + "language_loss": 0.82194817, + "learning_rate": 0.0005788018203153762, + "loss": 0.83242476, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.42089844, + "step": 2423, + "time_per_iteration": 2.582130193710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_mlp": 1.006392, + "epoch": 0.4663332050788765, + "flos": 492033856512.0, + "grad_norm": 0.03516767090589214, + "language_loss": 0.86157548, + "learning_rate": 0.000578494156243549, + "loss": 0.87205875, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.41967773, + "step": 2424, + "time_per_iteration": 2.569465160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047846, + "balance_loss_mlp": 1.0060271, + "epoch": 0.4665255867641401, + "flos": 513708109056.0, + "grad_norm": 0.03097112252036683, + "language_loss": 0.89247042, + "learning_rate": 0.0005781864616975878, + "loss": 0.90294886, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.41845703, + "step": 2425, + "time_per_iteration": 2.6580159664154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043502, + "balance_loss_mlp": 1.00175464, + "epoch": 0.4667179684494036, + "flos": 425707546368.0, + "grad_norm": 0.0331787429652153, + "language_loss": 0.84786129, + "learning_rate": 0.0005778787367969502, + "loss": 0.85829628, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.41772461, + "step": 2426, + "time_per_iteration": 2.577146291732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046987, + "balance_loss_mlp": 1.00526416, + "epoch": 0.4669103501346672, + "flos": 709224524544.0, + "grad_norm": 0.030186535385466236, + "language_loss": 0.81415391, + "learning_rate": 0.0005775709816611053, + "loss": 0.82462376, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.41748047, + "step": 2427, + "time_per_iteration": 2.946763515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044811, + "balance_loss_mlp": 1.00294447, + "epoch": 0.4671027318199307, + "flos": 555946513920.0, + "grad_norm": 0.029160974795623382, + "language_loss": 0.83887118, + "learning_rate": 0.0005772631964095346, + "loss": 0.84931928, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.41894531, + "step": 2428, + "time_per_iteration": 2.7246575355529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047997, + "balance_loss_mlp": 1.0062499, + "epoch": 0.4672951135051943, + "flos": 568196546304.0, + "grad_norm": 0.03470882192857659, + "language_loss": 0.86100912, + "learning_rate": 0.000576955381161731, + "loss": 0.87148911, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.41772461, + "step": 2429, + "time_per_iteration": 2.6618916988372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00959647, + "epoch": 0.46748749519045785, + "flos": 425418841344.0, + "grad_norm": 0.034295751127670006, + "language_loss": 0.86858582, + "learning_rate": 0.0005766475360371985, + "loss": 0.87909877, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.41723633, + "step": 2430, + "time_per_iteration": 2.6010043621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048678, + "balance_loss_mlp": 1.00697899, + "epoch": 0.46767987687572143, + "flos": 539371742208.0, + "grad_norm": 0.034969896754344705, + "language_loss": 0.85521102, + "learning_rate": 0.0005763396611554536, + "loss": 0.86569786, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.41723633, + "step": 2431, + "time_per_iteration": 2.6345412731170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045821, + "balance_loss_mlp": 1.00409806, + "epoch": 0.467872258560985, + "flos": 825076851456.0, + "grad_norm": 0.03589185796451142, + "language_loss": 0.80950278, + "learning_rate": 0.0005760317566360237, + "loss": 0.81996095, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.41748047, + "step": 2432, + "time_per_iteration": 3.0410006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050036, + "balance_loss_mlp": 1.0083127, + "epoch": 0.46806464024624855, + "flos": 662854821120.0, + "grad_norm": 0.03375923289076794, + "language_loss": 0.86271471, + "learning_rate": 0.000575723822598448, + "loss": 0.87321508, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.41748047, + "step": 2433, + "time_per_iteration": 2.7712388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00251079, + "epoch": 0.46825702193151214, + "flos": 757055249664.0, + "grad_norm": 0.029730946872360612, + "language_loss": 0.82302332, + "learning_rate": 0.0005754158591622773, + "loss": 0.83346617, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.41796875, + "step": 2434, + "time_per_iteration": 2.9708468914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_mlp": 1.00818896, + "epoch": 0.4684494036167757, + "flos": 440310960384.0, + "grad_norm": 0.03563934149764459, + "language_loss": 0.83011699, + "learning_rate": 0.0005751078664470732, + "loss": 0.84061682, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.41821289, + "step": 2435, + "time_per_iteration": 2.5696167945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.00468564, + "epoch": 0.46864178530203926, + "flos": 533749286400.0, + "grad_norm": 0.031914354194682755, + "language_loss": 0.86557531, + "learning_rate": 0.0005747998445724094, + "loss": 0.87603986, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.41796875, + "step": 2436, + "time_per_iteration": 2.6336376667022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047101, + "balance_loss_mlp": 1.00535429, + "epoch": 0.4688341669873028, + "flos": 577826846208.0, + "grad_norm": 0.03221336233810001, + "language_loss": 0.89470494, + "learning_rate": 0.0005744917936578707, + "loss": 0.90517592, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.41772461, + "step": 2437, + "time_per_iteration": 2.7748000621795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054309, + "balance_loss_mlp": 1.0126332, + "epoch": 0.4690265486725664, + "flos": 540718057728.0, + "grad_norm": 0.029623138174113085, + "language_loss": 0.84520715, + "learning_rate": 0.0005741837138230526, + "loss": 0.85575026, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.41699219, + "step": 2438, + "time_per_iteration": 2.717194080352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047928, + "balance_loss_mlp": 1.0061574, + "epoch": 0.4692189303578299, + "flos": 771882240000.0, + "grad_norm": 0.03250588789777806, + "language_loss": 0.86937356, + "learning_rate": 0.0005738756051875627, + "loss": 0.87985283, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.41796875, + "step": 2439, + "time_per_iteration": 3.0656278133392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050138, + "balance_loss_mlp": 1.00846255, + "epoch": 0.4694113120430935, + "flos": 572514482688.0, + "grad_norm": 0.03167805631394848, + "language_loss": 0.84031767, + "learning_rate": 0.0005735674678710192, + "loss": 0.85081905, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.41699219, + "step": 2440, + "time_per_iteration": 2.6962802410125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010485, + "balance_loss_mlp": 1.00675285, + "epoch": 0.4696036937283571, + "flos": 750095226624.0, + "grad_norm": 0.037443971636707395, + "language_loss": 0.82144701, + "learning_rate": 0.0005732593019930517, + "loss": 0.83193195, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.41772461, + "step": 2441, + "time_per_iteration": 2.9041428565979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050874, + "balance_loss_mlp": 1.00915074, + "epoch": 0.4697960754136206, + "flos": 494443618560.0, + "grad_norm": 0.033679899008564836, + "language_loss": 0.87957233, + "learning_rate": 0.0005729511076733008, + "loss": 0.89008105, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.41748047, + "step": 2442, + "time_per_iteration": 2.6734514236450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056181, + "balance_loss_mlp": 1.01433861, + "epoch": 0.4699884570988842, + "flos": 726361155072.0, + "grad_norm": 0.036289078656904894, + "language_loss": 0.85521489, + "learning_rate": 0.000572642885031418, + "loss": 0.86577672, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.41870117, + "step": 2443, + "time_per_iteration": 2.9099576473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_mlp": 1.01062274, + "epoch": 0.47018083878414774, + "flos": 556578359040.0, + "grad_norm": 0.03125880297204364, + "language_loss": 0.81027329, + "learning_rate": 0.0005723346341870662, + "loss": 0.82079738, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.41821289, + "step": 2444, + "time_per_iteration": 2.7017409801483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046603, + "balance_loss_mlp": 1.00480783, + "epoch": 0.4703732204694113, + "flos": 424962940416.0, + "grad_norm": 0.03329454905005034, + "language_loss": 0.86812586, + "learning_rate": 0.0005720263552599188, + "loss": 0.8785919, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.41821289, + "step": 2445, + "time_per_iteration": 2.462155818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044259, + "balance_loss_mlp": 1.00239313, + "epoch": 0.47056560215467486, + "flos": 704756888832.0, + "grad_norm": 0.03166905827629482, + "language_loss": 0.80339378, + "learning_rate": 0.0005717180483696604, + "loss": 0.81383634, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.41894531, + "step": 2446, + "time_per_iteration": 2.8927905559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_mlp": 1.00115991, + "epoch": 0.47075798383993844, + "flos": 556013587968.0, + "grad_norm": 0.03197533000624638, + "language_loss": 0.8331126, + "learning_rate": 0.0005714097136359862, + "loss": 0.8435452, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.42138672, + "step": 2447, + "time_per_iteration": 2.632544994354248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043755, + "balance_loss_mlp": 1.00169826, + "epoch": 0.470950365525202, + "flos": 565494188544.0, + "grad_norm": 0.028044805803111937, + "language_loss": 0.87163484, + "learning_rate": 0.0005711013511786027, + "loss": 0.88207239, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.42089844, + "step": 2448, + "time_per_iteration": 2.781325578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049475, + "balance_loss_mlp": 1.00768065, + "epoch": 0.47114274721046556, + "flos": 535499013120.0, + "grad_norm": 0.029728682222295192, + "language_loss": 0.84444499, + "learning_rate": 0.0005707929611172263, + "loss": 0.8549397, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.41821289, + "step": 2449, + "time_per_iteration": 2.704754114151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104782, + "balance_loss_mlp": 1.00576317, + "epoch": 0.47133512889572915, + "flos": 474078743040.0, + "grad_norm": 0.03341999970225476, + "language_loss": 0.84505057, + "learning_rate": 0.000570484543571585, + "loss": 0.85552877, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.42089844, + "step": 2450, + "time_per_iteration": 2.56648850440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_mlp": 1.00129259, + "epoch": 0.4715275105809927, + "flos": 459968168448.0, + "grad_norm": 0.03640704052870178, + "language_loss": 0.83504367, + "learning_rate": 0.0005701760986614171, + "loss": 0.84547579, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.41943359, + "step": 2451, + "time_per_iteration": 2.5392374992370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_mlp": 1.00522745, + "epoch": 0.47171989226625627, + "flos": 422887570176.0, + "grad_norm": 0.0300201122524448, + "language_loss": 0.87997985, + "learning_rate": 0.0005698676265064714, + "loss": 0.89045107, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.41918945, + "step": 2452, + "time_per_iteration": 2.501518487930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_mlp": 1.00378954, + "epoch": 0.4719122739515198, + "flos": 458376889344.0, + "grad_norm": 0.036567202146268483, + "language_loss": 0.89326543, + "learning_rate": 0.0005695591272265074, + "loss": 0.90372366, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.4206543, + "step": 2453, + "time_per_iteration": 2.5203113555908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00703239, + "epoch": 0.4721046556367834, + "flos": 516017749248.0, + "grad_norm": 0.03590555599096038, + "language_loss": 0.82296801, + "learning_rate": 0.0005692506009412954, + "loss": 0.83345866, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.4206543, + "step": 2454, + "time_per_iteration": 2.703277826309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_mlp": 1.00982666, + "epoch": 0.4722970373220469, + "flos": 1575706702080.0, + "grad_norm": 0.007700978657663942, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78601336, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.40234375, + "step": 2455, + "time_per_iteration": 4.935078859329224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_mlp": 1.00380278, + "epoch": 0.4724894190073105, + "flos": 587395908096.0, + "grad_norm": 0.032995428661028114, + "language_loss": 0.90020776, + "learning_rate": 0.0005686334678342593, + "loss": 0.91066664, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.42114258, + "step": 2456, + "time_per_iteration": 2.913954019546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_mlp": 1.00291097, + "epoch": 0.4726818006925741, + "flos": 869073731328.0, + "grad_norm": 0.0323844824027511, + "language_loss": 0.82033843, + "learning_rate": 0.0005683248612520274, + "loss": 0.83078766, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.42041016, + "step": 2457, + "time_per_iteration": 4.4027345180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_mlp": 1.0055697, + "epoch": 0.4728741823778376, + "flos": 754228470528.0, + "grad_norm": 0.03548497467281451, + "language_loss": 0.84315181, + "learning_rate": 0.0005680162281437321, + "loss": 0.85363138, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.42431641, + "step": 2458, + "time_per_iteration": 2.8824384212493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_mlp": 1.00649393, + "epoch": 0.4730665640631012, + "flos": 539658501888.0, + "grad_norm": 0.029540383226657484, + "language_loss": 0.85216498, + "learning_rate": 0.000567707568629195, + "loss": 0.86265045, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.42089844, + "step": 2459, + "time_per_iteration": 2.7024879455566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105311, + "balance_loss_mlp": 1.01088595, + "epoch": 0.47325894574836475, + "flos": 492683198208.0, + "grad_norm": 0.02914158825310119, + "language_loss": 0.8318013, + "learning_rate": 0.0005673988828282486, + "loss": 0.84233236, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.42260742, + "step": 2460, + "time_per_iteration": 2.680508852005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045398, + "balance_loss_mlp": 1.00341213, + "epoch": 0.47345132743362833, + "flos": 765832073472.0, + "grad_norm": 0.11223827549321637, + "language_loss": 0.8158704, + "learning_rate": 0.0005670901708607352, + "loss": 0.82632446, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.42016602, + "step": 2461, + "time_per_iteration": 2.963573455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_mlp": 1.00873268, + "epoch": 0.47364370911889186, + "flos": 541169101056.0, + "grad_norm": 0.03621241484942453, + "language_loss": 0.84821182, + "learning_rate": 0.0005667814328465076, + "loss": 0.85871977, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.42089844, + "step": 2462, + "time_per_iteration": 2.623180389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052459, + "balance_loss_mlp": 1.01042545, + "epoch": 0.47383609080415545, + "flos": 407092397568.0, + "grad_norm": 0.0408736366196423, + "language_loss": 0.82667732, + "learning_rate": 0.0005664726689054285, + "loss": 0.83720195, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.4206543, + "step": 2463, + "time_per_iteration": 2.463602304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_mlp": 1.01253569, + "epoch": 0.474028472489419, + "flos": 454439031552.0, + "grad_norm": 0.030418063351129263, + "language_loss": 0.81695265, + "learning_rate": 0.0005661638791573704, + "loss": 0.82749808, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.42041016, + "step": 2464, + "time_per_iteration": 2.736748695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048425, + "balance_loss_mlp": 1.00651097, + "epoch": 0.47422085417468257, + "flos": 493195479552.0, + "grad_norm": 0.029840540723241396, + "language_loss": 0.87200695, + "learning_rate": 0.0005658550637222164, + "loss": 0.88249123, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.41943359, + "step": 2465, + "time_per_iteration": 2.618978261947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00532842, + "epoch": 0.47441323585994616, + "flos": 740126644224.0, + "grad_norm": 0.027711669007488924, + "language_loss": 0.82591414, + "learning_rate": 0.0005655462227198592, + "loss": 0.8363868, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.41967773, + "step": 2466, + "time_per_iteration": 2.9003212451934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045592, + "balance_loss_mlp": 1.00363016, + "epoch": 0.4746056175452097, + "flos": 485675543040.0, + "grad_norm": 0.03086334809399425, + "language_loss": 0.84889436, + "learning_rate": 0.0005652373562702016, + "loss": 0.85935026, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.41992188, + "step": 2467, + "time_per_iteration": 2.635524272918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050978, + "balance_loss_mlp": 1.00913572, + "epoch": 0.4747979992304733, + "flos": 462006600192.0, + "grad_norm": 0.030700027016666232, + "language_loss": 0.89103687, + "learning_rate": 0.000564928464493156, + "loss": 0.9015466, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.41870117, + "step": 2468, + "time_per_iteration": 2.5902397632598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050963, + "balance_loss_mlp": 1.00900185, + "epoch": 0.4749903809157368, + "flos": 865880479488.0, + "grad_norm": 0.04027391649848807, + "language_loss": 0.82258296, + "learning_rate": 0.000564619547508645, + "loss": 0.83309263, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.41992188, + "step": 2469, + "time_per_iteration": 3.071483850479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_mlp": 1.00877666, + "epoch": 0.4751827626010004, + "flos": 506552699904.0, + "grad_norm": 0.03439249398490307, + "language_loss": 0.83728659, + "learning_rate": 0.0005643106054366008, + "loss": 0.84779418, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.42016602, + "step": 2470, + "time_per_iteration": 2.5717906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054243, + "balance_loss_mlp": 1.01240063, + "epoch": 0.47537514428626393, + "flos": 560453033472.0, + "grad_norm": 0.030831302101538484, + "language_loss": 0.80302799, + "learning_rate": 0.000564001638396965, + "loss": 0.81357038, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.41870117, + "step": 2471, + "time_per_iteration": 2.807666540145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010519, + "balance_loss_mlp": 1.01008177, + "epoch": 0.4755675259715275, + "flos": 835677278976.0, + "grad_norm": 0.03000607606640632, + "language_loss": 0.82444054, + "learning_rate": 0.0005636926465096897, + "loss": 0.83495951, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.41845703, + "step": 2472, + "time_per_iteration": 3.0930862426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_mlp": 1.01106381, + "epoch": 0.47575990765679105, + "flos": 509233670400.0, + "grad_norm": 0.03423576863830587, + "language_loss": 0.88083971, + "learning_rate": 0.0005633836298947363, + "loss": 0.89136827, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.41821289, + "step": 2473, + "time_per_iteration": 2.5820775032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050107, + "balance_loss_mlp": 1.00819325, + "epoch": 0.47595228934205464, + "flos": 592963928832.0, + "grad_norm": 0.03298724569498326, + "language_loss": 0.71285135, + "learning_rate": 0.000563074588672075, + "loss": 0.72335243, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.41943359, + "step": 2474, + "time_per_iteration": 2.693268299102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_mlp": 1.01231647, + "epoch": 0.4761446710273182, + "flos": 581684024064.0, + "grad_norm": 0.03213378714772974, + "language_loss": 0.85775197, + "learning_rate": 0.0005627655229616868, + "loss": 0.86829406, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.41918945, + "step": 2475, + "time_per_iteration": 2.719207286834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051223, + "balance_loss_mlp": 1.00933242, + "epoch": 0.47633705271258175, + "flos": 674080290816.0, + "grad_norm": 0.026991444464169446, + "language_loss": 0.9029963, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350853, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.41918945, + "step": 2476, + "time_per_iteration": 2.793189764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054365, + "balance_loss_mlp": 1.0125705, + "epoch": 0.47652943439784534, + "flos": 542971317504.0, + "grad_norm": 0.02962321585608733, + "language_loss": 0.84663439, + "learning_rate": 0.0005621473185576986, + "loss": 0.85717803, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.41821289, + "step": 2477, + "time_per_iteration": 2.7773327827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_mlp": 1.00822008, + "epoch": 0.4767218160831089, + "flos": 525847325952.0, + "grad_norm": 0.03556533386707064, + "language_loss": 0.87709439, + "learning_rate": 0.0005618381801041068, + "loss": 0.8875953, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.41894531, + "step": 2478, + "time_per_iteration": 2.6155920028686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053209, + "balance_loss_mlp": 1.0111047, + "epoch": 0.47691419776837246, + "flos": 569127790080.0, + "grad_norm": 0.035286823129286084, + "language_loss": 0.83750623, + "learning_rate": 0.0005615290176428044, + "loss": 0.84803832, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.42138672, + "step": 2479, + "time_per_iteration": 2.6538074016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049988, + "balance_loss_mlp": 1.00802612, + "epoch": 0.477106579453636, + "flos": 532025804544.0, + "grad_norm": 0.0314839310376407, + "language_loss": 0.85928833, + "learning_rate": 0.0005612198312938187, + "loss": 0.86978817, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.41992188, + "step": 2480, + "time_per_iteration": 2.781107187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051481, + "balance_loss_mlp": 1.00937629, + "epoch": 0.4772989611388996, + "flos": 595502002944.0, + "grad_norm": 0.03185012593036433, + "language_loss": 0.79825139, + "learning_rate": 0.0005609106211771868, + "loss": 0.80876625, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.42138672, + "step": 2481, + "time_per_iteration": 2.854200839996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049319, + "balance_loss_mlp": 1.00702322, + "epoch": 0.4774913428241631, + "flos": 545708668416.0, + "grad_norm": 0.032298555104441296, + "language_loss": 0.89798552, + "learning_rate": 0.0005606013874129543, + "loss": 0.90847874, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.42333984, + "step": 2482, + "time_per_iteration": 2.8364884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.00214577, + "epoch": 0.4776837245094267, + "flos": 541130217216.0, + "grad_norm": 0.031860038244933726, + "language_loss": 0.8004725, + "learning_rate": 0.0005602921301211768, + "loss": 0.81091738, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.42382812, + "step": 2483, + "time_per_iteration": 2.719606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_mlp": 1.00185454, + "epoch": 0.4778761061946903, + "flos": 472756727040.0, + "grad_norm": 0.037639636071959574, + "language_loss": 0.82567894, + "learning_rate": 0.0005599828494219185, + "loss": 0.83612138, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.42431641, + "step": 2484, + "time_per_iteration": 2.5541560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_mlp": 1.00548136, + "epoch": 0.4780684878799538, + "flos": 727338085632.0, + "grad_norm": 0.033674716450053835, + "language_loss": 0.89748895, + "learning_rate": 0.0005596735454352527, + "loss": 0.90796649, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.4230957, + "step": 2485, + "time_per_iteration": 2.9516124725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_mlp": 1.00921071, + "epoch": 0.4782608695652174, + "flos": 549954673152.0, + "grad_norm": 0.03622289239904689, + "language_loss": 0.86092174, + "learning_rate": 0.0005593642182812619, + "loss": 0.87143582, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.42236328, + "step": 2486, + "time_per_iteration": 2.643221139907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054193, + "balance_loss_mlp": 1.01192153, + "epoch": 0.47845325125048094, + "flos": 831403084032.0, + "grad_norm": 0.035916445699024475, + "language_loss": 0.84163451, + "learning_rate": 0.0005590548680800378, + "loss": 0.85217643, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.4230957, + "step": 2487, + "time_per_iteration": 3.1013588905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_mlp": 1.01356208, + "epoch": 0.4786456329357445, + "flos": 515271197952.0, + "grad_norm": 0.032399463516541584, + "language_loss": 0.76797146, + "learning_rate": 0.0005587454949516804, + "loss": 0.77852952, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.42285156, + "step": 2488, + "time_per_iteration": 2.7681314945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00992179, + "epoch": 0.47883801462100806, + "flos": 565730403840.0, + "grad_norm": 0.034669501918414815, + "language_loss": 0.88538134, + "learning_rate": 0.0005584360990162993, + "loss": 0.89590186, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.42163086, + "step": 2489, + "time_per_iteration": 2.6323490142822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105419, + "balance_loss_mlp": 1.01196563, + "epoch": 0.47903039630627164, + "flos": 580705148160.0, + "grad_norm": 0.028676455513171533, + "language_loss": 0.85944891, + "learning_rate": 0.0005581266803940124, + "loss": 0.86999071, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.42260742, + "step": 2490, + "time_per_iteration": 2.758180856704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051095, + "balance_loss_mlp": 1.00891864, + "epoch": 0.47922277799153523, + "flos": 620086638336.0, + "grad_norm": 0.029629924190795385, + "language_loss": 0.8824507, + "learning_rate": 0.0005578172392049471, + "loss": 0.89296162, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.42211914, + "step": 2491, + "time_per_iteration": 2.733055353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049931, + "balance_loss_mlp": 1.00787377, + "epoch": 0.47941515967679876, + "flos": 640859782656.0, + "grad_norm": 0.03401187912624355, + "language_loss": 0.84927547, + "learning_rate": 0.0005575077755692386, + "loss": 0.85977477, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.42089844, + "step": 2492, + "time_per_iteration": 2.7897393703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051779, + "balance_loss_mlp": 1.00988865, + "epoch": 0.47960754136206235, + "flos": 520876157184.0, + "grad_norm": 0.02611914925979928, + "language_loss": 0.8632732, + "learning_rate": 0.0005571982896070316, + "loss": 0.87379098, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.41918945, + "step": 2493, + "time_per_iteration": 2.667999744415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_mlp": 1.01010633, + "epoch": 0.4797999230473259, + "flos": 476032604160.0, + "grad_norm": 0.03441931276085345, + "language_loss": 0.90227294, + "learning_rate": 0.0005568887814384792, + "loss": 0.9127928, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.41918945, + "step": 2494, + "time_per_iteration": 2.5400681495666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105198, + "balance_loss_mlp": 1.01023245, + "epoch": 0.47999230473258947, + "flos": 533069809152.0, + "grad_norm": 0.031194267436751296, + "language_loss": 0.87632048, + "learning_rate": 0.000556579251183743, + "loss": 0.88684028, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.41772461, + "step": 2495, + "time_per_iteration": 2.662360906600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047828, + "balance_loss_mlp": 1.00615287, + "epoch": 0.480184686417853, + "flos": 602606867712.0, + "grad_norm": 0.03455941378420467, + "language_loss": 0.8073976, + "learning_rate": 0.0005562696989629936, + "loss": 0.81787586, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.41699219, + "step": 2496, + "time_per_iteration": 2.677384614944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049959, + "balance_loss_mlp": 1.00837922, + "epoch": 0.4803770681031166, + "flos": 529262208768.0, + "grad_norm": 0.02987635047659329, + "language_loss": 0.83264202, + "learning_rate": 0.0005559601248964095, + "loss": 0.84314156, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.41601562, + "step": 2497, + "time_per_iteration": 2.629697322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052806, + "balance_loss_mlp": 1.01132119, + "epoch": 0.4805694497883801, + "flos": 512229590784.0, + "grad_norm": 0.031958617017597245, + "language_loss": 0.86286914, + "learning_rate": 0.0005556505291041783, + "loss": 0.87339711, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.41503906, + "step": 2498, + "time_per_iteration": 2.6821835041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105189, + "balance_loss_mlp": 1.0103811, + "epoch": 0.4807618314736437, + "flos": 601606604544.0, + "grad_norm": 0.02993690761083535, + "language_loss": 0.84804475, + "learning_rate": 0.0005553409117064954, + "loss": 0.85856366, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.4152832, + "step": 2499, + "time_per_iteration": 2.868149518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_mlp": 1.00626087, + "epoch": 0.4809542131589073, + "flos": 570030843648.0, + "grad_norm": 0.03218775088546566, + "language_loss": 0.85501659, + "learning_rate": 0.0005550312728235654, + "loss": 0.86549377, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.41479492, + "step": 2500, + "time_per_iteration": 2.6775684356689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00767624, + "epoch": 0.4811465948441708, + "flos": 577166810880.0, + "grad_norm": 0.03560315442462447, + "language_loss": 0.84339613, + "learning_rate": 0.0005547216125756003, + "loss": 0.85388672, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.4140625, + "step": 2501, + "time_per_iteration": 2.730938196182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051501, + "balance_loss_mlp": 1.01011145, + "epoch": 0.4813389765294344, + "flos": 825298482432.0, + "grad_norm": 0.030150461655227775, + "language_loss": 0.82324314, + "learning_rate": 0.0005544119310828211, + "loss": 0.83375812, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.4140625, + "step": 2502, + "time_per_iteration": 3.113402843475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01203024, + "epoch": 0.48153135821469795, + "flos": 636700293888.0, + "grad_norm": 0.03404405348604493, + "language_loss": 0.85394537, + "learning_rate": 0.0005541022284654568, + "loss": 0.8644805, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.41503906, + "step": 2503, + "time_per_iteration": 2.946800708770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055542, + "balance_loss_mlp": 1.01393807, + "epoch": 0.48172373989996153, + "flos": 504709654272.0, + "grad_norm": 0.029988445312160498, + "language_loss": 0.84392428, + "learning_rate": 0.0005537925048437446, + "loss": 0.85447979, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.41625977, + "step": 2504, + "time_per_iteration": 2.5928125381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_mlp": 1.0131073, + "epoch": 0.48191612158522507, + "flos": 1535568945408.0, + "grad_norm": 0.009640282548559968, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76805007, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.40429688, + "step": 2505, + "time_per_iteration": 4.956170320510864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105109, + "balance_loss_mlp": 1.00936711, + "epoch": 0.48210850327048865, + "flos": 703813006080.0, + "grad_norm": 0.02927379087328487, + "language_loss": 0.88880217, + "learning_rate": 0.0005531729950682664, + "loss": 0.89931303, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.41748047, + "step": 2506, + "time_per_iteration": 2.9935836791992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052672, + "balance_loss_mlp": 1.01106763, + "epoch": 0.4823008849557522, + "flos": 440701732608.0, + "grad_norm": 0.04047033106809228, + "language_loss": 0.85417378, + "learning_rate": 0.000552863209155015, + "loss": 0.86470056, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.41625977, + "step": 2507, + "time_per_iteration": 2.4729647636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053204, + "balance_loss_mlp": 1.01157653, + "epoch": 0.48249326664101577, + "flos": 472813107456.0, + "grad_norm": 0.04603508602748786, + "language_loss": 0.82726657, + "learning_rate": 0.0005525534027184461, + "loss": 0.8377986, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.41650391, + "step": 2508, + "time_per_iteration": 2.5513370037078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055774, + "balance_loss_mlp": 1.01421785, + "epoch": 0.48268564832627936, + "flos": 564315068928.0, + "grad_norm": 0.02879273586569962, + "language_loss": 0.83137357, + "learning_rate": 0.0005522435758788365, + "loss": 0.84193128, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.41577148, + "step": 2509, + "time_per_iteration": 2.753450393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055715, + "balance_loss_mlp": 1.01415896, + "epoch": 0.4828780300115429, + "flos": 630843568128.0, + "grad_norm": 0.03460020680283242, + "language_loss": 0.80409563, + "learning_rate": 0.0005519337287564721, + "loss": 0.8146528, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.41577148, + "step": 2510, + "time_per_iteration": 2.790820360183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.01020396, + "epoch": 0.4830704116968065, + "flos": 633005454336.0, + "grad_norm": 0.032398618840687954, + "language_loss": 0.83713245, + "learning_rate": 0.000551623861471646, + "loss": 0.84764957, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.4152832, + "step": 2511, + "time_per_iteration": 2.750471353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.01596832, + "epoch": 0.48326279338207, + "flos": 1572619408128.0, + "grad_norm": 0.008656675131842123, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79874945, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.40136719, + "step": 2512, + "time_per_iteration": 4.832056999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_mlp": 1.00636733, + "epoch": 0.4834551750673336, + "flos": 510238791168.0, + "grad_norm": 0.030652937711335218, + "language_loss": 0.87039137, + "learning_rate": 0.0005510040668958211, + "loss": 0.88087165, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.41674805, + "step": 2513, + "time_per_iteration": 2.593559741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053741, + "balance_loss_mlp": 1.0134964, + "epoch": 0.48364755675259713, + "flos": 1531828419072.0, + "grad_norm": 0.007806244380112886, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78814328, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.40234375, + "step": 2514, + "time_per_iteration": 4.834583282470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049855, + "balance_loss_mlp": 1.00810826, + "epoch": 0.4838399384378607, + "flos": 566047299072.0, + "grad_norm": 0.0392841259920432, + "language_loss": 0.83837014, + "learning_rate": 0.0005503841931138645, + "loss": 0.84886873, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.41772461, + "step": 2515, + "time_per_iteration": 2.704660177230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_mlp": 1.00741005, + "epoch": 0.4840323201231243, + "flos": 388542377472.0, + "grad_norm": 0.03590543250931975, + "language_loss": 0.82853907, + "learning_rate": 0.0005500742268214025, + "loss": 0.83903086, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.41796875, + "step": 2516, + "time_per_iteration": 2.4684557914733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048399, + "balance_loss_mlp": 1.00662851, + "epoch": 0.48422470180838784, + "flos": 632176277760.0, + "grad_norm": 0.031370714323768, + "language_loss": 0.8605336, + "learning_rate": 0.0005497642410884014, + "loss": 0.87101769, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.41796875, + "step": 2517, + "time_per_iteration": 2.7523274421691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00808239, + "epoch": 0.4844170834936514, + "flos": 500313950208.0, + "grad_norm": 0.02829147010426611, + "language_loss": 0.85602349, + "learning_rate": 0.0005494542360352085, + "loss": 0.86652207, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.41796875, + "step": 2518, + "time_per_iteration": 2.635472059249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_mlp": 1.00882208, + "epoch": 0.48460946517891496, + "flos": 552195293952.0, + "grad_norm": 0.029973626664194793, + "language_loss": 0.86134493, + "learning_rate": 0.0005491442117821783, + "loss": 0.87185204, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.41918945, + "step": 2519, + "time_per_iteration": 2.686150550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050979, + "balance_loss_mlp": 1.00916088, + "epoch": 0.48480184686417854, + "flos": 530462715648.0, + "grad_norm": 0.03547836116600895, + "language_loss": 0.87863553, + "learning_rate": 0.0005488341684496732, + "loss": 0.88914526, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.41845703, + "step": 2520, + "time_per_iteration": 2.6380345821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_mlp": 1.01155508, + "epoch": 0.4849942285494421, + "flos": 533048421888.0, + "grad_norm": 0.030317982530802673, + "language_loss": 0.92374247, + "learning_rate": 0.0005485241061580624, + "loss": 0.93427622, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.41845703, + "step": 2521, + "time_per_iteration": 2.7106375694274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048164, + "balance_loss_mlp": 1.00639331, + "epoch": 0.48518661023470566, + "flos": 723973747200.0, + "grad_norm": 0.029300799536016952, + "language_loss": 0.85061228, + "learning_rate": 0.0005482140250277228, + "loss": 0.86109388, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.41796875, + "step": 2522, + "time_per_iteration": 2.998014450073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_mlp": 1.00859261, + "epoch": 0.4853789919199692, + "flos": 507156354816.0, + "grad_norm": 0.033835684591452045, + "language_loss": 0.87858051, + "learning_rate": 0.0005479039251790387, + "loss": 0.88908345, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.41723633, + "step": 2523, + "time_per_iteration": 2.6554031372070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00470078, + "epoch": 0.4855713736052328, + "flos": 661700001024.0, + "grad_norm": 0.033801552668461764, + "language_loss": 0.85375023, + "learning_rate": 0.0005475938067324014, + "loss": 0.86421466, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.41772461, + "step": 2524, + "time_per_iteration": 2.8294761180877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_mlp": 1.00839663, + "epoch": 0.48576375529049637, + "flos": 437890504704.0, + "grad_norm": 0.03215141471545655, + "language_loss": 0.84198898, + "learning_rate": 0.0005472836698082098, + "loss": 0.85249019, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.41748047, + "step": 2525, + "time_per_iteration": 2.553400754928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_mlp": 1.00858843, + "epoch": 0.4859561369757599, + "flos": 582845647104.0, + "grad_norm": 0.029048493067812663, + "language_loss": 0.84421259, + "learning_rate": 0.0005469735145268694, + "loss": 0.85471547, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.41723633, + "step": 2526, + "time_per_iteration": 2.741071939468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01121581, + "epoch": 0.4861485186610235, + "flos": 488933923584.0, + "grad_norm": 0.035658567470948505, + "language_loss": 0.81546867, + "learning_rate": 0.0005466633410087933, + "loss": 0.82599807, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.41748047, + "step": 2527, + "time_per_iteration": 2.7008073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057697, + "balance_loss_mlp": 1.01735687, + "epoch": 0.486340900346287, + "flos": 1561113981696.0, + "grad_norm": 0.006481424575109751, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78318518, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.40332031, + "step": 2528, + "time_per_iteration": 4.889545679092407 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048632, + "balance_loss_mlp": 1.00719464, + "epoch": 0.4865332820315506, + "flos": 483990945024.0, + "grad_norm": 0.029120047594960542, + "language_loss": 0.88662624, + "learning_rate": 0.0005460429397441214, + "loss": 0.89711249, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.41455078, + "step": 2529, + "time_per_iteration": 4.04598331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.00706387, + "epoch": 0.48672566371681414, + "flos": 536857967616.0, + "grad_norm": 0.030816613356667605, + "language_loss": 0.87420261, + "learning_rate": 0.0005457327122383866, + "loss": 0.88468921, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.41625977, + "step": 2530, + "time_per_iteration": 2.613560676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.01515198, + "epoch": 0.4869180454020777, + "flos": 1415833195776.0, + "grad_norm": 0.0094125035005948, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75691986, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.40332031, + "step": 2531, + "time_per_iteration": 4.826287269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104985, + "balance_loss_mlp": 1.00831711, + "epoch": 0.48711042708734126, + "flos": 574227270912.0, + "grad_norm": 0.03266780624208146, + "language_loss": 0.76332569, + "learning_rate": 0.0005451122040823244, + "loss": 0.77382421, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.41552734, + "step": 2532, + "time_per_iteration": 2.805912494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_mlp": 1.00438511, + "epoch": 0.48730280877260485, + "flos": 627817512192.0, + "grad_norm": 0.03502227574741412, + "language_loss": 0.77874511, + "learning_rate": 0.0005448019236728997, + "loss": 0.78920573, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.41699219, + "step": 2533, + "time_per_iteration": 2.865936040878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048209, + "balance_loss_mlp": 1.00670052, + "epoch": 0.48749519045786843, + "flos": 513468981504.0, + "grad_norm": 0.035197852276093636, + "language_loss": 0.85303891, + "learning_rate": 0.0005444916258698255, + "loss": 0.86352104, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.4152832, + "step": 2534, + "time_per_iteration": 2.6375105381011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_mlp": 1.00399435, + "epoch": 0.48768757214313196, + "flos": 526479171072.0, + "grad_norm": 0.030578272272676787, + "language_loss": 0.86534977, + "learning_rate": 0.0005441813107935704, + "loss": 0.87580293, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.41333008, + "step": 2535, + "time_per_iteration": 2.6708908081054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044819, + "balance_loss_mlp": 1.0033108, + "epoch": 0.48787995382839555, + "flos": 506031670272.0, + "grad_norm": 0.03128667529665633, + "language_loss": 0.86385322, + "learning_rate": 0.0005438709785646091, + "loss": 0.87430143, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.4152832, + "step": 2536, + "time_per_iteration": 2.587376117706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_mlp": 1.00599802, + "epoch": 0.4880723355136591, + "flos": 576248206080.0, + "grad_norm": 0.031424284702784445, + "language_loss": 0.87241846, + "learning_rate": 0.0005435606293034234, + "loss": 0.88289213, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.41381836, + "step": 2537, + "time_per_iteration": 2.6678061485290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.00425005, + "epoch": 0.48826471719892267, + "flos": 562537152000.0, + "grad_norm": 0.03574143188627203, + "language_loss": 0.85282528, + "learning_rate": 0.0005432502631305016, + "loss": 0.8632828, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.4152832, + "step": 2538, + "time_per_iteration": 2.7138583660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00763726, + "epoch": 0.4884570988841862, + "flos": 727549022976.0, + "grad_norm": 0.02708673321136359, + "language_loss": 0.84024864, + "learning_rate": 0.0005429398801663386, + "loss": 0.85074031, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.41552734, + "step": 2539, + "time_per_iteration": 2.964188814163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_mlp": 1.00797033, + "epoch": 0.4886494805694498, + "flos": 431924908800.0, + "grad_norm": 0.037537890597472735, + "language_loss": 0.83715379, + "learning_rate": 0.0005426294805314355, + "loss": 0.84764791, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.41455078, + "step": 2540, + "time_per_iteration": 2.5386080741882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_mlp": 1.00251019, + "epoch": 0.4888418622547134, + "flos": 674345663232.0, + "grad_norm": 0.02795943805212824, + "language_loss": 0.80757105, + "learning_rate": 0.0005423190643463003, + "loss": 0.81801265, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.41674805, + "step": 2541, + "time_per_iteration": 3.0026512145996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_mlp": 1.00182211, + "epoch": 0.4890342439399769, + "flos": 542936324352.0, + "grad_norm": 0.03490297591946719, + "language_loss": 0.83297753, + "learning_rate": 0.0005420086317314473, + "loss": 0.84341061, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.41503906, + "step": 2542, + "time_per_iteration": 2.713738441467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_mlp": 1.00457919, + "epoch": 0.4892266256252405, + "flos": 591863543808.0, + "grad_norm": 0.03220316860335889, + "language_loss": 0.81509852, + "learning_rate": 0.0005416981828073971, + "loss": 0.8255589, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.41479492, + "step": 2543, + "time_per_iteration": 2.833582639694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_mlp": 1.00983429, + "epoch": 0.48941900731050403, + "flos": 1519657121280.0, + "grad_norm": 0.011925691275285389, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78164709, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.3984375, + "step": 2544, + "time_per_iteration": 4.825795412063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_mlp": 1.00319445, + "epoch": 0.4896113889957676, + "flos": 471519281664.0, + "grad_norm": 0.035595787649594084, + "language_loss": 0.85265428, + "learning_rate": 0.000541077236513819, + "loss": 0.86310375, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.41772461, + "step": 2545, + "time_per_iteration": 2.5318596363067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046977, + "balance_loss_mlp": 1.00515878, + "epoch": 0.48980377068103115, + "flos": 497552299776.0, + "grad_norm": 0.029954814135253697, + "language_loss": 0.8290776, + "learning_rate": 0.0005407667393853638, + "loss": 0.8395474, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.41845703, + "step": 2546, + "time_per_iteration": 2.6808276176452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049449, + "balance_loss_mlp": 1.00765431, + "epoch": 0.48999615236629473, + "flos": 694108829184.0, + "grad_norm": 0.033072726692276254, + "language_loss": 0.83875388, + "learning_rate": 0.0005404562264298569, + "loss": 0.84924835, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.41821289, + "step": 2547, + "time_per_iteration": 2.8665168285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_mlp": 1.00894189, + "epoch": 0.49018853405155827, + "flos": 542749686528.0, + "grad_norm": 0.0323259245637504, + "language_loss": 0.84166187, + "learning_rate": 0.0005401456977678498, + "loss": 0.85217071, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.41967773, + "step": 2548, + "time_per_iteration": 2.646385431289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.01248467, + "epoch": 0.49038091573682185, + "flos": 697109607168.0, + "grad_norm": 0.03434023749691101, + "language_loss": 0.7811271, + "learning_rate": 0.0005398351535199008, + "loss": 0.79166895, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.41723633, + "step": 2549, + "time_per_iteration": 3.0581490993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056036, + "balance_loss_mlp": 1.01443195, + "epoch": 0.49057329742208544, + "flos": 598063409664.0, + "grad_norm": 0.032237778563639685, + "language_loss": 0.84733725, + "learning_rate": 0.0005395245938065735, + "loss": 0.85789764, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.41625977, + "step": 2550, + "time_per_iteration": 2.7877790927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_mlp": 1.01105404, + "epoch": 0.490765679107349, + "flos": 514417721856.0, + "grad_norm": 0.03812364840268788, + "language_loss": 0.82968283, + "learning_rate": 0.0005392140187484379, + "loss": 0.84021086, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.41772461, + "step": 2551, + "time_per_iteration": 2.59513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_mlp": 1.01097441, + "epoch": 0.49095806079261256, + "flos": 630843568128.0, + "grad_norm": 0.028435741934699065, + "language_loss": 0.8977747, + "learning_rate": 0.0005389034284660701, + "loss": 0.90830076, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.41650391, + "step": 2552, + "time_per_iteration": 2.8811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051565, + "balance_loss_mlp": 1.00979364, + "epoch": 0.4911504424778761, + "flos": 916793640960.0, + "grad_norm": 0.038088038632412044, + "language_loss": 0.82567823, + "learning_rate": 0.000538592823080052, + "loss": 0.83619392, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.41796875, + "step": 2553, + "time_per_iteration": 3.147981882095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_mlp": 1.00736189, + "epoch": 0.4913428241631397, + "flos": 439855059456.0, + "grad_norm": 0.03635352086596181, + "language_loss": 0.85271204, + "learning_rate": 0.000538282202710971, + "loss": 0.86320198, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.41650391, + "step": 2554, + "time_per_iteration": 2.5295345783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_mlp": 1.00865471, + "epoch": 0.4915352058484032, + "flos": 637240765440.0, + "grad_norm": 0.03576310950851386, + "language_loss": 0.82746387, + "learning_rate": 0.000537971567479421, + "loss": 0.83796692, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.41674805, + "step": 2555, + "time_per_iteration": 2.7715530395507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047308, + "balance_loss_mlp": 1.00567997, + "epoch": 0.4917275875336668, + "flos": 505510640640.0, + "grad_norm": 0.03586911519664752, + "language_loss": 0.88338435, + "learning_rate": 0.0005376609175060011, + "loss": 0.89385736, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.41650391, + "step": 2556, + "time_per_iteration": 2.6225156784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_mlp": 1.00252998, + "epoch": 0.49191996921893033, + "flos": 655734405120.0, + "grad_norm": 0.03188042342455107, + "language_loss": 0.80798948, + "learning_rate": 0.0005373502529113162, + "loss": 0.81842965, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.41503906, + "step": 2557, + "time_per_iteration": 2.809008836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00485921, + "epoch": 0.4921123509041939, + "flos": 493399613952.0, + "grad_norm": 0.03491285747037794, + "language_loss": 0.8216666, + "learning_rate": 0.0005370395738159773, + "loss": 0.83213049, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.41552734, + "step": 2558, + "time_per_iteration": 2.6442172527313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047723, + "balance_loss_mlp": 1.00619018, + "epoch": 0.4923047325894575, + "flos": 547208573952.0, + "grad_norm": 0.0376599347248576, + "language_loss": 0.83764005, + "learning_rate": 0.0005367288803406003, + "loss": 0.84811723, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.41552734, + "step": 2559, + "time_per_iteration": 2.6496431827545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_mlp": 1.00299704, + "epoch": 0.49249711427472104, + "flos": 597590012160.0, + "grad_norm": 0.034513710641845094, + "language_loss": 0.81748044, + "learning_rate": 0.0005364181726058073, + "loss": 0.8279264, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.41625977, + "step": 2560, + "time_per_iteration": 2.677976608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.0049566, + "epoch": 0.4926894959599846, + "flos": 498809187072.0, + "grad_norm": 0.0360523922041074, + "language_loss": 0.83156157, + "learning_rate": 0.0005361074507322261, + "loss": 0.84202433, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.41333008, + "step": 2561, + "time_per_iteration": 2.5902929306030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_mlp": 1.00575542, + "epoch": 0.49288187764524816, + "flos": 537183611136.0, + "grad_norm": 0.03594243708601782, + "language_loss": 0.81942439, + "learning_rate": 0.000535796714840489, + "loss": 0.82989568, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.41381836, + "step": 2562, + "time_per_iteration": 2.6181418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_mlp": 1.00658977, + "epoch": 0.49307425933051174, + "flos": 642713521920.0, + "grad_norm": 0.03700989683335547, + "language_loss": 0.84345794, + "learning_rate": 0.0005354859650512348, + "loss": 0.85393751, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.41381836, + "step": 2563, + "time_per_iteration": 2.7921204566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048021, + "balance_loss_mlp": 1.00670326, + "epoch": 0.4932666410157753, + "flos": 517265888256.0, + "grad_norm": 0.0348037560143354, + "language_loss": 0.8771596, + "learning_rate": 0.0005351752014851074, + "loss": 0.88763982, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.41333008, + "step": 2564, + "time_per_iteration": 2.602555990219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048335, + "balance_loss_mlp": 1.00694537, + "epoch": 0.49345902270103886, + "flos": 602652554496.0, + "grad_norm": 0.04115766537624956, + "language_loss": 0.83900678, + "learning_rate": 0.0005348644242627553, + "loss": 0.84949011, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.4140625, + "step": 2565, + "time_per_iteration": 2.7332029342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010522, + "balance_loss_mlp": 1.01195526, + "epoch": 0.49365140438630245, + "flos": 1496984550912.0, + "grad_norm": 0.005471138804527184, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76338828, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.40234375, + "step": 2566, + "time_per_iteration": 4.974903583526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051209, + "balance_loss_mlp": 1.00991523, + "epoch": 0.493843786071566, + "flos": 630789133056.0, + "grad_norm": 0.031108020693620165, + "language_loss": 0.8259182, + "learning_rate": 0.0005342428293320013, + "loss": 0.83643031, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.41308594, + "step": 2567, + "time_per_iteration": 2.774355173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_mlp": 1.01332963, + "epoch": 0.49403616775682957, + "flos": 618690745344.0, + "grad_norm": 0.04042101882964004, + "language_loss": 0.84698522, + "learning_rate": 0.0005339320118649238, + "loss": 0.85753244, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.4140625, + "step": 2568, + "time_per_iteration": 2.7593345642089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_mlp": 1.0091759, + "epoch": 0.4942285494420931, + "flos": 578814470400.0, + "grad_norm": 0.03306097920847627, + "language_loss": 0.87056893, + "learning_rate": 0.000533621181224271, + "loss": 0.88107407, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.41357422, + "step": 2569, + "time_per_iteration": 2.815171957015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_mlp": 1.00358069, + "epoch": 0.4944209311273567, + "flos": 631466664960.0, + "grad_norm": 0.04400973771206172, + "language_loss": 0.82116252, + "learning_rate": 0.0005333103375307182, + "loss": 0.83161294, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.41479492, + "step": 2570, + "time_per_iteration": 2.86649227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048831, + "balance_loss_mlp": 1.00751352, + "epoch": 0.4946133128126202, + "flos": 588719869440.0, + "grad_norm": 0.030724614795269025, + "language_loss": 0.86645854, + "learning_rate": 0.0005329994809049451, + "loss": 0.87694681, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.41333008, + "step": 2571, + "time_per_iteration": 2.717759847640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_mlp": 1.00297725, + "epoch": 0.4948056944978838, + "flos": 584847140352.0, + "grad_norm": 0.02937251460087377, + "language_loss": 0.88108343, + "learning_rate": 0.0005326886114676375, + "loss": 0.89152658, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.41357422, + "step": 2572, + "time_per_iteration": 2.767547369003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_mlp": 1.00207376, + "epoch": 0.49499807618314734, + "flos": 482781689856.0, + "grad_norm": 0.032763972727654474, + "language_loss": 0.88217831, + "learning_rate": 0.0005323777293394854, + "loss": 0.8926127, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.41381836, + "step": 2573, + "time_per_iteration": 2.557117223739624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_mlp": 1.00318027, + "epoch": 0.4951904578684109, + "flos": 520038232320.0, + "grad_norm": 0.044201740478413694, + "language_loss": 0.82535017, + "learning_rate": 0.000532066834641184, + "loss": 0.83579636, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.41455078, + "step": 2574, + "time_per_iteration": 2.6565427780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_mlp": 1.00202954, + "epoch": 0.4953828395536745, + "flos": 536578010880.0, + "grad_norm": 0.03171877270725238, + "language_loss": 0.85277009, + "learning_rate": 0.0005317559274934334, + "loss": 0.8632071, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.41699219, + "step": 2575, + "time_per_iteration": 2.720740795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048187, + "balance_loss_mlp": 1.00653565, + "epoch": 0.49557522123893805, + "flos": 529607294208.0, + "grad_norm": 0.03640176927698583, + "language_loss": 0.81348443, + "learning_rate": 0.0005314450080169382, + "loss": 0.82396632, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.41674805, + "step": 2576, + "time_per_iteration": 2.6694118976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.00729847, + "epoch": 0.49576760292420163, + "flos": 428918294784.0, + "grad_norm": 0.03343170538339807, + "language_loss": 0.81225574, + "learning_rate": 0.0005311340763324083, + "loss": 0.82274544, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.41699219, + "step": 2577, + "time_per_iteration": 2.5676074028015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050384, + "balance_loss_mlp": 1.00866091, + "epoch": 0.49595998460946517, + "flos": 566316562176.0, + "grad_norm": 0.031028578783915843, + "language_loss": 0.83262658, + "learning_rate": 0.0005308231325605578, + "loss": 0.84313035, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.41748047, + "step": 2578, + "time_per_iteration": 2.6750431060791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00893033, + "epoch": 0.49615236629472875, + "flos": 703814951424.0, + "grad_norm": 0.16493684193156796, + "language_loss": 0.7742933, + "learning_rate": 0.0005305121768221061, + "loss": 0.78479862, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.41625977, + "step": 2579, + "time_per_iteration": 3.083477020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047222, + "balance_loss_mlp": 1.00688171, + "epoch": 0.4963447479799923, + "flos": 1444755209472.0, + "grad_norm": 0.004557610476670616, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76085544, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.40332031, + "step": 2580, + "time_per_iteration": 4.820146083831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_mlp": 1.00602686, + "epoch": 0.49653712966525587, + "flos": 538664074752.0, + "grad_norm": 0.031551785699882776, + "language_loss": 0.92325974, + "learning_rate": 0.0005298902299282984, + "loss": 0.93373942, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.41967773, + "step": 2581, + "time_per_iteration": 2.619842529296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050513, + "balance_loss_mlp": 1.00840831, + "epoch": 0.4967295113505194, + "flos": 608396519424.0, + "grad_norm": 0.03377113658216861, + "language_loss": 0.8488903, + "learning_rate": 0.0005295792390144033, + "loss": 0.8593955, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.42138672, + "step": 2582, + "time_per_iteration": 2.722321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00872111, + "epoch": 0.496921893035783, + "flos": 475531016448.0, + "grad_norm": 0.04081472802053015, + "language_loss": 0.84166956, + "learning_rate": 0.0005292682366168294, + "loss": 0.85217929, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.42285156, + "step": 2583, + "time_per_iteration": 2.5314435958862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00393724, + "epoch": 0.4971142747210466, + "flos": 598603881216.0, + "grad_norm": 0.03300753756436905, + "language_loss": 0.80573511, + "learning_rate": 0.0005289572228563181, + "loss": 0.81619596, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.421875, + "step": 2584, + "time_per_iteration": 2.7332074642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00846612, + "epoch": 0.4973066564063101, + "flos": 600735631872.0, + "grad_norm": 0.03199938195942058, + "language_loss": 0.83498567, + "learning_rate": 0.000528646197853616, + "loss": 0.8454923, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.42236328, + "step": 2585, + "time_per_iteration": 2.748955249786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_mlp": 1.00938058, + "epoch": 0.4974990380915737, + "flos": 650770039296.0, + "grad_norm": 0.03327645798274956, + "language_loss": 0.86559486, + "learning_rate": 0.0005283351617294735, + "loss": 0.87611067, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.42236328, + "step": 2586, + "time_per_iteration": 2.9175055027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051655, + "balance_loss_mlp": 1.01093292, + "epoch": 0.49769141977683723, + "flos": 1532442767616.0, + "grad_norm": 0.005920405298637117, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77688324, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.40722656, + "step": 2587, + "time_per_iteration": 4.992246627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.00936949, + "epoch": 0.4978838014621008, + "flos": 537398439168.0, + "grad_norm": 0.03485872476270145, + "language_loss": 0.87171799, + "learning_rate": 0.0005277130565998916, + "loss": 0.88223433, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.4230957, + "step": 2588, + "time_per_iteration": 2.7742838859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048839, + "balance_loss_mlp": 1.00666261, + "epoch": 0.49807618314736435, + "flos": 540746247936.0, + "grad_norm": 0.02719767735149213, + "language_loss": 0.82424593, + "learning_rate": 0.0005274019878359748, + "loss": 0.83473426, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.42211914, + "step": 2589, + "time_per_iteration": 2.7111029624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_mlp": 1.00699103, + "epoch": 0.49826856483262794, + "flos": 543522482688.0, + "grad_norm": 0.03488772819740132, + "language_loss": 0.87582624, + "learning_rate": 0.0005270909084336628, + "loss": 0.88631868, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.42285156, + "step": 2590, + "time_per_iteration": 2.6801702976226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00911105, + "epoch": 0.4984609465178915, + "flos": 523361741568.0, + "grad_norm": 0.03538182267925601, + "language_loss": 0.89689445, + "learning_rate": 0.0005267798185137276, + "loss": 0.90740824, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.4230957, + "step": 2591, + "time_per_iteration": 2.673933506011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00577164, + "epoch": 0.49865332820315506, + "flos": 575705789184.0, + "grad_norm": 0.03191547825845594, + "language_loss": 0.90023857, + "learning_rate": 0.0005264687181969444, + "loss": 0.91071951, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.42358398, + "step": 2592, + "time_per_iteration": 2.729825735092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047144, + "balance_loss_mlp": 1.00484908, + "epoch": 0.49884570988841864, + "flos": 1015211884032.0, + "grad_norm": 0.03571151562514848, + "language_loss": 0.75975507, + "learning_rate": 0.0005261576076040937, + "loss": 0.77022654, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.42333984, + "step": 2593, + "time_per_iteration": 3.284675359725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00502849, + "epoch": 0.4990380915736822, + "flos": 560648419584.0, + "grad_norm": 0.032935336602121515, + "language_loss": 0.84734505, + "learning_rate": 0.0005258464868559591, + "loss": 0.85781705, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.42211914, + "step": 2594, + "time_per_iteration": 2.638974905014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_mlp": 1.00772595, + "epoch": 0.49923047325894576, + "flos": 499944565248.0, + "grad_norm": 0.031535831762229155, + "language_loss": 0.89198703, + "learning_rate": 0.0005255353560733284, + "loss": 0.90248442, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.42041016, + "step": 2595, + "time_per_iteration": 2.5665078163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_mlp": 1.00414276, + "epoch": 0.4994228549442093, + "flos": 1499790921216.0, + "grad_norm": 0.005502914482473529, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76623321, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.40527344, + "step": 2596, + "time_per_iteration": 4.774062395095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_mlp": 1.0082401, + "epoch": 0.4996152366294729, + "flos": 558514723584.0, + "grad_norm": 0.032060383149289634, + "language_loss": 0.83672047, + "learning_rate": 0.0005249130648877492, + "loss": 0.84722298, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.42041016, + "step": 2597, + "time_per_iteration": 2.7558000087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051517, + "balance_loss_mlp": 1.00950754, + "epoch": 0.4998076183147364, + "flos": 416483569920.0, + "grad_norm": 0.036130927396763525, + "language_loss": 0.85007888, + "learning_rate": 0.0005246019047263953, + "loss": 0.86059409, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.42041016, + "step": 2598, + "time_per_iteration": 2.4761478900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045154, + "balance_loss_mlp": 1.00300181, + "epoch": 0.5, + "flos": 468326029824.0, + "grad_norm": 0.035928472301153966, + "language_loss": 0.83319026, + "learning_rate": 0.0005242907350137353, + "loss": 0.84364176, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.421875, + "step": 2599, + "time_per_iteration": 2.551312208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_mlp": 1.00439322, + "epoch": 0.5001923816852636, + "flos": 483756675072.0, + "grad_norm": 0.03511658446114867, + "language_loss": 0.79463625, + "learning_rate": 0.0005239795558705754, + "loss": 0.80510032, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.42041016, + "step": 2600, + "time_per_iteration": 2.6441214084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_mlp": 1.00278771, + "epoch": 0.5003847633705272, + "flos": 534856474368.0, + "grad_norm": 0.03015144944524051, + "language_loss": 0.89835393, + "learning_rate": 0.0005236683674177264, + "loss": 0.90880144, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.41992188, + "step": 2601, + "time_per_iteration": 2.669487953186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_mlp": 1.00746012, + "epoch": 0.5005771450557907, + "flos": 739056394752.0, + "grad_norm": 0.03236196452732128, + "language_loss": 0.82869333, + "learning_rate": 0.0005233571697760021, + "loss": 0.83918852, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.42089844, + "step": 2602, + "time_per_iteration": 2.85748028755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_mlp": 1.00264096, + "epoch": 0.5007695267410542, + "flos": 780307175424.0, + "grad_norm": 0.03720253600362933, + "language_loss": 0.83658135, + "learning_rate": 0.0005230459630662203, + "loss": 0.84702832, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.42089844, + "step": 2603, + "time_per_iteration": 2.9300596714019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_mlp": 1.00358939, + "epoch": 0.5009619084263178, + "flos": 624619402752.0, + "grad_norm": 0.038089595528021734, + "language_loss": 0.82175541, + "learning_rate": 0.0005227347474092022, + "loss": 0.83221114, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.42016602, + "step": 2604, + "time_per_iteration": 2.7056775093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00621724, + "epoch": 0.5011542901115814, + "flos": 532193000448.0, + "grad_norm": 0.026542730624890497, + "language_loss": 0.84019673, + "learning_rate": 0.0005224235229257724, + "loss": 0.85067946, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.42089844, + "step": 2605, + "time_per_iteration": 2.6953065395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048861, + "balance_loss_mlp": 1.00680435, + "epoch": 0.5013466717968449, + "flos": 528628418304.0, + "grad_norm": 0.028335807962849974, + "language_loss": 0.87261045, + "learning_rate": 0.0005221122897367589, + "loss": 0.88309902, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.42089844, + "step": 2606, + "time_per_iteration": 2.7901618480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051336, + "balance_loss_mlp": 1.00939834, + "epoch": 0.5015390534821085, + "flos": 567089358336.0, + "grad_norm": 0.03672669743645021, + "language_loss": 0.81618142, + "learning_rate": 0.0005218010479629932, + "loss": 0.82669473, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.41967773, + "step": 2607, + "time_per_iteration": 2.6298229694366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047474, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5017314351673721, + "flos": 567768835584.0, + "grad_norm": 0.038374388481505664, + "language_loss": 0.82467473, + "learning_rate": 0.0005214897977253102, + "loss": 0.83514941, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41992188, + "step": 2608, + "time_per_iteration": 2.6571240425109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044229, + "balance_loss_mlp": 1.00231516, + "epoch": 0.5019238168526357, + "flos": 523387986432.0, + "grad_norm": 0.030375370520194293, + "language_loss": 0.84678638, + "learning_rate": 0.0005211785391445473, + "loss": 0.85722864, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.41943359, + "step": 2609, + "time_per_iteration": 2.7354485988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_mlp": 1.00309336, + "epoch": 0.5021161985378992, + "flos": 642637699584.0, + "grad_norm": 0.0345609683707489, + "language_loss": 0.80034763, + "learning_rate": 0.0005208672723415467, + "loss": 0.81079769, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.41943359, + "step": 2610, + "time_per_iteration": 2.8003506660461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_mlp": 1.00431252, + "epoch": 0.5023085802231627, + "flos": 592423457280.0, + "grad_norm": 0.034384432252957974, + "language_loss": 0.79919124, + "learning_rate": 0.0005205559974371525, + "loss": 0.8096537, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41967773, + "step": 2611, + "time_per_iteration": 2.801931142807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_mlp": 1.00283635, + "epoch": 0.5025009619084263, + "flos": 473334137088.0, + "grad_norm": 0.0314075616675113, + "language_loss": 0.83085155, + "learning_rate": 0.0005202447145522123, + "loss": 0.84129953, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.41992188, + "step": 2612, + "time_per_iteration": 2.7084405422210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104663, + "balance_loss_mlp": 1.00476372, + "epoch": 0.5026933435936899, + "flos": 456077942784.0, + "grad_norm": 0.03248187925620893, + "language_loss": 0.79969329, + "learning_rate": 0.0005199334238075769, + "loss": 0.81015956, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.41894531, + "step": 2613, + "time_per_iteration": 2.5416245460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00367975, + "epoch": 0.5028857252789535, + "flos": 492722082048.0, + "grad_norm": 0.030734349084793038, + "language_loss": 0.92369366, + "learning_rate": 0.0005196221253241, + "loss": 0.93415004, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.41992188, + "step": 2614, + "time_per_iteration": 2.5504183769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.00344431, + "epoch": 0.503078106964217, + "flos": 626731711488.0, + "grad_norm": 0.0333228394962432, + "language_loss": 0.83482671, + "learning_rate": 0.0005193108192226383, + "loss": 0.84528148, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.4206543, + "step": 2615, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_mlp": 1.00445676, + "epoch": 0.5032704886494805, + "flos": 580138431744.0, + "grad_norm": 0.028161477664975402, + "language_loss": 0.87796414, + "learning_rate": 0.000518999505624052, + "loss": 0.88842779, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.41943359, + "step": 2616, + "time_per_iteration": 2.703958749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_mlp": 1.00289583, + "epoch": 0.5034628703347441, + "flos": 472846155264.0, + "grad_norm": 0.026579731156649716, + "language_loss": 0.83874726, + "learning_rate": 0.000518688184649203, + "loss": 0.84919554, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41967773, + "step": 2617, + "time_per_iteration": 2.7804102897644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046877, + "balance_loss_mlp": 1.00501108, + "epoch": 0.5036552520200077, + "flos": 490813907712.0, + "grad_norm": 0.028739225931260208, + "language_loss": 0.84081781, + "learning_rate": 0.0005183768564189577, + "loss": 0.85128659, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41894531, + "step": 2618, + "time_per_iteration": 2.559967517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049251, + "balance_loss_mlp": 1.00724185, + "epoch": 0.5038476337052713, + "flos": 495216414720.0, + "grad_norm": 0.040417435174145346, + "language_loss": 0.82122672, + "learning_rate": 0.0005180655210541838, + "loss": 0.83171928, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.42041016, + "step": 2619, + "time_per_iteration": 2.569495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_mlp": 1.00471759, + "epoch": 0.5040400153905348, + "flos": 601740752640.0, + "grad_norm": 0.03616333015321602, + "language_loss": 0.83923668, + "learning_rate": 0.0005177541786757527, + "loss": 0.84970129, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.41772461, + "step": 2620, + "time_per_iteration": 2.7744040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048715, + "balance_loss_mlp": 1.0068723, + "epoch": 0.5042323970757984, + "flos": 812920137984.0, + "grad_norm": 0.03309299686066053, + "language_loss": 0.83304209, + "learning_rate": 0.000517442829404538, + "loss": 0.84352922, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.41870117, + "step": 2621, + "time_per_iteration": 2.97257137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048042, + "balance_loss_mlp": 1.00610471, + "epoch": 0.504424778761062, + "flos": 628607804928.0, + "grad_norm": 0.035914844760130495, + "language_loss": 0.87778026, + "learning_rate": 0.0005171314733614166, + "loss": 0.88826072, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.41967773, + "step": 2622, + "time_per_iteration": 2.8732259273529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051056, + "balance_loss_mlp": 1.0091418, + "epoch": 0.5046171604463255, + "flos": 516957741312.0, + "grad_norm": 0.03505567711141955, + "language_loss": 0.79205, + "learning_rate": 0.0005168201106672671, + "loss": 0.80256051, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.41943359, + "step": 2623, + "time_per_iteration": 2.773688316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_mlp": 1.00590754, + "epoch": 0.504809542131589, + "flos": 528853939968.0, + "grad_norm": 0.0377301000829576, + "language_loss": 0.8564831, + "learning_rate": 0.0005165087414429717, + "loss": 0.86696255, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.4206543, + "step": 2624, + "time_per_iteration": 2.6755454540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051892, + "balance_loss_mlp": 1.0100261, + "epoch": 0.5050019238168526, + "flos": 555175663104.0, + "grad_norm": 0.03350143092818485, + "language_loss": 0.83751678, + "learning_rate": 0.0005161973658094144, + "loss": 0.84803575, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.41894531, + "step": 2625, + "time_per_iteration": 2.6260385513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105232, + "balance_loss_mlp": 1.01057339, + "epoch": 0.5051943055021162, + "flos": 575929365504.0, + "grad_norm": 0.030667351452066165, + "language_loss": 0.83093894, + "learning_rate": 0.000515885983887482, + "loss": 0.84146214, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.41772461, + "step": 2626, + "time_per_iteration": 2.7437500953674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_mlp": 1.00686646, + "epoch": 0.5053866871873798, + "flos": 497682557184.0, + "grad_norm": 0.033924054159163435, + "language_loss": 0.84715843, + "learning_rate": 0.0005155745957980636, + "loss": 0.85764432, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.41748047, + "step": 2627, + "time_per_iteration": 2.625260353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048009, + "balance_loss_mlp": 1.00638068, + "epoch": 0.5055790688726434, + "flos": 503220442368.0, + "grad_norm": 0.03037314022037546, + "language_loss": 0.89067703, + "learning_rate": 0.000515263201662051, + "loss": 0.90115714, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41650391, + "step": 2628, + "time_per_iteration": 2.68068265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.00565541, + "epoch": 0.5057714505579068, + "flos": 846768600576.0, + "grad_norm": 0.031311962044338205, + "language_loss": 0.83074951, + "learning_rate": 0.0005149518016003378, + "loss": 0.84122205, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.41625977, + "step": 2629, + "time_per_iteration": 3.208085060119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048667, + "balance_loss_mlp": 1.00720644, + "epoch": 0.5059638322431704, + "flos": 498809187072.0, + "grad_norm": 0.03517894489413756, + "language_loss": 0.82677329, + "learning_rate": 0.0005146403957338206, + "loss": 0.83725995, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.41479492, + "step": 2630, + "time_per_iteration": 2.5591788291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_mlp": 1.0044378, + "epoch": 0.506156213928434, + "flos": 619114565376.0, + "grad_norm": 0.029747387185900163, + "language_loss": 0.82375658, + "learning_rate": 0.0005143289841833975, + "loss": 0.83421576, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41503906, + "step": 2631, + "time_per_iteration": 2.8919997215270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5063485956136976, + "flos": 425790171648.0, + "grad_norm": 0.040524041139339724, + "language_loss": 0.82811654, + "learning_rate": 0.0005140175670699696, + "loss": 0.83857036, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6062378883361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.0038327, + "epoch": 0.5065409772989612, + "flos": 571070957568.0, + "grad_norm": 0.026263595366118216, + "language_loss": 0.83201623, + "learning_rate": 0.0005137061445144395, + "loss": 0.84246838, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.4140625, + "step": 2633, + "time_per_iteration": 2.9138190746307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00282133, + "epoch": 0.5067333589842247, + "flos": 629970650112.0, + "grad_norm": 0.032671607566671305, + "language_loss": 0.87714005, + "learning_rate": 0.000513394716637712, + "loss": 0.8875829, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.41479492, + "step": 2634, + "time_per_iteration": 2.7618257999420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044567, + "balance_loss_mlp": 1.00422668, + "epoch": 0.5069257406694883, + "flos": 1451098938624.0, + "grad_norm": 0.004578936312393245, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.8023628, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.40332031, + "step": 2635, + "time_per_iteration": 4.85358738899231 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050199, + "balance_loss_mlp": 1.00869, + "epoch": 0.5071181223547518, + "flos": 640058796288.0, + "grad_norm": 0.03342633817994969, + "language_loss": 0.81428993, + "learning_rate": 0.0005127718454042958, + "loss": 0.82479185, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.4152832, + "step": 2636, + "time_per_iteration": 2.8021318912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_mlp": 1.00304461, + "epoch": 0.5073105040400154, + "flos": 714873225216.0, + "grad_norm": 0.031182962990379204, + "language_loss": 0.85094464, + "learning_rate": 0.0005124604022894269, + "loss": 0.86139023, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.4152832, + "step": 2637, + "time_per_iteration": 2.934414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_mlp": 1.00676727, + "epoch": 0.5075028857252789, + "flos": 1439614899456.0, + "grad_norm": 0.007557162842452459, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7823543, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.40429688, + "step": 2638, + "time_per_iteration": 4.820345878601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104601, + "balance_loss_mlp": 1.00435817, + "epoch": 0.5076952674105425, + "flos": 572308402944.0, + "grad_norm": 0.03427455588588844, + "language_loss": 0.83839953, + "learning_rate": 0.0005118375016679325, + "loss": 0.84885961, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.41674805, + "step": 2639, + "time_per_iteration": 2.753891706466675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104483, + "balance_loss_mlp": 1.00327373, + "epoch": 0.5078876490958061, + "flos": 517713040896.0, + "grad_norm": 0.0397313189962262, + "language_loss": 0.81205344, + "learning_rate": 0.0005115260444031382, + "loss": 0.82250178, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.41577148, + "step": 2640, + "time_per_iteration": 2.5884034633636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_mlp": 1.0042038, + "epoch": 0.5080800307810697, + "flos": 1587622342656.0, + "grad_norm": 0.00452780467183982, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79776466, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.40429688, + "step": 2641, + "time_per_iteration": 5.021141290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048765, + "balance_loss_mlp": 1.0071131, + "epoch": 0.5082724124663333, + "flos": 486187824384.0, + "grad_norm": 0.03665123216497768, + "language_loss": 0.87927556, + "learning_rate": 0.0005109031165700483, + "loss": 0.88976324, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.41674805, + "step": 2642, + "time_per_iteration": 2.564768075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044813, + "balance_loss_mlp": 1.00313723, + "epoch": 0.5084647941515967, + "flos": 683443272960.0, + "grad_norm": 0.03222315683418769, + "language_loss": 0.84105259, + "learning_rate": 0.0005105916462435945, + "loss": 0.85150075, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.41699219, + "step": 2643, + "time_per_iteration": 2.8432576656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046727, + "balance_loss_mlp": 1.0049082, + "epoch": 0.5086571758368603, + "flos": 549813722112.0, + "grad_norm": 0.031341979306324576, + "language_loss": 0.85911554, + "learning_rate": 0.0005102801718050989, + "loss": 0.86958289, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.41845703, + "step": 2644, + "time_per_iteration": 2.7012667655944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_mlp": 1.00658011, + "epoch": 0.5088495575221239, + "flos": 565079116800.0, + "grad_norm": 0.03553781912080262, + "language_loss": 0.89604807, + "learning_rate": 0.0005099686933754867, + "loss": 0.90653086, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.41723633, + "step": 2645, + "time_per_iteration": 2.774092197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047355, + "balance_loss_mlp": 1.00551212, + "epoch": 0.5090419392073875, + "flos": 552512189184.0, + "grad_norm": 0.03374447512064937, + "language_loss": 0.84807706, + "learning_rate": 0.0005096572110756845, + "loss": 0.85855055, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.41870117, + "step": 2646, + "time_per_iteration": 2.691534996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050517, + "balance_loss_mlp": 1.00857961, + "epoch": 0.509234320892651, + "flos": 568884771840.0, + "grad_norm": 0.0280586539552875, + "language_loss": 0.86222303, + "learning_rate": 0.0005093457250266205, + "loss": 0.87272823, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.41967773, + "step": 2647, + "time_per_iteration": 2.669032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_mlp": 1.00750375, + "epoch": 0.5094267025779146, + "flos": 583694265600.0, + "grad_norm": 0.03456739808544309, + "language_loss": 0.83707237, + "learning_rate": 0.000509034235349224, + "loss": 0.84756589, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.41870117, + "step": 2648, + "time_per_iteration": 2.7174429893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048368, + "balance_loss_mlp": 1.00657344, + "epoch": 0.5096190842631781, + "flos": 593139873024.0, + "grad_norm": 0.03190176036185227, + "language_loss": 0.81830442, + "learning_rate": 0.0005087227421644266, + "loss": 0.82878816, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.41821289, + "step": 2649, + "time_per_iteration": 2.730527877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_mlp": 1.00278723, + "epoch": 0.5098114659484417, + "flos": 514584917760.0, + "grad_norm": 0.03166339002539628, + "language_loss": 0.86503744, + "learning_rate": 0.0005084112455931602, + "loss": 0.87548256, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.41748047, + "step": 2650, + "time_per_iteration": 2.588543176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048046, + "balance_loss_mlp": 1.00627494, + "epoch": 0.5100038476337053, + "flos": 485601666048.0, + "grad_norm": 0.03514605484852806, + "language_loss": 0.85810292, + "learning_rate": 0.0005080997457563586, + "loss": 0.86858344, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41796875, + "step": 2651, + "time_per_iteration": 2.547510862350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053822, + "balance_loss_mlp": 1.01214612, + "epoch": 0.5101962293189688, + "flos": 462555820032.0, + "grad_norm": 0.03981395249249623, + "language_loss": 0.79794431, + "learning_rate": 0.0005077882427749569, + "loss": 0.80848241, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41699219, + "step": 2652, + "time_per_iteration": 2.5867154598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052855, + "balance_loss_mlp": 1.0111798, + "epoch": 0.5103886110042324, + "flos": 588133711104.0, + "grad_norm": 0.03576387090025985, + "language_loss": 0.8527801, + "learning_rate": 0.0005074767367698913, + "loss": 0.86330867, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.41699219, + "step": 2653, + "time_per_iteration": 2.668619155883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052633, + "balance_loss_mlp": 1.01083803, + "epoch": 0.510580992689496, + "flos": 846679172352.0, + "grad_norm": 0.03324234024932545, + "language_loss": 0.84336531, + "learning_rate": 0.0005071652278620988, + "loss": 0.85389161, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.41821289, + "step": 2654, + "time_per_iteration": 3.0502736568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_mlp": 1.01043141, + "epoch": 0.5107733743747596, + "flos": 659811268608.0, + "grad_norm": 0.033221976859431776, + "language_loss": 0.83371234, + "learning_rate": 0.0005068537161725186, + "loss": 0.84423465, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.41821289, + "step": 2655, + "time_per_iteration": 2.7732832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049493, + "balance_loss_mlp": 1.00784123, + "epoch": 0.510965756060023, + "flos": 702961475328.0, + "grad_norm": 0.03652104464060243, + "language_loss": 0.84970605, + "learning_rate": 0.0005065422018220893, + "loss": 0.860201, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41674805, + "step": 2656, + "time_per_iteration": 2.8670201301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_mlp": 1.00430822, + "epoch": 0.5111581377452866, + "flos": 560941982208.0, + "grad_norm": 0.03459233510222537, + "language_loss": 0.80690587, + "learning_rate": 0.0005062306849317521, + "loss": 0.81736469, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41601562, + "step": 2657, + "time_per_iteration": 2.8002302646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_mlp": 1.00202358, + "epoch": 0.5113505194305502, + "flos": 610146246144.0, + "grad_norm": 0.03554743150534212, + "language_loss": 0.83936596, + "learning_rate": 0.0005059191656224487, + "loss": 0.84980083, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41479492, + "step": 2658, + "time_per_iteration": 2.716935157775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_mlp": 1.0037955, + "epoch": 0.5115429011158138, + "flos": 535535951616.0, + "grad_norm": 0.03199868953010379, + "language_loss": 0.89635181, + "learning_rate": 0.0005056076440151212, + "loss": 0.90680414, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.41455078, + "step": 2659, + "time_per_iteration": 2.6661012172698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_mlp": 1.0019455, + "epoch": 0.5117352828010774, + "flos": 1365275813376.0, + "grad_norm": 0.005851878799964376, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.773305, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.40429688, + "step": 2660, + "time_per_iteration": 4.8821775913238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5119276644863409, + "flos": 634931125248.0, + "grad_norm": 0.030472593638878876, + "language_loss": 0.87624103, + "learning_rate": 0.0005049845943901691, + "loss": 0.88671124, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.4152832, + "step": 2661, + "time_per_iteration": 2.868314743041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_mlp": 1.00434649, + "epoch": 0.5121200461716044, + "flos": 586781559552.0, + "grad_norm": 0.035240788892260635, + "language_loss": 0.87104362, + "learning_rate": 0.0005046730666144338, + "loss": 0.88150167, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.41479492, + "step": 2662, + "time_per_iteration": 2.7716057300567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_mlp": 1.00323498, + "epoch": 0.512312427856868, + "flos": 1034224608000.0, + "grad_norm": 0.027938837780362106, + "language_loss": 0.8826527, + "learning_rate": 0.0005043615370244532, + "loss": 0.89309919, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41430664, + "step": 2663, + "time_per_iteration": 3.4280622005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046261, + "balance_loss_mlp": 1.00611115, + "epoch": 0.5125048095421316, + "flos": 1540901729280.0, + "grad_norm": 0.006786755652655265, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.7929064, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.40136719, + "step": 2664, + "time_per_iteration": 4.68994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.003901, + "epoch": 0.5126971912273951, + "flos": 592328193024.0, + "grad_norm": 0.02608573212926663, + "language_loss": 0.86075294, + "learning_rate": 0.0005037384728855425, + "loss": 0.87120485, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.41308594, + "step": 2665, + "time_per_iteration": 2.7917027473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_mlp": 1.00552762, + "epoch": 0.5128895729126587, + "flos": 552718268928.0, + "grad_norm": 0.03821611985083245, + "language_loss": 0.85252321, + "learning_rate": 0.0005034269385785075, + "loss": 0.86299217, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.41381836, + "step": 2666, + "time_per_iteration": 2.63472318649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_mlp": 1.00605392, + "epoch": 0.5130819545979223, + "flos": 482232470016.0, + "grad_norm": 0.03834683208397515, + "language_loss": 0.85133517, + "learning_rate": 0.0005031154029410168, + "loss": 0.86180985, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.41430664, + "step": 2667, + "time_per_iteration": 2.517110824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049696, + "balance_loss_mlp": 1.00837803, + "epoch": 0.5132743362831859, + "flos": 476768461824.0, + "grad_norm": 0.033096203996997774, + "language_loss": 0.87656248, + "learning_rate": 0.0005028038660940197, + "loss": 0.88705945, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.41333008, + "step": 2668, + "time_per_iteration": 2.5096347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105007, + "balance_loss_mlp": 1.00870478, + "epoch": 0.5134667179684494, + "flos": 504903095040.0, + "grad_norm": 0.028882778070319505, + "language_loss": 0.84998578, + "learning_rate": 0.0005024923281584648, + "loss": 0.86048645, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.41381836, + "step": 2669, + "time_per_iteration": 2.6474804878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_mlp": 1.0076561, + "epoch": 0.5136590996537129, + "flos": 505005162240.0, + "grad_norm": 0.03165719334287126, + "language_loss": 0.8319236, + "learning_rate": 0.0005021807892553026, + "loss": 0.84241164, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.41162109, + "step": 2670, + "time_per_iteration": 2.7183725833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044063, + "balance_loss_mlp": 1.00269723, + "epoch": 0.5138514813389765, + "flos": 625800467712.0, + "grad_norm": 0.030310171756311025, + "language_loss": 0.85420138, + "learning_rate": 0.0005018692495054828, + "loss": 0.86464202, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41381836, + "step": 2671, + "time_per_iteration": 2.772813081741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_mlp": 1.00224543, + "epoch": 0.5140438630242401, + "flos": 584634257664.0, + "grad_norm": 0.030896406933945995, + "language_loss": 0.80988181, + "learning_rate": 0.0005015577090299561, + "loss": 0.82031626, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.41210938, + "step": 2672, + "time_per_iteration": 2.6667463779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_mlp": 1.00858212, + "epoch": 0.5142362447095037, + "flos": 488905733376.0, + "grad_norm": 0.032429697018958814, + "language_loss": 0.87124586, + "learning_rate": 0.0005012461679496729, + "loss": 0.88174391, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41235352, + "step": 2673, + "time_per_iteration": 2.6442089080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104556, + "balance_loss_mlp": 1.00431406, + "epoch": 0.5144286263947672, + "flos": 527885757696.0, + "grad_norm": 0.03122591363863073, + "language_loss": 0.88052714, + "learning_rate": 0.0005009346263855848, + "loss": 0.89098281, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.41259766, + "step": 2674, + "time_per_iteration": 2.602527379989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048679, + "balance_loss_mlp": 1.00736094, + "epoch": 0.5146210080800308, + "flos": 487590520320.0, + "grad_norm": 0.029060606816111258, + "language_loss": 0.84209937, + "learning_rate": 0.0005006230844586422, + "loss": 0.85258621, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.41333008, + "step": 2675, + "time_per_iteration": 2.8685102462768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043387, + "balance_loss_mlp": 1.00216484, + "epoch": 0.5148133897652943, + "flos": 516975237888.0, + "grad_norm": 0.028587045609365692, + "language_loss": 0.79492688, + "learning_rate": 0.0005003115422897968, + "loss": 0.80536079, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.41235352, + "step": 2676, + "time_per_iteration": 2.765714168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_mlp": 1.00024414, + "epoch": 0.5150057714505579, + "flos": 512212094208.0, + "grad_norm": 0.033131913333961045, + "language_loss": 0.87827182, + "learning_rate": 0.0005, + "loss": 0.88868773, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.41357422, + "step": 2677, + "time_per_iteration": 2.705502986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047696, + "balance_loss_mlp": 1.00623488, + "epoch": 0.5151981531358215, + "flos": 912391133952.0, + "grad_norm": 0.03328612222334398, + "language_loss": 0.79844034, + "learning_rate": 0.0004996884577102033, + "loss": 0.80891728, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.41479492, + "step": 2678, + "time_per_iteration": 3.112602949142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049572, + "balance_loss_mlp": 1.00801528, + "epoch": 0.515390534821085, + "flos": 472930725888.0, + "grad_norm": 0.03414850275815592, + "language_loss": 0.85192269, + "learning_rate": 0.000499376915541358, + "loss": 0.86241841, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.41577148, + "step": 2679, + "time_per_iteration": 2.732088565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_mlp": 1.00475073, + "epoch": 0.5155829165063486, + "flos": 651358142976.0, + "grad_norm": 0.0316115868451719, + "language_loss": 0.81490767, + "learning_rate": 0.0004990653736144155, + "loss": 0.82537097, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.41601562, + "step": 2680, + "time_per_iteration": 2.9006052017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_mlp": 1.00425994, + "epoch": 0.5157752981916122, + "flos": 415161553920.0, + "grad_norm": 0.034873868180568895, + "language_loss": 0.86566359, + "learning_rate": 0.0004987538320503271, + "loss": 0.876122, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.41601562, + "step": 2681, + "time_per_iteration": 2.5385584831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049108, + "balance_loss_mlp": 1.00750434, + "epoch": 0.5159676798768758, + "flos": 554932644864.0, + "grad_norm": 0.03448939758068617, + "language_loss": 0.83127022, + "learning_rate": 0.0004984422909700442, + "loss": 0.84176129, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.41625977, + "step": 2682, + "time_per_iteration": 2.7167794704437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105107, + "balance_loss_mlp": 1.00944197, + "epoch": 0.5161600615621393, + "flos": 587621429760.0, + "grad_norm": 0.033752660754493145, + "language_loss": 0.84206975, + "learning_rate": 0.0004981307504945173, + "loss": 0.85258043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.41650391, + "step": 2683, + "time_per_iteration": 2.6896650791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_mlp": 1.00856805, + "epoch": 0.5163524432474028, + "flos": 589948566528.0, + "grad_norm": 0.03498305011402451, + "language_loss": 0.90086776, + "learning_rate": 0.0004978192107446976, + "loss": 0.9113704, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.41723633, + "step": 2684, + "time_per_iteration": 2.7550315856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_mlp": 1.00456297, + "epoch": 0.5165448249326664, + "flos": 504905040384.0, + "grad_norm": 0.03233825392148911, + "language_loss": 0.87956327, + "learning_rate": 0.0004975076718415353, + "loss": 0.89002615, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41748047, + "step": 2685, + "time_per_iteration": 2.5969831943511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046859, + "balance_loss_mlp": 1.00515938, + "epoch": 0.51673720661793, + "flos": 417647138304.0, + "grad_norm": 0.0327603501643271, + "language_loss": 0.91275072, + "learning_rate": 0.0004971961339059806, + "loss": 0.9232192, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.41723633, + "step": 2686, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048798, + "balance_loss_mlp": 1.00714636, + "epoch": 0.5169295883031936, + "flos": 600075596544.0, + "grad_norm": 0.03249247039046824, + "language_loss": 0.84663117, + "learning_rate": 0.0004968845970589832, + "loss": 0.8571192, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41674805, + "step": 2687, + "time_per_iteration": 2.7266340255737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047672, + "balance_loss_mlp": 1.00597274, + "epoch": 0.517121969988457, + "flos": 557911068672.0, + "grad_norm": 0.03510688251477249, + "language_loss": 0.85442108, + "learning_rate": 0.0004965730614214926, + "loss": 0.86489779, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.41723633, + "step": 2688, + "time_per_iteration": 2.669203758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00721848, + "epoch": 0.5173143516737206, + "flos": 470375155200.0, + "grad_norm": 0.031768698442390816, + "language_loss": 0.85484231, + "learning_rate": 0.0004962615271144576, + "loss": 0.86533004, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.41577148, + "step": 2689, + "time_per_iteration": 2.508864164352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00578225, + "epoch": 0.5175067333589842, + "flos": 721379292672.0, + "grad_norm": 0.036604011276375, + "language_loss": 0.83442801, + "learning_rate": 0.0004959499942588264, + "loss": 0.84490001, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.41430664, + "step": 2690, + "time_per_iteration": 2.937147617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054985, + "balance_loss_mlp": 1.01473999, + "epoch": 0.5176991150442478, + "flos": 1469344702464.0, + "grad_norm": 0.008104040921495323, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79255009, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.40234375, + "step": 2691, + "time_per_iteration": 4.793481111526489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047518, + "balance_loss_mlp": 1.00593746, + "epoch": 0.5178914967295114, + "flos": 613784705280.0, + "grad_norm": 0.029651978346564224, + "language_loss": 0.85819978, + "learning_rate": 0.0004953269333855661, + "loss": 0.86867493, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.41601562, + "step": 2692, + "time_per_iteration": 2.7456183433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054253, + "balance_loss_mlp": 1.01293516, + "epoch": 0.5180838784147749, + "flos": 501981051648.0, + "grad_norm": 0.03275547277888071, + "language_loss": 0.85017627, + "learning_rate": 0.0004950154056098309, + "loss": 0.86071873, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.41333008, + "step": 2693, + "time_per_iteration": 2.710204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052407, + "balance_loss_mlp": 1.01108897, + "epoch": 0.5182762601000385, + "flos": 690042659328.0, + "grad_norm": 0.03430000909694698, + "language_loss": 0.84476924, + "learning_rate": 0.0004947038797692867, + "loss": 0.85529327, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41333008, + "step": 2694, + "time_per_iteration": 2.846104860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053132, + "balance_loss_mlp": 1.01169479, + "epoch": 0.518468641785302, + "flos": 666801427200.0, + "grad_norm": 0.031372779584062496, + "language_loss": 0.77936417, + "learning_rate": 0.0004943923559848789, + "loss": 0.78989553, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.41455078, + "step": 2695, + "time_per_iteration": 2.780346155166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054261, + "balance_loss_mlp": 1.01303816, + "epoch": 0.5186610234705656, + "flos": 567814522368.0, + "grad_norm": 0.025403978054072948, + "language_loss": 0.9097802, + "learning_rate": 0.0004940808343775515, + "loss": 0.92032284, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.41235352, + "step": 2696, + "time_per_iteration": 2.6940221786499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052298, + "balance_loss_mlp": 1.01093256, + "epoch": 0.5188534051558291, + "flos": 429793158144.0, + "grad_norm": 0.033988353521974116, + "language_loss": 0.8254481, + "learning_rate": 0.0004937693150682479, + "loss": 0.83597112, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.41381836, + "step": 2697, + "time_per_iteration": 2.5146913528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048575, + "balance_loss_mlp": 1.00725734, + "epoch": 0.5190457868410927, + "flos": 547412708352.0, + "grad_norm": 0.031596370266791504, + "language_loss": 0.77111042, + "learning_rate": 0.0004934577981779107, + "loss": 0.78159618, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41333008, + "step": 2698, + "time_per_iteration": 2.6567137241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_mlp": 1.00327134, + "epoch": 0.5192381685263563, + "flos": 549746648064.0, + "grad_norm": 0.029705122804042017, + "language_loss": 0.81764138, + "learning_rate": 0.0004931462838274817, + "loss": 0.82808805, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.4140625, + "step": 2699, + "time_per_iteration": 2.817087173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050227, + "balance_loss_mlp": 1.00895715, + "epoch": 0.5194305502116199, + "flos": 576350273280.0, + "grad_norm": 0.03619468074242637, + "language_loss": 0.84569639, + "learning_rate": 0.0004928347721379011, + "loss": 0.85619867, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.4128418, + "step": 2700, + "time_per_iteration": 2.6439361572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049587, + "balance_loss_mlp": 1.00831699, + "epoch": 0.5196229318968835, + "flos": 435218282496.0, + "grad_norm": 0.03299749227833017, + "language_loss": 0.82266027, + "learning_rate": 0.0004925232632301089, + "loss": 0.83315617, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.4128418, + "step": 2701, + "time_per_iteration": 2.5564098358154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_mlp": 1.00409007, + "epoch": 0.5198153135821469, + "flos": 559986438912.0, + "grad_norm": 0.03181007655018395, + "language_loss": 0.79940033, + "learning_rate": 0.0004922117572250431, + "loss": 0.80985349, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.41235352, + "step": 2702, + "time_per_iteration": 2.651662826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00763237, + "epoch": 0.5200076952674105, + "flos": 566835646464.0, + "grad_norm": 0.030877309828348475, + "language_loss": 0.81538028, + "learning_rate": 0.0004919002542436414, + "loss": 0.82586813, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.41162109, + "step": 2703, + "time_per_iteration": 2.829218864440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_mlp": 1.01028192, + "epoch": 0.5202000769526741, + "flos": 572273409792.0, + "grad_norm": 0.031996161034096735, + "language_loss": 0.81638157, + "learning_rate": 0.0004915887544068399, + "loss": 0.82689589, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.41162109, + "step": 2704, + "time_per_iteration": 2.6583306789398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052276, + "balance_loss_mlp": 1.01110101, + "epoch": 0.5203924586379377, + "flos": 695467783680.0, + "grad_norm": 0.03456723160752419, + "language_loss": 0.7851603, + "learning_rate": 0.0004912772578355736, + "loss": 0.79568309, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41186523, + "step": 2705, + "time_per_iteration": 2.9061107635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051355, + "balance_loss_mlp": 1.01010871, + "epoch": 0.5205848403232012, + "flos": 567691067904.0, + "grad_norm": 0.03253184462937942, + "language_loss": 0.83445644, + "learning_rate": 0.000490965764650776, + "loss": 0.84497005, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.41259766, + "step": 2706, + "time_per_iteration": 2.8724799156188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_mlp": 1.01042521, + "epoch": 0.5207772220084648, + "flos": 1216205913600.0, + "grad_norm": 0.03130848752928153, + "language_loss": 0.83192623, + "learning_rate": 0.0004906542749733798, + "loss": 0.84244412, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.41381836, + "step": 2707, + "time_per_iteration": 3.6585958003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_mlp": 1.00770402, + "epoch": 0.5209696036937284, + "flos": 594032232960.0, + "grad_norm": 0.02732760694007456, + "language_loss": 0.85709697, + "learning_rate": 0.0004903427889243156, + "loss": 0.86758834, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.41455078, + "step": 2708, + "time_per_iteration": 2.871150016784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00294721, + "epoch": 0.5211619853789919, + "flos": 523956648192.0, + "grad_norm": 0.03352920522422817, + "language_loss": 0.85979593, + "learning_rate": 0.0004900313066245134, + "loss": 0.87024117, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.41601562, + "step": 2709, + "time_per_iteration": 2.6438417434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_mlp": 1.00632536, + "epoch": 0.5213543670642555, + "flos": 503861035776.0, + "grad_norm": 0.03205745002268137, + "language_loss": 0.81327069, + "learning_rate": 0.0004897198281949012, + "loss": 0.82374883, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.41503906, + "step": 2710, + "time_per_iteration": 2.693906307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049613, + "balance_loss_mlp": 1.00800931, + "epoch": 0.521546748749519, + "flos": 587072209920.0, + "grad_norm": 0.036857631666753196, + "language_loss": 0.78204525, + "learning_rate": 0.0004894083537564057, + "loss": 0.79254138, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.41625977, + "step": 2711, + "time_per_iteration": 2.7300491333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_mlp": 1.00333273, + "epoch": 0.5217391304347826, + "flos": 571266343680.0, + "grad_norm": 0.030696577254243577, + "language_loss": 0.81681752, + "learning_rate": 0.0004890968834299519, + "loss": 0.82726759, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.41699219, + "step": 2712, + "time_per_iteration": 2.746556043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_mlp": 1.00831652, + "epoch": 0.5219315121200462, + "flos": 543920057856.0, + "grad_norm": 0.028956363679279982, + "language_loss": 0.79082847, + "learning_rate": 0.0004887854173364633, + "loss": 0.80132675, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.4152832, + "step": 2713, + "time_per_iteration": 2.733306884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051945, + "balance_loss_mlp": 1.01045978, + "epoch": 0.5221238938053098, + "flos": 551531367936.0, + "grad_norm": 0.030815907554272836, + "language_loss": 0.82228422, + "learning_rate": 0.0004884739555968617, + "loss": 0.83280361, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.41503906, + "step": 2714, + "time_per_iteration": 2.815034866333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054211, + "balance_loss_mlp": 1.01425171, + "epoch": 0.5223162754905732, + "flos": 1358392579584.0, + "grad_norm": 0.009025254493072253, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80031264, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.39941406, + "step": 2715, + "time_per_iteration": 5.005860090255737 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_mlp": 1.00550854, + "epoch": 0.5225086571758368, + "flos": 568974200064.0, + "grad_norm": 0.030755982791586634, + "language_loss": 0.87142956, + "learning_rate": 0.0004878510456629992, + "loss": 0.88190192, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.41748047, + "step": 2716, + "time_per_iteration": 2.9582624435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_mlp": 1.00713038, + "epoch": 0.5227010388611004, + "flos": 501136323840.0, + "grad_norm": 0.03155972783921746, + "language_loss": 0.85419679, + "learning_rate": 0.00048753959771057314, + "loss": 0.86468375, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.41577148, + "step": 2717, + "time_per_iteration": 2.623081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104979, + "balance_loss_mlp": 1.00832856, + "epoch": 0.522893420546364, + "flos": 598799267328.0, + "grad_norm": 0.035176839616525644, + "language_loss": 0.83230948, + "learning_rate": 0.0004872281545957044, + "loss": 0.84280741, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.41479492, + "step": 2718, + "time_per_iteration": 2.7231285572052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059587, + "balance_loss_mlp": 1.01800716, + "epoch": 0.5230858022316276, + "flos": 665922673152.0, + "grad_norm": 0.03224340083556492, + "language_loss": 0.86415994, + "learning_rate": 0.0004869167164393055, + "loss": 0.8747558, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.41601562, + "step": 2719, + "time_per_iteration": 2.9305646419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054772, + "balance_loss_mlp": 1.0132159, + "epoch": 0.5232781839168911, + "flos": 605034126336.0, + "grad_norm": 0.0287825993415993, + "language_loss": 0.89917624, + "learning_rate": 0.00048660528336228793, + "loss": 0.909724, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.41577148, + "step": 2720, + "time_per_iteration": 2.788072347640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_mlp": 1.0080725, + "epoch": 0.5234705656021547, + "flos": 551841460224.0, + "grad_norm": 0.02763684671666484, + "language_loss": 0.90116215, + "learning_rate": 0.0004862938554855606, + "loss": 0.91165972, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.41699219, + "step": 2721, + "time_per_iteration": 2.775818109512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_mlp": 1.00965011, + "epoch": 0.5236629472874182, + "flos": 505295812608.0, + "grad_norm": 0.03601660428487822, + "language_loss": 0.86817378, + "learning_rate": 0.0004859824329300304, + "loss": 0.87868822, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.41821289, + "step": 2722, + "time_per_iteration": 2.587228536605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053208, + "balance_loss_mlp": 1.01138973, + "epoch": 0.5238553289726818, + "flos": 548697785856.0, + "grad_norm": 0.03170706554102953, + "language_loss": 0.83958352, + "learning_rate": 0.00048567101581660244, + "loss": 0.85011566, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.41845703, + "step": 2723, + "time_per_iteration": 2.6208062171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050325, + "balance_loss_mlp": 1.00843501, + "epoch": 0.5240477106579453, + "flos": 533004680448.0, + "grad_norm": 0.03335820140898581, + "language_loss": 0.87488234, + "learning_rate": 0.00048535960426617956, + "loss": 0.88538557, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.41918945, + "step": 2724, + "time_per_iteration": 2.5951199531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050726, + "balance_loss_mlp": 1.00883543, + "epoch": 0.5242400923432089, + "flos": 619090265856.0, + "grad_norm": 0.03212273913620546, + "language_loss": 0.8244487, + "learning_rate": 0.0004850481983996621, + "loss": 0.83495593, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.41918945, + "step": 2725, + "time_per_iteration": 2.747008800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049694, + "balance_loss_mlp": 1.00785124, + "epoch": 0.5244324740284725, + "flos": 417590757888.0, + "grad_norm": 0.03280670580990367, + "language_loss": 0.88229245, + "learning_rate": 0.0004847367983379492, + "loss": 0.89278936, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.41870117, + "step": 2726, + "time_per_iteration": 2.437721014022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_mlp": 1.00770533, + "epoch": 0.5246248557137361, + "flos": 627732941568.0, + "grad_norm": 0.03120006141405487, + "language_loss": 0.79435945, + "learning_rate": 0.00048442540420193643, + "loss": 0.80485278, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.41650391, + "step": 2727, + "time_per_iteration": 2.927518844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.00911331, + "epoch": 0.5248172373989997, + "flos": 1250403352320.0, + "grad_norm": 0.03663625191481743, + "language_loss": 0.7991612, + "learning_rate": 0.0004841140161125182, + "loss": 0.80966663, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.41455078, + "step": 2728, + "time_per_iteration": 3.574690818786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053819, + "balance_loss_mlp": 1.01250064, + "epoch": 0.5250096190842631, + "flos": 507883464192.0, + "grad_norm": 0.03360211420143325, + "language_loss": 0.85387456, + "learning_rate": 0.0004838026341905857, + "loss": 0.86441278, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.41333008, + "step": 2729, + "time_per_iteration": 2.7263481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046844, + "balance_loss_mlp": 1.00547838, + "epoch": 0.5252020007695267, + "flos": 612508376064.0, + "grad_norm": 0.029211194306351093, + "language_loss": 0.85320604, + "learning_rate": 0.00048349125855702844, + "loss": 0.86367452, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.41381836, + "step": 2730, + "time_per_iteration": 2.775851011276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00649071, + "epoch": 0.5253943824547903, + "flos": 540292292352.0, + "grad_norm": 0.02938539212610817, + "language_loss": 0.81675971, + "learning_rate": 0.00048317988933273287, + "loss": 0.82723826, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.41381836, + "step": 2731, + "time_per_iteration": 2.7763831615448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00613368, + "epoch": 0.5255867641400539, + "flos": 699338567424.0, + "grad_norm": 0.033934632058623626, + "language_loss": 0.82549971, + "learning_rate": 0.00048286852663858367, + "loss": 0.83597326, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.41235352, + "step": 2732, + "time_per_iteration": 2.96213698387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052513, + "balance_loss_mlp": 1.01131439, + "epoch": 0.5257791458253175, + "flos": 668549208576.0, + "grad_norm": 0.03297641476237434, + "language_loss": 0.84432375, + "learning_rate": 0.000482557170595462, + "loss": 0.85484892, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.41210938, + "step": 2733, + "time_per_iteration": 2.840514659881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050533, + "balance_loss_mlp": 1.00943005, + "epoch": 0.525971527510581, + "flos": 484605293568.0, + "grad_norm": 0.032410991276381265, + "language_loss": 0.88272679, + "learning_rate": 0.0004822458213242475, + "loss": 0.89323211, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.41113281, + "step": 2734, + "time_per_iteration": 2.560474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047172, + "balance_loss_mlp": 1.00613987, + "epoch": 0.5261639091958445, + "flos": 831348648960.0, + "grad_norm": 0.03341440797603734, + "language_loss": 0.86630881, + "learning_rate": 0.00048193447894581627, + "loss": 0.87678051, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.41040039, + "step": 2735, + "time_per_iteration": 3.1240243911743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105002, + "balance_loss_mlp": 1.00886869, + "epoch": 0.5263562908811081, + "flos": 521733523968.0, + "grad_norm": 0.03226346413051534, + "language_loss": 0.88327318, + "learning_rate": 0.00048162314358104243, + "loss": 0.89377338, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.41162109, + "step": 2736, + "time_per_iteration": 2.599510669708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_mlp": 1.00581563, + "epoch": 0.5265486725663717, + "flos": 576098506752.0, + "grad_norm": 0.03477073688653673, + "language_loss": 0.84006953, + "learning_rate": 0.0004813118153507969, + "loss": 0.85054016, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.41259766, + "step": 2737, + "time_per_iteration": 2.7309916019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0173111, + "epoch": 0.5267410542516352, + "flos": 1550561186304.0, + "grad_norm": 0.008968329145720436, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83504307, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.3984375, + "step": 2738, + "time_per_iteration": 4.815824747085571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054104, + "balance_loss_mlp": 1.01311994, + "epoch": 0.5269334359368988, + "flos": 931462183680.0, + "grad_norm": 0.03276977156640091, + "language_loss": 0.84196591, + "learning_rate": 0.00048068918077736163, + "loss": 0.85250694, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.40991211, + "step": 2739, + "time_per_iteration": 3.2470173835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051244, + "balance_loss_mlp": 1.01004505, + "epoch": 0.5271258176221624, + "flos": 656635513344.0, + "grad_norm": 0.03436954846361053, + "language_loss": 0.82138938, + "learning_rate": 0.0004803778746759001, + "loss": 0.83190179, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.41210938, + "step": 2740, + "time_per_iteration": 2.920330286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051176, + "balance_loss_mlp": 1.01007247, + "epoch": 0.527318199307426, + "flos": 544062954240.0, + "grad_norm": 0.045913237701965745, + "language_loss": 0.82631075, + "learning_rate": 0.00048006657619242317, + "loss": 0.83682251, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.41113281, + "step": 2741, + "time_per_iteration": 2.612001419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_mlp": 1.00462067, + "epoch": 0.5275105809926895, + "flos": 448899201024.0, + "grad_norm": 0.036563153452021165, + "language_loss": 0.78434455, + "learning_rate": 0.00047975528544778775, + "loss": 0.7948041, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.41357422, + "step": 2742, + "time_per_iteration": 2.590146064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_mlp": 1.00130069, + "epoch": 0.527702962677953, + "flos": 580053861120.0, + "grad_norm": 0.038221984800347206, + "language_loss": 0.89132345, + "learning_rate": 0.00047944400256284754, + "loss": 0.90174961, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.41333008, + "step": 2743, + "time_per_iteration": 2.691096305847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046686, + "balance_loss_mlp": 1.00548708, + "epoch": 0.5278953443632166, + "flos": 654010923264.0, + "grad_norm": 0.03476413811576821, + "language_loss": 0.80653423, + "learning_rate": 0.0004791327276584532, + "loss": 0.8170011, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.41210938, + "step": 2744, + "time_per_iteration": 2.8089282512664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00753677, + "epoch": 0.5280877260484802, + "flos": 515049566976.0, + "grad_norm": 0.03187296499214836, + "language_loss": 0.81036532, + "learning_rate": 0.00047882146085545264, + "loss": 0.82085317, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.41259766, + "step": 2745, + "time_per_iteration": 2.646883010864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055283, + "balance_loss_mlp": 1.01541901, + "epoch": 0.5282801077337438, + "flos": 1448715421440.0, + "grad_norm": 0.006687794222264933, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76457667, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.3984375, + "step": 2746, + "time_per_iteration": 4.967897653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_mlp": 1.00703144, + "epoch": 0.5284724894190073, + "flos": 605967315456.0, + "grad_norm": 0.03667028691338261, + "language_loss": 0.80105197, + "learning_rate": 0.00047819895203700684, + "loss": 0.81153399, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.41186523, + "step": 2747, + "time_per_iteration": 2.7146098613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105164, + "balance_loss_mlp": 1.01187134, + "epoch": 0.5286648711042709, + "flos": 1498106323200.0, + "grad_norm": 0.006729060992495368, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76564074, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.39746094, + "step": 2748, + "time_per_iteration": 4.6327197551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045337, + "balance_loss_mlp": 1.00416195, + "epoch": 0.5288572527895344, + "flos": 598834260480.0, + "grad_norm": 0.03692084834433464, + "language_loss": 0.89385319, + "learning_rate": 0.0004775764770742277, + "loss": 0.90430653, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.41186523, + "step": 2749, + "time_per_iteration": 2.807567834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_mlp": 1.00394237, + "epoch": 0.529049634474798, + "flos": 558440846592.0, + "grad_norm": 0.03911259999059639, + "language_loss": 0.87067056, + "learning_rate": 0.00047726525259079777, + "loss": 0.88112199, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.41210938, + "step": 2750, + "time_per_iteration": 2.7838735580444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_mlp": 1.00348663, + "epoch": 0.5292420161600616, + "flos": 582435432960.0, + "grad_norm": 0.03406590895995427, + "language_loss": 0.89342177, + "learning_rate": 0.0004769540369337798, + "loss": 0.9038682, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.41162109, + "step": 2751, + "time_per_iteration": 2.716430902481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 1.00405395, + "epoch": 0.5294343978453251, + "flos": 609564945408.0, + "grad_norm": 0.0303004693379624, + "language_loss": 0.8646909, + "learning_rate": 0.00047664283022399794, + "loss": 0.87514395, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.41259766, + "step": 2752, + "time_per_iteration": 2.8746426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048518, + "balance_loss_mlp": 1.00736678, + "epoch": 0.5296267795305887, + "flos": 647710935552.0, + "grad_norm": 0.032209809873809676, + "language_loss": 0.81781971, + "learning_rate": 0.00047633163258227376, + "loss": 0.82830489, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.41162109, + "step": 2753, + "time_per_iteration": 2.859628677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048376, + "balance_loss_mlp": 1.0070343, + "epoch": 0.5298191612158523, + "flos": 560806867200.0, + "grad_norm": 0.034095977821307535, + "language_loss": 0.85918152, + "learning_rate": 0.0004760204441294247, + "loss": 0.86966527, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.41357422, + "step": 2754, + "time_per_iteration": 2.642761707305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049622, + "balance_loss_mlp": 1.00842357, + "epoch": 0.5300115429011159, + "flos": 515132192256.0, + "grad_norm": 0.03324074908377848, + "language_loss": 0.86806327, + "learning_rate": 0.00047570926498626486, + "loss": 0.87855953, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.41210938, + "step": 2755, + "time_per_iteration": 2.688204765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048264, + "balance_loss_mlp": 1.00699341, + "epoch": 0.5302039245863793, + "flos": 674050155264.0, + "grad_norm": 0.032282959747224574, + "language_loss": 0.82332271, + "learning_rate": 0.00047539809527360474, + "loss": 0.83380532, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.4128418, + "step": 2756, + "time_per_iteration": 2.891369104385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051726, + "balance_loss_mlp": 1.01052761, + "epoch": 0.5303963062716429, + "flos": 732157609728.0, + "grad_norm": 0.027910460797545535, + "language_loss": 0.82830453, + "learning_rate": 0.0004750869351122511, + "loss": 0.83882177, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.41210938, + "step": 2757, + "time_per_iteration": 2.9782614707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051325, + "balance_loss_mlp": 1.01015055, + "epoch": 0.5305886879569065, + "flos": 574552914432.0, + "grad_norm": 0.03118318769242836, + "language_loss": 0.82440865, + "learning_rate": 0.00047477578462300685, + "loss": 0.83492196, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.41186523, + "step": 2758, + "time_per_iteration": 2.7210254669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104635, + "balance_loss_mlp": 1.00498474, + "epoch": 0.5307810696421701, + "flos": 696729528576.0, + "grad_norm": 0.03181982217221047, + "language_loss": 0.79867083, + "learning_rate": 0.0004744646439266718, + "loss": 0.8091343, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.41381836, + "step": 2759, + "time_per_iteration": 2.997299909591675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046488, + "balance_loss_mlp": 1.005265, + "epoch": 0.5309734513274337, + "flos": 650203322880.0, + "grad_norm": 0.04897119780065821, + "language_loss": 0.92728293, + "learning_rate": 0.000474153513144041, + "loss": 0.93774784, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.41235352, + "step": 2760, + "time_per_iteration": 2.9030909538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047525, + "balance_loss_mlp": 1.00618315, + "epoch": 0.5311658330126972, + "flos": 606056743680.0, + "grad_norm": 0.03383323202633534, + "language_loss": 0.87311566, + "learning_rate": 0.00047384239239590633, + "loss": 0.88359094, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.41357422, + "step": 2761, + "time_per_iteration": 2.8522770404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_mlp": 1.00859571, + "epoch": 0.5313582146979607, + "flos": 559317655296.0, + "grad_norm": 0.03320129260812799, + "language_loss": 0.89026552, + "learning_rate": 0.0004735312818030556, + "loss": 0.90076458, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.41333008, + "step": 2762, + "time_per_iteration": 2.6917500495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00390708, + "epoch": 0.5315505963832243, + "flos": 509446553088.0, + "grad_norm": 0.032512052220750494, + "language_loss": 0.8324827, + "learning_rate": 0.0004732201814862727, + "loss": 0.84293473, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.41308594, + "step": 2763, + "time_per_iteration": 2.7620086669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045981, + "balance_loss_mlp": 1.00461555, + "epoch": 0.5317429780684879, + "flos": 627669758208.0, + "grad_norm": 0.03302669202039023, + "language_loss": 0.81508183, + "learning_rate": 0.0004729090915663373, + "loss": 0.82554156, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.41381836, + "step": 2764, + "time_per_iteration": 2.827430248260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_mlp": 1.00333464, + "epoch": 0.5319353597537514, + "flos": 477699705600.0, + "grad_norm": 0.039772813062738895, + "language_loss": 0.85676539, + "learning_rate": 0.00047259801216402534, + "loss": 0.86721289, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.41430664, + "step": 2765, + "time_per_iteration": 2.5082104206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104674, + "balance_loss_mlp": 1.00535059, + "epoch": 0.532127741439015, + "flos": 502634284032.0, + "grad_norm": 0.03926492526470634, + "language_loss": 0.86841261, + "learning_rate": 0.00047228694340010845, + "loss": 0.87888008, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.4140625, + "step": 2766, + "time_per_iteration": 2.549739360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047042, + "balance_loss_mlp": 1.00555718, + "epoch": 0.5323201231242786, + "flos": 1166484510720.0, + "grad_norm": 0.033303639033777616, + "language_loss": 0.86118937, + "learning_rate": 0.0004719758853953544, + "loss": 0.87165976, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.41503906, + "step": 2767, + "time_per_iteration": 3.5872445106506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_mlp": 1.00888503, + "epoch": 0.5325125048095422, + "flos": 379541977344.0, + "grad_norm": 0.045646551162954616, + "language_loss": 0.84812796, + "learning_rate": 0.00047166483827052645, + "loss": 0.85863209, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.41552734, + "step": 2768, + "time_per_iteration": 2.4177846908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057545, + "balance_loss_mlp": 1.01796722, + "epoch": 0.5327048864948057, + "flos": 1544750147328.0, + "grad_norm": 0.015563445131555704, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78136033, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.39550781, + "step": 2769, + "time_per_iteration": 4.974437236785889 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00447309, + "epoch": 0.5328972681800692, + "flos": 912862586112.0, + "grad_norm": 0.03252924413682995, + "language_loss": 0.84066141, + "learning_rate": 0.000471042777143682, + "loss": 0.85112101, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.41503906, + "step": 2770, + "time_per_iteration": 3.204782724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104834, + "balance_loss_mlp": 1.00680697, + "epoch": 0.5330896498653328, + "flos": 474851539200.0, + "grad_norm": 0.03462661973501109, + "language_loss": 0.80093729, + "learning_rate": 0.0004707317633831707, + "loss": 0.81142068, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.41552734, + "step": 2771, + "time_per_iteration": 2.566772699356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049499, + "balance_loss_mlp": 1.00789511, + "epoch": 0.5332820315505964, + "flos": 502634284032.0, + "grad_norm": 0.03484250248812788, + "language_loss": 0.78787035, + "learning_rate": 0.00047042076098559673, + "loss": 0.79836535, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.41625977, + "step": 2772, + "time_per_iteration": 2.5929906368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_mlp": 1.00454724, + "epoch": 0.53347441323586, + "flos": 926033168640.0, + "grad_norm": 0.038112679556298976, + "language_loss": 0.74248701, + "learning_rate": 0.00047010977007170174, + "loss": 0.75295115, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.41894531, + "step": 2773, + "time_per_iteration": 3.221947193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_mlp": 1.00956452, + "epoch": 0.5336667949211235, + "flos": 575540538624.0, + "grad_norm": 0.03388488907034337, + "language_loss": 0.83005095, + "learning_rate": 0.00046979879076222334, + "loss": 0.8405627, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.41625977, + "step": 2774, + "time_per_iteration": 2.7014822959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_mlp": 1.00767875, + "epoch": 0.533859176606387, + "flos": 1066392363264.0, + "grad_norm": 0.03095569704566717, + "language_loss": 0.85300922, + "learning_rate": 0.0004694878231778939, + "loss": 0.86350143, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.41552734, + "step": 2775, + "time_per_iteration": 3.368795156478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048588, + "balance_loss_mlp": 1.00700808, + "epoch": 0.5340515582916506, + "flos": 747907095552.0, + "grad_norm": 0.030429614039409136, + "language_loss": 0.84799051, + "learning_rate": 0.0004691768674394423, + "loss": 0.8584764, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.41601562, + "step": 2776, + "time_per_iteration": 2.958280324935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052456, + "balance_loss_mlp": 1.01230621, + "epoch": 0.5342439399769142, + "flos": 1448821379328.0, + "grad_norm": 0.012202915272427423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85536468, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.40136719, + "step": 2777, + "time_per_iteration": 4.774897575378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_mlp": 1.00908661, + "epoch": 0.5344363216621778, + "flos": 1430699069952.0, + "grad_norm": 0.005918596107012712, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77702767, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.40136719, + "step": 2778, + "time_per_iteration": 4.978635549545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051694, + "balance_loss_mlp": 1.01039958, + "epoch": 0.5346287033474413, + "flos": 528676050432.0, + "grad_norm": 0.029867236989907914, + "language_loss": 0.79874206, + "learning_rate": 0.00046824407250656676, + "loss": 0.80925894, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.41308594, + "step": 2779, + "time_per_iteration": 2.610321044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_mlp": 1.00790143, + "epoch": 0.5348210850327049, + "flos": 511756193280.0, + "grad_norm": 0.03028632537310572, + "language_loss": 0.83974576, + "learning_rate": 0.0004679331653588161, + "loss": 0.85023701, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.41235352, + "step": 2780, + "time_per_iteration": 2.641401529312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_mlp": 1.00530362, + "epoch": 0.5350134667179685, + "flos": 463626069504.0, + "grad_norm": 0.032724184133620285, + "language_loss": 0.86073065, + "learning_rate": 0.0004676222706605147, + "loss": 0.87119734, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.41381836, + "step": 2781, + "time_per_iteration": 2.6093719005584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046994, + "balance_loss_mlp": 1.005795, + "epoch": 0.535205848403232, + "flos": 710118829824.0, + "grad_norm": 0.033538440780340566, + "language_loss": 0.85521388, + "learning_rate": 0.0004673113885323626, + "loss": 0.86568379, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.41210938, + "step": 2782, + "time_per_iteration": 2.8278369903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_mlp": 1.00337684, + "epoch": 0.5353982300884956, + "flos": 895793029632.0, + "grad_norm": 0.03115315889801346, + "language_loss": 0.79367262, + "learning_rate": 0.00046700051909505494, + "loss": 0.80411977, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.41357422, + "step": 2783, + "time_per_iteration": 3.181025743484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_mlp": 1.00410628, + "epoch": 0.5355906117737591, + "flos": 537025163520.0, + "grad_norm": 0.03272022966866855, + "language_loss": 0.84359205, + "learning_rate": 0.000466689662469282, + "loss": 0.85404533, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41235352, + "step": 2784, + "time_per_iteration": 2.623128890991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045278, + "balance_loss_mlp": 1.00419891, + "epoch": 0.5357829934590227, + "flos": 870328673280.0, + "grad_norm": 0.0344669350963294, + "language_loss": 0.84610772, + "learning_rate": 0.00046637881877572917, + "loss": 0.85656047, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.41088867, + "step": 2785, + "time_per_iteration": 3.079174757003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010433, + "balance_loss_mlp": 1.00229168, + "epoch": 0.5359753751442863, + "flos": 554446608384.0, + "grad_norm": 0.028858393123854686, + "language_loss": 0.85135722, + "learning_rate": 0.0004660679881350764, + "loss": 0.86179018, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.41015625, + "step": 2786, + "time_per_iteration": 2.7473020553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_mlp": 1.00150299, + "epoch": 0.5361677568295499, + "flos": 1483759533312.0, + "grad_norm": 0.0067453290840893895, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76649511, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.3984375, + "step": 2787, + "time_per_iteration": 5.041473627090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_mlp": 1.0027802, + "epoch": 0.5363601385148133, + "flos": 807642767616.0, + "grad_norm": 0.03504389904677532, + "language_loss": 0.78613555, + "learning_rate": 0.0004654463664951667, + "loss": 0.79657346, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.41015625, + "step": 2788, + "time_per_iteration": 2.9798529148101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_mlp": 1.00775349, + "epoch": 0.5365525202000769, + "flos": 508879836672.0, + "grad_norm": 0.03320853792290129, + "language_loss": 0.8327626, + "learning_rate": 0.0004651355757372447, + "loss": 0.84325004, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.40991211, + "step": 2789, + "time_per_iteration": 2.643827438354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720656, + "epoch": 0.5367449018853405, + "flos": 530015563008.0, + "grad_norm": 0.032066447391342436, + "language_loss": 0.8626231, + "learning_rate": 0.00046482479851489274, + "loss": 0.87310588, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.41088867, + "step": 2790, + "time_per_iteration": 2.7637765407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046715, + "balance_loss_mlp": 1.0056597, + "epoch": 0.5369372835706041, + "flos": 651217191936.0, + "grad_norm": 0.038515792328953954, + "language_loss": 0.78515691, + "learning_rate": 0.00046451403494876525, + "loss": 0.79562402, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.41064453, + "step": 2791, + "time_per_iteration": 2.9090025424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_mlp": 1.00504696, + "epoch": 0.5371296652558677, + "flos": 585628684800.0, + "grad_norm": 0.03231753899308558, + "language_loss": 0.84747189, + "learning_rate": 0.0004642032851595111, + "loss": 0.85793316, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.41088867, + "step": 2792, + "time_per_iteration": 2.775444507598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_mlp": 1.00717819, + "epoch": 0.5373220469411312, + "flos": 597084533760.0, + "grad_norm": 0.03483653357210067, + "language_loss": 0.85361469, + "learning_rate": 0.00046389254926777404, + "loss": 0.86409795, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41162109, + "step": 2793, + "time_per_iteration": 2.8168118000030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00378096, + "epoch": 0.5375144286263948, + "flos": 1116279016704.0, + "grad_norm": 0.03171846878783484, + "language_loss": 0.78282589, + "learning_rate": 0.0004635818273941926, + "loss": 0.79327619, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.41259766, + "step": 2794, + "time_per_iteration": 3.5206284523010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_mlp": 1.00301409, + "epoch": 0.5377068103116583, + "flos": 596769583872.0, + "grad_norm": 0.0416500636560626, + "language_loss": 0.82705241, + "learning_rate": 0.0004632711196593997, + "loss": 0.83749551, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.41308594, + "step": 2795, + "time_per_iteration": 2.81925892829895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010512, + "balance_loss_mlp": 1.0100255, + "epoch": 0.5378991919969219, + "flos": 885650448384.0, + "grad_norm": 0.03764518727969069, + "language_loss": 0.85939819, + "learning_rate": 0.00046296042618402297, + "loss": 0.86991024, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.41186523, + "step": 2796, + "time_per_iteration": 3.076819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047791, + "balance_loss_mlp": 1.00666356, + "epoch": 0.5380915736821854, + "flos": 711951181824.0, + "grad_norm": 0.02842771896049368, + "language_loss": 0.79539001, + "learning_rate": 0.0004626497470886839, + "loss": 0.80586791, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.41137695, + "step": 2797, + "time_per_iteration": 2.9846107959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00844073, + "epoch": 0.538283955367449, + "flos": 558115203072.0, + "grad_norm": 0.029565541443496178, + "language_loss": 0.82388103, + "learning_rate": 0.00046233908249399897, + "loss": 0.83437717, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41186523, + "step": 2798, + "time_per_iteration": 2.7782254219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01255548, + "epoch": 0.5384763370527126, + "flos": 514482850560.0, + "grad_norm": 0.03320479864481119, + "language_loss": 0.78804994, + "learning_rate": 0.00046202843252057905, + "loss": 0.79858828, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.4128418, + "step": 2799, + "time_per_iteration": 2.60296368598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_mlp": 1.00985634, + "epoch": 0.5386687187379762, + "flos": 490720588800.0, + "grad_norm": 0.036707180351256564, + "language_loss": 0.84230787, + "learning_rate": 0.00046171779728902896, + "loss": 0.8528192, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.4128418, + "step": 2800, + "time_per_iteration": 2.5585505962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00514555, + "epoch": 0.5388611004232398, + "flos": 483628363008.0, + "grad_norm": 0.04683117604826235, + "language_loss": 0.86678994, + "learning_rate": 0.000461407176919948, + "loss": 0.87725389, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.41259766, + "step": 2801, + "time_per_iteration": 2.5158677101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045703, + "balance_loss_mlp": 1.00440919, + "epoch": 0.5390534821085032, + "flos": 562089999360.0, + "grad_norm": 0.033429611400543416, + "language_loss": 0.85806906, + "learning_rate": 0.00046109657153392997, + "loss": 0.8685261, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.41308594, + "step": 2802, + "time_per_iteration": 2.685462236404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_mlp": 1.00591016, + "epoch": 0.5392458637937668, + "flos": 489361634304.0, + "grad_norm": 0.036955437438287664, + "language_loss": 0.83497781, + "learning_rate": 0.0004607859812515622, + "loss": 0.84544891, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.41210938, + "step": 2803, + "time_per_iteration": 2.6187045574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054182, + "balance_loss_mlp": 1.01300752, + "epoch": 0.5394382454790304, + "flos": 513050019072.0, + "grad_norm": 0.03744234433888121, + "language_loss": 0.88279247, + "learning_rate": 0.00046047540619342667, + "loss": 0.89333427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.41186523, + "step": 2804, + "time_per_iteration": 2.5895795822143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046381, + "balance_loss_mlp": 1.00525355, + "epoch": 0.539630627164294, + "flos": 568689385728.0, + "grad_norm": 0.033797229327163864, + "language_loss": 0.80605161, + "learning_rate": 0.00046016484648009933, + "loss": 0.81651545, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.41137695, + "step": 2805, + "time_per_iteration": 2.691092014312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_mlp": 1.00612748, + "epoch": 0.5398230088495575, + "flos": 527503733760.0, + "grad_norm": 0.03721333567310717, + "language_loss": 0.8141259, + "learning_rate": 0.0004598543022321501, + "loss": 0.82459861, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.41162109, + "step": 2806, + "time_per_iteration": 2.6083474159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_mlp": 1.00312901, + "epoch": 0.5400153905348211, + "flos": 539853888000.0, + "grad_norm": 0.03209862982455251, + "language_loss": 0.80560988, + "learning_rate": 0.0004595437735701433, + "loss": 0.81605339, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.41235352, + "step": 2807, + "time_per_iteration": 2.688770055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_mlp": 1.00354242, + "epoch": 0.5402077722200846, + "flos": 514665597696.0, + "grad_norm": 0.03651112385557252, + "language_loss": 0.83778703, + "learning_rate": 0.00045923326061463623, + "loss": 0.84823376, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.41137695, + "step": 2808, + "time_per_iteration": 2.761165142059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_mlp": 1.00534451, + "epoch": 0.5404001539053482, + "flos": 677567105280.0, + "grad_norm": 0.031915220360544935, + "language_loss": 0.81941223, + "learning_rate": 0.00045892276348618113, + "loss": 0.82987767, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.41210938, + "step": 2809, + "time_per_iteration": 2.9716503620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105954, + "balance_loss_mlp": 1.01948547, + "epoch": 0.5405925355906118, + "flos": 1558191938304.0, + "grad_norm": 0.009079850654737754, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79320371, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.40039062, + "step": 2810, + "time_per_iteration": 4.989593029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051922, + "balance_loss_mlp": 1.01069915, + "epoch": 0.5407849172758753, + "flos": 648538166784.0, + "grad_norm": 0.030063831285765737, + "language_loss": 0.81372178, + "learning_rate": 0.000458301817192603, + "loss": 0.82424104, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.41235352, + "step": 2811, + "time_per_iteration": 2.855461359024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063999, + "balance_loss_mlp": 1.02404022, + "epoch": 0.5409772989611389, + "flos": 1410483893760.0, + "grad_norm": 0.010433444863556941, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81905782, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.39941406, + "step": 2812, + "time_per_iteration": 4.82320761680603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048585, + "balance_loss_mlp": 1.00748193, + "epoch": 0.5411696806464025, + "flos": 555545048064.0, + "grad_norm": 0.0337189850887645, + "language_loss": 0.87703073, + "learning_rate": 0.00045768093565369983, + "loss": 0.88751662, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.41113281, + "step": 2813, + "time_per_iteration": 2.7693569660186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047899, + "balance_loss_mlp": 1.00660491, + "epoch": 0.5413620623316661, + "flos": 529205828352.0, + "grad_norm": 0.032417929995103685, + "language_loss": 0.82523155, + "learning_rate": 0.0004573705194685646, + "loss": 0.83571053, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.41308594, + "step": 2814, + "time_per_iteration": 2.6525402069091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_mlp": 1.00637758, + "epoch": 0.5415544440169295, + "flos": 599852020224.0, + "grad_norm": 0.03532378336462207, + "language_loss": 0.85743833, + "learning_rate": 0.00045706011983366157, + "loss": 0.86791384, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.41186523, + "step": 2815, + "time_per_iteration": 2.67850661277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_mlp": 1.0085113, + "epoch": 0.5417468257021931, + "flos": 471714667776.0, + "grad_norm": 0.039926593194372036, + "language_loss": 0.83561838, + "learning_rate": 0.00045674973686949847, + "loss": 0.84611619, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.4128418, + "step": 2816, + "time_per_iteration": 2.56265926361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_mlp": 1.00839996, + "epoch": 0.5419392073874567, + "flos": 682191243264.0, + "grad_norm": 0.04027281254885066, + "language_loss": 0.85790694, + "learning_rate": 0.0004564393706965766, + "loss": 0.86840272, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.41186523, + "step": 2817, + "time_per_iteration": 2.955655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_mlp": 1.00700641, + "epoch": 0.5421315890727203, + "flos": 463337364480.0, + "grad_norm": 0.033241337033607515, + "language_loss": 0.82050943, + "learning_rate": 0.00045612902143539116, + "loss": 0.83099198, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.41259766, + "step": 2818, + "time_per_iteration": 2.546567440032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_mlp": 1.0021013, + "epoch": 0.5423239707579839, + "flos": 437890504704.0, + "grad_norm": 0.03727551718578137, + "language_loss": 0.82264733, + "learning_rate": 0.00045581868920642986, + "loss": 0.83307964, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.41137695, + "step": 2819, + "time_per_iteration": 2.4746038913726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043245, + "balance_loss_mlp": 1.00197434, + "epoch": 0.5425163524432474, + "flos": 459306187776.0, + "grad_norm": 0.035271404401503774, + "language_loss": 0.80009091, + "learning_rate": 0.00045550837413017457, + "loss": 0.81052339, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.4128418, + "step": 2820, + "time_per_iteration": 2.598879098892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00348616, + "epoch": 0.542708734128511, + "flos": 420410734080.0, + "grad_norm": 0.029285477013781286, + "language_loss": 0.8579312, + "learning_rate": 0.0004551980763271005, + "loss": 0.86837852, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.41259766, + "step": 2821, + "time_per_iteration": 2.650609254837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00917685, + "epoch": 0.5429011158137745, + "flos": 679709549568.0, + "grad_norm": 0.038877958454501954, + "language_loss": 0.84286433, + "learning_rate": 0.0004548877959176756, + "loss": 0.8533681, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.41210938, + "step": 2822, + "time_per_iteration": 2.831773042678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049844, + "balance_loss_mlp": 1.00857341, + "epoch": 0.5430934974990381, + "flos": 541968142080.0, + "grad_norm": 0.03541809911924704, + "language_loss": 0.8707608, + "learning_rate": 0.00045457753302236166, + "loss": 0.8812592, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.4128418, + "step": 2823, + "time_per_iteration": 2.609090805053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_mlp": 1.00726891, + "epoch": 0.5432858791843016, + "flos": 659644072704.0, + "grad_norm": 0.03671475643697152, + "language_loss": 0.87739956, + "learning_rate": 0.00045426728776161353, + "loss": 0.8878845, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.41235352, + "step": 2824, + "time_per_iteration": 2.802915334701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_mlp": 1.00574553, + "epoch": 0.5434782608695652, + "flos": 532967741952.0, + "grad_norm": 0.03427907044877429, + "language_loss": 0.82057846, + "learning_rate": 0.00045395706025587863, + "loss": 0.83104837, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.41259766, + "step": 2825, + "time_per_iteration": 2.6308939456939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_mlp": 1.00194418, + "epoch": 0.5436706425548288, + "flos": 609633964800.0, + "grad_norm": 0.034616126048734014, + "language_loss": 0.8290934, + "learning_rate": 0.00045364685062559843, + "loss": 0.83952391, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.41113281, + "step": 2826, + "time_per_iteration": 2.8231375217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.006657, + "epoch": 0.5438630242400924, + "flos": 706773933312.0, + "grad_norm": 0.03098010756730768, + "language_loss": 0.92170852, + "learning_rate": 0.0004533366589912067, + "loss": 0.93218541, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.41040039, + "step": 2827, + "time_per_iteration": 2.9529805183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105042, + "balance_loss_mlp": 1.00912547, + "epoch": 0.544055405925356, + "flos": 857839513344.0, + "grad_norm": 0.036966152235284246, + "language_loss": 0.78087002, + "learning_rate": 0.0004530264854731306, + "loss": 0.79137421, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.41308594, + "step": 2828, + "time_per_iteration": 3.0584123134613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00913441, + "epoch": 0.5442477876106194, + "flos": 572968438272.0, + "grad_norm": 0.03388858680916364, + "language_loss": 0.84792554, + "learning_rate": 0.00045271633019179034, + "loss": 0.85842907, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.41235352, + "step": 2829, + "time_per_iteration": 2.827160596847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046647, + "balance_loss_mlp": 1.00532901, + "epoch": 0.544440169295883, + "flos": 626803643136.0, + "grad_norm": 0.02947280635893411, + "language_loss": 0.88373405, + "learning_rate": 0.0004524061932675986, + "loss": 0.89420056, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.41333008, + "step": 2830, + "time_per_iteration": 2.8206188678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_mlp": 1.00768852, + "epoch": 0.5446325509811466, + "flos": 837641833728.0, + "grad_norm": 0.03760239902604625, + "language_loss": 0.87454915, + "learning_rate": 0.00045209607482096125, + "loss": 0.88503784, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.41186523, + "step": 2831, + "time_per_iteration": 3.0359649658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_mlp": 1.00600255, + "epoch": 0.5448249326664102, + "flos": 484390465536.0, + "grad_norm": 0.03560900416786153, + "language_loss": 0.8480038, + "learning_rate": 0.0004517859749722772, + "loss": 0.85847604, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.41235352, + "step": 2832, + "time_per_iteration": 2.689295768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050991, + "balance_loss_mlp": 1.00972044, + "epoch": 0.5450173143516738, + "flos": 562346623488.0, + "grad_norm": 0.03426430427633819, + "language_loss": 0.79531574, + "learning_rate": 0.0004514758938419376, + "loss": 0.80582559, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.4128418, + "step": 2833, + "time_per_iteration": 2.8727176189422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049419, + "balance_loss_mlp": 1.00965118, + "epoch": 0.5452096960369373, + "flos": 1473588761856.0, + "grad_norm": 0.014550980978032766, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77970004, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.39746094, + "step": 2834, + "time_per_iteration": 4.9399590492248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_mlp": 1.00556791, + "epoch": 0.5454020777222008, + "flos": 466018334976.0, + "grad_norm": 0.03248736316688099, + "language_loss": 0.84558713, + "learning_rate": 0.00045085578821782175, + "loss": 0.85605574, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.41308594, + "step": 2835, + "time_per_iteration": 2.5900182723999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057518, + "balance_loss_mlp": 1.01784515, + "epoch": 0.5455944594074644, + "flos": 1472617667328.0, + "grad_norm": 0.013168056581512213, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77192259, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.39648438, + "step": 2836, + "time_per_iteration": 4.910645961761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_mlp": 1.01100063, + "epoch": 0.545786841092728, + "flos": 534305309184.0, + "grad_norm": 0.02738620901632673, + "language_loss": 0.81102663, + "learning_rate": 0.00045023575891159866, + "loss": 0.82154894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.41235352, + "step": 2837, + "time_per_iteration": 2.7457492351531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_mlp": 1.00682068, + "epoch": 0.5459792227779915, + "flos": 1355428740096.0, + "grad_norm": 0.008010480990562174, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75810492, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.3984375, + "step": 2838, + "time_per_iteration": 4.94202995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00748277, + "epoch": 0.5461716044632551, + "flos": 639073117440.0, + "grad_norm": 0.02877585305336934, + "language_loss": 0.78956163, + "learning_rate": 0.0004496158068861354, + "loss": 0.80004895, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.41259766, + "step": 2839, + "time_per_iteration": 2.808370590209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_mlp": 1.00642872, + "epoch": 0.5463639861485187, + "flos": 603926938368.0, + "grad_norm": 0.03433602558833516, + "language_loss": 0.81297666, + "learning_rate": 0.00044930586015455207, + "loss": 0.82345319, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.41235352, + "step": 2840, + "time_per_iteration": 2.782735824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_mlp": 1.00695133, + "epoch": 0.5465563678337823, + "flos": 643753635840.0, + "grad_norm": 0.02662038136573285, + "language_loss": 0.89087546, + "learning_rate": 0.000448995933104179, + "loss": 0.9013567, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.41186523, + "step": 2841, + "time_per_iteration": 2.869476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050304, + "balance_loss_mlp": 1.0090816, + "epoch": 0.5467487495190458, + "flos": 615365290752.0, + "grad_norm": 0.03719587304070891, + "language_loss": 0.80725658, + "learning_rate": 0.00044868602585534077, + "loss": 0.81775963, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.41235352, + "step": 2842, + "time_per_iteration": 2.843027353286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_mlp": 1.00552344, + "epoch": 0.5469411312043093, + "flos": 462128109312.0, + "grad_norm": 0.03959126806850753, + "language_loss": 0.89450765, + "learning_rate": 0.0004483761385283541, + "loss": 0.90497464, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.41186523, + "step": 2843, + "time_per_iteration": 2.5162315368652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044431, + "balance_loss_mlp": 1.00332797, + "epoch": 0.5471335128895729, + "flos": 562267888896.0, + "grad_norm": 0.03475490738980998, + "language_loss": 0.82207608, + "learning_rate": 0.0004480662712435281, + "loss": 0.83252037, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.41113281, + "step": 2844, + "time_per_iteration": 2.7367589473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_mlp": 1.0045476, + "epoch": 0.5473258945748365, + "flos": 519686343936.0, + "grad_norm": 0.032685207895773144, + "language_loss": 0.8903448, + "learning_rate": 0.0004477564241211635, + "loss": 0.90080059, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.41040039, + "step": 2845, + "time_per_iteration": 2.6059961318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00640178, + "epoch": 0.5475182762601001, + "flos": 434744884992.0, + "grad_norm": 0.035185291050346845, + "language_loss": 0.87463105, + "learning_rate": 0.0004474465972815541, + "loss": 0.88510644, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.41137695, + "step": 2846, + "time_per_iteration": 2.5159108638763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_mlp": 1.00808775, + "epoch": 0.5477106579453636, + "flos": 512574676224.0, + "grad_norm": 0.03033857724648134, + "language_loss": 0.88145, + "learning_rate": 0.000447136790844985, + "loss": 0.89194143, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.41064453, + "step": 2847, + "time_per_iteration": 2.7494916915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049923, + "balance_loss_mlp": 1.00889075, + "epoch": 0.5479030396306271, + "flos": 677141339904.0, + "grad_norm": 0.030728657632270156, + "language_loss": 0.81529921, + "learning_rate": 0.00044682700493173385, + "loss": 0.82579845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.41040039, + "step": 2848, + "time_per_iteration": 2.8558499813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043616, + "balance_loss_mlp": 1.00260758, + "epoch": 0.5480954213158907, + "flos": 877579346688.0, + "grad_norm": 0.03576262257130289, + "language_loss": 0.80969125, + "learning_rate": 0.00044651723966207004, + "loss": 0.82012743, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.41015625, + "step": 2849, + "time_per_iteration": 3.1599223613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_mlp": 1.00768459, + "epoch": 0.5482878030011543, + "flos": 623175877632.0, + "grad_norm": 0.0450385792128453, + "language_loss": 0.79220605, + "learning_rate": 0.00044620749515625536, + "loss": 0.80269301, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.41015625, + "step": 2850, + "time_per_iteration": 2.816164255142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044849, + "balance_loss_mlp": 1.00376952, + "epoch": 0.5484801846864179, + "flos": 498258021888.0, + "grad_norm": 0.033687612572946876, + "language_loss": 0.85353971, + "learning_rate": 0.00044589777153454334, + "loss": 0.86398828, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.41088867, + "step": 2851, + "time_per_iteration": 2.767086982727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_mlp": 1.00158429, + "epoch": 0.5486725663716814, + "flos": 443354512896.0, + "grad_norm": 0.032917884516517996, + "language_loss": 0.84102762, + "learning_rate": 0.00044558806891717895, + "loss": 0.85145497, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.41162109, + "step": 2852, + "time_per_iteration": 2.4791274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_mlp": 1.00560999, + "epoch": 0.548864948056945, + "flos": 656348753664.0, + "grad_norm": 0.02926310360240776, + "language_loss": 0.80048501, + "learning_rate": 0.0004452783874243998, + "loss": 0.81095093, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.40991211, + "step": 2853, + "time_per_iteration": 2.8510489463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051891, + "balance_loss_mlp": 1.01100183, + "epoch": 0.5490573297422086, + "flos": 547141499904.0, + "grad_norm": 0.035598285504377866, + "language_loss": 0.85552013, + "learning_rate": 0.00044496872717643475, + "loss": 0.86603898, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.40893555, + "step": 2854, + "time_per_iteration": 2.6640069484710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107375, + "balance_loss_mlp": 1.03398132, + "epoch": 0.5492497114274721, + "flos": 1593763882752.0, + "grad_norm": 0.015003928091872471, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7816304, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.39746094, + "step": 2855, + "time_per_iteration": 4.924941778182983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00791013, + "epoch": 0.5494420931127356, + "flos": 752270718720.0, + "grad_norm": 0.03382110809465603, + "language_loss": 0.82668245, + "learning_rate": 0.0004443494708958217, + "loss": 0.83717024, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.40869141, + "step": 2856, + "time_per_iteration": 2.9736838340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049194, + "balance_loss_mlp": 1.00837672, + "epoch": 0.5496344747979992, + "flos": 627305230848.0, + "grad_norm": 0.02827813290363101, + "language_loss": 0.81289691, + "learning_rate": 0.0004440398751035906, + "loss": 0.82338881, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.40820312, + "step": 2857, + "time_per_iteration": 2.943936347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_mlp": 1.01289868, + "epoch": 0.5498268564832628, + "flos": 524125789440.0, + "grad_norm": 0.04150845511788398, + "language_loss": 0.8407867, + "learning_rate": 0.00044373030103700645, + "loss": 0.85132337, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.40771484, + "step": 2858, + "time_per_iteration": 2.5977840423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00719178, + "epoch": 0.5500192381685264, + "flos": 605778732288.0, + "grad_norm": 0.03313045470580536, + "language_loss": 0.80440414, + "learning_rate": 0.000443420748816257, + "loss": 0.81488407, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.40795898, + "step": 2859, + "time_per_iteration": 2.7645347118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049935, + "balance_loss_mlp": 1.00914145, + "epoch": 0.55021161985379, + "flos": 521655756288.0, + "grad_norm": 0.037659665058523445, + "language_loss": 0.79047614, + "learning_rate": 0.0004431112185615208, + "loss": 0.8009755, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.40795898, + "step": 2860, + "time_per_iteration": 2.7862706184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_mlp": 1.00302446, + "epoch": 0.5504040015390534, + "flos": 490655460096.0, + "grad_norm": 0.03348154415794888, + "language_loss": 0.8037793, + "learning_rate": 0.00044280171039296845, + "loss": 0.8142184, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.40893555, + "step": 2861, + "time_per_iteration": 2.6561086177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052487, + "balance_loss_mlp": 1.01166964, + "epoch": 0.550596383224317, + "flos": 576862554624.0, + "grad_norm": 0.03513860333112342, + "language_loss": 0.88868964, + "learning_rate": 0.0004424922244307616, + "loss": 0.89921451, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.40820312, + "step": 2862, + "time_per_iteration": 2.7066099643707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01298499, + "epoch": 0.5507887649095806, + "flos": 643634072064.0, + "grad_norm": 0.03653258974946179, + "language_loss": 0.82663441, + "learning_rate": 0.00044218276079505315, + "loss": 0.83717263, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.40844727, + "step": 2863, + "time_per_iteration": 2.87058162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049384, + "balance_loss_mlp": 1.00856698, + "epoch": 0.5509811465948442, + "flos": 532865674752.0, + "grad_norm": 0.034931125724459874, + "language_loss": 0.75083911, + "learning_rate": 0.0004418733196059876, + "loss": 0.76133299, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.40820312, + "step": 2864, + "time_per_iteration": 2.690927743911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048154, + "balance_loss_mlp": 1.00719357, + "epoch": 0.5511735282801077, + "flos": 655984226304.0, + "grad_norm": 0.03582782743987034, + "language_loss": 0.80482149, + "learning_rate": 0.0004415639009837008, + "loss": 0.81530309, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.40966797, + "step": 2865, + "time_per_iteration": 2.8515002727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050322, + "balance_loss_mlp": 1.00948107, + "epoch": 0.5513659099653713, + "flos": 530610469632.0, + "grad_norm": 0.03216902856467023, + "language_loss": 0.82250589, + "learning_rate": 0.00044125450504831955, + "loss": 0.83300906, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.40844727, + "step": 2866, + "time_per_iteration": 2.743833303451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053087, + "balance_loss_mlp": 1.01229346, + "epoch": 0.5515582916506349, + "flos": 555974704128.0, + "grad_norm": 0.03636447949545943, + "language_loss": 0.827411, + "learning_rate": 0.0004409451319199622, + "loss": 0.83794183, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.40795898, + "step": 2867, + "time_per_iteration": 2.654466390609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045403, + "balance_loss_mlp": 1.00439477, + "epoch": 0.5517506733358984, + "flos": 736772999424.0, + "grad_norm": 0.03752588301556939, + "language_loss": 0.85160595, + "learning_rate": 0.0004406357817187381, + "loss": 0.86206001, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.41015625, + "step": 2868, + "time_per_iteration": 2.9610273838043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051499, + "balance_loss_mlp": 1.01065779, + "epoch": 0.551943055021162, + "flos": 1117190818560.0, + "grad_norm": 0.028811275091252902, + "language_loss": 0.81857193, + "learning_rate": 0.0004403264545647474, + "loss": 0.8290869, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.40844727, + "step": 2869, + "time_per_iteration": 3.511462450027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043012, + "balance_loss_mlp": 1.00195587, + "epoch": 0.5521354367064255, + "flos": 545502588672.0, + "grad_norm": 0.03184831617373855, + "language_loss": 0.85004073, + "learning_rate": 0.00044001715057808154, + "loss": 0.86047089, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.41064453, + "step": 2870, + "time_per_iteration": 2.744248390197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048495, + "balance_loss_mlp": 1.00746286, + "epoch": 0.5523278183916891, + "flos": 937872986880.0, + "grad_norm": 0.03348956391566461, + "language_loss": 0.81933939, + "learning_rate": 0.0004397078698788232, + "loss": 0.82982433, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.41040039, + "step": 2871, + "time_per_iteration": 3.193040132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_mlp": 1.01277161, + "epoch": 0.5525202000769527, + "flos": 1469101684224.0, + "grad_norm": 0.00853782264427079, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81494617, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.39453125, + "step": 2872, + "time_per_iteration": 4.887877702713013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050207, + "balance_loss_mlp": 1.00917542, + "epoch": 0.5527125817622163, + "flos": 490785717504.0, + "grad_norm": 0.036240955421061, + "language_loss": 0.78392744, + "learning_rate": 0.00043908937882281343, + "loss": 0.79442948, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.41040039, + "step": 2873, + "time_per_iteration": 2.5992209911346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00414526, + "epoch": 0.5529049634474797, + "flos": 636149128704.0, + "grad_norm": 0.03461125376652938, + "language_loss": 0.82969832, + "learning_rate": 0.0004387801687061814, + "loss": 0.84015036, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.41064453, + "step": 2874, + "time_per_iteration": 2.8166332244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_mlp": 1.00408852, + "epoch": 0.5530973451327433, + "flos": 582435432960.0, + "grad_norm": 0.031639900781256135, + "language_loss": 0.81371784, + "learning_rate": 0.0004384709823571958, + "loss": 0.82416999, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.41137695, + "step": 2875, + "time_per_iteration": 2.7777786254882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00458348, + "epoch": 0.5532897268180069, + "flos": 1124330676480.0, + "grad_norm": 0.03430168550584483, + "language_loss": 0.83714402, + "learning_rate": 0.0004381618198958932, + "loss": 0.84760094, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.41113281, + "step": 2876, + "time_per_iteration": 3.517432451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_mlp": 1.00536335, + "epoch": 0.5534821085032705, + "flos": 638513203968.0, + "grad_norm": 0.03082674119581989, + "language_loss": 0.83886576, + "learning_rate": 0.00043785268144230137, + "loss": 0.84933138, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.41210938, + "step": 2877, + "time_per_iteration": 2.9488272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048169, + "balance_loss_mlp": 1.0069226, + "epoch": 0.5536744901885341, + "flos": 572217029376.0, + "grad_norm": 0.037462471463683845, + "language_loss": 0.8303535, + "learning_rate": 0.00043754356711643837, + "loss": 0.84083521, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.41259766, + "step": 2878, + "time_per_iteration": 2.669304370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_mlp": 1.00479829, + "epoch": 0.5538668718737976, + "flos": 596917337856.0, + "grad_norm": 0.03146432649645385, + "language_loss": 0.84558415, + "learning_rate": 0.0004372344770383132, + "loss": 0.8560434, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.41137695, + "step": 2879, + "time_per_iteration": 2.855231761932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050888, + "balance_loss_mlp": 1.0097847, + "epoch": 0.5540592535590612, + "flos": 533719150848.0, + "grad_norm": 0.0358528854453713, + "language_loss": 0.83432066, + "learning_rate": 0.00043692541132792507, + "loss": 0.84482956, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.41113281, + "step": 2880, + "time_per_iteration": 2.662008047103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051879, + "balance_loss_mlp": 1.01070428, + "epoch": 0.5542516352443247, + "flos": 413505146112.0, + "grad_norm": 0.035032849721931915, + "language_loss": 0.83894408, + "learning_rate": 0.00043661637010526384, + "loss": 0.84946287, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.41186523, + "step": 2881, + "time_per_iteration": 2.507699489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_mlp": 1.00484717, + "epoch": 0.5544440169295883, + "flos": 548678343936.0, + "grad_norm": 0.03314086611141918, + "language_loss": 0.83246458, + "learning_rate": 0.00043630735349031025, + "loss": 0.84292531, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.41235352, + "step": 2882, + "time_per_iteration": 2.70409893989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_mlp": 1.00623393, + "epoch": 0.5546363986148518, + "flos": 623034926592.0, + "grad_norm": 0.03282028788454341, + "language_loss": 0.82495463, + "learning_rate": 0.00043599836160303495, + "loss": 0.83542871, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.41186523, + "step": 2883, + "time_per_iteration": 2.900757312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_mlp": 1.00550687, + "epoch": 0.5548287803001154, + "flos": 706580492544.0, + "grad_norm": 0.029978122278870225, + "language_loss": 0.78110325, + "learning_rate": 0.0004356893945633995, + "loss": 0.79157007, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.41186523, + "step": 2884, + "time_per_iteration": 2.975062608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_mlp": 1.00501966, + "epoch": 0.555021161985379, + "flos": 505184997120.0, + "grad_norm": 0.033025085572570244, + "language_loss": 0.82143605, + "learning_rate": 0.0004353804524913551, + "loss": 0.83189756, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.41137695, + "step": 2885, + "time_per_iteration": 2.6369645595550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_mlp": 1.00512528, + "epoch": 0.5552135436706426, + "flos": 617210281728.0, + "grad_norm": 0.0369840001422722, + "language_loss": 0.82350749, + "learning_rate": 0.0004350715355068441, + "loss": 0.83396947, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.41088867, + "step": 2886, + "time_per_iteration": 2.727186441421509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044964, + "balance_loss_mlp": 1.00393176, + "epoch": 0.5554059253559062, + "flos": 464817828096.0, + "grad_norm": 0.043659618464352824, + "language_loss": 0.80073905, + "learning_rate": 0.00043476264372979847, + "loss": 0.8111887, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.41040039, + "step": 2887, + "time_per_iteration": 2.5368049144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_mlp": 1.00357509, + "epoch": 0.5555983070411696, + "flos": 1564876885248.0, + "grad_norm": 0.03408551435207337, + "language_loss": 0.79322737, + "learning_rate": 0.0004344537772801408, + "loss": 0.80367273, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.40966797, + "step": 2888, + "time_per_iteration": 3.869920015335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057491, + "balance_loss_mlp": 1.01791382, + "epoch": 0.5557906887264332, + "flos": 1471229544192.0, + "grad_norm": 0.014769088101488215, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74479944, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.39550781, + "step": 2889, + "time_per_iteration": 4.936699867248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_mlp": 1.00608003, + "epoch": 0.5559830704116968, + "flos": 530864181504.0, + "grad_norm": 0.0376436874687178, + "language_loss": 0.83696067, + "learning_rate": 0.0004338361208426298, + "loss": 0.84743202, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.41064453, + "step": 2890, + "time_per_iteration": 2.6094541549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051706, + "balance_loss_mlp": 1.01069844, + "epoch": 0.5561754520969604, + "flos": 652519766016.0, + "grad_norm": 0.029226912064567154, + "language_loss": 0.81876659, + "learning_rate": 0.00043352733109457164, + "loss": 0.82928365, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.41015625, + "step": 2891, + "time_per_iteration": 2.8833718299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00985098, + "epoch": 0.556367833782224, + "flos": 735620124672.0, + "grad_norm": 0.029092214279724596, + "language_loss": 0.84975475, + "learning_rate": 0.00043321856715349244, + "loss": 0.86026359, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.41040039, + "step": 2892, + "time_per_iteration": 2.9798240661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046881, + "balance_loss_mlp": 1.00575387, + "epoch": 0.5565602154674875, + "flos": 673641886464.0, + "grad_norm": 0.03553967461394851, + "language_loss": 0.81101406, + "learning_rate": 0.00043290982913926466, + "loss": 0.8214829, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.41137695, + "step": 2893, + "time_per_iteration": 2.8139491081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00463176, + "epoch": 0.556752597152751, + "flos": 587504778240.0, + "grad_norm": 0.036653967015968944, + "language_loss": 0.84921324, + "learning_rate": 0.0004326011171717514, + "loss": 0.85967016, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.41064453, + "step": 2894, + "time_per_iteration": 2.9087953567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046707, + "balance_loss_mlp": 1.00555551, + "epoch": 0.5569449788380146, + "flos": 438691491072.0, + "grad_norm": 0.03515530628910635, + "language_loss": 0.81422639, + "learning_rate": 0.0004322924313708051, + "loss": 0.82469344, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.41162109, + "step": 2895, + "time_per_iteration": 2.529937505722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051552, + "balance_loss_mlp": 1.01054382, + "epoch": 0.5571373605232782, + "flos": 503248632576.0, + "grad_norm": 0.03724847922393753, + "language_loss": 0.84896851, + "learning_rate": 0.0004319837718562681, + "loss": 0.85948396, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.41015625, + "step": 2896, + "time_per_iteration": 2.6142115592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_mlp": 1.00599957, + "epoch": 0.5573297422085417, + "flos": 578590894080.0, + "grad_norm": 0.04905398235042313, + "language_loss": 0.83417499, + "learning_rate": 0.0004316751387479726, + "loss": 0.84464645, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.41162109, + "step": 2897, + "time_per_iteration": 2.7738893032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_mlp": 1.00555933, + "epoch": 0.5575221238938053, + "flos": 1346049251328.0, + "grad_norm": 0.03588075887117774, + "language_loss": 0.82779884, + "learning_rate": 0.0004313665321657409, + "loss": 0.83826572, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.41137695, + "step": 2898, + "time_per_iteration": 3.725510835647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_mlp": 1.00672877, + "epoch": 0.5577145055790689, + "flos": 603099707136.0, + "grad_norm": 0.03720848090960627, + "language_loss": 0.80283779, + "learning_rate": 0.00043105795222938436, + "loss": 0.81331486, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.40991211, + "step": 2899, + "time_per_iteration": 2.7282700538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_mlp": 1.00829744, + "epoch": 0.5579068872643325, + "flos": 563691972096.0, + "grad_norm": 0.03568825250494595, + "language_loss": 0.79214776, + "learning_rate": 0.00043074939905870467, + "loss": 0.80263913, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.40844727, + "step": 2900, + "time_per_iteration": 2.696354389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_mlp": 1.00399923, + "epoch": 0.558099268949596, + "flos": 545589104640.0, + "grad_norm": 0.04035642488371941, + "language_loss": 0.81151342, + "learning_rate": 0.0004304408727734927, + "loss": 0.82196188, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.40844727, + "step": 2901, + "time_per_iteration": 2.6394877433776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044507, + "balance_loss_mlp": 1.00366592, + "epoch": 0.5582916506348595, + "flos": 553853647104.0, + "grad_norm": 0.036813902208390564, + "language_loss": 0.89428526, + "learning_rate": 0.0004301323734935288, + "loss": 0.90473032, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.40844727, + "step": 2902, + "time_per_iteration": 2.659945249557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_mlp": 1.00635207, + "epoch": 0.5584840323201231, + "flos": 544425536256.0, + "grad_norm": 0.03290970227186249, + "language_loss": 0.87933898, + "learning_rate": 0.000429823901338583, + "loss": 0.88981086, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.40844727, + "step": 2903, + "time_per_iteration": 2.643388032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_mlp": 1.00432324, + "epoch": 0.5586764140053867, + "flos": 817023246336.0, + "grad_norm": 0.03162840926526219, + "language_loss": 0.87249023, + "learning_rate": 0.00042951545642841513, + "loss": 0.88294262, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.40917969, + "step": 2904, + "time_per_iteration": 3.0901763439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047509, + "balance_loss_mlp": 1.00642967, + "epoch": 0.5588687956906503, + "flos": 487416521472.0, + "grad_norm": 0.02951660315659268, + "language_loss": 0.87151515, + "learning_rate": 0.0004292070388827737, + "loss": 0.88199031, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.41088867, + "step": 2905, + "time_per_iteration": 2.6241614818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050702, + "balance_loss_mlp": 1.00967062, + "epoch": 0.5590611773759138, + "flos": 453069383424.0, + "grad_norm": 0.03428125950398782, + "language_loss": 0.81863332, + "learning_rate": 0.00042889864882139753, + "loss": 0.82914031, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.41040039, + "step": 2906, + "time_per_iteration": 2.6295247077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_mlp": 1.01025224, + "epoch": 0.5592535590611774, + "flos": 521957100288.0, + "grad_norm": 0.03203389874594117, + "language_loss": 0.82458705, + "learning_rate": 0.0004285902863640139, + "loss": 0.83510035, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.41088867, + "step": 2907, + "time_per_iteration": 2.6310994625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044647, + "balance_loss_mlp": 1.00366294, + "epoch": 0.5594459407464409, + "flos": 553601880576.0, + "grad_norm": 0.029509403523767207, + "language_loss": 0.86282808, + "learning_rate": 0.00042828195163033966, + "loss": 0.87327456, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.40991211, + "step": 2908, + "time_per_iteration": 2.720059871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_mlp": 1.00285828, + "epoch": 0.5596383224317045, + "flos": 485788303872.0, + "grad_norm": 0.032784621074408576, + "language_loss": 0.796462, + "learning_rate": 0.0004279736447400812, + "loss": 0.80690086, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.41040039, + "step": 2909, + "time_per_iteration": 2.562958240509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_mlp": 1.00323904, + "epoch": 0.5598307041169681, + "flos": 612380064000.0, + "grad_norm": 0.03125271468065307, + "language_loss": 0.78822809, + "learning_rate": 0.00042766536581293385, + "loss": 0.79866982, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.40942383, + "step": 2910, + "time_per_iteration": 2.742727041244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_mlp": 1.00297463, + "epoch": 0.5600230858022316, + "flos": 489917657088.0, + "grad_norm": 0.033084161668713065, + "language_loss": 0.80192208, + "learning_rate": 0.0004273571149685819, + "loss": 0.81236243, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.41064453, + "step": 2911, + "time_per_iteration": 2.7333109378814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_mlp": 1.00091636, + "epoch": 0.5602154674874952, + "flos": 599982277632.0, + "grad_norm": 0.033670817346998394, + "language_loss": 0.84396589, + "learning_rate": 0.00042704889232669937, + "loss": 0.8543846, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.40966797, + "step": 2912, + "time_per_iteration": 2.7085225582122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00336003, + "epoch": 0.5604078491727588, + "flos": 587063461632.0, + "grad_norm": 0.043754524068974454, + "language_loss": 0.8611334, + "learning_rate": 0.0004267406980069484, + "loss": 0.87157494, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.40795898, + "step": 2913, + "time_per_iteration": 2.747812271118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00275385, + "epoch": 0.5606002308580224, + "flos": 542328778752.0, + "grad_norm": 0.02876490223829942, + "language_loss": 0.7993964, + "learning_rate": 0.0004264325321289808, + "loss": 0.80983406, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.41015625, + "step": 2914, + "time_per_iteration": 2.8028316497802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043869, + "balance_loss_mlp": 1.0028609, + "epoch": 0.5607926125432858, + "flos": 585079464960.0, + "grad_norm": 0.03419971609404561, + "language_loss": 0.86714381, + "learning_rate": 0.00042612439481243736, + "loss": 0.87758255, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.41015625, + "step": 2915, + "time_per_iteration": 2.7691102027893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045259, + "balance_loss_mlp": 1.00417948, + "epoch": 0.5609849942285494, + "flos": 628631137536.0, + "grad_norm": 0.0372312942186238, + "language_loss": 0.90099525, + "learning_rate": 0.00042581628617694735, + "loss": 0.91144788, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.41088867, + "step": 2916, + "time_per_iteration": 2.7420172691345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043547, + "balance_loss_mlp": 1.00261009, + "epoch": 0.561177375913813, + "flos": 589455727104.0, + "grad_norm": 0.03338895186153077, + "language_loss": 0.82208467, + "learning_rate": 0.0004255082063421296, + "loss": 0.83252013, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.40942383, + "step": 2917, + "time_per_iteration": 2.673243999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_mlp": 1.0016005, + "epoch": 0.5613697575990766, + "flos": 528144327168.0, + "grad_norm": 0.03066260992789867, + "language_loss": 0.85543269, + "learning_rate": 0.00042520015542759065, + "loss": 0.86586022, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.41162109, + "step": 2918, + "time_per_iteration": 2.879850387573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_mlp": 1.00201178, + "epoch": 0.5615621392843402, + "flos": 643875144960.0, + "grad_norm": 0.028477148441929827, + "language_loss": 0.88382292, + "learning_rate": 0.00042489213355292687, + "loss": 0.89425319, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.41015625, + "step": 2919, + "time_per_iteration": 2.9279518127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044234, + "balance_loss_mlp": 1.00315475, + "epoch": 0.5617545209696037, + "flos": 428657779968.0, + "grad_norm": 0.03756668389237789, + "language_loss": 0.81703657, + "learning_rate": 0.00042458414083772276, + "loss": 0.82747889, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.41088867, + "step": 2920, + "time_per_iteration": 2.5474023818969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_mlp": 1.00371051, + "epoch": 0.5619469026548672, + "flos": 569590493952.0, + "grad_norm": 0.029467937694277743, + "language_loss": 0.85509026, + "learning_rate": 0.000424276177401552, + "loss": 0.86553693, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.40966797, + "step": 2921, + "time_per_iteration": 2.797123670578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_mlp": 1.00260556, + "epoch": 0.5621392843401308, + "flos": 506244552960.0, + "grad_norm": 0.03575401527758356, + "language_loss": 0.86372185, + "learning_rate": 0.0004239682433639763, + "loss": 0.87415743, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.40966797, + "step": 2922, + "time_per_iteration": 2.6631922721862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_mlp": 1.00281191, + "epoch": 0.5623316660253944, + "flos": 518010494208.0, + "grad_norm": 0.03518251960287723, + "language_loss": 0.86062789, + "learning_rate": 0.0004236603388445467, + "loss": 0.87106532, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.40942383, + "step": 2923, + "time_per_iteration": 2.60380482673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_mlp": 1.00410116, + "epoch": 0.5625240477106579, + "flos": 607139632128.0, + "grad_norm": 0.03089029411800112, + "language_loss": 0.82301855, + "learning_rate": 0.00042335246396280166, + "loss": 0.8334682, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.40869141, + "step": 2924, + "time_per_iteration": 2.7605555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045584, + "balance_loss_mlp": 1.00462389, + "epoch": 0.5627164293959215, + "flos": 451341043968.0, + "grad_norm": 0.04701230911743114, + "language_loss": 0.91272092, + "learning_rate": 0.0004230446188382693, + "loss": 0.92317677, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.40966797, + "step": 2925, + "time_per_iteration": 2.5571765899658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042518, + "balance_loss_mlp": 1.00158191, + "epoch": 0.5629088110811851, + "flos": 743437514496.0, + "grad_norm": 0.0349005963329915, + "language_loss": 0.81125653, + "learning_rate": 0.0004227368035904654, + "loss": 0.82168174, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.40942383, + "step": 2926, + "time_per_iteration": 3.0334270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_mlp": 1.00211096, + "epoch": 0.5631011927664487, + "flos": 497980010496.0, + "grad_norm": 0.0467260030557379, + "language_loss": 0.83361161, + "learning_rate": 0.00042242901833889474, + "loss": 0.84404236, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.40966797, + "step": 2927, + "time_per_iteration": 2.6271822452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_mlp": 1.00153816, + "epoch": 0.5632935744517122, + "flos": 887595561216.0, + "grad_norm": 0.03653524957968277, + "language_loss": 0.8629514, + "learning_rate": 0.0004221212632030501, + "loss": 0.87337685, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.41015625, + "step": 2928, + "time_per_iteration": 3.1174416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_mlp": 1.00542605, + "epoch": 0.5634859561369757, + "flos": 605902186752.0, + "grad_norm": 0.04110669316721802, + "language_loss": 0.80746865, + "learning_rate": 0.0004218135383024124, + "loss": 0.81793177, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.40893555, + "step": 2929, + "time_per_iteration": 2.705615758895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_mlp": 1.00056946, + "epoch": 0.5636783378222393, + "flos": 454903680768.0, + "grad_norm": 0.0339470495466753, + "language_loss": 0.85614669, + "learning_rate": 0.0004215058437564511, + "loss": 0.86656082, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.40844727, + "step": 2930, + "time_per_iteration": 2.5682146549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_mlp": 1.00006831, + "epoch": 0.5638707195075029, + "flos": 519462767616.0, + "grad_norm": 0.03372410984042782, + "language_loss": 0.82691574, + "learning_rate": 0.00042119817968462397, + "loss": 0.83732378, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.4074707, + "step": 2931, + "time_per_iteration": 2.6308341026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105222, + "balance_loss_mlp": 1.01135468, + "epoch": 0.5640631011927665, + "flos": 565845110016.0, + "grad_norm": 0.03794773284405352, + "language_loss": 0.87544155, + "learning_rate": 0.0004208905462063766, + "loss": 0.88596374, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.40869141, + "step": 2932, + "time_per_iteration": 2.6615707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049556, + "balance_loss_mlp": 1.00866711, + "epoch": 0.56425548287803, + "flos": 518038684416.0, + "grad_norm": 0.03232798556838129, + "language_loss": 0.84722394, + "learning_rate": 0.00042058294344114315, + "loss": 0.85771948, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.40893555, + "step": 2933, + "time_per_iteration": 2.6182868480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049116, + "balance_loss_mlp": 1.0083226, + "epoch": 0.5644478645632935, + "flos": 855670824192.0, + "grad_norm": 0.03170317888214056, + "language_loss": 0.78432804, + "learning_rate": 0.0004202753715083456, + "loss": 0.79481918, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.40795898, + "step": 2934, + "time_per_iteration": 3.0613481998443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045, + "balance_loss_mlp": 1.00420666, + "epoch": 0.5646402462485571, + "flos": 554496185856.0, + "grad_norm": 0.03929055225526713, + "language_loss": 0.81611717, + "learning_rate": 0.0004199678305273936, + "loss": 0.82656717, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.40795898, + "step": 2935, + "time_per_iteration": 2.634765386581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00552905, + "epoch": 0.5648326279338207, + "flos": 687312111360.0, + "grad_norm": 0.02956036273454178, + "language_loss": 0.8172124, + "learning_rate": 0.0004196603206176854, + "loss": 0.82767659, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.40893555, + "step": 2936, + "time_per_iteration": 2.9358084201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_mlp": 1.00783014, + "epoch": 0.5650250096190843, + "flos": 804683785728.0, + "grad_norm": 0.03257366451462874, + "language_loss": 0.84142041, + "learning_rate": 0.000419352841898607, + "loss": 0.85190785, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.40917969, + "step": 2937, + "time_per_iteration": 2.9652152061462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_mlp": 1.00891984, + "epoch": 0.5652173913043478, + "flos": 583145045760.0, + "grad_norm": 0.037245032295536384, + "language_loss": 0.7792089, + "learning_rate": 0.000419045394489532, + "loss": 0.78970701, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.40893555, + "step": 2938, + "time_per_iteration": 2.6814448833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048464, + "balance_loss_mlp": 1.00752795, + "epoch": 0.5654097729896114, + "flos": 822168413952.0, + "grad_norm": 0.03166469527574581, + "language_loss": 0.76863134, + "learning_rate": 0.0004187379785098224, + "loss": 0.77911597, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.40942383, + "step": 2939, + "time_per_iteration": 3.1437690258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049709, + "balance_loss_mlp": 1.00881994, + "epoch": 0.565602154674875, + "flos": 785482478592.0, + "grad_norm": 0.035451368889273006, + "language_loss": 0.84531581, + "learning_rate": 0.00041843059407882744, + "loss": 0.85581291, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.40893555, + "step": 2940, + "time_per_iteration": 2.9561386108398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_mlp": 1.00554383, + "epoch": 0.5657945363601385, + "flos": 550744965888.0, + "grad_norm": 0.033205673863039784, + "language_loss": 0.83385015, + "learning_rate": 0.0004181232413158842, + "loss": 0.84431374, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.40820312, + "step": 2941, + "time_per_iteration": 2.6476027965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047072, + "balance_loss_mlp": 1.0061357, + "epoch": 0.5659869180454021, + "flos": 669332698368.0, + "grad_norm": 0.03636978251075169, + "language_loss": 0.83073509, + "learning_rate": 0.0004178159203403179, + "loss": 0.84120584, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.40942383, + "step": 2942, + "time_per_iteration": 2.835840940475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_mlp": 1.00862479, + "epoch": 0.5661792997306656, + "flos": 500949686016.0, + "grad_norm": 0.030415094414242012, + "language_loss": 0.8213833, + "learning_rate": 0.0004175086312714409, + "loss": 0.83187747, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.40795898, + "step": 2943, + "time_per_iteration": 2.6258370876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00911188, + "epoch": 0.5663716814159292, + "flos": 602363849472.0, + "grad_norm": 0.030374801338140925, + "language_loss": 0.84196591, + "learning_rate": 0.00041720137422855366, + "loss": 0.85246402, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.40698242, + "step": 2944, + "time_per_iteration": 2.753483772277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050206, + "balance_loss_mlp": 1.00948393, + "epoch": 0.5665640631011928, + "flos": 542033270784.0, + "grad_norm": 0.0327328941542846, + "language_loss": 0.79511452, + "learning_rate": 0.00041689414933094383, + "loss": 0.80561656, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.40722656, + "step": 2945, + "time_per_iteration": 2.6251614093780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_mlp": 1.00701642, + "epoch": 0.5667564447864564, + "flos": 603062768640.0, + "grad_norm": 0.03650681858880775, + "language_loss": 0.81631696, + "learning_rate": 0.00041658695669788653, + "loss": 0.82679439, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.40722656, + "step": 2946, + "time_per_iteration": 2.7196879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00432932, + "epoch": 0.5669488264717198, + "flos": 660723070464.0, + "grad_norm": 0.039783949444703086, + "language_loss": 0.82089484, + "learning_rate": 0.00041627979644864453, + "loss": 0.83134508, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.40698242, + "step": 2947, + "time_per_iteration": 2.8414080142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_mlp": 1.00243521, + "epoch": 0.5671412081569834, + "flos": 486383210496.0, + "grad_norm": 0.029571262892964766, + "language_loss": 0.81883216, + "learning_rate": 0.0004159726687024683, + "loss": 0.82926297, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.40649414, + "step": 2948, + "time_per_iteration": 2.6365981101989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043624, + "balance_loss_mlp": 1.0029496, + "epoch": 0.567333589842247, + "flos": 731061115392.0, + "grad_norm": 0.03568675680792695, + "language_loss": 0.79577011, + "learning_rate": 0.00041566557357859506, + "loss": 0.80620635, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.40673828, + "step": 2949, + "time_per_iteration": 2.8660199642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046952, + "balance_loss_mlp": 1.00618231, + "epoch": 0.5675259715275106, + "flos": 970559826432.0, + "grad_norm": 0.03148848509964497, + "language_loss": 0.79963183, + "learning_rate": 0.0004153585111962502, + "loss": 0.81010127, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.40771484, + "step": 2950, + "time_per_iteration": 3.284973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049042, + "balance_loss_mlp": 1.00824845, + "epoch": 0.5677183532127742, + "flos": 566214494976.0, + "grad_norm": 0.035222224981726044, + "language_loss": 0.84893769, + "learning_rate": 0.0004150514816746453, + "loss": 0.85942811, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.40795898, + "step": 2951, + "time_per_iteration": 2.688965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053398, + "balance_loss_mlp": 1.0126282, + "epoch": 0.5679107348980377, + "flos": 552746459136.0, + "grad_norm": 0.03211470229094595, + "language_loss": 0.86231828, + "learning_rate": 0.0004147444851329802, + "loss": 0.87285221, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.40771484, + "step": 2952, + "time_per_iteration": 2.654975175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050037, + "balance_loss_mlp": 1.00929093, + "epoch": 0.5681031165833013, + "flos": 820841540352.0, + "grad_norm": 0.031520082579240216, + "language_loss": 0.86395264, + "learning_rate": 0.00041443752169044126, + "loss": 0.87445295, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.4074707, + "step": 2953, + "time_per_iteration": 2.9978690147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_mlp": 1.00384951, + "epoch": 0.5682954982685648, + "flos": 619146646272.0, + "grad_norm": 0.031195671435834585, + "language_loss": 0.85214126, + "learning_rate": 0.0004141305914662025, + "loss": 0.86258864, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.40893555, + "step": 2954, + "time_per_iteration": 2.7177786827087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01214588, + "epoch": 0.5684878799538284, + "flos": 649252637184.0, + "grad_norm": 0.03230481359903608, + "language_loss": 0.81020069, + "learning_rate": 0.0004138236945794246, + "loss": 0.82073009, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.40795898, + "step": 2955, + "time_per_iteration": 2.8862104415893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.01065099, + "epoch": 0.5686802616390919, + "flos": 807354062592.0, + "grad_norm": 0.038353041221636526, + "language_loss": 0.84374332, + "learning_rate": 0.00041351683114925576, + "loss": 0.85425854, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.40869141, + "step": 2956, + "time_per_iteration": 3.0500295162200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_mlp": 1.01126814, + "epoch": 0.5688726433243555, + "flos": 548176756224.0, + "grad_norm": 0.03189027766628176, + "language_loss": 0.87115657, + "learning_rate": 0.0004132100012948308, + "loss": 0.8816781, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.40893555, + "step": 2957, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104687, + "balance_loss_mlp": 1.00593376, + "epoch": 0.5690650250096191, + "flos": 487546778880.0, + "grad_norm": 0.03605588885155363, + "language_loss": 0.84833193, + "learning_rate": 0.00041290320513527145, + "loss": 0.85880065, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.40942383, + "step": 2958, + "time_per_iteration": 2.567070960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010482, + "balance_loss_mlp": 1.00733471, + "epoch": 0.5692574066948827, + "flos": 578555900928.0, + "grad_norm": 0.030752617047449367, + "language_loss": 0.85344827, + "learning_rate": 0.0004125964427896867, + "loss": 0.86393028, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.40869141, + "step": 2959, + "time_per_iteration": 2.672534704208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047545, + "balance_loss_mlp": 1.00663245, + "epoch": 0.5694497883801463, + "flos": 455220576000.0, + "grad_norm": 0.04229544295686443, + "language_loss": 0.79680836, + "learning_rate": 0.0004122897143771723, + "loss": 0.80728376, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.40917969, + "step": 2960, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_mlp": 1.00534308, + "epoch": 0.5696421700654097, + "flos": 560583290880.0, + "grad_norm": 0.03127363894209499, + "language_loss": 0.82077289, + "learning_rate": 0.0004119830200168109, + "loss": 0.83123589, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.40966797, + "step": 2961, + "time_per_iteration": 2.663581609725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045878, + "balance_loss_mlp": 1.00510836, + "epoch": 0.5698345517506733, + "flos": 466502426112.0, + "grad_norm": 0.0350478630821908, + "language_loss": 0.89062726, + "learning_rate": 0.0004116763598276714, + "loss": 0.90108603, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.40771484, + "step": 2962, + "time_per_iteration": 2.521552801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_mlp": 1.00641382, + "epoch": 0.5700269334359369, + "flos": 607192121856.0, + "grad_norm": 0.031424704719117534, + "language_loss": 0.81706619, + "learning_rate": 0.00041136973392881017, + "loss": 0.82753831, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.40795898, + "step": 2963, + "time_per_iteration": 2.91904878616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_mlp": 1.00296056, + "epoch": 0.5702193151212005, + "flos": 563857222656.0, + "grad_norm": 0.03326860309508315, + "language_loss": 0.82831907, + "learning_rate": 0.00041106314243926983, + "loss": 0.83875614, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.4074707, + "step": 2964, + "time_per_iteration": 2.7399420738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_mlp": 1.00340486, + "epoch": 0.570411696806464, + "flos": 524310481920.0, + "grad_norm": 0.03332690132244082, + "language_loss": 0.8800739, + "learning_rate": 0.0004107565854780798, + "loss": 0.89051443, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.40649414, + "step": 2965, + "time_per_iteration": 2.6200034618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_mlp": 1.00565064, + "epoch": 0.5706040784917276, + "flos": 719473063680.0, + "grad_norm": 0.03436086388372073, + "language_loss": 0.81524932, + "learning_rate": 0.000410450063164256, + "loss": 0.82571304, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.40722656, + "step": 2966, + "time_per_iteration": 2.8336212635040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048641, + "balance_loss_mlp": 1.00787103, + "epoch": 0.5707964601769911, + "flos": 477671515392.0, + "grad_norm": 0.03782244517116874, + "language_loss": 0.82540762, + "learning_rate": 0.00041014357561680115, + "loss": 0.83589399, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.40771484, + "step": 2967, + "time_per_iteration": 2.5143654346466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00714386, + "epoch": 0.5709888418622547, + "flos": 581217429504.0, + "grad_norm": 0.030421169355448613, + "language_loss": 0.86193347, + "learning_rate": 0.0004098371229547039, + "loss": 0.87241161, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.40673828, + "step": 2968, + "time_per_iteration": 2.6610617637634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057983, + "balance_loss_mlp": 1.01869202, + "epoch": 0.5711812235475183, + "flos": 1583195536128.0, + "grad_norm": 0.0076189717983582966, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8106879, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.39257812, + "step": 2969, + "time_per_iteration": 4.76263952255249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_mlp": 1.00790465, + "epoch": 0.5713736052327818, + "flos": 469498346496.0, + "grad_norm": 0.03484927048715074, + "language_loss": 0.80634308, + "learning_rate": 0.00040922432276247107, + "loss": 0.81682986, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.40771484, + "step": 2970, + "time_per_iteration": 2.5514628887176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_mlp": 1.0054065, + "epoch": 0.5715659869180454, + "flos": 538755448320.0, + "grad_norm": 0.029079861926461517, + "language_loss": 0.84918243, + "learning_rate": 0.0004089179754702457, + "loss": 0.85964465, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.40820312, + "step": 2971, + "time_per_iteration": 2.749539613723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_mlp": 1.00396252, + "epoch": 0.571758368603309, + "flos": 657251807232.0, + "grad_norm": 0.03418066993480882, + "language_loss": 0.80556142, + "learning_rate": 0.00040861166353919843, + "loss": 0.81600946, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.40844727, + "step": 2972, + "time_per_iteration": 2.814680814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052102, + "balance_loss_mlp": 1.011356, + "epoch": 0.5719507502885726, + "flos": 669100373760.0, + "grad_norm": 0.031053974574008693, + "language_loss": 0.82602715, + "learning_rate": 0.00040830538708824983, + "loss": 0.83654815, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.4074707, + "step": 2973, + "time_per_iteration": 2.904085636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050783, + "balance_loss_mlp": 1.01018071, + "epoch": 0.572143131973836, + "flos": 477280743168.0, + "grad_norm": 0.03419925971016847, + "language_loss": 0.82092619, + "learning_rate": 0.000407999146236307, + "loss": 0.83143401, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.40600586, + "step": 2974, + "time_per_iteration": 2.549262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051203, + "balance_loss_mlp": 1.01062381, + "epoch": 0.5723355136590996, + "flos": 540535310592.0, + "grad_norm": 0.03597856382327793, + "language_loss": 0.83747095, + "learning_rate": 0.0004076929411022634, + "loss": 0.847983, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.40576172, + "step": 2975, + "time_per_iteration": 2.602869987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053058, + "balance_loss_mlp": 1.01235974, + "epoch": 0.5725278953443632, + "flos": 825650370816.0, + "grad_norm": 0.037415312483521146, + "language_loss": 0.8006742, + "learning_rate": 0.0004073867718049982, + "loss": 0.81120479, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.40698242, + "step": 2976, + "time_per_iteration": 3.139498472213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_mlp": 1.00966477, + "epoch": 0.5727202770296268, + "flos": 588570170112.0, + "grad_norm": 0.037681082671355684, + "language_loss": 0.83124882, + "learning_rate": 0.00040708063846337704, + "loss": 0.84175301, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.4074707, + "step": 2977, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00937819, + "epoch": 0.5729126587148904, + "flos": 447941712384.0, + "grad_norm": 0.03249864108633733, + "language_loss": 0.81268066, + "learning_rate": 0.00040677454119625143, + "loss": 0.82318383, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.40942383, + "step": 2978, + "time_per_iteration": 2.5775671005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049596, + "balance_loss_mlp": 1.00870752, + "epoch": 0.5731050404001539, + "flos": 520467888384.0, + "grad_norm": 0.034012599703189976, + "language_loss": 0.83670664, + "learning_rate": 0.0004064684801224587, + "loss": 0.84720254, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.40893555, + "step": 2979, + "time_per_iteration": 2.6424074172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047576, + "balance_loss_mlp": 1.00675905, + "epoch": 0.5732974220854175, + "flos": 505771155456.0, + "grad_norm": 0.032486782592384814, + "language_loss": 0.80872238, + "learning_rate": 0.00040616245536082224, + "loss": 0.81919813, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.40820312, + "step": 2980, + "time_per_iteration": 2.57401704788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050822, + "balance_loss_mlp": 1.01000464, + "epoch": 0.573489803770681, + "flos": 593678399232.0, + "grad_norm": 0.028956426653120197, + "language_loss": 0.82143462, + "learning_rate": 0.00040585646703015165, + "loss": 0.8319428, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.40820312, + "step": 2981, + "time_per_iteration": 2.828683614730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.01010036, + "epoch": 0.5736821854559446, + "flos": 490870288128.0, + "grad_norm": 0.04412597729133787, + "language_loss": 0.78605878, + "learning_rate": 0.0004055505152492419, + "loss": 0.79656816, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.40844727, + "step": 2982, + "time_per_iteration": 2.640928268432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048392, + "balance_loss_mlp": 1.00747919, + "epoch": 0.5738745671412081, + "flos": 459202175232.0, + "grad_norm": 0.034256342510568284, + "language_loss": 0.74769032, + "learning_rate": 0.00040524460013687425, + "loss": 0.7581743, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.40917969, + "step": 2983, + "time_per_iteration": 2.7067794799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105312, + "balance_loss_mlp": 1.0123024, + "epoch": 0.5740669488264717, + "flos": 581621807616.0, + "grad_norm": 0.029467935021435916, + "language_loss": 0.81554836, + "learning_rate": 0.0004049387218118155, + "loss": 0.82607955, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.40820312, + "step": 2984, + "time_per_iteration": 2.9581944942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_mlp": 1.00468242, + "epoch": 0.5742593305117353, + "flos": 525574172160.0, + "grad_norm": 0.03631391131249333, + "language_loss": 0.85729742, + "learning_rate": 0.00040463288039281777, + "loss": 0.86775261, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.40844727, + "step": 2985, + "time_per_iteration": 2.7224113941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056683, + "balance_loss_mlp": 1.01729584, + "epoch": 0.5744517121969989, + "flos": 1557269442816.0, + "grad_norm": 0.010841110534864203, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78933102, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.39355469, + "step": 2986, + "time_per_iteration": 5.064981698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.01089525, + "epoch": 0.5746440938822625, + "flos": 753203907840.0, + "grad_norm": 0.045288596232844924, + "language_loss": 0.82885808, + "learning_rate": 0.0004040213087479444, + "loss": 0.83937448, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.4074707, + "step": 2987, + "time_per_iteration": 2.98020601272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043481, + "balance_loss_mlp": 1.00266409, + "epoch": 0.5748364755675259, + "flos": 502857860352.0, + "grad_norm": 0.036149920431262125, + "language_loss": 0.85748988, + "learning_rate": 0.0004037155787595018, + "loss": 0.86792469, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.40820312, + "step": 2988, + "time_per_iteration": 2.5745627880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_mlp": 1.01026356, + "epoch": 0.5750288572527895, + "flos": 505198603008.0, + "grad_norm": 0.03371383384616788, + "language_loss": 0.81460357, + "learning_rate": 0.000403409886151987, + "loss": 0.82511389, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.40771484, + "step": 2989, + "time_per_iteration": 2.9434561729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045067, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5752212389380531, + "flos": 1544678215680.0, + "grad_norm": 0.006920775411585041, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83044171, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.39453125, + "step": 2990, + "time_per_iteration": 4.784885406494141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_mlp": 1.00367737, + "epoch": 0.5754136206233167, + "flos": 1570674295296.0, + "grad_norm": 0.003743957088283973, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79241765, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.39453125, + "step": 2991, + "time_per_iteration": 4.776461362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049264, + "balance_loss_mlp": 1.00842321, + "epoch": 0.5756060023085803, + "flos": 799562917632.0, + "grad_norm": 0.03045005809397815, + "language_loss": 0.77561808, + "learning_rate": 0.00040249303380173807, + "loss": 0.78611076, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.40844727, + "step": 2992, + "time_per_iteration": 3.0843074321746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_mlp": 1.00451803, + "epoch": 0.5757983839938438, + "flos": 589034819328.0, + "grad_norm": 0.034529184723129894, + "language_loss": 0.79738832, + "learning_rate": 0.00040218749190459126, + "loss": 0.8078438, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.41040039, + "step": 2993, + "time_per_iteration": 2.7403366565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.00428283, + "epoch": 0.5759907656791073, + "flos": 517852046592.0, + "grad_norm": 0.035278528612120996, + "language_loss": 0.82955313, + "learning_rate": 0.00040188198798162775, + "loss": 0.84000504, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.40917969, + "step": 2994, + "time_per_iteration": 2.6673707962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_mlp": 1.00718617, + "epoch": 0.5761831473643709, + "flos": 588290213376.0, + "grad_norm": 0.029287821677584636, + "language_loss": 0.85980493, + "learning_rate": 0.000401576522151455, + "loss": 0.87028569, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.40893555, + "step": 2995, + "time_per_iteration": 2.788686513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_mlp": 1.00815868, + "epoch": 0.5763755290496345, + "flos": 545009749248.0, + "grad_norm": 0.03018415670660867, + "language_loss": 0.8281709, + "learning_rate": 0.0004012710945326651, + "loss": 0.83866143, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.40893555, + "step": 2996, + "time_per_iteration": 2.7784581184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047721, + "balance_loss_mlp": 1.00685585, + "epoch": 0.576567910734898, + "flos": 627428685312.0, + "grad_norm": 0.030965553916741433, + "language_loss": 0.81781155, + "learning_rate": 0.0004009657052438355, + "loss": 0.82828873, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.40869141, + "step": 2997, + "time_per_iteration": 2.787832498550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_mlp": 1.00593948, + "epoch": 0.5767602924201616, + "flos": 539278423296.0, + "grad_norm": 0.0362963808148575, + "language_loss": 0.86264056, + "learning_rate": 0.00040066035440352904, + "loss": 0.87310815, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.40820312, + "step": 2998, + "time_per_iteration": 2.6896724700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045353, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5769526741054252, + "flos": 1563026046720.0, + "grad_norm": 0.005169215201186531, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8033849, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.39746094, + "step": 2999, + "time_per_iteration": 4.891216039657593 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_mlp": 1.00318265, + "epoch": 0.5771450557906888, + "flos": 469172702976.0, + "grad_norm": 0.037596514401195116, + "language_loss": 0.7668246, + "learning_rate": 0.00040004976854266145, + "loss": 0.77726436, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.40795898, + "step": 3000, + "time_per_iteration": 2.51895809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00478971, + "epoch": 0.5773374374759523, + "flos": 575633857536.0, + "grad_norm": 0.03248080927364981, + "language_loss": 0.81750363, + "learning_rate": 0.0003997445337591505, + "loss": 0.82796073, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.40917969, + "step": 3001, + "time_per_iteration": 2.692239999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.0079695, + "epoch": 0.5775298191612158, + "flos": 529505227008.0, + "grad_norm": 0.031913043384180086, + "language_loss": 0.74606609, + "learning_rate": 0.0003994393378982635, + "loss": 0.75655282, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.40698242, + "step": 3002, + "time_per_iteration": 2.665146589279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053272, + "balance_loss_mlp": 1.01369476, + "epoch": 0.5777222008464794, + "flos": 1306899095808.0, + "grad_norm": 0.010106387724362367, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80591273, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.39550781, + "step": 3003, + "time_per_iteration": 4.803764581680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104489, + "balance_loss_mlp": 1.00409698, + "epoch": 0.577914582531743, + "flos": 604793053440.0, + "grad_norm": 0.0386937293491606, + "language_loss": 0.88557941, + "learning_rate": 0.0003988290634182961, + "loss": 0.89602828, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.40795898, + "step": 3004, + "time_per_iteration": 2.7506465911865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_mlp": 1.00943995, + "epoch": 0.5781069642170066, + "flos": 487833538560.0, + "grad_norm": 0.034765884683499934, + "language_loss": 0.81038988, + "learning_rate": 0.0003985239850361453, + "loss": 0.82089031, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.40600586, + "step": 3005, + "time_per_iteration": 2.5988621711730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047258, + "balance_loss_mlp": 1.00653589, + "epoch": 0.5782993459022701, + "flos": 507414924288.0, + "grad_norm": 0.036479253397917216, + "language_loss": 0.85073388, + "learning_rate": 0.0003982189460504777, + "loss": 0.86120641, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.40722656, + "step": 3006, + "time_per_iteration": 2.694517135620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00913548, + "epoch": 0.5784917275875336, + "flos": 603295093248.0, + "grad_norm": 0.03899121610040523, + "language_loss": 0.79739761, + "learning_rate": 0.00039791394657971935, + "loss": 0.80789566, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.40673828, + "step": 3007, + "time_per_iteration": 2.694913387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_mlp": 1.00376368, + "epoch": 0.5786841092727972, + "flos": 522588945408.0, + "grad_norm": 0.03653808704233678, + "language_loss": 0.84952617, + "learning_rate": 0.00039760898674228205, + "loss": 0.85997152, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.40771484, + "step": 3008, + "time_per_iteration": 2.6486122608184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_mlp": 1.00476897, + "epoch": 0.5788764909580608, + "flos": 768836742144.0, + "grad_norm": 0.02798603221606654, + "language_loss": 0.81355041, + "learning_rate": 0.0003973040666565613, + "loss": 0.82400489, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.40673828, + "step": 3009, + "time_per_iteration": 3.029721975326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_mlp": 1.00590491, + "epoch": 0.5790688726433244, + "flos": 600332220672.0, + "grad_norm": 0.03710521046969438, + "language_loss": 0.82796824, + "learning_rate": 0.000396999186440938, + "loss": 0.8384347, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.4074707, + "step": 3010, + "time_per_iteration": 2.866637945175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_mlp": 1.00711966, + "epoch": 0.5792612543285879, + "flos": 524106347520.0, + "grad_norm": 0.03822457095680595, + "language_loss": 0.85752803, + "learning_rate": 0.000396694346213777, + "loss": 0.86800808, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.40893555, + "step": 3011, + "time_per_iteration": 2.6125171184539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045119, + "balance_loss_mlp": 1.00430202, + "epoch": 0.5794536360138515, + "flos": 878080934400.0, + "grad_norm": 0.030461633114119882, + "language_loss": 0.8396455, + "learning_rate": 0.0003963895460934276, + "loss": 0.8500967, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.40820312, + "step": 3012, + "time_per_iteration": 3.1341123580932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047321, + "balance_loss_mlp": 1.00631309, + "epoch": 0.5796460176991151, + "flos": 402299118336.0, + "grad_norm": 0.04162907217084141, + "language_loss": 0.85323715, + "learning_rate": 0.00039608478619822376, + "loss": 0.86371034, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.41015625, + "step": 3013, + "time_per_iteration": 2.45570969581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_mlp": 1.00448704, + "epoch": 0.5798383993843786, + "flos": 619676424192.0, + "grad_norm": 0.02973237056850944, + "language_loss": 0.8328954, + "learning_rate": 0.00039578006664648394, + "loss": 0.84334981, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.40966797, + "step": 3014, + "time_per_iteration": 2.796370506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_mlp": 1.00351644, + "epoch": 0.5800307810696421, + "flos": 845793615360.0, + "grad_norm": 0.037256106488294125, + "language_loss": 0.81995672, + "learning_rate": 0.0003954753875565105, + "loss": 0.83040106, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.40917969, + "step": 3015, + "time_per_iteration": 3.0796241760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045336, + "balance_loss_mlp": 1.00442326, + "epoch": 0.5802231627549057, + "flos": 570365235456.0, + "grad_norm": 0.0302253929683373, + "language_loss": 0.82961631, + "learning_rate": 0.00039517074904659057, + "loss": 0.84006965, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.40917969, + "step": 3016, + "time_per_iteration": 2.6984057426452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105015, + "balance_loss_mlp": 1.00921345, + "epoch": 0.5804155444401693, + "flos": 661663062528.0, + "grad_norm": 0.033398230079863866, + "language_loss": 0.85268873, + "learning_rate": 0.00039486615123499535, + "loss": 0.86319029, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.40942383, + "step": 3017, + "time_per_iteration": 2.8348796367645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051377, + "balance_loss_mlp": 1.01022601, + "epoch": 0.5806079261254329, + "flos": 515058315264.0, + "grad_norm": 0.030637451118741787, + "language_loss": 0.85653043, + "learning_rate": 0.00039456159423997996, + "loss": 0.86704421, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.41162109, + "step": 3018, + "time_per_iteration": 2.6296215057373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_mlp": 1.00740576, + "epoch": 0.5808003078106965, + "flos": 529718109696.0, + "grad_norm": 0.03062870911456177, + "language_loss": 0.90210342, + "learning_rate": 0.00039425707817978406, + "loss": 0.91258705, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.40966797, + "step": 3019, + "time_per_iteration": 2.631979465484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720644, + "epoch": 0.58099268949596, + "flos": 477997158912.0, + "grad_norm": 0.03679030272618613, + "language_loss": 0.84110886, + "learning_rate": 0.00039395260317263124, + "loss": 0.85159171, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.41088867, + "step": 3020, + "time_per_iteration": 2.584413528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00930095, + "epoch": 0.5811850711812235, + "flos": 518688026112.0, + "grad_norm": 0.03473628129951431, + "language_loss": 0.85378569, + "learning_rate": 0.0003936481693367291, + "loss": 0.86428928, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.41064453, + "step": 3021, + "time_per_iteration": 2.6612508296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049465, + "balance_loss_mlp": 1.00833774, + "epoch": 0.5813774528664871, + "flos": 617627298816.0, + "grad_norm": 0.037803518868136904, + "language_loss": 0.88371962, + "learning_rate": 0.0003933437767902697, + "loss": 0.89421427, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.41137695, + "step": 3022, + "time_per_iteration": 2.7910103797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00499725, + "epoch": 0.5815698345517507, + "flos": 568604815104.0, + "grad_norm": 0.03314052138705104, + "language_loss": 0.78534555, + "learning_rate": 0.00039303942565142825, + "loss": 0.7958051, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.40966797, + "step": 3023, + "time_per_iteration": 2.7066261768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_mlp": 1.00525796, + "epoch": 0.5817622162370142, + "flos": 564304375296.0, + "grad_norm": 0.034500169077956666, + "language_loss": 0.76946682, + "learning_rate": 0.0003927351160383644, + "loss": 0.77992761, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.40820312, + "step": 3024, + "time_per_iteration": 2.785215377807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_mlp": 1.00370252, + "epoch": 0.5819545979222778, + "flos": 460154806272.0, + "grad_norm": 0.03482271460519531, + "language_loss": 0.78468955, + "learning_rate": 0.000392430848069222, + "loss": 0.79513502, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.40844727, + "step": 3025, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_mlp": 1.00244236, + "epoch": 0.5821469796075414, + "flos": 542517361920.0, + "grad_norm": 0.03539348008973476, + "language_loss": 0.83090204, + "learning_rate": 0.00039212662186212795, + "loss": 0.8413347, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.40820312, + "step": 3026, + "time_per_iteration": 2.6203463077545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046114, + "balance_loss_mlp": 1.00534403, + "epoch": 0.582339361292805, + "flos": 553341365760.0, + "grad_norm": 0.030591419392928903, + "language_loss": 0.77452922, + "learning_rate": 0.0003918224375351934, + "loss": 0.78499031, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.40771484, + "step": 3027, + "time_per_iteration": 2.700643301010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_mlp": 1.00646877, + "epoch": 0.5825317429780685, + "flos": 497448287232.0, + "grad_norm": 0.03355698207676345, + "language_loss": 0.79253477, + "learning_rate": 0.0003915182952065135, + "loss": 0.80300689, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.4074707, + "step": 3028, + "time_per_iteration": 2.693223714828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043151, + "balance_loss_mlp": 1.00247645, + "epoch": 0.582724124663332, + "flos": 565255060992.0, + "grad_norm": 0.03374091506860629, + "language_loss": 0.88055015, + "learning_rate": 0.0003912141949941664, + "loss": 0.89098167, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.40673828, + "step": 3029, + "time_per_iteration": 2.674584150314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_mlp": 1.00249338, + "epoch": 0.5829165063485956, + "flos": 493112854272.0, + "grad_norm": 0.039605660090179254, + "language_loss": 0.83319384, + "learning_rate": 0.0003909101370162143, + "loss": 0.84362668, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.40795898, + "step": 3030, + "time_per_iteration": 2.592111587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.00718689, + "epoch": 0.5831088880338592, + "flos": 1531879941888.0, + "grad_norm": 0.006346134957791291, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73480463, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.39355469, + "step": 3031, + "time_per_iteration": 4.929339170455933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047591, + "balance_loss_mlp": 1.00686908, + "epoch": 0.5833012697191228, + "flos": 619209829632.0, + "grad_norm": 0.03163493287885039, + "language_loss": 0.83241516, + "learning_rate": 0.0003903021482356622, + "loss": 0.8428911, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.40722656, + "step": 3032, + "time_per_iteration": 2.7828269004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_mlp": 1.00508761, + "epoch": 0.5834936514043862, + "flos": 769294588416.0, + "grad_norm": 0.028764675594544035, + "language_loss": 0.83318806, + "learning_rate": 0.00038999821766910465, + "loss": 0.84364575, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.40673828, + "step": 3033, + "time_per_iteration": 2.976440906524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046889, + "balance_loss_mlp": 1.00616705, + "epoch": 0.5836860330896498, + "flos": 459316881408.0, + "grad_norm": 0.03570453873198092, + "language_loss": 0.86074644, + "learning_rate": 0.00038969432980902606, + "loss": 0.87121534, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.40722656, + "step": 3034, + "time_per_iteration": 2.5605523586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049232, + "balance_loss_mlp": 1.00975037, + "epoch": 0.5838784147749134, + "flos": 1364198760960.0, + "grad_norm": 0.006741388763220325, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80833733, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.39453125, + "step": 3035, + "time_per_iteration": 4.870011329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046432, + "balance_loss_mlp": 1.00566232, + "epoch": 0.584070796460177, + "flos": 568289865216.0, + "grad_norm": 0.0320953374409888, + "language_loss": 0.82746142, + "learning_rate": 0.00038908668268020953, + "loss": 0.83792579, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.40771484, + "step": 3036, + "time_per_iteration": 2.6482043266296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_mlp": 1.00582528, + "epoch": 0.5842631781454406, + "flos": 612666823680.0, + "grad_norm": 0.032158289179941596, + "language_loss": 0.85682309, + "learning_rate": 0.00038878292364738097, + "loss": 0.86729091, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.40966797, + "step": 3037, + "time_per_iteration": 2.7571158409118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104852, + "balance_loss_mlp": 1.00758314, + "epoch": 0.5844555598307041, + "flos": 464333736960.0, + "grad_norm": 0.037716829310632, + "language_loss": 0.87422657, + "learning_rate": 0.0003884792077928508, + "loss": 0.88471174, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.40942383, + "step": 3038, + "time_per_iteration": 2.5060815811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00522828, + "epoch": 0.5846479415159677, + "flos": 411058445568.0, + "grad_norm": 0.036592459093467214, + "language_loss": 0.77285695, + "learning_rate": 0.0003881755352345322, + "loss": 0.78331912, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.40991211, + "step": 3039, + "time_per_iteration": 2.558833360671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049516, + "balance_loss_mlp": 1.0084126, + "epoch": 0.5848403232012312, + "flos": 492266181120.0, + "grad_norm": 0.028436591435814704, + "language_loss": 0.87703776, + "learning_rate": 0.0003878719060903207, + "loss": 0.88753295, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.41113281, + "step": 3040, + "time_per_iteration": 2.563680410385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048574, + "balance_loss_mlp": 1.0073278, + "epoch": 0.5850327048864948, + "flos": 585509121024.0, + "grad_norm": 0.03942000109029475, + "language_loss": 0.8397156, + "learning_rate": 0.0003875683204780961, + "loss": 0.85020131, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.41259766, + "step": 3041, + "time_per_iteration": 2.707235336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046188, + "balance_loss_mlp": 1.00506115, + "epoch": 0.5852250865717584, + "flos": 652719042816.0, + "grad_norm": 0.03661913957485838, + "language_loss": 0.85946143, + "learning_rate": 0.00038726477851572043, + "loss": 0.86992323, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.41137695, + "step": 3042, + "time_per_iteration": 2.7779452800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_mlp": 1.00753701, + "epoch": 0.5854174682570219, + "flos": 535620522240.0, + "grad_norm": 0.03519010087747146, + "language_loss": 0.80754662, + "learning_rate": 0.0003869612803210395, + "loss": 0.81803256, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.41064453, + "step": 3043, + "time_per_iteration": 2.64778733253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_mlp": 1.01044726, + "epoch": 0.5856098499422855, + "flos": 510759820800.0, + "grad_norm": 0.03494290194274924, + "language_loss": 0.83645654, + "learning_rate": 0.0003866578260118817, + "loss": 0.84697139, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.41040039, + "step": 3044, + "time_per_iteration": 2.596379041671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00828719, + "epoch": 0.5858022316275491, + "flos": 594993612288.0, + "grad_norm": 0.03849486234726574, + "language_loss": 0.83826196, + "learning_rate": 0.0003863544157060581, + "loss": 0.84875488, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.41015625, + "step": 3045, + "time_per_iteration": 2.6666998863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_mlp": 1.0086298, + "epoch": 0.5859946133128127, + "flos": 560318885376.0, + "grad_norm": 0.02876341489298987, + "language_loss": 0.82639688, + "learning_rate": 0.0003860510495213634, + "loss": 0.83689421, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.41113281, + "step": 3046, + "time_per_iteration": 2.865504264831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00827503, + "epoch": 0.5861869949980761, + "flos": 554756700672.0, + "grad_norm": 0.0396946944562825, + "language_loss": 0.78689963, + "learning_rate": 0.0003857477275755746, + "loss": 0.79739368, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.41137695, + "step": 3047, + "time_per_iteration": 2.624819278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049982, + "balance_loss_mlp": 1.00887823, + "epoch": 0.5863793766833397, + "flos": 720055331328.0, + "grad_norm": 0.02972376125592825, + "language_loss": 0.84339547, + "learning_rate": 0.00038544444998645167, + "loss": 0.85389531, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.41113281, + "step": 3048, + "time_per_iteration": 2.990790367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_mlp": 1.00750625, + "epoch": 0.5865717583686033, + "flos": 473286504960.0, + "grad_norm": 0.034605288898392046, + "language_loss": 0.82032233, + "learning_rate": 0.00038514121687173767, + "loss": 0.83080769, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.41040039, + "step": 3049, + "time_per_iteration": 2.596529960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049884, + "balance_loss_mlp": 1.0088284, + "epoch": 0.5867641400538669, + "flos": 814847754240.0, + "grad_norm": 0.03903750410866887, + "language_loss": 0.82380903, + "learning_rate": 0.00038483802834915807, + "loss": 0.83430791, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.41064453, + "step": 3050, + "time_per_iteration": 2.9996161460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.00480914, + "epoch": 0.5869565217391305, + "flos": 487518588672.0, + "grad_norm": 0.0350404565928551, + "language_loss": 0.79904723, + "learning_rate": 0.00038453488453642074, + "loss": 0.80950606, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.41088867, + "step": 3051, + "time_per_iteration": 2.7099759578704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_mlp": 1.00626779, + "epoch": 0.587148903424394, + "flos": 570512989440.0, + "grad_norm": 0.03324549798167153, + "language_loss": 0.8786602, + "learning_rate": 0.00038423178555121697, + "loss": 0.88913417, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.41137695, + "step": 3052, + "time_per_iteration": 2.684868097305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_mlp": 1.00359285, + "epoch": 0.5873412851096576, + "flos": 748695442944.0, + "grad_norm": 0.0344494509074348, + "language_loss": 0.86014688, + "learning_rate": 0.00038392873151121994, + "loss": 0.87059504, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.41235352, + "step": 3053, + "time_per_iteration": 3.073838949203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_mlp": 1.00079656, + "epoch": 0.5875336667949211, + "flos": 529188331776.0, + "grad_norm": 0.03507235034672983, + "language_loss": 0.83636832, + "learning_rate": 0.0003836257225340859, + "loss": 0.84678853, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.41235352, + "step": 3054, + "time_per_iteration": 2.6333680152893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_mlp": 1.000633, + "epoch": 0.5877260484801847, + "flos": 825641622528.0, + "grad_norm": 0.032727897026981576, + "language_loss": 0.82534069, + "learning_rate": 0.00038332275873745336, + "loss": 0.83575833, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.41137695, + "step": 3055, + "time_per_iteration": 3.051757335662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_mlp": 1.00292683, + "epoch": 0.5879184301654482, + "flos": 592694665728.0, + "grad_norm": 0.030899230424817493, + "language_loss": 0.83323562, + "learning_rate": 0.0003830198402389431, + "loss": 0.84367692, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.41210938, + "step": 3056, + "time_per_iteration": 2.6873278617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_mlp": 1.00317383, + "epoch": 0.5881108118507118, + "flos": 1549226531328.0, + "grad_norm": 0.008859615514711313, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78391969, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.3984375, + "step": 3057, + "time_per_iteration": 5.044417142868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045675, + "balance_loss_mlp": 1.00461972, + "epoch": 0.5883031935359754, + "flos": 490599079680.0, + "grad_norm": 0.03687508634060279, + "language_loss": 0.83287209, + "learning_rate": 0.0003824141396066855, + "loss": 0.84332883, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.41064453, + "step": 3058, + "time_per_iteration": 2.57017183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_mlp": 1.00458455, + "epoch": 0.588495575221239, + "flos": 583981025280.0, + "grad_norm": 0.03543871049956236, + "language_loss": 0.83470112, + "learning_rate": 0.000382111357708092, + "loss": 0.84515893, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.41210938, + "step": 3059, + "time_per_iteration": 2.710636615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046053, + "balance_loss_mlp": 1.00492609, + "epoch": 0.5886879569065026, + "flos": 662240472576.0, + "grad_norm": 0.03467029745908185, + "language_loss": 0.84034348, + "learning_rate": 0.00038180862157792864, + "loss": 0.85080403, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.41137695, + "step": 3060, + "time_per_iteration": 2.765730619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045299, + "balance_loss_mlp": 1.00429142, + "epoch": 0.588880338591766, + "flos": 563720162304.0, + "grad_norm": 0.034528332603885874, + "language_loss": 0.82661986, + "learning_rate": 0.0003815059313337279, + "loss": 0.83707285, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.41015625, + "step": 3061, + "time_per_iteration": 2.6512649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_mlp": 1.00339055, + "epoch": 0.5890727202770296, + "flos": 555853195008.0, + "grad_norm": 0.028645191608940447, + "language_loss": 0.78527474, + "learning_rate": 0.00038120328709300436, + "loss": 0.79571807, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.40942383, + "step": 3062, + "time_per_iteration": 2.839588165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_mlp": 1.00321651, + "epoch": 0.5892651019622932, + "flos": 656702587392.0, + "grad_norm": 0.03868775593308096, + "language_loss": 0.83858323, + "learning_rate": 0.0003809006889732549, + "loss": 0.84902555, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.41015625, + "step": 3063, + "time_per_iteration": 2.80668306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044985, + "balance_loss_mlp": 1.00395334, + "epoch": 0.5894574836475568, + "flos": 454132829952.0, + "grad_norm": 0.034675820144419535, + "language_loss": 0.8846643, + "learning_rate": 0.0003805981370919589, + "loss": 0.89511412, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.41040039, + "step": 3064, + "time_per_iteration": 2.4926044940948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.00509965, + "epoch": 0.5896498653328203, + "flos": 520112109312.0, + "grad_norm": 0.03109338069781882, + "language_loss": 0.843858, + "learning_rate": 0.0003802956315665771, + "loss": 0.85432076, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.41186523, + "step": 3065, + "time_per_iteration": 2.6821701526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00530875, + "epoch": 0.5898422470180839, + "flos": 550084930560.0, + "grad_norm": 0.039548358411626815, + "language_loss": 0.82298601, + "learning_rate": 0.0003799931725145529, + "loss": 0.83345109, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.41210938, + "step": 3066, + "time_per_iteration": 2.6161272525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_mlp": 1.00532758, + "epoch": 0.5900346287033474, + "flos": 525380731392.0, + "grad_norm": 0.034195441532662435, + "language_loss": 0.86171907, + "learning_rate": 0.00037969076005331083, + "loss": 0.87218219, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.40991211, + "step": 3067, + "time_per_iteration": 2.769503116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046067, + "balance_loss_mlp": 1.00515461, + "epoch": 0.590227010388611, + "flos": 568215988224.0, + "grad_norm": 0.03443045458348014, + "language_loss": 0.88715112, + "learning_rate": 0.00037938839430025817, + "loss": 0.8976118, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.40917969, + "step": 3068, + "time_per_iteration": 2.626838207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046372, + "balance_loss_mlp": 1.00557816, + "epoch": 0.5904193920738746, + "flos": 584456368128.0, + "grad_norm": 0.03106221395948033, + "language_loss": 0.86157519, + "learning_rate": 0.0003790860753727835, + "loss": 0.8720389, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.40795898, + "step": 3069, + "time_per_iteration": 2.825906991958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_mlp": 1.0041821, + "epoch": 0.5906117737591381, + "flos": 530797107456.0, + "grad_norm": 0.033655572520404166, + "language_loss": 0.83318973, + "learning_rate": 0.00037878380338825766, + "loss": 0.84363884, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.40722656, + "step": 3070, + "time_per_iteration": 2.6605753898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_mlp": 1.00264668, + "epoch": 0.5908041554444017, + "flos": 685516697856.0, + "grad_norm": 0.032255816781200916, + "language_loss": 0.81519401, + "learning_rate": 0.00037848157846403287, + "loss": 0.82562816, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.40771484, + "step": 3071, + "time_per_iteration": 2.8913676738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_mlp": 1.00712073, + "epoch": 0.5909965371296653, + "flos": 551133792768.0, + "grad_norm": 0.033304308768315895, + "language_loss": 0.83666503, + "learning_rate": 0.0003781794007174435, + "loss": 0.84714377, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.4074707, + "step": 3072, + "time_per_iteration": 2.7170376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044559, + "balance_loss_mlp": 1.00498199, + "epoch": 0.5911889188149289, + "flos": 1495645038336.0, + "grad_norm": 0.0062576164066865435, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7511909, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.39550781, + "step": 3073, + "time_per_iteration": 4.848031282424927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053183, + "balance_loss_mlp": 1.01248538, + "epoch": 0.5913813005001923, + "flos": 488886291456.0, + "grad_norm": 0.03164327687157731, + "language_loss": 0.81542623, + "learning_rate": 0.0003775751872264152, + "loss": 0.82595801, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.40698242, + "step": 3074, + "time_per_iteration": 2.7835612297058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_mlp": 1.00721872, + "epoch": 0.5915736821854559, + "flos": 574522778880.0, + "grad_norm": 0.03137518576611995, + "language_loss": 0.87806273, + "learning_rate": 0.0003772731517165527, + "loss": 0.88854092, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.40600586, + "step": 3075, + "time_per_iteration": 2.7984819412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045123, + "balance_loss_mlp": 1.00451982, + "epoch": 0.5917660638707195, + "flos": 790861916160.0, + "grad_norm": 0.03467745447845496, + "language_loss": 0.83953345, + "learning_rate": 0.0003769711638534784, + "loss": 0.84998471, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.40600586, + "step": 3076, + "time_per_iteration": 2.9498283863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045943, + "balance_loss_mlp": 1.0053643, + "epoch": 0.5919584455559831, + "flos": 529756993536.0, + "grad_norm": 0.038274807826461636, + "language_loss": 0.7910676, + "learning_rate": 0.00037666922375443446, + "loss": 0.80152702, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.40576172, + "step": 3077, + "time_per_iteration": 2.595907211303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043771, + "balance_loss_mlp": 1.00312054, + "epoch": 0.5921508272412467, + "flos": 561753662208.0, + "grad_norm": 0.037448898185008676, + "language_loss": 0.82402956, + "learning_rate": 0.00037636733153664396, + "loss": 0.83446729, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.40649414, + "step": 3078, + "time_per_iteration": 2.8082337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050449, + "balance_loss_mlp": 1.00984669, + "epoch": 0.5923432089265102, + "flos": 564334510848.0, + "grad_norm": 0.04535413457726027, + "language_loss": 0.80388999, + "learning_rate": 0.0003760654873173124, + "loss": 0.81439447, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.40600586, + "step": 3079, + "time_per_iteration": 2.6586430072784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048929, + "balance_loss_mlp": 1.00832665, + "epoch": 0.5925355906117737, + "flos": 496751313408.0, + "grad_norm": 0.032303837876808815, + "language_loss": 0.82224989, + "learning_rate": 0.00037576369121362566, + "loss": 0.83273923, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.40600586, + "step": 3080, + "time_per_iteration": 2.5874335765838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_mlp": 1.00846922, + "epoch": 0.5927279722970373, + "flos": 567493736448.0, + "grad_norm": 0.03169427730059961, + "language_loss": 0.82085633, + "learning_rate": 0.0003754619433427516, + "loss": 0.83134699, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.40600586, + "step": 3081, + "time_per_iteration": 2.9037671089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_mlp": 1.00400662, + "epoch": 0.5929203539823009, + "flos": 668160381696.0, + "grad_norm": 0.04430970694991959, + "language_loss": 0.78507918, + "learning_rate": 0.0003751602438218392, + "loss": 0.79552627, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.40698242, + "step": 3082, + "time_per_iteration": 2.77486252784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_mlp": 1.00195801, + "epoch": 0.5931127356675644, + "flos": 556786384128.0, + "grad_norm": 0.03446517582568327, + "language_loss": 0.84122735, + "learning_rate": 0.0003748585927680186, + "loss": 0.8516537, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.40673828, + "step": 3083, + "time_per_iteration": 2.6401243209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047685, + "balance_loss_mlp": 1.00698733, + "epoch": 0.593305117352828, + "flos": 536243619072.0, + "grad_norm": 0.03379156982252967, + "language_loss": 0.83284605, + "learning_rate": 0.00037455699029840086, + "loss": 0.84332293, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.40698242, + "step": 3084, + "time_per_iteration": 2.6359477043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_mlp": 1.00723624, + "epoch": 0.5934974990380916, + "flos": 595058740992.0, + "grad_norm": 0.03375272766067447, + "language_loss": 0.84866869, + "learning_rate": 0.0003742554365300787, + "loss": 0.85914803, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.40698242, + "step": 3085, + "time_per_iteration": 2.7629523277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047831, + "balance_loss_mlp": 1.00727594, + "epoch": 0.5936898807233552, + "flos": 714015858432.0, + "grad_norm": 0.08464198739198994, + "language_loss": 0.79301089, + "learning_rate": 0.0003739539315801255, + "loss": 0.80348921, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.40551758, + "step": 3086, + "time_per_iteration": 2.9152019023895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.01004303, + "epoch": 0.5938822624086187, + "flos": 392749498368.0, + "grad_norm": 0.03659508144201786, + "language_loss": 0.92428821, + "learning_rate": 0.000373652475565596, + "loss": 0.93479371, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.4050293, + "step": 3087, + "time_per_iteration": 2.4702134132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050448, + "balance_loss_mlp": 1.00982189, + "epoch": 0.5940746440938822, + "flos": 481336219392.0, + "grad_norm": 0.034289442552625136, + "language_loss": 0.81692433, + "learning_rate": 0.00037335106860352587, + "loss": 0.82742882, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.40625, + "step": 3088, + "time_per_iteration": 2.675694704055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_mlp": 1.00322449, + "epoch": 0.5942670257791458, + "flos": 484307840256.0, + "grad_norm": 0.03351872550432346, + "language_loss": 0.8348605, + "learning_rate": 0.00037304971081093146, + "loss": 0.84530044, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.40771484, + "step": 3089, + "time_per_iteration": 2.5974292755126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_mlp": 1.00181389, + "epoch": 0.5944594074644094, + "flos": 549058422528.0, + "grad_norm": 0.03144984032595776, + "language_loss": 0.81257939, + "learning_rate": 0.00037274840230481024, + "loss": 0.82300425, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.40673828, + "step": 3090, + "time_per_iteration": 2.7465951442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_mlp": 1.00262976, + "epoch": 0.594651789149673, + "flos": 450129843456.0, + "grad_norm": 0.0354227551067568, + "language_loss": 0.79578584, + "learning_rate": 0.00037244714320214077, + "loss": 0.80621862, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.40649414, + "step": 3091, + "time_per_iteration": 2.532076597213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_mlp": 1.00489831, + "epoch": 0.5948441708349365, + "flos": 597466557696.0, + "grad_norm": 0.033875543124705955, + "language_loss": 0.83456963, + "learning_rate": 0.000372145933619882, + "loss": 0.84502512, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.40649414, + "step": 3092, + "time_per_iteration": 2.888296127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00502765, + "epoch": 0.5950365525202, + "flos": 549581397504.0, + "grad_norm": 0.03918584024885415, + "language_loss": 0.83476591, + "learning_rate": 0.000371844773674974, + "loss": 0.84522295, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.40673828, + "step": 3093, + "time_per_iteration": 2.641191244125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042191, + "balance_loss_mlp": 1.00146902, + "epoch": 0.5952289342054636, + "flos": 655964784384.0, + "grad_norm": 0.03345437818943746, + "language_loss": 0.82307684, + "learning_rate": 0.0003715436634843375, + "loss": 0.83349872, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.40722656, + "step": 3094, + "time_per_iteration": 2.8391387462615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_mlp": 1.00185752, + "epoch": 0.5954213158907272, + "flos": 604604470272.0, + "grad_norm": 0.028714859262846556, + "language_loss": 0.8123939, + "learning_rate": 0.00037124260316487355, + "loss": 0.82281804, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.40551758, + "step": 3095, + "time_per_iteration": 2.8300905227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_mlp": 1.00742722, + "epoch": 0.5956136975759908, + "flos": 487268767488.0, + "grad_norm": 0.03390156256560374, + "language_loss": 0.89901024, + "learning_rate": 0.0003709415928334643, + "loss": 0.90949249, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.40795898, + "step": 3096, + "time_per_iteration": 2.594320297241211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_mlp": 1.00376081, + "epoch": 0.5958060792612543, + "flos": 660041647872.0, + "grad_norm": 0.036547009459556086, + "language_loss": 0.8143428, + "learning_rate": 0.00037064063260697233, + "loss": 0.82478929, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.40893555, + "step": 3097, + "time_per_iteration": 2.853452205657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00397766, + "epoch": 0.5959984609465179, + "flos": 724996364544.0, + "grad_norm": 0.03336502037481855, + "language_loss": 0.78911316, + "learning_rate": 0.0003703397226022407, + "loss": 0.79956114, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.40820312, + "step": 3098, + "time_per_iteration": 3.0299534797668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050957, + "balance_loss_mlp": 1.01147461, + "epoch": 0.5961908426317815, + "flos": 1523221703424.0, + "grad_norm": 0.010872658804754508, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76550829, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.39453125, + "step": 3099, + "time_per_iteration": 4.950707674026489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_mlp": 1.00387943, + "epoch": 0.596383224317045, + "flos": 533647219200.0, + "grad_norm": 0.033784299285581076, + "language_loss": 0.84084308, + "learning_rate": 0.0003697380537253339, + "loss": 0.85128987, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.40795898, + "step": 3100, + "time_per_iteration": 2.6651411056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044743, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5965756060023086, + "flos": 592367076864.0, + "grad_norm": 0.032025449945388196, + "language_loss": 0.82004619, + "learning_rate": 0.0003694372950867471, + "loss": 0.83049357, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.40795898, + "step": 3101, + "time_per_iteration": 2.7825992107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_mlp": 1.00341129, + "epoch": 0.5967679876875721, + "flos": 863470717440.0, + "grad_norm": 0.0338522286072748, + "language_loss": 0.78029126, + "learning_rate": 0.0003691365871370976, + "loss": 0.79073191, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.40649414, + "step": 3102, + "time_per_iteration": 3.0174319744110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044179, + "balance_loss_mlp": 1.00340927, + "epoch": 0.5969603693728357, + "flos": 554878209792.0, + "grad_norm": 0.03201933469342105, + "language_loss": 0.85875535, + "learning_rate": 0.00036883592999313093, + "loss": 0.86919713, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.40771484, + "step": 3103, + "time_per_iteration": 2.683260679244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_mlp": 1.00314891, + "epoch": 0.5971527510580993, + "flos": 719937712896.0, + "grad_norm": 0.039464615758245, + "language_loss": 0.79932439, + "learning_rate": 0.0003685353237715722, + "loss": 0.80976272, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.40673828, + "step": 3104, + "time_per_iteration": 2.8593432903289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_mlp": 1.00312221, + "epoch": 0.5973451327433629, + "flos": 648863810304.0, + "grad_norm": 0.031062495288944163, + "language_loss": 0.82383978, + "learning_rate": 0.0003682347685891274, + "loss": 0.83427846, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.4074707, + "step": 3105, + "time_per_iteration": 2.840812921524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_mlp": 1.00504565, + "epoch": 0.5975375144286263, + "flos": 723090135552.0, + "grad_norm": 0.03430317325592521, + "language_loss": 0.81334996, + "learning_rate": 0.0003679342645624822, + "loss": 0.82380736, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.40698242, + "step": 3106, + "time_per_iteration": 2.961186408996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.00520086, + "epoch": 0.5977298961138899, + "flos": 752344595712.0, + "grad_norm": 0.03201923744385334, + "language_loss": 0.82261443, + "learning_rate": 0.0003676338118083025, + "loss": 0.83307385, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.4074707, + "step": 3107, + "time_per_iteration": 2.9809908866882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105353, + "balance_loss_mlp": 1.01264107, + "epoch": 0.5979222777991535, + "flos": 531999559680.0, + "grad_norm": 0.03643788911431517, + "language_loss": 0.79681456, + "learning_rate": 0.0003673334104432347, + "loss": 0.8073498, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.40893555, + "step": 3108, + "time_per_iteration": 2.5879976749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_mlp": 1.01157844, + "epoch": 0.5981146594844171, + "flos": 622915362816.0, + "grad_norm": 0.031178647905512342, + "language_loss": 0.84073299, + "learning_rate": 0.0003670330605839048, + "loss": 0.85125697, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.40820312, + "step": 3109, + "time_per_iteration": 2.843069314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_mlp": 1.00877023, + "epoch": 0.5983070411696807, + "flos": 604710428160.0, + "grad_norm": 0.03611015998230635, + "language_loss": 0.77344596, + "learning_rate": 0.0003667327623469191, + "loss": 0.7839421, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.40844727, + "step": 3110, + "time_per_iteration": 2.7326698303222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00438774, + "epoch": 0.5984994228549442, + "flos": 634670610432.0, + "grad_norm": 0.03877534508876671, + "language_loss": 0.78326917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79372144, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.40844727, + "step": 3111, + "time_per_iteration": 2.784482717514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105463, + "balance_loss_mlp": 1.01369393, + "epoch": 0.5986918045402078, + "flos": 526294478592.0, + "grad_norm": 0.03280596002015671, + "language_loss": 0.82781613, + "learning_rate": 0.00036613232120630393, + "loss": 0.83836246, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.40942383, + "step": 3112, + "time_per_iteration": 2.5862860679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105133, + "balance_loss_mlp": 1.0103699, + "epoch": 0.5988841862254713, + "flos": 484140644352.0, + "grad_norm": 0.03859230842611924, + "language_loss": 0.80514455, + "learning_rate": 0.00036583217853578643, + "loss": 0.81565785, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.40966797, + "step": 3113, + "time_per_iteration": 2.565713405609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048995, + "balance_loss_mlp": 1.00805807, + "epoch": 0.5990765679107349, + "flos": 1142123451648.0, + "grad_norm": 0.034390898471739054, + "language_loss": 0.77730286, + "learning_rate": 0.000365532087953837, + "loss": 0.78779286, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.40942383, + "step": 3114, + "time_per_iteration": 3.646124839782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00889242, + "epoch": 0.5992689495959984, + "flos": 518019242496.0, + "grad_norm": 0.033850887819700186, + "language_loss": 0.89597213, + "learning_rate": 0.00036523204957696065, + "loss": 0.90647066, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.40966797, + "step": 3115, + "time_per_iteration": 2.594458818435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050487, + "balance_loss_mlp": 1.00952673, + "epoch": 0.599461331281262, + "flos": 745942540800.0, + "grad_norm": 0.044244117222237124, + "language_loss": 0.81526911, + "learning_rate": 0.00036493206352164324, + "loss": 0.82577395, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.40966797, + "step": 3116, + "time_per_iteration": 2.9088714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046863, + "balance_loss_mlp": 1.0058552, + "epoch": 0.5996537129665256, + "flos": 593484958464.0, + "grad_norm": 0.034019953192927346, + "language_loss": 0.85863578, + "learning_rate": 0.000364632129904349, + "loss": 0.8691045, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.41015625, + "step": 3117, + "time_per_iteration": 2.7059812545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055187, + "balance_loss_mlp": 1.01415479, + "epoch": 0.5998460946517892, + "flos": 560116696320.0, + "grad_norm": 0.0363455836603733, + "language_loss": 0.78243721, + "learning_rate": 0.00036433224884152283, + "loss": 0.79298902, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.41040039, + "step": 3118, + "time_per_iteration": 2.7368576526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049846, + "balance_loss_mlp": 1.00879073, + "epoch": 0.6000384763370528, + "flos": 485536537344.0, + "grad_norm": 0.037553840644260136, + "language_loss": 0.78583586, + "learning_rate": 0.00036403242044958875, + "loss": 0.79633433, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.41064453, + "step": 3119, + "time_per_iteration": 2.5575714111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105366, + "balance_loss_mlp": 1.01267588, + "epoch": 0.6002308580223162, + "flos": 597878717184.0, + "grad_norm": 0.03820222884564333, + "language_loss": 0.91700655, + "learning_rate": 0.0003637326448449507, + "loss": 0.9275431, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.40991211, + "step": 3120, + "time_per_iteration": 2.742879629135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_mlp": 1.00335419, + "epoch": 0.6004232397075798, + "flos": 546220949760.0, + "grad_norm": 0.03312076086842182, + "language_loss": 0.86720824, + "learning_rate": 0.00036343292214399177, + "loss": 0.87765157, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.40991211, + "step": 3121, + "time_per_iteration": 2.827937364578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048688, + "balance_loss_mlp": 1.00777555, + "epoch": 0.6006156213928434, + "flos": 631151715072.0, + "grad_norm": 0.0990751082853954, + "language_loss": 0.77571696, + "learning_rate": 0.00036313325246307456, + "loss": 0.78620386, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.40917969, + "step": 3122, + "time_per_iteration": 2.844771146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044272, + "balance_loss_mlp": 1.00347829, + "epoch": 0.600808003078107, + "flos": 583405560576.0, + "grad_norm": 0.0330511855915857, + "language_loss": 0.87869143, + "learning_rate": 0.0003628336359185411, + "loss": 0.88913417, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.40795898, + "step": 3123, + "time_per_iteration": 2.728536367416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_mlp": 1.00810611, + "epoch": 0.6010003847633705, + "flos": 636439779072.0, + "grad_norm": 0.035612142743683524, + "language_loss": 0.75946915, + "learning_rate": 0.000362534072626713, + "loss": 0.76995575, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.40551758, + "step": 3124, + "time_per_iteration": 2.7660484313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049761, + "balance_loss_mlp": 1.00915837, + "epoch": 0.6011927664486341, + "flos": 720031031808.0, + "grad_norm": 0.034873879848328126, + "language_loss": 0.81774855, + "learning_rate": 0.00036223456270389093, + "loss": 0.82824624, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.40600586, + "step": 3125, + "time_per_iteration": 2.943265676498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050049, + "balance_loss_mlp": 1.00939894, + "epoch": 0.6013851481338977, + "flos": 500055380736.0, + "grad_norm": 0.03349756434082021, + "language_loss": 0.81548929, + "learning_rate": 0.00036193510626635517, + "loss": 0.82598984, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.40649414, + "step": 3126, + "time_per_iteration": 2.7160630226135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049829, + "balance_loss_mlp": 1.00929773, + "epoch": 0.6015775298191612, + "flos": 750876771072.0, + "grad_norm": 0.03275922867012815, + "language_loss": 0.81968188, + "learning_rate": 0.0003616357034303649, + "loss": 0.83018017, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.40527344, + "step": 3127, + "time_per_iteration": 2.9286913871765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_mlp": 1.00725627, + "epoch": 0.6017699115044248, + "flos": 594264557568.0, + "grad_norm": 0.02908266373706377, + "language_loss": 0.79201299, + "learning_rate": 0.0003613363543121584, + "loss": 0.80249178, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.40625, + "step": 3128, + "time_per_iteration": 2.917598009109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_mlp": 1.00568318, + "epoch": 0.6019622931896883, + "flos": 516202441728.0, + "grad_norm": 0.031364349484999776, + "language_loss": 0.85277975, + "learning_rate": 0.00036103705902795357, + "loss": 0.86324257, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.40600586, + "step": 3129, + "time_per_iteration": 2.7694129943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_mlp": 1.01047814, + "epoch": 0.6021546748749519, + "flos": 491473943040.0, + "grad_norm": 0.0392414269589035, + "language_loss": 0.80161059, + "learning_rate": 0.0003607378176939471, + "loss": 0.81212205, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.40673828, + "step": 3130, + "time_per_iteration": 2.622267961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055595, + "balance_loss_mlp": 1.01494503, + "epoch": 0.6023470565602155, + "flos": 542115896064.0, + "grad_norm": 0.037876950900112984, + "language_loss": 0.82781708, + "learning_rate": 0.00036043863042631465, + "loss": 0.83837301, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.40649414, + "step": 3131, + "time_per_iteration": 2.7120039463043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_mlp": 1.01163399, + "epoch": 0.6025394382454791, + "flos": 846464344320.0, + "grad_norm": 0.039947813860245845, + "language_loss": 0.76966566, + "learning_rate": 0.00036013949734121133, + "loss": 0.78018856, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.40649414, + "step": 3132, + "time_per_iteration": 3.127255916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050529, + "balance_loss_mlp": 1.00990224, + "epoch": 0.6027318199307425, + "flos": 578258447616.0, + "grad_norm": 0.03419044123662342, + "language_loss": 0.8313787, + "learning_rate": 0.00035984041855477043, + "loss": 0.84188402, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.40625, + "step": 3133, + "time_per_iteration": 2.7259347438812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051548, + "balance_loss_mlp": 1.01216125, + "epoch": 0.6029242016160061, + "flos": 1474255600128.0, + "grad_norm": 0.0070819988580959, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79761446, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.39355469, + "step": 3134, + "time_per_iteration": 4.934648513793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_mlp": 1.01171601, + "epoch": 0.6031165833012697, + "flos": 481783372032.0, + "grad_norm": 0.03833547758664617, + "language_loss": 0.80612588, + "learning_rate": 0.00035924242434230637, + "loss": 0.81664813, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.4050293, + "step": 3135, + "time_per_iteration": 2.691655397415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_mlp": 1.00985157, + "epoch": 0.6033089649865333, + "flos": 500465594880.0, + "grad_norm": 0.04302606138210952, + "language_loss": 0.79556847, + "learning_rate": 0.00035894350914844516, + "loss": 0.80607277, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.40576172, + "step": 3136, + "time_per_iteration": 2.6602935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048879, + "balance_loss_mlp": 1.00827622, + "epoch": 0.6035013466717969, + "flos": 557724430848.0, + "grad_norm": 0.03619946216792389, + "language_loss": 0.83608747, + "learning_rate": 0.0003586446487175703, + "loss": 0.84657621, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.40600586, + "step": 3137, + "time_per_iteration": 2.7028918266296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050046, + "balance_loss_mlp": 1.00944352, + "epoch": 0.6036937283570604, + "flos": 595996787712.0, + "grad_norm": 0.03316873106558702, + "language_loss": 0.8565768, + "learning_rate": 0.0003583458431657099, + "loss": 0.86707723, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.40600586, + "step": 3138, + "time_per_iteration": 2.730760097503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051204, + "balance_loss_mlp": 1.01048255, + "epoch": 0.603886110042324, + "flos": 542059515648.0, + "grad_norm": 0.041412274215224906, + "language_loss": 0.83086127, + "learning_rate": 0.00035804709260887056, + "loss": 0.84137332, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.40722656, + "step": 3139, + "time_per_iteration": 2.6989586353302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049269, + "balance_loss_mlp": 1.00852323, + "epoch": 0.6040784917275875, + "flos": 519656208384.0, + "grad_norm": 0.031983597535220364, + "language_loss": 0.89732921, + "learning_rate": 0.0003577483971630373, + "loss": 0.90782189, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.4074707, + "step": 3140, + "time_per_iteration": 2.697202205657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_mlp": 1.00888968, + "epoch": 0.6042708734128511, + "flos": 662014950912.0, + "grad_norm": 0.02881540865080385, + "language_loss": 0.85653752, + "learning_rate": 0.00035744975694417414, + "loss": 0.86703384, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.4074707, + "step": 3141, + "time_per_iteration": 2.853609085083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049159, + "balance_loss_mlp": 1.00838912, + "epoch": 0.6044632550981146, + "flos": 573517658112.0, + "grad_norm": 0.037282810981105224, + "language_loss": 0.83199489, + "learning_rate": 0.00035715117206822344, + "loss": 0.8424865, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.40771484, + "step": 3142, + "time_per_iteration": 2.778184175491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_mlp": 1.00812936, + "epoch": 0.6046556367833782, + "flos": 547729603584.0, + "grad_norm": 0.035085942615977306, + "language_loss": 0.81379992, + "learning_rate": 0.0003568526426511065, + "loss": 0.82428956, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.40844727, + "step": 3143, + "time_per_iteration": 2.626789093017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047012, + "balance_loss_mlp": 1.00612307, + "epoch": 0.6048480184686418, + "flos": 778175424768.0, + "grad_norm": 0.035762108913210126, + "language_loss": 0.83504343, + "learning_rate": 0.000356554168808722, + "loss": 0.84551358, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.40893555, + "step": 3144, + "time_per_iteration": 2.987703323364258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_mlp": 1.00229406, + "epoch": 0.6050404001539054, + "flos": 658376491776.0, + "grad_norm": 0.03425886740508031, + "language_loss": 0.85222483, + "learning_rate": 0.00035625575065694837, + "loss": 0.86265695, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.40917969, + "step": 3145, + "time_per_iteration": 2.8534908294677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_mlp": 1.00359786, + "epoch": 0.605232781839169, + "flos": 550082985216.0, + "grad_norm": 0.03070859084954421, + "language_loss": 0.78136766, + "learning_rate": 0.0003559573883116415, + "loss": 0.79181373, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.41015625, + "step": 3146, + "time_per_iteration": 2.701352119445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_mlp": 1.00323021, + "epoch": 0.6054251635244324, + "flos": 606642902016.0, + "grad_norm": 0.029138241099590467, + "language_loss": 0.8591851, + "learning_rate": 0.00035565908188863604, + "loss": 0.8696267, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.40942383, + "step": 3147, + "time_per_iteration": 2.919374465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_mlp": 1.00640118, + "epoch": 0.605617545209696, + "flos": 614809267968.0, + "grad_norm": 0.029609984696998014, + "language_loss": 0.8021152, + "learning_rate": 0.00035536083150374464, + "loss": 0.81258953, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.41040039, + "step": 3148, + "time_per_iteration": 2.7596092224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053307, + "balance_loss_mlp": 1.01382446, + "epoch": 0.6058099268949596, + "flos": 1501610634240.0, + "grad_norm": 0.006207951084567088, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75801259, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.39453125, + "step": 3149, + "time_per_iteration": 4.876317739486694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051295, + "balance_loss_mlp": 1.01014411, + "epoch": 0.6060023085802232, + "flos": 671705521920.0, + "grad_norm": 0.034498143829504634, + "language_loss": 0.86414444, + "learning_rate": 0.0003547644993114475, + "loss": 0.87465739, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.41162109, + "step": 3150, + "time_per_iteration": 2.845522403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052323, + "balance_loss_mlp": 1.01110053, + "epoch": 0.6061946902654868, + "flos": 607306828032.0, + "grad_norm": 0.035670233665724194, + "language_loss": 0.80287176, + "learning_rate": 0.00035446641773555806, + "loss": 0.81339502, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.41235352, + "step": 3151, + "time_per_iteration": 2.7565760612487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_mlp": 1.01236176, + "epoch": 0.6063870719507503, + "flos": 558953127936.0, + "grad_norm": 0.031088575801094406, + "language_loss": 0.8789348, + "learning_rate": 0.000354168392660816, + "loss": 0.88947117, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.4128418, + "step": 3152, + "time_per_iteration": 2.747297525405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049898, + "balance_loss_mlp": 1.00865126, + "epoch": 0.6065794536360138, + "flos": 558282398976.0, + "grad_norm": 0.032072657791302916, + "language_loss": 0.83342856, + "learning_rate": 0.0003538704242029252, + "loss": 0.84392756, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.41259766, + "step": 3153, + "time_per_iteration": 2.7606263160705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050833, + "balance_loss_mlp": 1.0096823, + "epoch": 0.6067718353212774, + "flos": 691382171904.0, + "grad_norm": 0.035512545115511426, + "language_loss": 0.78534603, + "learning_rate": 0.0003535725124775672, + "loss": 0.79585433, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.41162109, + "step": 3154, + "time_per_iteration": 2.832683801651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046236, + "balance_loss_mlp": 1.00510859, + "epoch": 0.606964217006541, + "flos": 522903895296.0, + "grad_norm": 0.031701324925560485, + "language_loss": 0.87189692, + "learning_rate": 0.00035327465760040126, + "loss": 0.88235927, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.41137695, + "step": 3155, + "time_per_iteration": 2.6946585178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_mlp": 1.00643396, + "epoch": 0.6071565986918045, + "flos": 642713521920.0, + "grad_norm": 0.0351469249432502, + "language_loss": 0.85231131, + "learning_rate": 0.00035297685968706526, + "loss": 0.86278605, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.41040039, + "step": 3156, + "time_per_iteration": 2.7586491107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045389, + "balance_loss_mlp": 1.00416672, + "epoch": 0.6073489803770681, + "flos": 561653540352.0, + "grad_norm": 0.03543028352480344, + "language_loss": 0.83488154, + "learning_rate": 0.00035267911885317454, + "loss": 0.84533542, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.41235352, + "step": 3157, + "time_per_iteration": 2.678812026977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051077, + "balance_loss_mlp": 1.00997388, + "epoch": 0.6075413620623317, + "flos": 587202467328.0, + "grad_norm": 0.03110064511501168, + "language_loss": 0.81796658, + "learning_rate": 0.0003523814352143222, + "loss": 0.82847732, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.41113281, + "step": 3158, + "time_per_iteration": 2.8277432918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052557, + "balance_loss_mlp": 1.01128709, + "epoch": 0.6077337437475953, + "flos": 631972143360.0, + "grad_norm": 0.03468149601951464, + "language_loss": 0.9173736, + "learning_rate": 0.00035208380888607937, + "loss": 0.92789918, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.4128418, + "step": 3159, + "time_per_iteration": 2.787712574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_mlp": 1.01289368, + "epoch": 0.6079261254328588, + "flos": 1471626152448.0, + "grad_norm": 0.014144477200468554, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80514455, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.39550781, + "step": 3160, + "time_per_iteration": 4.879680871963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053925, + "balance_loss_mlp": 1.01444244, + "epoch": 0.6081185071181223, + "flos": 1526205963264.0, + "grad_norm": 0.006801374803666016, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76746154, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.39453125, + "step": 3161, + "time_per_iteration": 5.0031373500823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051684, + "balance_loss_mlp": 1.0106045, + "epoch": 0.6083108888033859, + "flos": 557435725824.0, + "grad_norm": 0.030142563258654227, + "language_loss": 0.82224369, + "learning_rate": 0.00035119127492038446, + "loss": 0.83276057, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.41088867, + "step": 3162, + "time_per_iteration": 2.80432391166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053395, + "balance_loss_mlp": 1.01229131, + "epoch": 0.6085032704886495, + "flos": 842556622080.0, + "grad_norm": 0.03512464115253957, + "language_loss": 0.83202064, + "learning_rate": 0.00035089387898984436, + "loss": 0.84255463, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.41113281, + "step": 3163, + "time_per_iteration": 3.1297876834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_mlp": 1.008147, + "epoch": 0.6086956521739131, + "flos": 685993986048.0, + "grad_norm": 0.03637672327155598, + "language_loss": 0.82543135, + "learning_rate": 0.0003505965409474343, + "loss": 0.83592415, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.41137695, + "step": 3164, + "time_per_iteration": 2.9028842449188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_mlp": 1.00382376, + "epoch": 0.6088880338591766, + "flos": 536866715904.0, + "grad_norm": 0.035078655431856474, + "language_loss": 0.86721897, + "learning_rate": 0.0003502992609085913, + "loss": 0.87766796, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.41088867, + "step": 3165, + "time_per_iteration": 2.752734422683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045246, + "balance_loss_mlp": 1.0041908, + "epoch": 0.6090804155444401, + "flos": 732882773760.0, + "grad_norm": 0.030998406489771316, + "language_loss": 0.82771933, + "learning_rate": 0.00035000203898872954, + "loss": 0.83817178, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.41064453, + "step": 3166, + "time_per_iteration": 2.9903385639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_mlp": 1.00420833, + "epoch": 0.6092727972297037, + "flos": 700243566336.0, + "grad_norm": 0.03412494871544842, + "language_loss": 0.85219544, + "learning_rate": 0.0003497048753032406, + "loss": 0.86264783, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.41040039, + "step": 3167, + "time_per_iteration": 2.8939006328582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052068, + "balance_loss_mlp": 1.01117909, + "epoch": 0.6094651789149673, + "flos": 1053677681664.0, + "grad_norm": 0.032839303584214885, + "language_loss": 0.81472063, + "learning_rate": 0.000349407769967494, + "loss": 0.82524133, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.40893555, + "step": 3168, + "time_per_iteration": 3.384226083755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_mlp": 1.0035919, + "epoch": 0.6096575606002309, + "flos": 504095305728.0, + "grad_norm": 0.03315731648792901, + "language_loss": 0.85102254, + "learning_rate": 0.0003491107230968361, + "loss": 0.86146903, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.41064453, + "step": 3169, + "time_per_iteration": 2.6621110439300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104943, + "balance_loss_mlp": 1.00837409, + "epoch": 0.6098499422854944, + "flos": 586864184832.0, + "grad_norm": 0.02773637180026576, + "language_loss": 0.82196522, + "learning_rate": 0.00034881373480659085, + "loss": 0.83245957, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.41064453, + "step": 3170, + "time_per_iteration": 2.8139965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00795305, + "epoch": 0.610042323970758, + "flos": 470160327168.0, + "grad_norm": 0.03906179499333773, + "language_loss": 0.78314018, + "learning_rate": 0.0003485168052120594, + "loss": 0.79363, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.41040039, + "step": 3171, + "time_per_iteration": 2.5483758449554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052409, + "balance_loss_mlp": 1.01142442, + "epoch": 0.6102347056560216, + "flos": 515199266304.0, + "grad_norm": 0.03618411847150492, + "language_loss": 0.80390579, + "learning_rate": 0.00034821993442851973, + "loss": 0.81442988, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.40991211, + "step": 3172, + "time_per_iteration": 2.590830087661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_mlp": 1.00884163, + "epoch": 0.6104270873412851, + "flos": 469964941056.0, + "grad_norm": 0.03897584044245514, + "language_loss": 0.82572639, + "learning_rate": 0.00034792312257122735, + "loss": 0.83622348, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.40869141, + "step": 3173, + "time_per_iteration": 2.594754457473755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_mlp": 1.00834739, + "epoch": 0.6106194690265486, + "flos": 550940352000.0, + "grad_norm": 0.03632239406226319, + "language_loss": 0.81349075, + "learning_rate": 0.00034762636975541506, + "loss": 0.82398319, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.40893555, + "step": 3174, + "time_per_iteration": 2.6291897296905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046645, + "balance_loss_mlp": 1.00563669, + "epoch": 0.6108118507118122, + "flos": 473881411584.0, + "grad_norm": 0.03249903592127121, + "language_loss": 0.81528097, + "learning_rate": 0.0003473296760962923, + "loss": 0.82574743, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.41015625, + "step": 3175, + "time_per_iteration": 2.6912500858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052322, + "balance_loss_mlp": 1.01264954, + "epoch": 0.6110042323970758, + "flos": 1448182731264.0, + "grad_norm": 0.007043978800011362, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79586065, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.39648438, + "step": 3176, + "time_per_iteration": 4.679258108139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00635707, + "epoch": 0.6111966140823394, + "flos": 795542434560.0, + "grad_norm": 0.03450548999539666, + "language_loss": 0.81482762, + "learning_rate": 0.00034673646670883976, + "loss": 0.82530034, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.40917969, + "step": 3177, + "time_per_iteration": 2.9776415824890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104406, + "balance_loss_mlp": 1.0043869, + "epoch": 0.611388995767603, + "flos": 1561066349568.0, + "grad_norm": 0.006895739494838764, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76759082, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.39648438, + "step": 3178, + "time_per_iteration": 4.9859678745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046014, + "balance_loss_mlp": 1.00512564, + "epoch": 0.6115813774528664, + "flos": 713486080512.0, + "grad_norm": 0.037712756689321836, + "language_loss": 0.81948996, + "learning_rate": 0.0003461434953300865, + "loss": 0.82995009, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.40893555, + "step": 3179, + "time_per_iteration": 2.9206619262695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046107, + "balance_loss_mlp": 1.0051471, + "epoch": 0.61177375913813, + "flos": 685690696704.0, + "grad_norm": 0.02737860550975636, + "language_loss": 0.81828141, + "learning_rate": 0.0003458470991817515, + "loss": 0.8287425, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.40966797, + "step": 3180, + "time_per_iteration": 3.0038623809814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046359, + "balance_loss_mlp": 1.00537503, + "epoch": 0.6119661408233936, + "flos": 512667995136.0, + "grad_norm": 0.03551722244255775, + "language_loss": 0.85187316, + "learning_rate": 0.0003455507628808802, + "loss": 0.86233675, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.40991211, + "step": 3181, + "time_per_iteration": 2.623522996902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048614, + "balance_loss_mlp": 1.0076772, + "epoch": 0.6121585225086572, + "flos": 557856633600.0, + "grad_norm": 0.04043393522454786, + "language_loss": 0.85139406, + "learning_rate": 0.00034525448654252076, + "loss": 0.86188018, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.40942383, + "step": 3182, + "time_per_iteration": 2.701493501663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053271, + "balance_loss_mlp": 1.0125016, + "epoch": 0.6123509041939207, + "flos": 562910427648.0, + "grad_norm": 0.044342295152579134, + "language_loss": 0.83549857, + "learning_rate": 0.0003449582702816976, + "loss": 0.84603125, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.40771484, + "step": 3183, + "time_per_iteration": 2.6956191062927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050719, + "balance_loss_mlp": 1.00980616, + "epoch": 0.6125432858791843, + "flos": 559131017472.0, + "grad_norm": 0.0337797622344846, + "language_loss": 0.833462, + "learning_rate": 0.0003446621142134122, + "loss": 0.84396923, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.40917969, + "step": 3184, + "time_per_iteration": 2.639289379119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049865, + "balance_loss_mlp": 1.0089761, + "epoch": 0.6127356675644479, + "flos": 415897411584.0, + "grad_norm": 0.038637283425345254, + "language_loss": 0.84757721, + "learning_rate": 0.0003443660184526424, + "loss": 0.85807586, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.40893555, + "step": 3185, + "time_per_iteration": 2.4257092475891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_mlp": 1.00855243, + "epoch": 0.6129280492497114, + "flos": 605034126336.0, + "grad_norm": 0.03183522344564459, + "language_loss": 0.86949629, + "learning_rate": 0.0003440699831143429, + "loss": 0.87999046, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.40869141, + "step": 3186, + "time_per_iteration": 2.775930404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051794, + "balance_loss_mlp": 1.01092947, + "epoch": 0.613120430934975, + "flos": 520865463552.0, + "grad_norm": 0.03426856833524134, + "language_loss": 0.82819283, + "learning_rate": 0.0003437740083134449, + "loss": 0.83871073, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.40869141, + "step": 3187, + "time_per_iteration": 2.696072816848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049174, + "balance_loss_mlp": 1.00835705, + "epoch": 0.6133128126202385, + "flos": 512081836800.0, + "grad_norm": 0.03992475023697304, + "language_loss": 0.84158587, + "learning_rate": 0.00034347809416485574, + "loss": 0.8520776, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.40820312, + "step": 3188, + "time_per_iteration": 2.6222550868988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052376, + "balance_loss_mlp": 1.01158273, + "epoch": 0.6135051943055021, + "flos": 608757156096.0, + "grad_norm": 0.032577275408737616, + "language_loss": 0.82338852, + "learning_rate": 0.0003431822407834597, + "loss": 0.83391231, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.40795898, + "step": 3189, + "time_per_iteration": 2.818133592605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050392, + "balance_loss_mlp": 1.00959849, + "epoch": 0.6136975759907657, + "flos": 1162010072064.0, + "grad_norm": 0.04434341362834108, + "language_loss": 0.84634304, + "learning_rate": 0.00034288644828411706, + "loss": 0.85684693, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.40795898, + "step": 3190, + "time_per_iteration": 3.4801251888275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_mlp": 1.00911534, + "epoch": 0.6138899576760293, + "flos": 708173716992.0, + "grad_norm": 0.03680261410998276, + "language_loss": 0.76343262, + "learning_rate": 0.0003425907167816649, + "loss": 0.77393216, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.40844727, + "step": 3191, + "time_per_iteration": 2.859435558319092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049983, + "balance_loss_mlp": 1.00914156, + "epoch": 0.6140823393612928, + "flos": 587619484416.0, + "grad_norm": 0.036153352426406216, + "language_loss": 0.85233247, + "learning_rate": 0.00034229504639091623, + "loss": 0.86283231, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.40844727, + "step": 3192, + "time_per_iteration": 2.7828218936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_mlp": 1.00656581, + "epoch": 0.6142747210465563, + "flos": 805619887104.0, + "grad_norm": 0.035035162625632645, + "language_loss": 0.80565524, + "learning_rate": 0.0003419994372266606, + "loss": 0.81612837, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.4074707, + "step": 3193, + "time_per_iteration": 3.1529080867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046771, + "balance_loss_mlp": 1.00593019, + "epoch": 0.6144671027318199, + "flos": 530545340928.0, + "grad_norm": 0.02881776150326524, + "language_loss": 0.82229221, + "learning_rate": 0.00034170388940366335, + "loss": 0.83275998, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.40844727, + "step": 3194, + "time_per_iteration": 2.733793258666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_mlp": 1.00598967, + "epoch": 0.6146594844170835, + "flos": 806913712896.0, + "grad_norm": 0.03443984664399312, + "language_loss": 0.8074832, + "learning_rate": 0.0003414084030366667, + "loss": 0.81795198, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.40893555, + "step": 3195, + "time_per_iteration": 3.1194753646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049907, + "balance_loss_mlp": 1.00906587, + "epoch": 0.6148518661023471, + "flos": 502762596096.0, + "grad_norm": 0.03247725998101352, + "language_loss": 0.83429492, + "learning_rate": 0.0003411129782403883, + "loss": 0.84479403, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.40844727, + "step": 3196, + "time_per_iteration": 2.701995849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_mlp": 1.00785387, + "epoch": 0.6150442477876106, + "flos": 511699812864.0, + "grad_norm": 0.05177418573029483, + "language_loss": 0.85667449, + "learning_rate": 0.0003408176151295225, + "loss": 0.86716187, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.40893555, + "step": 3197, + "time_per_iteration": 2.6645357608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046183, + "balance_loss_mlp": 1.0052464, + "epoch": 0.6152366294728742, + "flos": 527998518528.0, + "grad_norm": 0.03939493376677649, + "language_loss": 0.7823236, + "learning_rate": 0.00034052231381873944, + "loss": 0.79278541, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.40942383, + "step": 3198, + "time_per_iteration": 2.6415092945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049656, + "balance_loss_mlp": 1.00881481, + "epoch": 0.6154290111581378, + "flos": 474282877440.0, + "grad_norm": 0.04031967856737408, + "language_loss": 0.85886127, + "learning_rate": 0.00034022707442268494, + "loss": 0.86935782, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.40844727, + "step": 3199, + "time_per_iteration": 2.5885183811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_mlp": 1.00976777, + "epoch": 0.6156213928434013, + "flos": 551934779136.0, + "grad_norm": 0.028515598642512706, + "language_loss": 0.82251477, + "learning_rate": 0.0003399318970559813, + "loss": 0.83302015, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.40771484, + "step": 3200, + "time_per_iteration": 2.819209337234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050709, + "balance_loss_mlp": 1.00998724, + "epoch": 0.6158137745286649, + "flos": 752362092288.0, + "grad_norm": 0.030934752464501728, + "language_loss": 0.84934688, + "learning_rate": 0.00033963678183322656, + "loss": 0.85985398, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.40722656, + "step": 3201, + "time_per_iteration": 3.0306894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051067, + "balance_loss_mlp": 1.01027346, + "epoch": 0.6160061562139284, + "flos": 556905947904.0, + "grad_norm": 0.03121820045207164, + "language_loss": 0.83180207, + "learning_rate": 0.0003393417288689945, + "loss": 0.84231275, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.40795898, + "step": 3202, + "time_per_iteration": 2.748361587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050686, + "balance_loss_mlp": 1.00989294, + "epoch": 0.616198537899192, + "flos": 743467650048.0, + "grad_norm": 0.04116101332214976, + "language_loss": 0.76590461, + "learning_rate": 0.00033904673827783504, + "loss": 0.77641141, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.40795898, + "step": 3203, + "time_per_iteration": 2.9209775924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052852, + "balance_loss_mlp": 1.01193893, + "epoch": 0.6163909195844556, + "flos": 479775075840.0, + "grad_norm": 0.031654400686770015, + "language_loss": 0.82428539, + "learning_rate": 0.00033875181017427357, + "loss": 0.83481383, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.40917969, + "step": 3204, + "time_per_iteration": 2.6138155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104652, + "balance_loss_mlp": 1.00551248, + "epoch": 0.6165833012697192, + "flos": 532666397952.0, + "grad_norm": 0.03324868864618939, + "language_loss": 0.81742775, + "learning_rate": 0.00033845694467281133, + "loss": 0.82789296, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.41015625, + "step": 3205, + "time_per_iteration": 2.8665361404418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045816, + "balance_loss_mlp": 1.0049988, + "epoch": 0.6167756829549826, + "flos": 809295284736.0, + "grad_norm": 0.03418345099687322, + "language_loss": 0.83676243, + "learning_rate": 0.00033816214188792516, + "loss": 0.8472206, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.40820312, + "step": 3206, + "time_per_iteration": 3.176194190979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_mlp": 1.00504088, + "epoch": 0.6169680646402462, + "flos": 489910854144.0, + "grad_norm": 0.03420383958613512, + "language_loss": 0.8597641, + "learning_rate": 0.00033786740193406784, + "loss": 0.87022221, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.40771484, + "step": 3207, + "time_per_iteration": 2.60602068901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_mlp": 1.00528312, + "epoch": 0.6171604463255098, + "flos": 620204256768.0, + "grad_norm": 0.033645733240054064, + "language_loss": 0.81914175, + "learning_rate": 0.00033757272492566736, + "loss": 0.82960248, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.40795898, + "step": 3208, + "time_per_iteration": 2.929311990737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_mlp": 1.00502181, + "epoch": 0.6173528280107734, + "flos": 529895999232.0, + "grad_norm": 0.030436054236508022, + "language_loss": 0.87530887, + "learning_rate": 0.0003372781109771278, + "loss": 0.8857677, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.40869141, + "step": 3209, + "time_per_iteration": 2.725886821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_mlp": 1.00390351, + "epoch": 0.617545209696037, + "flos": 597737766144.0, + "grad_norm": 0.031193081131094685, + "language_loss": 0.77093422, + "learning_rate": 0.0003369835602028281, + "loss": 0.78138143, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.40820312, + "step": 3210, + "time_per_iteration": 2.7928357124328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_mlp": 1.00196826, + "epoch": 0.6177375913813005, + "flos": 476106481152.0, + "grad_norm": 0.036241731553070825, + "language_loss": 0.80260098, + "learning_rate": 0.0003366890727171232, + "loss": 0.81302822, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.4074707, + "step": 3211, + "time_per_iteration": 2.688157558441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046468, + "balance_loss_mlp": 1.00565052, + "epoch": 0.617929973066564, + "flos": 530881678080.0, + "grad_norm": 0.03703049785450956, + "language_loss": 0.7920953, + "learning_rate": 0.00033639464863434313, + "loss": 0.80255997, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.40820312, + "step": 3212, + "time_per_iteration": 2.6376640796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105003, + "balance_loss_mlp": 1.01045227, + "epoch": 0.6181223547518276, + "flos": 1422835026432.0, + "grad_norm": 0.010124003783497993, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79492497, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.39550781, + "step": 3213, + "time_per_iteration": 4.704723596572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047001, + "balance_loss_mlp": 1.00618351, + "epoch": 0.6183147364370912, + "flos": 741696536064.0, + "grad_norm": 0.03266398965494079, + "language_loss": 0.79975474, + "learning_rate": 0.00033580599113475543, + "loss": 0.81022477, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.40820312, + "step": 3214, + "time_per_iteration": 2.9861807823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00875139, + "epoch": 0.6185071181223547, + "flos": 382483462656.0, + "grad_norm": 0.034946308334165094, + "language_loss": 0.86866862, + "learning_rate": 0.00033551175794648507, + "loss": 0.87916261, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.40649414, + "step": 3215, + "time_per_iteration": 2.462238311767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050777, + "balance_loss_mlp": 1.01005554, + "epoch": 0.6186994998076183, + "flos": 464305546752.0, + "grad_norm": 0.05487149837237803, + "language_loss": 0.82309055, + "learning_rate": 0.00033521758861821365, + "loss": 0.83359838, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.40722656, + "step": 3216, + "time_per_iteration": 2.6265599727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_mlp": 1.00778484, + "epoch": 0.6188918814928819, + "flos": 486252953088.0, + "grad_norm": 0.035787768578127474, + "language_loss": 0.89356089, + "learning_rate": 0.0003349234832641479, + "loss": 0.90404689, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.40820312, + "step": 3217, + "time_per_iteration": 2.600252628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105038, + "balance_loss_mlp": 1.00956285, + "epoch": 0.6190842631781455, + "flos": 658598122752.0, + "grad_norm": 0.04394177664040498, + "language_loss": 0.81214905, + "learning_rate": 0.00033462944199846975, + "loss": 0.82265282, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.40820312, + "step": 3218, + "time_per_iteration": 3.059032917022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_mlp": 1.00748599, + "epoch": 0.619276644863409, + "flos": 404467807488.0, + "grad_norm": 0.03662586595942604, + "language_loss": 0.87058449, + "learning_rate": 0.00033433546493533606, + "loss": 0.88106751, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.40820312, + "step": 3219, + "time_per_iteration": 2.464569091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049492, + "balance_loss_mlp": 1.00876999, + "epoch": 0.6194690265486725, + "flos": 584241540096.0, + "grad_norm": 0.03704236392673744, + "language_loss": 0.8459326, + "learning_rate": 0.00033404155218887897, + "loss": 0.85642755, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.40722656, + "step": 3220, + "time_per_iteration": 2.717883825302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048251, + "balance_loss_mlp": 1.00745773, + "epoch": 0.6196614082339361, + "flos": 505385240832.0, + "grad_norm": 0.03422152158197648, + "language_loss": 0.87844843, + "learning_rate": 0.00033374770387320534, + "loss": 0.88893092, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.40795898, + "step": 3221, + "time_per_iteration": 2.7630932331085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00921607, + "epoch": 0.6198537899191997, + "flos": 576526217472.0, + "grad_norm": 0.03373583765668511, + "language_loss": 0.85412097, + "learning_rate": 0.00033345392010239737, + "loss": 0.86462182, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.40869141, + "step": 3222, + "time_per_iteration": 2.7410025596618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050416, + "balance_loss_mlp": 1.00952721, + "epoch": 0.6200461716044633, + "flos": 594303441408.0, + "grad_norm": 0.03547804945622036, + "language_loss": 0.82924426, + "learning_rate": 0.0003331602009905118, + "loss": 0.83974844, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.40893555, + "step": 3223, + "time_per_iteration": 2.8037710189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_mlp": 1.01098979, + "epoch": 0.6202385532897268, + "flos": 667411885056.0, + "grad_norm": 0.03269956620721502, + "language_loss": 0.84572297, + "learning_rate": 0.00033286654665158085, + "loss": 0.85624015, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.40722656, + "step": 3224, + "time_per_iteration": 2.948554754257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050192, + "balance_loss_mlp": 1.00939882, + "epoch": 0.6204309349749904, + "flos": 485927309568.0, + "grad_norm": 0.03423910891288116, + "language_loss": 0.88386071, + "learning_rate": 0.0003325729571996109, + "loss": 0.89436263, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.40795898, + "step": 3225, + "time_per_iteration": 2.6549041271209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049912, + "balance_loss_mlp": 1.00914264, + "epoch": 0.6206233166602539, + "flos": 585218470656.0, + "grad_norm": 0.03260898019544377, + "language_loss": 0.84271944, + "learning_rate": 0.000332279432748584, + "loss": 0.85321862, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.40771484, + "step": 3226, + "time_per_iteration": 2.716174840927124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_mlp": 1.00827563, + "epoch": 0.6208156983455175, + "flos": 477912588288.0, + "grad_norm": 0.031713525688758036, + "language_loss": 0.87778246, + "learning_rate": 0.00033198597341245576, + "loss": 0.88827246, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.40722656, + "step": 3227, + "time_per_iteration": 2.596343994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_mlp": 1.00591445, + "epoch": 0.6210080800307811, + "flos": 790469198592.0, + "grad_norm": 0.02931098854288103, + "language_loss": 0.82211602, + "learning_rate": 0.00033169257930515763, + "loss": 0.8325814, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.40625, + "step": 3228, + "time_per_iteration": 3.0495920181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_mlp": 1.01036096, + "epoch": 0.6212004617160446, + "flos": 608917549056.0, + "grad_norm": 0.05193251609129224, + "language_loss": 0.83099496, + "learning_rate": 0.0003313992505405951, + "loss": 0.8415041, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.40551758, + "step": 3229, + "time_per_iteration": 2.7221577167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049521, + "balance_loss_mlp": 1.00896585, + "epoch": 0.6213928434013082, + "flos": 587612681472.0, + "grad_norm": 0.04085502918766405, + "language_loss": 0.81571418, + "learning_rate": 0.0003311059872326487, + "loss": 0.82620943, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.40551758, + "step": 3230, + "time_per_iteration": 2.6938486099243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051014, + "balance_loss_mlp": 1.0103395, + "epoch": 0.6215852250865718, + "flos": 537109734144.0, + "grad_norm": 0.03319484231219387, + "language_loss": 0.79486078, + "learning_rate": 0.0003308127894951734, + "loss": 0.80537093, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.40673828, + "step": 3231, + "time_per_iteration": 2.6565897464752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00634694, + "epoch": 0.6217776067718354, + "flos": 619313842176.0, + "grad_norm": 0.044149605083951216, + "language_loss": 0.8665247, + "learning_rate": 0.00033051965744199834, + "loss": 0.87699568, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.4074707, + "step": 3232, + "time_per_iteration": 2.7405452728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_mlp": 1.00575984, + "epoch": 0.6219699884570988, + "flos": 547100670720.0, + "grad_norm": 0.03240939524045973, + "language_loss": 0.90891719, + "learning_rate": 0.0003302265911869276, + "loss": 0.91938138, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.40649414, + "step": 3233, + "time_per_iteration": 2.9264018535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048831, + "balance_loss_mlp": 1.00827634, + "epoch": 0.6221623701423624, + "flos": 482156647680.0, + "grad_norm": 0.04042837420673253, + "language_loss": 0.8472892, + "learning_rate": 0.0003299335908437397, + "loss": 0.85777748, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.40551758, + "step": 3234, + "time_per_iteration": 2.6122491359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104817, + "balance_loss_mlp": 1.00751972, + "epoch": 0.622354751827626, + "flos": 380872741632.0, + "grad_norm": 0.045523891323386655, + "language_loss": 0.80743796, + "learning_rate": 0.0003296406565261873, + "loss": 0.81791961, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.40649414, + "step": 3235, + "time_per_iteration": 2.4912121295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052853, + "balance_loss_mlp": 1.01241732, + "epoch": 0.6225471335128896, + "flos": 669072183552.0, + "grad_norm": 0.032252040846456206, + "language_loss": 0.85526693, + "learning_rate": 0.0003293477883479978, + "loss": 0.86579549, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.40429688, + "step": 3236, + "time_per_iteration": 2.8378734588623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049915, + "balance_loss_mlp": 1.00943148, + "epoch": 0.6227395151981532, + "flos": 772628791296.0, + "grad_norm": 0.03861340277154514, + "language_loss": 0.80045772, + "learning_rate": 0.0003290549864228727, + "loss": 0.81095684, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.40478516, + "step": 3237, + "time_per_iteration": 2.9996402263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00410068, + "epoch": 0.6229318968834167, + "flos": 485358647808.0, + "grad_norm": 0.03163121059903129, + "language_loss": 0.87001842, + "learning_rate": 0.0003287622508644875, + "loss": 0.88046503, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.40551758, + "step": 3238, + "time_per_iteration": 2.8210766315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051288, + "balance_loss_mlp": 1.01082802, + "epoch": 0.6231242785686802, + "flos": 463877836032.0, + "grad_norm": 0.03974001893419822, + "language_loss": 0.87119055, + "learning_rate": 0.0003284695817864923, + "loss": 0.88170344, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.40454102, + "step": 3239, + "time_per_iteration": 2.4931445121765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048406, + "balance_loss_mlp": 1.00773168, + "epoch": 0.6233166602539438, + "flos": 610211374848.0, + "grad_norm": 0.03997150810707431, + "language_loss": 0.84201944, + "learning_rate": 0.0003281769793025116, + "loss": 0.85250354, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.40673828, + "step": 3240, + "time_per_iteration": 2.71476674079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00999331, + "epoch": 0.6235090419392074, + "flos": 440115574272.0, + "grad_norm": 0.053967997241239116, + "language_loss": 0.9023276, + "learning_rate": 0.00032788444352614346, + "loss": 0.91283357, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.40600586, + "step": 3241, + "time_per_iteration": 2.5143325328826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_mlp": 1.00826836, + "epoch": 0.6237014236244709, + "flos": 505901412864.0, + "grad_norm": 0.03953535493242474, + "language_loss": 0.81586522, + "learning_rate": 0.0003275919745709606, + "loss": 0.82635486, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.40698242, + "step": 3242, + "time_per_iteration": 2.6041946411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052265, + "balance_loss_mlp": 1.01171017, + "epoch": 0.6238938053097345, + "flos": 513996814080.0, + "grad_norm": 0.03348358487194809, + "language_loss": 0.82661837, + "learning_rate": 0.00032729957255050936, + "loss": 0.83714104, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.40551758, + "step": 3243, + "time_per_iteration": 2.6362357139587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053144, + "balance_loss_mlp": 1.01263702, + "epoch": 0.6240861869949981, + "flos": 738023083776.0, + "grad_norm": 0.04011709848771047, + "language_loss": 0.82433391, + "learning_rate": 0.0003270072375783102, + "loss": 0.83486533, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.4050293, + "step": 3244, + "time_per_iteration": 2.890136241912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_mlp": 1.00855565, + "epoch": 0.6242785686802617, + "flos": 495709254144.0, + "grad_norm": 0.03469894111823996, + "language_loss": 0.80177683, + "learning_rate": 0.00032671496976785774, + "loss": 0.81226623, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.40380859, + "step": 3245, + "time_per_iteration": 2.6587681770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_mlp": 1.01091611, + "epoch": 0.6244709503655252, + "flos": 747234421248.0, + "grad_norm": 0.03291682412434118, + "language_loss": 0.76093823, + "learning_rate": 0.0003264227692326205, + "loss": 0.77145123, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.40380859, + "step": 3246, + "time_per_iteration": 3.0954296588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050368, + "balance_loss_mlp": 1.00995624, + "epoch": 0.6246633320507887, + "flos": 493551258624.0, + "grad_norm": 0.036876384824843206, + "language_loss": 0.86561215, + "learning_rate": 0.00032613063608604055, + "loss": 0.8761158, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.40405273, + "step": 3247, + "time_per_iteration": 2.632049560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_mlp": 1.00296032, + "epoch": 0.6248557137360523, + "flos": 518392518144.0, + "grad_norm": 0.03391504049871655, + "language_loss": 0.84063625, + "learning_rate": 0.0003258385704415343, + "loss": 0.85107023, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.40429688, + "step": 3248, + "time_per_iteration": 2.580336809158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00317442, + "epoch": 0.6250480954213159, + "flos": 520429004544.0, + "grad_norm": 0.028687824097281916, + "language_loss": 0.83734399, + "learning_rate": 0.0003255465724124915, + "loss": 0.84777981, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.40405273, + "step": 3249, + "time_per_iteration": 2.699963331222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_mlp": 1.00580287, + "epoch": 0.6252404771065795, + "flos": 517070502144.0, + "grad_norm": 0.03444404266219843, + "language_loss": 0.83187747, + "learning_rate": 0.00032525464211227587, + "loss": 0.84233886, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.40332031, + "step": 3250, + "time_per_iteration": 2.590261697769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045897, + "balance_loss_mlp": 1.0055331, + "epoch": 0.6254328587918431, + "flos": 577997932800.0, + "grad_norm": 0.03271100856558234, + "language_loss": 0.86164498, + "learning_rate": 0.0003249627796542249, + "loss": 0.87210405, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.40356445, + "step": 3251, + "time_per_iteration": 2.706554412841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046601, + "balance_loss_mlp": 1.006284, + "epoch": 0.6256252404771065, + "flos": 599105468928.0, + "grad_norm": 0.035746905542485746, + "language_loss": 0.84805512, + "learning_rate": 0.00032467098515164943, + "loss": 0.8585211, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.40307617, + "step": 3252, + "time_per_iteration": 2.870948076248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_mlp": 1.00411773, + "epoch": 0.6258176221623701, + "flos": 509361982464.0, + "grad_norm": 0.036795712439313615, + "language_loss": 0.84738171, + "learning_rate": 0.00032437925871783456, + "loss": 0.85782516, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.40209961, + "step": 3253, + "time_per_iteration": 2.6761369705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_mlp": 1.00468659, + "epoch": 0.6260100038476337, + "flos": 640805347584.0, + "grad_norm": 0.03851108593477808, + "language_loss": 0.85338682, + "learning_rate": 0.00032408760046603803, + "loss": 0.86383539, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.40161133, + "step": 3254, + "time_per_iteration": 2.8586931228637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_mlp": 1.00344408, + "epoch": 0.6262023855328973, + "flos": 842452609536.0, + "grad_norm": 0.03391057824911436, + "language_loss": 0.78393734, + "learning_rate": 0.00032379601050949193, + "loss": 0.79437345, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.40161133, + "step": 3255, + "time_per_iteration": 3.0973715782165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_mlp": 1.00629032, + "epoch": 0.6263947672181608, + "flos": 523157607168.0, + "grad_norm": 0.03422589562212714, + "language_loss": 0.8863821, + "learning_rate": 0.0003235044889614013, + "loss": 0.89684743, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.40234375, + "step": 3256, + "time_per_iteration": 2.643688917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046154, + "balance_loss_mlp": 1.00593293, + "epoch": 0.6265871489034244, + "flos": 608290561536.0, + "grad_norm": 0.06509285278700487, + "language_loss": 0.84065372, + "learning_rate": 0.0003232130359349451, + "loss": 0.85111523, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.40209961, + "step": 3257, + "time_per_iteration": 2.859252452850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_mlp": 1.00690067, + "epoch": 0.626779530588688, + "flos": 589594732800.0, + "grad_norm": 0.03191133097735202, + "language_loss": 0.82224607, + "learning_rate": 0.0003229216515432751, + "loss": 0.83271682, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.40161133, + "step": 3258, + "time_per_iteration": 2.7475619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_mlp": 1.00625372, + "epoch": 0.6269719122739515, + "flos": 439538164224.0, + "grad_norm": 0.04023600043450841, + "language_loss": 0.80242079, + "learning_rate": 0.0003226303358995174, + "loss": 0.81288606, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.40258789, + "step": 3259, + "time_per_iteration": 2.5837466716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104682, + "balance_loss_mlp": 1.00647962, + "epoch": 0.6271642939592151, + "flos": 564015670272.0, + "grad_norm": 0.027274694738231114, + "language_loss": 0.88901317, + "learning_rate": 0.00032233908911677, + "loss": 0.89948136, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.40332031, + "step": 3260, + "time_per_iteration": 2.825246810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044855, + "balance_loss_mlp": 1.00465786, + "epoch": 0.6273566756444786, + "flos": 515653221888.0, + "grad_norm": 0.03753718779185775, + "language_loss": 0.81557947, + "learning_rate": 0.0003220479113081053, + "loss": 0.82602805, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.40185547, + "step": 3261, + "time_per_iteration": 2.7426939010620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_mlp": 1.00566065, + "epoch": 0.6275490573297422, + "flos": 586588118784.0, + "grad_norm": 0.04387524863401932, + "language_loss": 0.79368806, + "learning_rate": 0.00032175680258656836, + "loss": 0.80414808, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.40332031, + "step": 3262, + "time_per_iteration": 2.704888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_mlp": 1.007092, + "epoch": 0.6277414390150058, + "flos": 560544407040.0, + "grad_norm": 0.03394703934758085, + "language_loss": 0.80846763, + "learning_rate": 0.00032146576306517794, + "loss": 0.81894171, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.40307617, + "step": 3263, + "time_per_iteration": 2.744232654571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_mlp": 1.00529134, + "epoch": 0.6279338207002694, + "flos": 613841085696.0, + "grad_norm": 0.03564897241316152, + "language_loss": 0.81241357, + "learning_rate": 0.0003211747928569255, + "loss": 0.82287127, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.40478516, + "step": 3264, + "time_per_iteration": 2.7210609912872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_mlp": 1.00754583, + "epoch": 0.6281262023855329, + "flos": 626933900544.0, + "grad_norm": 0.03587918693245657, + "language_loss": 0.81859601, + "learning_rate": 0.0003208838920747754, + "loss": 0.82907528, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.40380859, + "step": 3265, + "time_per_iteration": 2.828963041305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_mlp": 1.00379026, + "epoch": 0.6283185840707964, + "flos": 1125420367872.0, + "grad_norm": 0.03507856752255015, + "language_loss": 0.77222586, + "learning_rate": 0.0003205930608316656, + "loss": 0.78266764, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.40380859, + "step": 3266, + "time_per_iteration": 3.4536292552948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_mlp": 1.00251615, + "epoch": 0.62851096575606, + "flos": 516332699136.0, + "grad_norm": 0.05679261767260983, + "language_loss": 0.85571408, + "learning_rate": 0.00032030229924050673, + "loss": 0.86614287, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.40356445, + "step": 3267, + "time_per_iteration": 2.669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048611, + "balance_loss_mlp": 1.00815153, + "epoch": 0.6287033474413236, + "flos": 405062714112.0, + "grad_norm": 0.035560546659782886, + "language_loss": 0.80196536, + "learning_rate": 0.00032001160741418247, + "loss": 0.81245148, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.40454102, + "step": 3268, + "time_per_iteration": 2.6049489974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_mlp": 1.00421953, + "epoch": 0.6288957291265872, + "flos": 526759127808.0, + "grad_norm": 0.05710921395997567, + "language_loss": 0.8274591, + "learning_rate": 0.0003197209854655494, + "loss": 0.83790565, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.40429688, + "step": 3269, + "time_per_iteration": 2.6551384925842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043687, + "balance_loss_mlp": 1.00313175, + "epoch": 0.6290881108118507, + "flos": 604958304000.0, + "grad_norm": 0.03774804220071916, + "language_loss": 0.75090307, + "learning_rate": 0.0003194304335074371, + "loss": 0.7613399, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.40551758, + "step": 3270, + "time_per_iteration": 2.851900577545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049729, + "balance_loss_mlp": 1.0093174, + "epoch": 0.6292804924971143, + "flos": 438598172160.0, + "grad_norm": 0.03683695296075174, + "language_loss": 0.89063656, + "learning_rate": 0.0003191399516526475, + "loss": 0.90113389, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.40405273, + "step": 3271, + "time_per_iteration": 2.5034451484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_mlp": 1.00488937, + "epoch": 0.6294728741823779, + "flos": 607845354240.0, + "grad_norm": 0.03066213341534494, + "language_loss": 0.79802763, + "learning_rate": 0.0003188495400139559, + "loss": 0.80848092, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.40429688, + "step": 3272, + "time_per_iteration": 2.780644178390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_mlp": 1.00486362, + "epoch": 0.6296652558676414, + "flos": 702774837504.0, + "grad_norm": 0.038362375592622004, + "language_loss": 0.85288656, + "learning_rate": 0.00031855919870411013, + "loss": 0.86333817, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.40283203, + "step": 3273, + "time_per_iteration": 2.8482918739318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_mlp": 1.00769854, + "epoch": 0.6298576375529049, + "flos": 524944272384.0, + "grad_norm": 0.03395775035270535, + "language_loss": 0.85278755, + "learning_rate": 0.0003182689278358305, + "loss": 0.86326772, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.40307617, + "step": 3274, + "time_per_iteration": 2.7457242012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_mlp": 1.00567281, + "epoch": 0.6300500192381685, + "flos": 476926909440.0, + "grad_norm": 0.036436552387549975, + "language_loss": 0.80145383, + "learning_rate": 0.0003179787275218105, + "loss": 0.81191462, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.40405273, + "step": 3275, + "time_per_iteration": 2.567723274230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_mlp": 1.00372946, + "epoch": 0.6302424009234321, + "flos": 521891971584.0, + "grad_norm": 0.03333768301867296, + "language_loss": 0.84862459, + "learning_rate": 0.0003176885978747155, + "loss": 0.85906482, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.40283203, + "step": 3276, + "time_per_iteration": 2.6513776779174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_mlp": 1.00587988, + "epoch": 0.6304347826086957, + "flos": 695858555904.0, + "grad_norm": 0.03467401587057451, + "language_loss": 0.83325267, + "learning_rate": 0.0003173985390071839, + "loss": 0.84371352, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.40185547, + "step": 3277, + "time_per_iteration": 2.876150131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054981, + "balance_loss_mlp": 1.01578522, + "epoch": 0.6306271642939593, + "flos": 1470032928000.0, + "grad_norm": 0.010139969116537896, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78955436, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.39160156, + "step": 3278, + "time_per_iteration": 4.770167589187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_mlp": 1.00548971, + "epoch": 0.6308195459792227, + "flos": 602930565888.0, + "grad_norm": 0.03526553994141675, + "language_loss": 0.81487232, + "learning_rate": 0.00031681863406122704, + "loss": 0.82533133, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.40405273, + "step": 3279, + "time_per_iteration": 2.7587971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043917, + "balance_loss_mlp": 1.0036478, + "epoch": 0.6310119276644863, + "flos": 728237248512.0, + "grad_norm": 0.034493081934242914, + "language_loss": 0.85473228, + "learning_rate": 0.00031652878820794087, + "loss": 0.86517143, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.40258789, + "step": 3280, + "time_per_iteration": 2.9854700565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045188, + "balance_loss_mlp": 1.00484729, + "epoch": 0.6312043093497499, + "flos": 520819776768.0, + "grad_norm": 0.037869406847462164, + "language_loss": 0.8647517, + "learning_rate": 0.00031623901358449627, + "loss": 0.87520361, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.40332031, + "step": 3281, + "time_per_iteration": 2.626267910003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_mlp": 1.00399899, + "epoch": 0.6313966910350135, + "flos": 532223136000.0, + "grad_norm": 0.03407480500665165, + "language_loss": 0.88792193, + "learning_rate": 0.0003159493103033936, + "loss": 0.89836484, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.40283203, + "step": 3282, + "time_per_iteration": 2.574249505996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00734711, + "epoch": 0.631589072720277, + "flos": 1382996656896.0, + "grad_norm": 0.01146599852639075, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80965501, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.390625, + "step": 3283, + "time_per_iteration": 4.8656487464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.00684595, + "epoch": 0.6317814544055406, + "flos": 625874344704.0, + "grad_norm": 0.030628800549983924, + "language_loss": 0.83010268, + "learning_rate": 0.0003153701182180776, + "loss": 0.84057581, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.40454102, + "step": 3284, + "time_per_iteration": 2.803232431411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047972, + "balance_loss_mlp": 1.00751245, + "epoch": 0.6319738360908042, + "flos": 499097892096.0, + "grad_norm": 0.036572578748274465, + "language_loss": 0.82564306, + "learning_rate": 0.00031508062963872655, + "loss": 0.83612275, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.40454102, + "step": 3285, + "time_per_iteration": 2.5559017658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00602329, + "epoch": 0.6321662177760677, + "flos": 580909282560.0, + "grad_norm": 0.041327466784405305, + "language_loss": 0.80268341, + "learning_rate": 0.0003147912128514423, + "loss": 0.81314898, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.40527344, + "step": 3286, + "time_per_iteration": 2.7093169689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_mlp": 1.00380301, + "epoch": 0.6323585994613313, + "flos": 602606867712.0, + "grad_norm": 0.0363944042801657, + "language_loss": 0.87847489, + "learning_rate": 0.0003145018679685859, + "loss": 0.88891751, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.40454102, + "step": 3287, + "time_per_iteration": 2.741680145263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_mlp": 1.00691795, + "epoch": 0.6325509811465948, + "flos": 529633539072.0, + "grad_norm": 0.02715728015284293, + "language_loss": 0.88303924, + "learning_rate": 0.00031421259510249134, + "loss": 0.89351344, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.4050293, + "step": 3288, + "time_per_iteration": 2.793593406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050359, + "balance_loss_mlp": 1.00975657, + "epoch": 0.6327433628318584, + "flos": 575345152512.0, + "grad_norm": 0.03790719604682011, + "language_loss": 0.8176173, + "learning_rate": 0.00031392339436546414, + "loss": 0.82812083, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.40600586, + "step": 3289, + "time_per_iteration": 2.806328773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_mlp": 1.00960624, + "epoch": 0.632935744517122, + "flos": 518112561408.0, + "grad_norm": 0.04130029787255878, + "language_loss": 0.84016752, + "learning_rate": 0.00031363426586978205, + "loss": 0.85067028, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.40673828, + "step": 3290, + "time_per_iteration": 2.815406322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00676227, + "epoch": 0.6331281262023856, + "flos": 618597426432.0, + "grad_norm": 0.031083560389852355, + "language_loss": 0.85119176, + "learning_rate": 0.0003133452097276947, + "loss": 0.86166441, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.4050293, + "step": 3291, + "time_per_iteration": 2.7325408458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_mlp": 1.00465119, + "epoch": 0.633320507887649, + "flos": 594116803584.0, + "grad_norm": 0.03244834687463976, + "language_loss": 0.84650022, + "learning_rate": 0.0003130562260514238, + "loss": 0.85695255, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.40576172, + "step": 3292, + "time_per_iteration": 2.7858352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046449, + "balance_loss_mlp": 1.00582266, + "epoch": 0.6335128895729126, + "flos": 583496934144.0, + "grad_norm": 0.03053589669397976, + "language_loss": 0.8217054, + "learning_rate": 0.0003127673149531626, + "loss": 0.83216989, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.40625, + "step": 3293, + "time_per_iteration": 2.755866050720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_mlp": 1.00506401, + "epoch": 0.6337052712581762, + "flos": 453974382336.0, + "grad_norm": 0.03437959175785583, + "language_loss": 0.83448106, + "learning_rate": 0.0003124784765450762, + "loss": 0.84493768, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.40600586, + "step": 3294, + "time_per_iteration": 2.555196762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045908, + "balance_loss_mlp": 1.00535333, + "epoch": 0.6338976529434398, + "flos": 574515975936.0, + "grad_norm": 0.03647562664134654, + "language_loss": 0.810781, + "learning_rate": 0.0003121897109393017, + "loss": 0.82124007, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.40551758, + "step": 3295, + "time_per_iteration": 2.726447582244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_mlp": 1.00441372, + "epoch": 0.6340900346287034, + "flos": 509809135104.0, + "grad_norm": 0.0325303094953836, + "language_loss": 0.89509195, + "learning_rate": 0.0003119010182479481, + "loss": 0.90554118, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.4050293, + "step": 3296, + "time_per_iteration": 2.6128556728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00422609, + "epoch": 0.6342824163139669, + "flos": 480715067904.0, + "grad_norm": 0.036682379732438104, + "language_loss": 0.8339026, + "learning_rate": 0.00031161239858309563, + "loss": 0.84434992, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.4050293, + "step": 3297, + "time_per_iteration": 2.571183443069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_mlp": 1.00323093, + "epoch": 0.6344747979992305, + "flos": 573111334656.0, + "grad_norm": 0.03822576874130642, + "language_loss": 0.83954668, + "learning_rate": 0.0003113238520567964, + "loss": 0.84998387, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.40478516, + "step": 3298, + "time_per_iteration": 2.677607297897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_mlp": 1.00143397, + "epoch": 0.634667179684494, + "flos": 607046313216.0, + "grad_norm": 0.03748382415323519, + "language_loss": 0.818299, + "learning_rate": 0.00031103537878107403, + "loss": 0.82871747, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.40405273, + "step": 3299, + "time_per_iteration": 2.731858730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_mlp": 1.0007478, + "epoch": 0.6348595613697576, + "flos": 648129897984.0, + "grad_norm": 0.036818455755728355, + "language_loss": 0.80712759, + "learning_rate": 0.0003107469788679238, + "loss": 0.81753987, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.40478516, + "step": 3300, + "time_per_iteration": 2.811863660812378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041324, + "balance_loss_mlp": 1.00088787, + "epoch": 0.6350519430550212, + "flos": 640273624320.0, + "grad_norm": 0.03493243312285999, + "language_loss": 0.872877, + "learning_rate": 0.00031045865242931267, + "loss": 0.88329029, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.40429688, + "step": 3301, + "time_per_iteration": 2.7718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042406, + "balance_loss_mlp": 1.00206506, + "epoch": 0.6352443247402847, + "flos": 687831195648.0, + "grad_norm": 0.031178821676135258, + "language_loss": 0.83354819, + "learning_rate": 0.00031017039957717877, + "loss": 0.84397227, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.40332031, + "step": 3302, + "time_per_iteration": 3.0323870182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_mlp": 1.01028883, + "epoch": 0.6354367064255483, + "flos": 560526910464.0, + "grad_norm": 0.03426704048429257, + "language_loss": 0.89209497, + "learning_rate": 0.0003098822204234318, + "loss": 0.9026022, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.40429688, + "step": 3303, + "time_per_iteration": 2.688183069229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048749, + "balance_loss_mlp": 1.00831378, + "epoch": 0.6356290881108119, + "flos": 981062077440.0, + "grad_norm": 0.05617774198225317, + "language_loss": 0.88024724, + "learning_rate": 0.00030959411507995273, + "loss": 0.89073473, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.40429688, + "step": 3304, + "time_per_iteration": 3.2071332931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050985, + "balance_loss_mlp": 1.01050138, + "epoch": 0.6358214697960755, + "flos": 529373024256.0, + "grad_norm": 0.04089277764533041, + "language_loss": 0.81679875, + "learning_rate": 0.00030930608365859407, + "loss": 0.82730865, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.40478516, + "step": 3305, + "time_per_iteration": 2.6791036128997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052235, + "balance_loss_mlp": 1.01184678, + "epoch": 0.6360138514813389, + "flos": 517869543168.0, + "grad_norm": 0.03251934179180288, + "language_loss": 0.88227487, + "learning_rate": 0.00030901812627117943, + "loss": 0.89279723, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.40380859, + "step": 3306, + "time_per_iteration": 2.643564462661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047687, + "balance_loss_mlp": 1.00720358, + "epoch": 0.6362062331666025, + "flos": 467470608384.0, + "grad_norm": 0.0425448547397637, + "language_loss": 0.85627687, + "learning_rate": 0.000308730243029504, + "loss": 0.8667537, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.40478516, + "step": 3307, + "time_per_iteration": 2.5909810066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_mlp": 1.00854588, + "epoch": 0.6363986148518661, + "flos": 550773156096.0, + "grad_norm": 0.03484330169343757, + "language_loss": 0.80282146, + "learning_rate": 0.0003084424340453339, + "loss": 0.81331193, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.4050293, + "step": 3308, + "time_per_iteration": 2.84796142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048791, + "balance_loss_mlp": 1.00830781, + "epoch": 0.6365909965371297, + "flos": 584158914816.0, + "grad_norm": 0.03632736574425893, + "language_loss": 0.82740968, + "learning_rate": 0.0003081546994304064, + "loss": 0.83789754, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.40478516, + "step": 3309, + "time_per_iteration": 2.7956221103668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105091, + "balance_loss_mlp": 1.01052189, + "epoch": 0.6367833782223933, + "flos": 532288264704.0, + "grad_norm": 0.03383722740926899, + "language_loss": 0.83152783, + "learning_rate": 0.0003078670392964298, + "loss": 0.8420369, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.40380859, + "step": 3310, + "time_per_iteration": 2.6194021701812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049103, + "balance_loss_mlp": 1.00883412, + "epoch": 0.6369757599076568, + "flos": 570588811776.0, + "grad_norm": 0.03520180951361345, + "language_loss": 0.83487624, + "learning_rate": 0.00030757945375508406, + "loss": 0.84536731, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.40258789, + "step": 3311, + "time_per_iteration": 2.636317729949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.00614858, + "epoch": 0.6371681415929203, + "flos": 541054394880.0, + "grad_norm": 0.03810911352031966, + "language_loss": 0.81548536, + "learning_rate": 0.00030729194291801944, + "loss": 0.82595289, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.40600586, + "step": 3312, + "time_per_iteration": 2.6490793228149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.00452065, + "epoch": 0.6373605232781839, + "flos": 484531416576.0, + "grad_norm": 0.03667535496624994, + "language_loss": 0.77428758, + "learning_rate": 0.00030700450689685787, + "loss": 0.78473806, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.40527344, + "step": 3313, + "time_per_iteration": 2.535402774810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_mlp": 1.00575566, + "epoch": 0.6375529049634475, + "flos": 579817645824.0, + "grad_norm": 0.03891693330572632, + "language_loss": 0.85701913, + "learning_rate": 0.00030671714580319186, + "loss": 0.86748058, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.40380859, + "step": 3314, + "time_per_iteration": 2.8058876991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044916, + "balance_loss_mlp": 1.00433683, + "epoch": 0.637745286648711, + "flos": 683480211456.0, + "grad_norm": 0.11702238081113171, + "language_loss": 0.83888423, + "learning_rate": 0.0003064298597485846, + "loss": 0.84933341, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.40576172, + "step": 3315, + "time_per_iteration": 2.8778491020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_mlp": 1.00489366, + "epoch": 0.6379376683339746, + "flos": 505649646336.0, + "grad_norm": 0.05211428291246213, + "language_loss": 0.84419525, + "learning_rate": 0.00030614264884457054, + "loss": 0.85464859, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.40429688, + "step": 3316, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_mlp": 1.00977015, + "epoch": 0.6381300500192382, + "flos": 503025056256.0, + "grad_norm": 0.0426813784455398, + "language_loss": 0.77854991, + "learning_rate": 0.000305855513202655, + "loss": 0.7890529, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.40527344, + "step": 3317, + "time_per_iteration": 2.5690500736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048316, + "balance_loss_mlp": 1.0077374, + "epoch": 0.6383224317045018, + "flos": 401367874560.0, + "grad_norm": 0.04267134147869369, + "language_loss": 0.78333461, + "learning_rate": 0.0003055684529343138, + "loss": 0.79381788, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.40576172, + "step": 3318, + "time_per_iteration": 2.4513895511627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054177, + "balance_loss_mlp": 1.01378846, + "epoch": 0.6385148133897653, + "flos": 500363527680.0, + "grad_norm": 0.0362987336754338, + "language_loss": 0.78882575, + "learning_rate": 0.00030528146815099374, + "loss": 0.79936755, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.40380859, + "step": 3319, + "time_per_iteration": 2.6613929271698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058923, + "balance_loss_mlp": 1.01851058, + "epoch": 0.6387071950750288, + "flos": 528695492352.0, + "grad_norm": 0.033070910188452485, + "language_loss": 0.72438365, + "learning_rate": 0.00030499455896411203, + "loss": 0.73497283, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.40405273, + "step": 3320, + "time_per_iteration": 2.641817092895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.02330017, + "epoch": 0.6388995767602924, + "flos": 1459106856960.0, + "grad_norm": 0.013037560040261834, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77363789, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.39648438, + "step": 3321, + "time_per_iteration": 4.960562705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_mlp": 1.00777197, + "epoch": 0.639091958445556, + "flos": 605171186688.0, + "grad_norm": 0.03633146914450565, + "language_loss": 0.77279496, + "learning_rate": 0.0003044209678251865, + "loss": 0.78327799, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.40527344, + "step": 3322, + "time_per_iteration": 2.875474691390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048137, + "balance_loss_mlp": 1.00762939, + "epoch": 0.6392843401308196, + "flos": 585665623296.0, + "grad_norm": 0.031694233880752425, + "language_loss": 0.85324746, + "learning_rate": 0.0003041342860958306, + "loss": 0.86372876, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.4050293, + "step": 3323, + "time_per_iteration": 2.7719669342041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_mlp": 1.00939035, + "epoch": 0.6394767218160831, + "flos": 515729044224.0, + "grad_norm": 0.03911936056883103, + "language_loss": 0.91999781, + "learning_rate": 0.00030384768040828857, + "loss": 0.93049705, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.40527344, + "step": 3324, + "time_per_iteration": 2.6998729705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_mlp": 1.00607193, + "epoch": 0.6396691035013466, + "flos": 542777876736.0, + "grad_norm": 0.04757896669484628, + "language_loss": 0.86295962, + "learning_rate": 0.00030356115087383094, + "loss": 0.87342638, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.40600586, + "step": 3325, + "time_per_iteration": 2.701478958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050797, + "balance_loss_mlp": 1.01033795, + "epoch": 0.6398614851866102, + "flos": 526554993408.0, + "grad_norm": 0.04173120766563636, + "language_loss": 0.85232729, + "learning_rate": 0.00030327469760369803, + "loss": 0.86283523, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.40454102, + "step": 3326, + "time_per_iteration": 2.5700113773345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048429, + "balance_loss_mlp": 1.0079217, + "epoch": 0.6400538668718738, + "flos": 624135311616.0, + "grad_norm": 0.07319214553535336, + "language_loss": 0.85706425, + "learning_rate": 0.0003029883207091009, + "loss": 0.86754858, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.4050293, + "step": 3327, + "time_per_iteration": 2.7076821327209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_mlp": 1.00391674, + "epoch": 0.6402462485571374, + "flos": 504455942400.0, + "grad_norm": 0.03613290239480707, + "language_loss": 0.78819323, + "learning_rate": 0.00030270202030122095, + "loss": 0.79863703, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.40454102, + "step": 3328, + "time_per_iteration": 2.7022666931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043909, + "balance_loss_mlp": 1.00337768, + "epoch": 0.6404386302424009, + "flos": 820663650816.0, + "grad_norm": 0.036325579184177476, + "language_loss": 0.8635475, + "learning_rate": 0.00030241579649121, + "loss": 0.8739866, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.40527344, + "step": 3329, + "time_per_iteration": 2.985426902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_mlp": 1.0080061, + "epoch": 0.6406310119276645, + "flos": 472793665536.0, + "grad_norm": 0.03267380509371782, + "language_loss": 0.80188096, + "learning_rate": 0.00030212964939018994, + "loss": 0.81236637, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.40527344, + "step": 3330, + "time_per_iteration": 2.550344228744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048063, + "balance_loss_mlp": 1.00753188, + "epoch": 0.6408233936129281, + "flos": 426489090816.0, + "grad_norm": 0.03827308355906826, + "language_loss": 0.86015689, + "learning_rate": 0.0003018435791092527, + "loss": 0.87063748, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.40527344, + "step": 3331, + "time_per_iteration": 2.4880104064941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042416, + "balance_loss_mlp": 1.00186062, + "epoch": 0.6410157752981916, + "flos": 550838284800.0, + "grad_norm": 0.0342671152523666, + "language_loss": 0.81525755, + "learning_rate": 0.00030155758575946083, + "loss": 0.82568169, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.40551758, + "step": 3332, + "time_per_iteration": 2.6726834774017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_mlp": 1.00267851, + "epoch": 0.6412081569834551, + "flos": 476861780736.0, + "grad_norm": 0.03538778522895548, + "language_loss": 0.84473503, + "learning_rate": 0.0003012716694518467, + "loss": 0.85516679, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.4050293, + "step": 3333, + "time_per_iteration": 2.5853443145751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042688, + "balance_loss_mlp": 1.00206196, + "epoch": 0.6414005386687187, + "flos": 542031325440.0, + "grad_norm": 0.03182184712742977, + "language_loss": 0.85642707, + "learning_rate": 0.000300985830297413, + "loss": 0.86685395, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.40625, + "step": 3334, + "time_per_iteration": 2.699078321456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_mlp": 1.00170887, + "epoch": 0.6415929203539823, + "flos": 1042957690368.0, + "grad_norm": 0.0341924045479309, + "language_loss": 0.88431525, + "learning_rate": 0.00030070006840713205, + "loss": 0.89473861, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.40625, + "step": 3335, + "time_per_iteration": 3.373852014541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_mlp": 1.0060693, + "epoch": 0.6417853020392459, + "flos": 649580226048.0, + "grad_norm": 0.035751052988779126, + "language_loss": 0.74186742, + "learning_rate": 0.000300414383891947, + "loss": 0.75233489, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.40673828, + "step": 3336, + "time_per_iteration": 2.86029314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_mlp": 1.0014739, + "epoch": 0.6419776837245095, + "flos": 501944113152.0, + "grad_norm": 0.02988455094961003, + "language_loss": 0.89225817, + "learning_rate": 0.00030012877686276973, + "loss": 0.90268028, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.4074707, + "step": 3337, + "time_per_iteration": 2.72491455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_mlp": 1.00569534, + "epoch": 0.642170065409773, + "flos": 621779984640.0, + "grad_norm": 0.03237702044621704, + "language_loss": 0.87225235, + "learning_rate": 0.0002998432474304832, + "loss": 0.88271654, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.40722656, + "step": 3338, + "time_per_iteration": 2.7576870918273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.01165771, + "epoch": 0.6423624470950365, + "flos": 1426641648384.0, + "grad_norm": 0.016568770215616015, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80288672, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.39648438, + "step": 3339, + "time_per_iteration": 4.923727035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_mlp": 1.00143564, + "epoch": 0.6425548287803001, + "flos": 563440205568.0, + "grad_norm": 0.03881466361138169, + "language_loss": 0.890571, + "learning_rate": 0.00029927242179996107, + "loss": 0.90099066, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.40527344, + "step": 3340, + "time_per_iteration": 2.7034361362457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_mlp": 1.00212634, + "epoch": 0.6427472104655637, + "flos": 586614363648.0, + "grad_norm": 0.030378234734855056, + "language_loss": 0.83618605, + "learning_rate": 0.0002989871258233398, + "loss": 0.84661257, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.40527344, + "step": 3341, + "time_per_iteration": 2.7497901916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_mlp": 1.00211823, + "epoch": 0.6429395921508272, + "flos": 405147284736.0, + "grad_norm": 0.03870957855804831, + "language_loss": 0.83240426, + "learning_rate": 0.0002987019078868373, + "loss": 0.84283173, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.40625, + "step": 3342, + "time_per_iteration": 2.425215005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_mlp": 1.00362682, + "epoch": 0.6431319738360908, + "flos": 549833164032.0, + "grad_norm": 0.031726413731120486, + "language_loss": 0.82255763, + "learning_rate": 0.00029841676810118484, + "loss": 0.83300042, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.40649414, + "step": 3343, + "time_per_iteration": 2.693652629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_mlp": 1.00368381, + "epoch": 0.6433243555213544, + "flos": 794706455040.0, + "grad_norm": 0.03684738873998065, + "language_loss": 0.87695611, + "learning_rate": 0.0002981317065770839, + "loss": 0.88739967, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.40673828, + "step": 3344, + "time_per_iteration": 3.0393459796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_mlp": 1.00147617, + "epoch": 0.643516737206618, + "flos": 584113228032.0, + "grad_norm": 0.0395181937617663, + "language_loss": 0.81428736, + "learning_rate": 0.00029784672342520493, + "loss": 0.82471007, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.40795898, + "step": 3345, + "time_per_iteration": 2.6979730129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045514, + "balance_loss_mlp": 1.00479162, + "epoch": 0.6437091188918815, + "flos": 519751472640.0, + "grad_norm": 0.07302138379312399, + "language_loss": 0.8401407, + "learning_rate": 0.00029756181875618834, + "loss": 0.85059583, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.40722656, + "step": 3346, + "time_per_iteration": 2.609215497970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046536, + "balance_loss_mlp": 1.00588584, + "epoch": 0.643901500577145, + "flos": 385787529984.0, + "grad_norm": 0.039174224295971255, + "language_loss": 0.83988988, + "learning_rate": 0.0002972769926806439, + "loss": 0.85035521, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.40649414, + "step": 3347, + "time_per_iteration": 2.4672152996063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044082, + "balance_loss_mlp": 1.00345576, + "epoch": 0.6440938822624086, + "flos": 484698612480.0, + "grad_norm": 0.03574243057214525, + "language_loss": 0.88977337, + "learning_rate": 0.0002969922453091508, + "loss": 0.9002142, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.40625, + "step": 3348, + "time_per_iteration": 2.615544557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044846, + "balance_loss_mlp": 1.00414753, + "epoch": 0.6442862639476722, + "flos": 541638607872.0, + "grad_norm": 0.030177655617681567, + "language_loss": 0.85437477, + "learning_rate": 0.00029670757675225777, + "loss": 0.86482322, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.40698242, + "step": 3349, + "time_per_iteration": 2.7615771293640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047798, + "balance_loss_mlp": 1.0071243, + "epoch": 0.6444786456329358, + "flos": 527959634688.0, + "grad_norm": 0.036762953036999044, + "language_loss": 0.79762578, + "learning_rate": 0.0002964229871204831, + "loss": 0.8081038, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.40673828, + "step": 3350, + "time_per_iteration": 2.6479439735412598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_mlp": 1.00781858, + "epoch": 0.6446710273181993, + "flos": 699162623232.0, + "grad_norm": 0.0356496056156774, + "language_loss": 0.84474576, + "learning_rate": 0.00029613847652431403, + "loss": 0.85523063, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.40673828, + "step": 3351, + "time_per_iteration": 2.852724313735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_mlp": 1.00514281, + "epoch": 0.6448634090034628, + "flos": 626300110080.0, + "grad_norm": 0.031569039076812924, + "language_loss": 0.79828554, + "learning_rate": 0.0002958540450742078, + "loss": 0.80874443, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.4074707, + "step": 3352, + "time_per_iteration": 2.943434238433838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104848, + "balance_loss_mlp": 1.0077343, + "epoch": 0.6450557906887264, + "flos": 602166518016.0, + "grad_norm": 0.03244355782647549, + "language_loss": 0.7780689, + "learning_rate": 0.0002955696928805901, + "loss": 0.78855366, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.4074707, + "step": 3353, + "time_per_iteration": 2.9107890129089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046337, + "balance_loss_mlp": 1.0057348, + "epoch": 0.64524817237399, + "flos": 647385292032.0, + "grad_norm": 0.03305835241833302, + "language_loss": 0.86728162, + "learning_rate": 0.0002952854200538563, + "loss": 0.87774503, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.40600586, + "step": 3354, + "time_per_iteration": 2.8001787662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_mlp": 1.00430393, + "epoch": 0.6454405540592536, + "flos": 474367448064.0, + "grad_norm": 0.03406107124883384, + "language_loss": 0.8233161, + "learning_rate": 0.000295001226704371, + "loss": 0.83376658, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.4074707, + "step": 3355, + "time_per_iteration": 2.6213538646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044402, + "balance_loss_mlp": 1.00372756, + "epoch": 0.6456329357445171, + "flos": 613020657408.0, + "grad_norm": 0.03542934708236725, + "language_loss": 0.82853353, + "learning_rate": 0.00029471711294246783, + "loss": 0.83897758, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.40673828, + "step": 3356, + "time_per_iteration": 2.790909767150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044152, + "balance_loss_mlp": 1.00362051, + "epoch": 0.6458253174297807, + "flos": 732932351232.0, + "grad_norm": 0.03702752169183614, + "language_loss": 0.82778573, + "learning_rate": 0.0002944330788784494, + "loss": 0.83822721, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.40527344, + "step": 3357, + "time_per_iteration": 2.8837075233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044846, + "balance_loss_mlp": 1.00424361, + "epoch": 0.6460176991150443, + "flos": 571555048704.0, + "grad_norm": 0.04139380130769849, + "language_loss": 0.84656543, + "learning_rate": 0.00029414912462258786, + "loss": 0.85701388, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.40600586, + "step": 3358, + "time_per_iteration": 2.8205137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_mlp": 1.00543988, + "epoch": 0.6462100808003078, + "flos": 584243485440.0, + "grad_norm": 0.03729295118772339, + "language_loss": 0.81916165, + "learning_rate": 0.00029386525028512366, + "loss": 0.82962239, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.40625, + "step": 3359, + "time_per_iteration": 2.7342734336853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044545, + "balance_loss_mlp": 1.00391877, + "epoch": 0.6464024624855714, + "flos": 485011617024.0, + "grad_norm": 0.03542298422939795, + "language_loss": 0.87396795, + "learning_rate": 0.0002935814559762666, + "loss": 0.88441336, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.40625, + "step": 3360, + "time_per_iteration": 2.7663137912750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_mlp": 1.00362873, + "epoch": 0.6465948441708349, + "flos": 528843246336.0, + "grad_norm": 0.034215531166731795, + "language_loss": 0.80432177, + "learning_rate": 0.0002932977418061957, + "loss": 0.81476361, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.40551758, + "step": 3361, + "time_per_iteration": 2.680459976196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_mlp": 1.00299382, + "epoch": 0.6467872258560985, + "flos": 670626524160.0, + "grad_norm": 0.03987324070915456, + "language_loss": 0.81433517, + "learning_rate": 0.00029301410788505833, + "loss": 0.82477069, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.40551758, + "step": 3362, + "time_per_iteration": 2.772834539413452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042359, + "balance_loss_mlp": 1.00178003, + "epoch": 0.6469796075413621, + "flos": 433040845056.0, + "grad_norm": 0.046274531894689615, + "language_loss": 0.81467456, + "learning_rate": 0.00029273055432297126, + "loss": 0.82509816, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.40576172, + "step": 3363, + "time_per_iteration": 2.49839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_mlp": 1.00188959, + "epoch": 0.6471719892266257, + "flos": 805102748160.0, + "grad_norm": 0.03834251982821679, + "language_loss": 0.81200004, + "learning_rate": 0.00029244708123001917, + "loss": 0.82242495, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.40600586, + "step": 3364, + "time_per_iteration": 2.968705177307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_mlp": 1.00215495, + "epoch": 0.6473643709118891, + "flos": 578349821184.0, + "grad_norm": 0.036932041933641975, + "language_loss": 0.84809864, + "learning_rate": 0.0002921636887162565, + "loss": 0.85852528, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.4050293, + "step": 3365, + "time_per_iteration": 2.7454428672790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044497, + "balance_loss_mlp": 1.00398982, + "epoch": 0.6475567525971527, + "flos": 762788520960.0, + "grad_norm": 0.046091211557592264, + "language_loss": 0.8445828, + "learning_rate": 0.00029188037689170595, + "loss": 0.85502779, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.4050293, + "step": 3366, + "time_per_iteration": 2.9878523349761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_mlp": 1.00274241, + "epoch": 0.6477491342824163, + "flos": 844501734912.0, + "grad_norm": 0.04252046587739173, + "language_loss": 0.84425056, + "learning_rate": 0.0002915971458663586, + "loss": 0.85468358, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.40551758, + "step": 3367, + "time_per_iteration": 3.052515745162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_mlp": 1.01003003, + "epoch": 0.6479415159676799, + "flos": 886382415360.0, + "grad_norm": 0.03864645902049365, + "language_loss": 0.82315862, + "learning_rate": 0.00029131399575017494, + "loss": 0.83366442, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.40551758, + "step": 3368, + "time_per_iteration": 3.1613588333129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050945, + "balance_loss_mlp": 1.01034212, + "epoch": 0.6481338976529435, + "flos": 616724245248.0, + "grad_norm": 0.06720988527624061, + "language_loss": 0.86632174, + "learning_rate": 0.0002910309266530836, + "loss": 0.87683117, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.40600586, + "step": 3369, + "time_per_iteration": 2.800647497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051276, + "balance_loss_mlp": 1.01067364, + "epoch": 0.648326279338207, + "flos": 511020335616.0, + "grad_norm": 0.03423893349875194, + "language_loss": 0.85872662, + "learning_rate": 0.0002907479386849814, + "loss": 0.86923945, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.40600586, + "step": 3370, + "time_per_iteration": 2.6336069107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105095, + "balance_loss_mlp": 1.0103476, + "epoch": 0.6485186610234706, + "flos": 703869386496.0, + "grad_norm": 0.03204560465373447, + "language_loss": 0.80689716, + "learning_rate": 0.0002904650319557339, + "loss": 0.81740665, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.40600586, + "step": 3371, + "time_per_iteration": 2.9737660884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054287, + "balance_loss_mlp": 1.01349366, + "epoch": 0.6487110427087341, + "flos": 561746859264.0, + "grad_norm": 0.039912158099113866, + "language_loss": 0.81825972, + "learning_rate": 0.0002901822065751758, + "loss": 0.82880259, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.40795898, + "step": 3372, + "time_per_iteration": 2.678905487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054182, + "balance_loss_mlp": 1.01341212, + "epoch": 0.6489034243939977, + "flos": 681302774016.0, + "grad_norm": 0.03214296467255679, + "language_loss": 0.86033392, + "learning_rate": 0.0002898994626531093, + "loss": 0.87087572, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.40771484, + "step": 3373, + "time_per_iteration": 2.9144790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047579, + "balance_loss_mlp": 1.00688112, + "epoch": 0.6490958060792612, + "flos": 475372568832.0, + "grad_norm": 0.03458153211721296, + "language_loss": 0.88523054, + "learning_rate": 0.00028961680029930526, + "loss": 0.8957063, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.40698242, + "step": 3374, + "time_per_iteration": 2.5657663345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048477, + "balance_loss_mlp": 1.00794625, + "epoch": 0.6492881877645248, + "flos": 590003001600.0, + "grad_norm": 0.03430965952422358, + "language_loss": 0.77826953, + "learning_rate": 0.00028933421962350317, + "loss": 0.78875428, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.40527344, + "step": 3375, + "time_per_iteration": 2.782069683074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053456, + "balance_loss_mlp": 1.0128299, + "epoch": 0.6494805694497884, + "flos": 643588385280.0, + "grad_norm": 0.03575939394791191, + "language_loss": 0.84478199, + "learning_rate": 0.0002890517207354104, + "loss": 0.85531658, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.40625, + "step": 3376, + "time_per_iteration": 2.837724447250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047454, + "balance_loss_mlp": 1.00689936, + "epoch": 0.649672951135052, + "flos": 532837484544.0, + "grad_norm": 0.034227306744160566, + "language_loss": 0.82481575, + "learning_rate": 0.0002887693037447029, + "loss": 0.83529025, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.40551758, + "step": 3377, + "time_per_iteration": 2.579442262649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_mlp": 1.00662696, + "epoch": 0.6498653328203156, + "flos": 548446019328.0, + "grad_norm": 0.03719565127882316, + "language_loss": 0.82554042, + "learning_rate": 0.00028848696876102443, + "loss": 0.83601272, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.40600586, + "step": 3378, + "time_per_iteration": 2.6242425441741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047252, + "balance_loss_mlp": 1.00650632, + "epoch": 0.650057714505579, + "flos": 463161420288.0, + "grad_norm": 0.037917560954429594, + "language_loss": 0.8430717, + "learning_rate": 0.00028820471589398723, + "loss": 0.85354424, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.4074707, + "step": 3379, + "time_per_iteration": 2.5716495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_mlp": 1.00565684, + "epoch": 0.6502500961908426, + "flos": 511241966592.0, + "grad_norm": 0.04232947369873583, + "language_loss": 0.78428495, + "learning_rate": 0.00028792254525317196, + "loss": 0.79474926, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.40771484, + "step": 3380, + "time_per_iteration": 2.6657466888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104445, + "balance_loss_mlp": 1.00377584, + "epoch": 0.6504424778761062, + "flos": 580911227904.0, + "grad_norm": 0.0355389042104645, + "language_loss": 0.8194313, + "learning_rate": 0.00028764045694812645, + "loss": 0.82987577, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.40673828, + "step": 3381, + "time_per_iteration": 2.75962233543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047874, + "balance_loss_mlp": 1.00727105, + "epoch": 0.6506348595613698, + "flos": 520467888384.0, + "grad_norm": 0.04062665752895993, + "language_loss": 0.76926279, + "learning_rate": 0.0002873584510883671, + "loss": 0.77974153, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.40600586, + "step": 3382, + "time_per_iteration": 2.5889906883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049006, + "balance_loss_mlp": 1.00837946, + "epoch": 0.6508272412466333, + "flos": 511363475712.0, + "grad_norm": 0.029998580027972052, + "language_loss": 0.86699784, + "learning_rate": 0.0002870765277833788, + "loss": 0.8774879, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.40625, + "step": 3383, + "time_per_iteration": 2.6930124759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_mlp": 1.00863218, + "epoch": 0.6510196229318969, + "flos": 626805588480.0, + "grad_norm": 0.03382855215234118, + "language_loss": 0.80910194, + "learning_rate": 0.00028679468714261347, + "loss": 0.81959337, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.4050293, + "step": 3384, + "time_per_iteration": 2.793992280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.00750864, + "epoch": 0.6512120046171604, + "flos": 475670022144.0, + "grad_norm": 0.034347459077756264, + "language_loss": 0.77632761, + "learning_rate": 0.0002865129292754918, + "loss": 0.78680825, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.40551758, + "step": 3385, + "time_per_iteration": 2.5745677947998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051635, + "balance_loss_mlp": 1.01115131, + "epoch": 0.651404386302424, + "flos": 553031273472.0, + "grad_norm": 0.0319561697529533, + "language_loss": 0.82687205, + "learning_rate": 0.00028623125429140105, + "loss": 0.8373884, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.40478516, + "step": 3386, + "time_per_iteration": 2.8197057247161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049254, + "balance_loss_mlp": 1.00874698, + "epoch": 0.6515967679876876, + "flos": 524375610624.0, + "grad_norm": 0.03843989341560043, + "language_loss": 0.87771493, + "learning_rate": 0.00028594966229969785, + "loss": 0.8882075, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.4050293, + "step": 3387, + "time_per_iteration": 2.6713032722473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_mlp": 1.00899053, + "epoch": 0.6517891496729511, + "flos": 575017563648.0, + "grad_norm": 0.03692798161206562, + "language_loss": 0.8182978, + "learning_rate": 0.00028566815340970577, + "loss": 0.82879114, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.40332031, + "step": 3388, + "time_per_iteration": 2.7321841716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048844, + "balance_loss_mlp": 1.0084554, + "epoch": 0.6519815313582147, + "flos": 556990518528.0, + "grad_norm": 0.03423866481728588, + "language_loss": 0.81470537, + "learning_rate": 0.0002853867277307162, + "loss": 0.82519382, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.40380859, + "step": 3389, + "time_per_iteration": 2.7031924724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049788, + "balance_loss_mlp": 1.00937581, + "epoch": 0.6521739130434783, + "flos": 481522857216.0, + "grad_norm": 0.03513339122298917, + "language_loss": 0.82942468, + "learning_rate": 0.00028510538537198824, + "loss": 0.83992255, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.40405273, + "step": 3390, + "time_per_iteration": 2.703963279724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050398, + "balance_loss_mlp": 1.00993848, + "epoch": 0.6523662947287419, + "flos": 667021112832.0, + "grad_norm": 0.03209400617836455, + "language_loss": 0.86939168, + "learning_rate": 0.00028482412644274867, + "loss": 0.87989569, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.40454102, + "step": 3391, + "time_per_iteration": 2.9381484985351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049716, + "balance_loss_mlp": 1.00920916, + "epoch": 0.6525586764140053, + "flos": 549702906624.0, + "grad_norm": 0.03739783573884853, + "language_loss": 0.75139832, + "learning_rate": 0.00028454295105219207, + "loss": 0.76189548, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.4050293, + "step": 3392, + "time_per_iteration": 2.658132314682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_mlp": 1.00706887, + "epoch": 0.6527510580992689, + "flos": 804391190016.0, + "grad_norm": 0.02478431190679109, + "language_loss": 0.79875654, + "learning_rate": 0.0002842618593094802, + "loss": 0.80923212, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.40478516, + "step": 3393, + "time_per_iteration": 3.1278936862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046198, + "balance_loss_mlp": 1.00571501, + "epoch": 0.6529434397845325, + "flos": 672376250880.0, + "grad_norm": 0.04113995840272075, + "language_loss": 0.80790162, + "learning_rate": 0.00028398085132374243, + "loss": 0.81836367, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.40478516, + "step": 3394, + "time_per_iteration": 2.8653299808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_mlp": 1.00571322, + "epoch": 0.6531358214697961, + "flos": 829876933632.0, + "grad_norm": 0.032703635981260123, + "language_loss": 0.85031712, + "learning_rate": 0.0002836999272040761, + "loss": 0.86077929, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.4050293, + "step": 3395, + "time_per_iteration": 3.131331205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.01005006, + "epoch": 0.6533282031550597, + "flos": 488393452032.0, + "grad_norm": 0.04317230929037854, + "language_loss": 0.84511197, + "learning_rate": 0.00028341908705954575, + "loss": 0.85561681, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.40429688, + "step": 3396, + "time_per_iteration": 2.5415916442871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048904, + "balance_loss_mlp": 1.0094223, + "epoch": 0.6535205848403232, + "flos": 1561105233408.0, + "grad_norm": 0.006364223174853702, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82810712, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.39453125, + "step": 3397, + "time_per_iteration": 4.924402236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0069257, + "epoch": 0.6537129665255867, + "flos": 494704133376.0, + "grad_norm": 0.03394309019693363, + "language_loss": 0.78847253, + "learning_rate": 0.00028285765913198604, + "loss": 0.79894781, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.40600586, + "step": 3398, + "time_per_iteration": 2.595367193222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046629, + "balance_loss_mlp": 1.00595522, + "epoch": 0.6539053482108503, + "flos": 606143259648.0, + "grad_norm": 0.03316024353093433, + "language_loss": 0.82683516, + "learning_rate": 0.0002825770715669227, + "loss": 0.83730143, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.40673828, + "step": 3399, + "time_per_iteration": 2.7097129821777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048676, + "balance_loss_mlp": 1.00807345, + "epoch": 0.6540977298961139, + "flos": 578881544448.0, + "grad_norm": 0.0428136910892252, + "language_loss": 0.81872654, + "learning_rate": 0.00028229656841292634, + "loss": 0.82921332, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.40600586, + "step": 3400, + "time_per_iteration": 2.6833486557006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_mlp": 1.00442982, + "epoch": 0.6542901115813774, + "flos": 512770062336.0, + "grad_norm": 0.04250142071298369, + "language_loss": 0.76713872, + "learning_rate": 0.0002820161497788979, + "loss": 0.77758902, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.40600586, + "step": 3401, + "time_per_iteration": 2.626732349395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048894, + "balance_loss_mlp": 1.00838673, + "epoch": 0.654482493266641, + "flos": 626675331072.0, + "grad_norm": 0.03960445373110503, + "language_loss": 0.87829405, + "learning_rate": 0.00028173581577370545, + "loss": 0.88878298, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.4050293, + "step": 3402, + "time_per_iteration": 2.7741096019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048023, + "balance_loss_mlp": 1.00753999, + "epoch": 0.6546748749519046, + "flos": 525063836160.0, + "grad_norm": 0.03167040591829995, + "language_loss": 0.79177642, + "learning_rate": 0.0002814555665061844, + "loss": 0.80225664, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.40478516, + "step": 3403, + "time_per_iteration": 2.664350986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047128, + "balance_loss_mlp": 1.00664401, + "epoch": 0.6548672566371682, + "flos": 480274718208.0, + "grad_norm": 0.036729511728986385, + "language_loss": 0.78224975, + "learning_rate": 0.00028117540208513715, + "loss": 0.79272103, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.40478516, + "step": 3404, + "time_per_iteration": 2.6802027225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043621, + "balance_loss_mlp": 1.00306582, + "epoch": 0.6550596383224317, + "flos": 617136404736.0, + "grad_norm": 0.034100585633273374, + "language_loss": 0.85354125, + "learning_rate": 0.00028089532261933313, + "loss": 0.86397743, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.40551758, + "step": 3405, + "time_per_iteration": 2.7186086177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_mlp": 1.00567865, + "epoch": 0.6552520200076952, + "flos": 489808786944.0, + "grad_norm": 0.041360786835332355, + "language_loss": 0.86205178, + "learning_rate": 0.0002806153282175087, + "loss": 0.87251461, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.40600586, + "step": 3406, + "time_per_iteration": 2.5789847373962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046471, + "balance_loss_mlp": 1.00584447, + "epoch": 0.6554444016929588, + "flos": 688859649024.0, + "grad_norm": 0.034986799312927766, + "language_loss": 0.8358103, + "learning_rate": 0.0002803354189883679, + "loss": 0.84627509, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.40625, + "step": 3407, + "time_per_iteration": 2.837360382080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_mlp": 1.01023173, + "epoch": 0.6556367833782224, + "flos": 544171824384.0, + "grad_norm": 0.032399307772020214, + "language_loss": 0.86254793, + "learning_rate": 0.00028005559504058053, + "loss": 0.87305439, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.40405273, + "step": 3408, + "time_per_iteration": 2.7328412532806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_mlp": 1.00673985, + "epoch": 0.655829165063486, + "flos": 674731577856.0, + "grad_norm": 0.033393765710147245, + "language_loss": 0.77549541, + "learning_rate": 0.0002797758564827838, + "loss": 0.78596783, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.4050293, + "step": 3409, + "time_per_iteration": 2.8037917613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048239, + "balance_loss_mlp": 1.00761223, + "epoch": 0.6560215467487496, + "flos": 532837484544.0, + "grad_norm": 0.037569861592142095, + "language_loss": 0.83625042, + "learning_rate": 0.0002794962034235824, + "loss": 0.84673285, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.40625, + "step": 3410, + "time_per_iteration": 2.660435676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00789142, + "epoch": 0.656213928434013, + "flos": 592460395776.0, + "grad_norm": 0.035927702009128905, + "language_loss": 0.75148469, + "learning_rate": 0.00027921663597154695, + "loss": 0.76196802, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.40429688, + "step": 3411, + "time_per_iteration": 2.7516040802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.01015997, + "epoch": 0.6564063101192766, + "flos": 416678956032.0, + "grad_norm": 0.07901014031845595, + "language_loss": 0.81708795, + "learning_rate": 0.00027893715423521525, + "loss": 0.82759392, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.40429688, + "step": 3412, + "time_per_iteration": 2.4704418182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045819, + "balance_loss_mlp": 1.00547826, + "epoch": 0.6565986918045402, + "flos": 454271835648.0, + "grad_norm": 0.03411050033810387, + "language_loss": 0.84291053, + "learning_rate": 0.00027865775832309163, + "loss": 0.85336864, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.40332031, + "step": 3413, + "time_per_iteration": 2.6385068893432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.00854325, + "epoch": 0.6567910734898038, + "flos": 548799853056.0, + "grad_norm": 0.036374593364126635, + "language_loss": 0.86917508, + "learning_rate": 0.00027837844834364733, + "loss": 0.87966299, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.40234375, + "step": 3414, + "time_per_iteration": 2.6444642543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048805, + "balance_loss_mlp": 1.00860763, + "epoch": 0.6569834551750673, + "flos": 656765770752.0, + "grad_norm": 0.03225713211671443, + "language_loss": 0.87055808, + "learning_rate": 0.00027809922440532, + "loss": 0.88104612, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.40185547, + "step": 3415, + "time_per_iteration": 2.847615957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.01114511, + "epoch": 0.6571758368603309, + "flos": 540811376640.0, + "grad_norm": 0.035988230545184526, + "language_loss": 0.81540048, + "learning_rate": 0.00027782008661651406, + "loss": 0.82591534, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.40332031, + "step": 3416, + "time_per_iteration": 2.767226457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049379, + "balance_loss_mlp": 1.00906193, + "epoch": 0.6573682185455945, + "flos": 498379531008.0, + "grad_norm": 0.03451446989535273, + "language_loss": 0.87885237, + "learning_rate": 0.00027754103508560013, + "loss": 0.88934618, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.40307617, + "step": 3417, + "time_per_iteration": 2.6277449131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045088, + "balance_loss_mlp": 1.00481939, + "epoch": 0.657560600230858, + "flos": 448353871872.0, + "grad_norm": 0.03502749433462501, + "language_loss": 0.8376503, + "learning_rate": 0.0002772620699209163, + "loss": 0.8481012, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.40258789, + "step": 3418, + "time_per_iteration": 2.603851318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041901, + "balance_loss_mlp": 1.00168002, + "epoch": 0.6577529819161216, + "flos": 482920695552.0, + "grad_norm": 0.033924516533442195, + "language_loss": 0.80503142, + "learning_rate": 0.0002769831912307658, + "loss": 0.81545043, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.40209961, + "step": 3419, + "time_per_iteration": 2.567737340927124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_mlp": 1.00313556, + "epoch": 0.6579453636013851, + "flos": 531860553984.0, + "grad_norm": 0.04823961507786352, + "language_loss": 0.80877286, + "learning_rate": 0.00027670439912341917, + "loss": 0.81920785, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.40356445, + "step": 3420, + "time_per_iteration": 2.639587163925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043978, + "balance_loss_mlp": 1.00354195, + "epoch": 0.6581377452866487, + "flos": 629243540736.0, + "grad_norm": 0.032258458979824364, + "language_loss": 0.84138131, + "learning_rate": 0.0002764256937071129, + "loss": 0.85182106, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.40429688, + "step": 3421, + "time_per_iteration": 2.793288469314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_mlp": 1.00347602, + "epoch": 0.6583301269719123, + "flos": 549674716416.0, + "grad_norm": 0.033092634832732, + "language_loss": 0.87840796, + "learning_rate": 0.00027614707509005036, + "loss": 0.88884783, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.4050293, + "step": 3422, + "time_per_iteration": 2.672691822052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_mlp": 1.0038892, + "epoch": 0.6585225086571759, + "flos": 428397265152.0, + "grad_norm": 0.041046610709459384, + "language_loss": 0.7990576, + "learning_rate": 0.0002758685433804008, + "loss": 0.80950087, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.40429688, + "step": 3423, + "time_per_iteration": 2.5028507709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_mlp": 1.00448632, + "epoch": 0.6587148903424394, + "flos": 861050261760.0, + "grad_norm": 0.040364444047634805, + "language_loss": 0.7997486, + "learning_rate": 0.00027559009868630005, + "loss": 0.81019825, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.40478516, + "step": 3424, + "time_per_iteration": 3.1220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_mlp": 1.00671124, + "epoch": 0.6589072720277029, + "flos": 807037167360.0, + "grad_norm": 0.05893519085395252, + "language_loss": 0.80930316, + "learning_rate": 0.0002753117411158491, + "loss": 0.81977397, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.40356445, + "step": 3425, + "time_per_iteration": 3.0889339447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047077, + "balance_loss_mlp": 1.00676, + "epoch": 0.6590996537129665, + "flos": 549674716416.0, + "grad_norm": 0.03274381739097603, + "language_loss": 0.90609264, + "learning_rate": 0.0002750334707771168, + "loss": 0.91656339, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.40307617, + "step": 3426, + "time_per_iteration": 2.6541290283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_mlp": 1.00647414, + "epoch": 0.6592920353982301, + "flos": 455109760512.0, + "grad_norm": 0.03777224687776173, + "language_loss": 0.81529361, + "learning_rate": 0.0002747552877781369, + "loss": 0.82576048, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.40209961, + "step": 3427, + "time_per_iteration": 2.5356411933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_mlp": 1.00858271, + "epoch": 0.6594844170834937, + "flos": 568261675008.0, + "grad_norm": 0.03735814383850805, + "language_loss": 0.82849789, + "learning_rate": 0.0002744771922269097, + "loss": 0.83898544, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.40161133, + "step": 3428, + "time_per_iteration": 2.7781617641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_mlp": 1.00761461, + "epoch": 0.6596767987687572, + "flos": 1189755878400.0, + "grad_norm": 0.035375644925624505, + "language_loss": 0.82642734, + "learning_rate": 0.0002741991842314015, + "loss": 0.83690447, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.40087891, + "step": 3429, + "time_per_iteration": 3.484401226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050876, + "balance_loss_mlp": 1.01070201, + "epoch": 0.6598691804540208, + "flos": 504468581376.0, + "grad_norm": 0.033809257581419436, + "language_loss": 0.86197507, + "learning_rate": 0.0002739212638995445, + "loss": 0.87248385, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.40161133, + "step": 3430, + "time_per_iteration": 2.557008743286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_mlp": 1.00654662, + "epoch": 0.6600615621392844, + "flos": 532399080192.0, + "grad_norm": 0.03652926945024374, + "language_loss": 0.83438206, + "learning_rate": 0.00027364343133923696, + "loss": 0.84484929, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.40161133, + "step": 3431, + "time_per_iteration": 2.662047863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010455, + "balance_loss_mlp": 1.00534999, + "epoch": 0.6602539438245479, + "flos": 566557635072.0, + "grad_norm": 0.03543857868011933, + "language_loss": 0.83350068, + "learning_rate": 0.0002733656866583431, + "loss": 0.84395564, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.40136719, + "step": 3432, + "time_per_iteration": 2.676973581314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_mlp": 1.00558269, + "epoch": 0.6604463255098114, + "flos": 858592867584.0, + "grad_norm": 0.037899677341019365, + "language_loss": 0.83285594, + "learning_rate": 0.0002730880299646927, + "loss": 0.8433131, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.40112305, + "step": 3433, + "time_per_iteration": 3.0207436084747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_mlp": 1.00585747, + "epoch": 0.660638707195075, + "flos": 675680318208.0, + "grad_norm": 0.03767896728200409, + "language_loss": 0.85914338, + "learning_rate": 0.0002728104613660821, + "loss": 0.8696028, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.40063477, + "step": 3434, + "time_per_iteration": 2.847806215286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_mlp": 1.0032711, + "epoch": 0.6608310888803386, + "flos": 890524407552.0, + "grad_norm": 0.03485230588781084, + "language_loss": 0.8359797, + "learning_rate": 0.0002725329809702729, + "loss": 0.84641242, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.39990234, + "step": 3435, + "time_per_iteration": 3.1851022243499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_mlp": 1.0028832, + "epoch": 0.6610234705656022, + "flos": 1138108804608.0, + "grad_norm": 0.04206643775716819, + "language_loss": 0.76903141, + "learning_rate": 0.0002722555888849921, + "loss": 0.7794615, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.40112305, + "step": 3436, + "time_per_iteration": 3.453571081161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_mlp": 1.00474417, + "epoch": 0.6612158522508658, + "flos": 468959820288.0, + "grad_norm": 0.03417683071505001, + "language_loss": 0.80971491, + "learning_rate": 0.00027197828521793334, + "loss": 0.82016289, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.40039062, + "step": 3437, + "time_per_iteration": 2.5737972259521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_mlp": 1.00892961, + "epoch": 0.6614082339361292, + "flos": 572774997504.0, + "grad_norm": 0.03444646564186984, + "language_loss": 0.85238397, + "learning_rate": 0.0002717010700767552, + "loss": 0.86287451, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.40112305, + "step": 3438, + "time_per_iteration": 2.6816329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_mlp": 1.00700414, + "epoch": 0.6616006156213928, + "flos": 499460474112.0, + "grad_norm": 0.039408018339583364, + "language_loss": 0.7639091, + "learning_rate": 0.00027142394356908226, + "loss": 0.77437991, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.40063477, + "step": 3439, + "time_per_iteration": 2.6397507190704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_mlp": 1.01604831, + "epoch": 0.6617929973066564, + "flos": 603610043136.0, + "grad_norm": 0.03512262783038589, + "language_loss": 0.85516727, + "learning_rate": 0.00027114690580250456, + "loss": 0.8657307, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.40283203, + "step": 3440, + "time_per_iteration": 2.8226699829101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00607085, + "epoch": 0.66198537899192, + "flos": 523995532032.0, + "grad_norm": 0.03484935524221126, + "language_loss": 0.87502497, + "learning_rate": 0.0002708699568845776, + "loss": 0.88549048, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.40478516, + "step": 3441, + "time_per_iteration": 2.666151762008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054733, + "balance_loss_mlp": 1.01563263, + "epoch": 0.6621777606771835, + "flos": 1569612794112.0, + "grad_norm": 0.008720086595697616, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80342519, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.390625, + "step": 3442, + "time_per_iteration": 4.902445316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043667, + "balance_loss_mlp": 1.00320721, + "epoch": 0.6623701423624471, + "flos": 527690371584.0, + "grad_norm": 0.04147844177514617, + "language_loss": 0.83753407, + "learning_rate": 0.0002703163260247261, + "loss": 0.84797072, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.40454102, + "step": 3443, + "time_per_iteration": 2.6544172763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_mlp": 1.00157988, + "epoch": 0.6625625240477107, + "flos": 529216521984.0, + "grad_norm": 0.040243971726719965, + "language_loss": 0.82285839, + "learning_rate": 0.0002700396442977399, + "loss": 0.83327973, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.40551758, + "step": 3444, + "time_per_iteration": 2.659823179244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046923, + "balance_loss_mlp": 1.00648713, + "epoch": 0.6627549057329742, + "flos": 474196361472.0, + "grad_norm": 0.03873462944333031, + "language_loss": 0.84804982, + "learning_rate": 0.0002697630518492817, + "loss": 0.85851908, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.40429688, + "step": 3445, + "time_per_iteration": 2.6407060623168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042694, + "balance_loss_mlp": 1.00218678, + "epoch": 0.6629472874182378, + "flos": 529012387584.0, + "grad_norm": 0.03365832032426446, + "language_loss": 0.86288029, + "learning_rate": 0.0002694865487867343, + "loss": 0.87330723, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.4050293, + "step": 3446, + "time_per_iteration": 2.6234817504882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_mlp": 0.99986076, + "epoch": 0.6631396691035013, + "flos": 614379611904.0, + "grad_norm": 0.029868994053189296, + "language_loss": 0.85050064, + "learning_rate": 0.0002692101352174453, + "loss": 0.86090481, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.40551758, + "step": 3447, + "time_per_iteration": 2.7610418796539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054226, + "balance_loss_mlp": 1.01357543, + "epoch": 0.6633320507887649, + "flos": 610434951168.0, + "grad_norm": 0.03566276224507284, + "language_loss": 0.85075617, + "learning_rate": 0.00026893381124872787, + "loss": 0.86129844, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.40649414, + "step": 3448, + "time_per_iteration": 2.7092947959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.0067625, + "epoch": 0.6635244324740285, + "flos": 751142143488.0, + "grad_norm": 0.03834758690665688, + "language_loss": 0.81510758, + "learning_rate": 0.00026865757698786097, + "loss": 0.82558024, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.4050293, + "step": 3449, + "time_per_iteration": 3.0252504348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050032, + "balance_loss_mlp": 1.00952482, + "epoch": 0.6637168141592921, + "flos": 665748674304.0, + "grad_norm": 0.03495621172774381, + "language_loss": 0.82439375, + "learning_rate": 0.000268381432542088, + "loss": 0.83489406, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.4050293, + "step": 3450, + "time_per_iteration": 2.847905397415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_mlp": 1.00598156, + "epoch": 0.6639091958445555, + "flos": 607921176576.0, + "grad_norm": 0.03480028422588226, + "language_loss": 0.80330265, + "learning_rate": 0.00026810537801861807, + "loss": 0.8137669, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.40429688, + "step": 3451, + "time_per_iteration": 2.8109076023101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044982, + "balance_loss_mlp": 1.00442719, + "epoch": 0.6641015775298191, + "flos": 477680263680.0, + "grad_norm": 0.03370448580538907, + "language_loss": 0.81616271, + "learning_rate": 0.0002678294135246243, + "loss": 0.82661253, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.40551758, + "step": 3452, + "time_per_iteration": 2.77632999420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_mlp": 1.00337756, + "epoch": 0.6642939592150827, + "flos": 905596361472.0, + "grad_norm": 0.035596990972813804, + "language_loss": 0.87064171, + "learning_rate": 0.0002675535391672463, + "loss": 0.88108027, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.40478516, + "step": 3453, + "time_per_iteration": 3.1011788845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046586, + "balance_loss_mlp": 1.00610256, + "epoch": 0.6644863409003463, + "flos": 582938966016.0, + "grad_norm": 0.03233314445792202, + "language_loss": 0.86734712, + "learning_rate": 0.0002672777550535877, + "loss": 0.87781298, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.40478516, + "step": 3454, + "time_per_iteration": 2.799320936203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047695, + "balance_loss_mlp": 1.00714028, + "epoch": 0.6646787225856099, + "flos": 479970461952.0, + "grad_norm": 0.04849178662998588, + "language_loss": 0.85994661, + "learning_rate": 0.00026700206129071747, + "loss": 0.87042361, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.40551758, + "step": 3455, + "time_per_iteration": 2.5544278621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044362, + "balance_loss_mlp": 1.00371206, + "epoch": 0.6648711042708734, + "flos": 450828762624.0, + "grad_norm": 0.04059200209413719, + "language_loss": 0.89189559, + "learning_rate": 0.00026672645798566925, + "loss": 0.90233922, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.40649414, + "step": 3456, + "time_per_iteration": 2.501304864883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_mlp": 1.00669408, + "epoch": 0.665063485956137, + "flos": 860597273088.0, + "grad_norm": 0.0398485152985426, + "language_loss": 0.7998091, + "learning_rate": 0.00026645094524544225, + "loss": 0.81028181, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.40576172, + "step": 3457, + "time_per_iteration": 3.276411294937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_mlp": 1.00266171, + "epoch": 0.6652558676414005, + "flos": 605472530688.0, + "grad_norm": 0.027841742129180558, + "language_loss": 0.75740635, + "learning_rate": 0.00026617552317699945, + "loss": 0.76784027, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.40722656, + "step": 3458, + "time_per_iteration": 2.801248550415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046474, + "balance_loss_mlp": 1.00591886, + "epoch": 0.6654482493266641, + "flos": 511411107840.0, + "grad_norm": 0.036000642082667296, + "language_loss": 0.87457603, + "learning_rate": 0.0002659001918872693, + "loss": 0.88504076, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.40551758, + "step": 3459, + "time_per_iteration": 2.6388814449310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050084, + "balance_loss_mlp": 1.00948107, + "epoch": 0.6656406310119277, + "flos": 566661647616.0, + "grad_norm": 0.03405161677383315, + "language_loss": 0.81573474, + "learning_rate": 0.0002656249514831449, + "loss": 0.82623559, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.40600586, + "step": 3460, + "time_per_iteration": 2.6583993434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052206, + "balance_loss_mlp": 1.01155555, + "epoch": 0.6658330126971912, + "flos": 1026060187392.0, + "grad_norm": 0.03356522396560915, + "language_loss": 0.87476516, + "learning_rate": 0.00026534980207148416, + "loss": 0.88528717, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.40649414, + "step": 3461, + "time_per_iteration": 3.4255144596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_mlp": 1.00962222, + "epoch": 0.6660253943824548, + "flos": 818234446848.0, + "grad_norm": 0.03543783293435262, + "language_loss": 0.74157602, + "learning_rate": 0.0002650747437591097, + "loss": 0.75208062, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.40844727, + "step": 3462, + "time_per_iteration": 2.99372935295105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.00879669, + "epoch": 0.6662177760677184, + "flos": 1499533318656.0, + "grad_norm": 0.007196146037648728, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82927668, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.39257812, + "step": 3463, + "time_per_iteration": 5.021228075027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047386, + "balance_loss_mlp": 1.00668836, + "epoch": 0.666410157752982, + "flos": 501108133632.0, + "grad_norm": 0.0343393236578971, + "language_loss": 0.8738476, + "learning_rate": 0.00026452490085933155, + "loss": 0.88432145, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.40698242, + "step": 3464, + "time_per_iteration": 2.5860917568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048288, + "balance_loss_mlp": 1.00747108, + "epoch": 0.6666025394382454, + "flos": 482139151104.0, + "grad_norm": 0.04334646456147875, + "language_loss": 0.90236807, + "learning_rate": 0.00026425011648539614, + "loss": 0.91285098, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.40820312, + "step": 3465, + "time_per_iteration": 2.5441110134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049485, + "balance_loss_mlp": 1.00864422, + "epoch": 0.666794921123509, + "flos": 547692665088.0, + "grad_norm": 0.03397954120439615, + "language_loss": 0.83244991, + "learning_rate": 0.00026397542363768267, + "loss": 0.84294474, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.40844727, + "step": 3466, + "time_per_iteration": 2.74092698097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051821, + "balance_loss_mlp": 1.01097989, + "epoch": 0.6669873028087726, + "flos": 472943364864.0, + "grad_norm": 0.036434069598551014, + "language_loss": 0.82217574, + "learning_rate": 0.0002637008224228362, + "loss": 0.83269393, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.40844727, + "step": 3467, + "time_per_iteration": 2.5710275173187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_mlp": 1.00785649, + "epoch": 0.6671796844940362, + "flos": 548500454400.0, + "grad_norm": 0.030766968440674072, + "language_loss": 0.8512944, + "learning_rate": 0.00026342631294746653, + "loss": 0.86178112, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.40820312, + "step": 3468, + "time_per_iteration": 2.7195847034454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_mlp": 1.00493824, + "epoch": 0.6673720661792998, + "flos": 1072123689216.0, + "grad_norm": 0.03165025767658557, + "language_loss": 0.81300414, + "learning_rate": 0.0002631518953181476, + "loss": 0.8234629, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.40942383, + "step": 3469, + "time_per_iteration": 3.4572696685791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045563, + "balance_loss_mlp": 1.00588989, + "epoch": 0.6675644478645633, + "flos": 1527113874432.0, + "grad_norm": 0.008139756237930116, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77370852, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.39648438, + "step": 3470, + "time_per_iteration": 4.91265869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_mlp": 1.00341797, + "epoch": 0.6677568295498268, + "flos": 580844153856.0, + "grad_norm": 0.03268114077515944, + "language_loss": 0.80885828, + "learning_rate": 0.00026260333602377985, + "loss": 0.81930137, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.40893555, + "step": 3471, + "time_per_iteration": 2.7573940753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043519, + "balance_loss_mlp": 1.00277328, + "epoch": 0.6679492112350904, + "flos": 384791157504.0, + "grad_norm": 0.03558012533984873, + "language_loss": 0.87711406, + "learning_rate": 0.0002623291945717007, + "loss": 0.88754922, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.4074707, + "step": 3472, + "time_per_iteration": 2.442338466644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_mlp": 1.00364411, + "epoch": 0.668141592920354, + "flos": 1152616954368.0, + "grad_norm": 0.0328139503917561, + "language_loss": 0.84606934, + "learning_rate": 0.00026205514539161175, + "loss": 0.85651278, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.40698242, + "step": 3473, + "time_per_iteration": 3.503469705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044285, + "balance_loss_mlp": 1.00353932, + "epoch": 0.6683339746056175, + "flos": 562292188416.0, + "grad_norm": 0.030626159125144124, + "language_loss": 0.84382141, + "learning_rate": 0.00026178118858990773, + "loss": 0.85426426, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.4074707, + "step": 3474, + "time_per_iteration": 2.8285627365112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_mlp": 1.00330997, + "epoch": 0.6685263562908811, + "flos": 515329523712.0, + "grad_norm": 0.030456650520625777, + "language_loss": 0.8459208, + "learning_rate": 0.0002615073242729483, + "loss": 0.85636061, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.40673828, + "step": 3475, + "time_per_iteration": 2.637474775314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047916, + "balance_loss_mlp": 1.00726593, + "epoch": 0.6687187379761447, + "flos": 631002015744.0, + "grad_norm": 0.030827527571606016, + "language_loss": 0.85137111, + "learning_rate": 0.0002612335525470573, + "loss": 0.86185026, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.40649414, + "step": 3476, + "time_per_iteration": 2.823110342025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_mlp": 1.00759649, + "epoch": 0.6689111196614083, + "flos": 536688826368.0, + "grad_norm": 0.0342797401257031, + "language_loss": 0.78870076, + "learning_rate": 0.0002609598735185221, + "loss": 0.79918253, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.40576172, + "step": 3477, + "time_per_iteration": 2.6825544834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048062, + "balance_loss_mlp": 1.00736415, + "epoch": 0.6691035013466718, + "flos": 604161208320.0, + "grad_norm": 0.031585406138604756, + "language_loss": 0.83722425, + "learning_rate": 0.00026068628729359445, + "loss": 0.84770489, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.40698242, + "step": 3478, + "time_per_iteration": 2.77055287361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_mlp": 1.00893426, + "epoch": 0.6692958830319353, + "flos": 634128193536.0, + "grad_norm": 0.03192222919752024, + "language_loss": 0.76639205, + "learning_rate": 0.00026041279397848996, + "loss": 0.77688813, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.40673828, + "step": 3479, + "time_per_iteration": 2.8836774826049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.00585258, + "epoch": 0.6694882647171989, + "flos": 646749556224.0, + "grad_norm": 0.03482378260676791, + "language_loss": 0.83261842, + "learning_rate": 0.00026013939367938797, + "loss": 0.84308422, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.40722656, + "step": 3480, + "time_per_iteration": 2.8915905952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044065, + "balance_loss_mlp": 1.00339055, + "epoch": 0.6696806464024625, + "flos": 570762810624.0, + "grad_norm": 0.033098295415039676, + "language_loss": 0.81370211, + "learning_rate": 0.00025986608650243204, + "loss": 0.82414275, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.40673828, + "step": 3481, + "time_per_iteration": 2.785128116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_mlp": 1.00114262, + "epoch": 0.6698730280877261, + "flos": 623964225024.0, + "grad_norm": 0.029494842151893377, + "language_loss": 0.79968995, + "learning_rate": 0.0002595928725537293, + "loss": 0.81010795, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.40649414, + "step": 3482, + "time_per_iteration": 2.862269639968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044098, + "balance_loss_mlp": 1.00361502, + "epoch": 0.6700654097729896, + "flos": 503509147392.0, + "grad_norm": 0.04687738924835003, + "language_loss": 0.88447571, + "learning_rate": 0.0002593197519393509, + "loss": 0.89491665, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.40478516, + "step": 3483, + "time_per_iteration": 2.5955467224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044048, + "balance_loss_mlp": 1.00356483, + "epoch": 0.6702577914582531, + "flos": 625119045120.0, + "grad_norm": 0.03040614525342857, + "language_loss": 0.79865301, + "learning_rate": 0.00025904672476533165, + "loss": 0.80909348, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.40478516, + "step": 3484, + "time_per_iteration": 2.83461594581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00695693, + "epoch": 0.6704501731435167, + "flos": 457213320960.0, + "grad_norm": 0.03431199864252877, + "language_loss": 0.83164477, + "learning_rate": 0.0002587737911376704, + "loss": 0.84211963, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.40527344, + "step": 3485, + "time_per_iteration": 2.6094586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_mlp": 1.00313866, + "epoch": 0.6706425548287803, + "flos": 544258340352.0, + "grad_norm": 0.0329892912769069, + "language_loss": 0.84059811, + "learning_rate": 0.00025850095116232885, + "loss": 0.851035, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.40551758, + "step": 3486, + "time_per_iteration": 2.686342477798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_mlp": 1.00332236, + "epoch": 0.6708349365140439, + "flos": 635180946432.0, + "grad_norm": 0.03091711657706004, + "language_loss": 0.78076321, + "learning_rate": 0.000258228204945233, + "loss": 0.79120129, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.40478516, + "step": 3487, + "time_per_iteration": 2.9295520782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_mlp": 1.0044347, + "epoch": 0.6710273181993074, + "flos": 641903787264.0, + "grad_norm": 0.032938145156071165, + "language_loss": 0.85185027, + "learning_rate": 0.00025795555259227254, + "loss": 0.86229968, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.4050293, + "step": 3488, + "time_per_iteration": 2.79502534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_mlp": 1.00561631, + "epoch": 0.671219699884571, + "flos": 555025963776.0, + "grad_norm": 0.02894865619678765, + "language_loss": 0.84055519, + "learning_rate": 0.00025768299420930046, + "loss": 0.85101712, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.40576172, + "step": 3489, + "time_per_iteration": 2.779972553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_mlp": 1.00622523, + "epoch": 0.6714120815698346, + "flos": 732782651904.0, + "grad_norm": 0.0327604861643189, + "language_loss": 0.8377071, + "learning_rate": 0.0002574105299021332, + "loss": 0.84817296, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.40356445, + "step": 3490, + "time_per_iteration": 2.893480062484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_mlp": 1.00433028, + "epoch": 0.6716044632550981, + "flos": 689947395072.0, + "grad_norm": 0.03209886664090861, + "language_loss": 0.8471486, + "learning_rate": 0.00025713815977655084, + "loss": 0.85759532, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.40332031, + "step": 3491, + "time_per_iteration": 2.957084894180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.0041728, + "epoch": 0.6717968449403616, + "flos": 461587637760.0, + "grad_norm": 0.0366727841184643, + "language_loss": 0.85291302, + "learning_rate": 0.0002568658839382969, + "loss": 0.8633579, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.40307617, + "step": 3492, + "time_per_iteration": 2.5661098957061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054708, + "balance_loss_mlp": 1.01429558, + "epoch": 0.6719892266256252, + "flos": 502597345536.0, + "grad_norm": 0.0394893912508571, + "language_loss": 0.8491143, + "learning_rate": 0.00025659370249307814, + "loss": 0.85966134, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.40405273, + "step": 3493, + "time_per_iteration": 2.6122422218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051838, + "balance_loss_mlp": 1.01144993, + "epoch": 0.6721816083108888, + "flos": 684737098752.0, + "grad_norm": 0.033378667785843884, + "language_loss": 0.85795897, + "learning_rate": 0.00025632161554656473, + "loss": 0.86847734, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.40380859, + "step": 3494, + "time_per_iteration": 2.8829426765441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049339, + "balance_loss_mlp": 1.00897467, + "epoch": 0.6723739899961524, + "flos": 586896265728.0, + "grad_norm": 0.03541855963970859, + "language_loss": 0.8296122, + "learning_rate": 0.00025604962320439017, + "loss": 0.84010559, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.40356445, + "step": 3495, + "time_per_iteration": 2.7043375968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_mlp": 1.00421739, + "epoch": 0.672566371681416, + "flos": 507740567808.0, + "grad_norm": 0.03528245901985063, + "language_loss": 0.82875669, + "learning_rate": 0.0002557777255721516, + "loss": 0.83920175, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.40283203, + "step": 3496, + "time_per_iteration": 2.719181776046753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045131, + "balance_loss_mlp": 1.00479066, + "epoch": 0.6727587533666795, + "flos": 536736458496.0, + "grad_norm": 0.036828443855142154, + "language_loss": 0.81081581, + "learning_rate": 0.0002555059227554087, + "loss": 0.82126713, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.40332031, + "step": 3497, + "time_per_iteration": 2.7057156562805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_mlp": 1.0047394, + "epoch": 0.672951135051943, + "flos": 604037753856.0, + "grad_norm": 0.0344810885559189, + "language_loss": 0.78363037, + "learning_rate": 0.00025523421485968453, + "loss": 0.79408205, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.40429688, + "step": 3498, + "time_per_iteration": 2.8460867404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_mlp": 1.00299513, + "epoch": 0.6731435167372066, + "flos": 812679065088.0, + "grad_norm": 0.0462085280228462, + "language_loss": 0.8591696, + "learning_rate": 0.00025496260199046585, + "loss": 0.86960292, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.40332031, + "step": 3499, + "time_per_iteration": 2.971506357192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_mlp": 1.0028913, + "epoch": 0.6733358984224702, + "flos": 612751394304.0, + "grad_norm": 0.03556230846218865, + "language_loss": 0.84967017, + "learning_rate": 0.000254691084253202, + "loss": 0.86010194, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.40283203, + "step": 3500, + "time_per_iteration": 2.8486316204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_mlp": 1.00318992, + "epoch": 0.6735282801077337, + "flos": 559968942336.0, + "grad_norm": 0.24449978816738047, + "language_loss": 0.77738857, + "learning_rate": 0.00025441966175330567, + "loss": 0.7878232, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.40258789, + "step": 3501, + "time_per_iteration": 2.631596803665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_mlp": 1.0023365, + "epoch": 0.6737206617929973, + "flos": 673633138176.0, + "grad_norm": 0.03266233971307438, + "language_loss": 0.80253637, + "learning_rate": 0.00025414833459615183, + "loss": 0.81296146, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.40161133, + "step": 3502, + "time_per_iteration": 2.822633981704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044113, + "balance_loss_mlp": 1.00398731, + "epoch": 0.6739130434782609, + "flos": 634642420224.0, + "grad_norm": 0.03194426719542878, + "language_loss": 0.80720419, + "learning_rate": 0.0002538771028870796, + "loss": 0.81764531, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.40112305, + "step": 3503, + "time_per_iteration": 2.8278305530548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044086, + "balance_loss_mlp": 1.00376928, + "epoch": 0.6741054251635245, + "flos": 532546834176.0, + "grad_norm": 0.03505319293998398, + "language_loss": 0.82144141, + "learning_rate": 0.0002536059667313903, + "loss": 0.8318823, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.40307617, + "step": 3504, + "time_per_iteration": 2.772728443145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045901, + "balance_loss_mlp": 1.00548911, + "epoch": 0.674297806848788, + "flos": 543652740096.0, + "grad_norm": 0.033634031590092824, + "language_loss": 0.89796269, + "learning_rate": 0.0002533349262343483, + "loss": 0.90842175, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.40405273, + "step": 3505, + "time_per_iteration": 2.6931023597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048869, + "balance_loss_mlp": 1.00845683, + "epoch": 0.6744901885340515, + "flos": 464455246080.0, + "grad_norm": 0.03724604036951252, + "language_loss": 0.82972419, + "learning_rate": 0.0002530639815011807, + "loss": 0.84021288, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.40405273, + "step": 3506, + "time_per_iteration": 2.5213606357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104646, + "balance_loss_mlp": 1.00604796, + "epoch": 0.6746825702193151, + "flos": 633022950912.0, + "grad_norm": 0.0353973537861221, + "language_loss": 0.85602045, + "learning_rate": 0.0002527931326370781, + "loss": 0.866485, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.40405273, + "step": 3507, + "time_per_iteration": 2.7929484844207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046008, + "balance_loss_mlp": 1.00554848, + "epoch": 0.6748749519045787, + "flos": 672393747456.0, + "grad_norm": 0.038454630804936565, + "language_loss": 0.83645785, + "learning_rate": 0.00025252237974719276, + "loss": 0.84691793, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.40454102, + "step": 3508, + "time_per_iteration": 2.8264431953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00374067, + "epoch": 0.6750673335898423, + "flos": 768493602048.0, + "grad_norm": 0.034781380834319586, + "language_loss": 0.81037247, + "learning_rate": 0.00025225172293664056, + "loss": 0.82081401, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.40405273, + "step": 3509, + "time_per_iteration": 2.988295078277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_mlp": 1.00597382, + "epoch": 0.6752597152751059, + "flos": 1515907846656.0, + "grad_norm": 0.0075717383905430985, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77978498, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.39355469, + "step": 3510, + "time_per_iteration": 4.925229787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00339806, + "epoch": 0.6754520969603693, + "flos": 688534005504.0, + "grad_norm": 0.03671107253105254, + "language_loss": 0.85454929, + "learning_rate": 0.00025171069797381106, + "loss": 0.8649869, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.40356445, + "step": 3511, + "time_per_iteration": 2.8605566024780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042968, + "balance_loss_mlp": 1.00265193, + "epoch": 0.6756444786456329, + "flos": 501618469632.0, + "grad_norm": 0.03363257909810701, + "language_loss": 0.82468766, + "learning_rate": 0.00025144033003157864, + "loss": 0.83511734, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.40307617, + "step": 3512, + "time_per_iteration": 2.6560440063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043999, + "balance_loss_mlp": 1.00382507, + "epoch": 0.6758368603308965, + "flos": 493660128768.0, + "grad_norm": 0.04010660433283205, + "language_loss": 0.79292786, + "learning_rate": 0.00025117005858876806, + "loss": 0.80336791, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.40161133, + "step": 3513, + "time_per_iteration": 2.6984188556671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_mlp": 1.00392663, + "epoch": 0.6760292420161601, + "flos": 557044953600.0, + "grad_norm": 0.035892201444293004, + "language_loss": 0.86103761, + "learning_rate": 0.000250899883750308, + "loss": 0.8714788, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.40185547, + "step": 3514, + "time_per_iteration": 2.7181315422058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046252, + "balance_loss_mlp": 1.00600672, + "epoch": 0.6762216237014236, + "flos": 608722162944.0, + "grad_norm": 0.033450458947787066, + "language_loss": 0.81925356, + "learning_rate": 0.00025062980562109006, + "loss": 0.82971609, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.40234375, + "step": 3515, + "time_per_iteration": 2.78231143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043998, + "balance_loss_mlp": 1.00377643, + "epoch": 0.6764140053866872, + "flos": 534928406016.0, + "grad_norm": 0.037161832732059044, + "language_loss": 0.83539182, + "learning_rate": 0.0002503598243059677, + "loss": 0.84583181, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.40209961, + "step": 3516, + "time_per_iteration": 2.7860419750213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046044, + "balance_loss_mlp": 1.00584662, + "epoch": 0.6766063870719508, + "flos": 505862529024.0, + "grad_norm": 0.041409918101289474, + "language_loss": 0.80496907, + "learning_rate": 0.0002500899399097568, + "loss": 0.81542951, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.40185547, + "step": 3517, + "time_per_iteration": 2.6418778896331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041487, + "balance_loss_mlp": 1.00131381, + "epoch": 0.6767987687572143, + "flos": 514194145536.0, + "grad_norm": 0.03808875517476391, + "language_loss": 0.86208284, + "learning_rate": 0.0002498201525372359, + "loss": 0.87249774, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.40161133, + "step": 3518, + "time_per_iteration": 2.569801092147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041708, + "balance_loss_mlp": 1.00148714, + "epoch": 0.6769911504424779, + "flos": 526079650560.0, + "grad_norm": 0.03452143000851854, + "language_loss": 0.83818328, + "learning_rate": 0.00024955046229314584, + "loss": 0.84860039, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.40209961, + "step": 3519, + "time_per_iteration": 2.602756977081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_mlp": 1.00353205, + "epoch": 0.6771835321277414, + "flos": 450837510912.0, + "grad_norm": 0.03417107794198843, + "language_loss": 0.87895727, + "learning_rate": 0.00024928086928218947, + "loss": 0.8893944, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.40161133, + "step": 3520, + "time_per_iteration": 2.4941091537475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00443459, + "epoch": 0.677375913813005, + "flos": 710674852608.0, + "grad_norm": 0.03642632041664857, + "language_loss": 0.76859355, + "learning_rate": 0.00024901137360903216, + "loss": 0.7790401, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.40209961, + "step": 3521, + "time_per_iteration": 2.985905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_mlp": 1.00404227, + "epoch": 0.6775682954982686, + "flos": 429346005504.0, + "grad_norm": 0.039972484461639736, + "language_loss": 0.81834614, + "learning_rate": 0.00024874197537830115, + "loss": 0.82878971, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.40307617, + "step": 3522, + "time_per_iteration": 2.525432586669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045522, + "balance_loss_mlp": 1.0052768, + "epoch": 0.6777606771835322, + "flos": 438821748480.0, + "grad_norm": 0.0378942066794791, + "language_loss": 0.83926749, + "learning_rate": 0.00024847267469458684, + "loss": 0.84972268, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.40234375, + "step": 3523, + "time_per_iteration": 2.519306182861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045594, + "balance_loss_mlp": 1.0053252, + "epoch": 0.6779530588687956, + "flos": 776788280064.0, + "grad_norm": 0.03620909543605363, + "language_loss": 0.78424889, + "learning_rate": 0.00024820347166244034, + "loss": 0.79470479, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.40258789, + "step": 3524, + "time_per_iteration": 3.016852378845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045119, + "balance_loss_mlp": 1.00494587, + "epoch": 0.6781454405540592, + "flos": 572905254912.0, + "grad_norm": 0.03295614224458047, + "language_loss": 0.85541701, + "learning_rate": 0.0002479343663863755, + "loss": 0.86586821, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.40161133, + "step": 3525, + "time_per_iteration": 2.7807812690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046011, + "balance_loss_mlp": 1.00586104, + "epoch": 0.6783378222393228, + "flos": 485983689984.0, + "grad_norm": 0.034679626335120894, + "language_loss": 0.77479804, + "learning_rate": 0.00024766535897086876, + "loss": 0.78525817, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.40136719, + "step": 3526, + "time_per_iteration": 2.599513530731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_mlp": 1.0070163, + "epoch": 0.6785302039245864, + "flos": 483832497408.0, + "grad_norm": 0.03442955801383442, + "language_loss": 0.797737, + "learning_rate": 0.0002473964495203578, + "loss": 0.80820936, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.40209961, + "step": 3527, + "time_per_iteration": 2.6847755908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_mlp": 1.00640571, + "epoch": 0.67872258560985, + "flos": 525862877184.0, + "grad_norm": 0.03305823044562006, + "language_loss": 0.861408, + "learning_rate": 0.0002471276381392425, + "loss": 0.87187475, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.40258789, + "step": 3528, + "time_per_iteration": 4.207594156265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_mlp": 1.00437164, + "epoch": 0.6789149672951135, + "flos": 1555894937088.0, + "grad_norm": 0.004731891717640295, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79231918, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.38867188, + "step": 3529, + "time_per_iteration": 4.977165222167969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046125, + "balance_loss_mlp": 1.00583255, + "epoch": 0.6791073489803771, + "flos": 742686105600.0, + "grad_norm": 0.033652850666290056, + "language_loss": 0.84582424, + "learning_rate": 0.00024659031000260826, + "loss": 0.85628551, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.40283203, + "step": 3530, + "time_per_iteration": 2.9048852920532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_mlp": 1.00703049, + "epoch": 0.6792997306656406, + "flos": 577448712960.0, + "grad_norm": 0.040150019342018534, + "language_loss": 0.81559235, + "learning_rate": 0.0002463217934556985, + "loss": 0.82606721, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.40454102, + "step": 3531, + "time_per_iteration": 2.6925132274627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046124, + "balance_loss_mlp": 1.00692749, + "epoch": 0.6794921123509042, + "flos": 1506546809856.0, + "grad_norm": 0.009705737357192788, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77578211, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.39160156, + "step": 3532, + "time_per_iteration": 4.747551202774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_mlp": 1.00138783, + "epoch": 0.6796844940361677, + "flos": 700141499136.0, + "grad_norm": 0.03694517286226913, + "language_loss": 0.84159917, + "learning_rate": 0.0002457850559259306, + "loss": 0.8520174, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.40429688, + "step": 3533, + "time_per_iteration": 2.8468008041381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_mlp": 1.00326967, + "epoch": 0.6798768757214313, + "flos": 553816708608.0, + "grad_norm": 0.03486714477103508, + "language_loss": 0.82139623, + "learning_rate": 0.00024551683515145275, + "loss": 0.83183265, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.40356445, + "step": 3534, + "time_per_iteration": 2.6637539863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_mlp": 1.00261092, + "epoch": 0.6800692574066949, + "flos": 523976090112.0, + "grad_norm": 0.03293406934357783, + "language_loss": 0.87167442, + "learning_rate": 0.0002452487131761014, + "loss": 0.88210464, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.40405273, + "step": 3535, + "time_per_iteration": 2.719104051589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041768, + "balance_loss_mlp": 1.00128436, + "epoch": 0.6802616390919585, + "flos": 575130324480.0, + "grad_norm": 0.03513185710250464, + "language_loss": 0.80471444, + "learning_rate": 0.00024498069010397093, + "loss": 0.81513214, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.40478516, + "step": 3536, + "time_per_iteration": 2.656780242919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042968, + "balance_loss_mlp": 1.00250804, + "epoch": 0.6804540207772221, + "flos": 489129309696.0, + "grad_norm": 0.03285150643596687, + "language_loss": 0.85294282, + "learning_rate": 0.00024471276603911697, + "loss": 0.86337245, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.40454102, + "step": 3537, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046239, + "balance_loss_mlp": 1.00566006, + "epoch": 0.6806464024624855, + "flos": 579745714176.0, + "grad_norm": 0.0319685563784025, + "language_loss": 0.79588819, + "learning_rate": 0.0002444449410855572, + "loss": 0.80635059, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.40576172, + "step": 3538, + "time_per_iteration": 2.7366206645965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048213, + "balance_loss_mlp": 1.00777721, + "epoch": 0.6808387841477491, + "flos": 554793639168.0, + "grad_norm": 0.028008178154431115, + "language_loss": 0.8488512, + "learning_rate": 0.00024417721534727033, + "loss": 0.85933334, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.40429688, + "step": 3539, + "time_per_iteration": 2.6501903533935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_mlp": 1.00716043, + "epoch": 0.6810311658330127, + "flos": 427754726400.0, + "grad_norm": 0.0434584868230971, + "language_loss": 0.83537716, + "learning_rate": 0.00024390958892819687, + "loss": 0.84585309, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.40429688, + "step": 3540, + "time_per_iteration": 2.5052664279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046344, + "balance_loss_mlp": 1.00571704, + "epoch": 0.6812235475182763, + "flos": 573461277696.0, + "grad_norm": 0.03693481574756638, + "language_loss": 0.81626362, + "learning_rate": 0.0002436420619322381, + "loss": 0.82672703, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.40625, + "step": 3541, + "time_per_iteration": 2.832705497741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049574, + "balance_loss_mlp": 1.00901949, + "epoch": 0.6814159292035398, + "flos": 502994920704.0, + "grad_norm": 0.03366403266770877, + "language_loss": 0.83297849, + "learning_rate": 0.0002433746344632577, + "loss": 0.84347427, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.40551758, + "step": 3542, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050212, + "balance_loss_mlp": 1.00972831, + "epoch": 0.6816083108888034, + "flos": 766956758016.0, + "grad_norm": 0.03487918397791305, + "language_loss": 0.80590951, + "learning_rate": 0.00024310730662508006, + "loss": 0.81641161, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.40478516, + "step": 3543, + "time_per_iteration": 3.086225986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051043, + "balance_loss_mlp": 1.0106312, + "epoch": 0.681800692574067, + "flos": 480480797952.0, + "grad_norm": 0.03000398684674813, + "language_loss": 0.88137174, + "learning_rate": 0.0002428400785214911, + "loss": 0.89188218, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.40405273, + "step": 3544, + "time_per_iteration": 2.5797877311706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050788, + "balance_loss_mlp": 1.01030433, + "epoch": 0.6819930742593305, + "flos": 692834445312.0, + "grad_norm": 0.03498907792035314, + "language_loss": 0.83317804, + "learning_rate": 0.00024257295025623794, + "loss": 0.84368593, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.40478516, + "step": 3545, + "time_per_iteration": 2.817002534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_mlp": 1.00852597, + "epoch": 0.6821854559445941, + "flos": 679355715840.0, + "grad_norm": 0.03355065924517062, + "language_loss": 0.81087142, + "learning_rate": 0.00024230592193302892, + "loss": 0.82136154, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.40478516, + "step": 3546, + "time_per_iteration": 2.9010307788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00469804, + "epoch": 0.6823778376298576, + "flos": 463133230080.0, + "grad_norm": 0.04387981272485442, + "language_loss": 0.85039532, + "learning_rate": 0.00024203899365553372, + "loss": 0.86084759, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.40527344, + "step": 3547, + "time_per_iteration": 2.5003862380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045105, + "balance_loss_mlp": 1.00543213, + "epoch": 0.6825702193151212, + "flos": 1478176939776.0, + "grad_norm": 0.005965966657319216, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7777946, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.39648438, + "step": 3548, + "time_per_iteration": 4.51382303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043331, + "balance_loss_mlp": 1.00299025, + "epoch": 0.6827626010003848, + "flos": 724414096896.0, + "grad_norm": 0.03369751046554337, + "language_loss": 0.83353454, + "learning_rate": 0.00024150543765216848, + "loss": 0.84396785, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.40332031, + "step": 3549, + "time_per_iteration": 2.868882179260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043901, + "balance_loss_mlp": 1.00348902, + "epoch": 0.6829549826856484, + "flos": 559940752128.0, + "grad_norm": 0.03314347093854088, + "language_loss": 0.83934271, + "learning_rate": 0.00024123881013344352, + "loss": 0.84978169, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.40405273, + "step": 3550, + "time_per_iteration": 2.673149347305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043918, + "balance_loss_mlp": 1.00352979, + "epoch": 0.6831473643709118, + "flos": 626134859520.0, + "grad_norm": 0.03193969534774964, + "language_loss": 0.80188608, + "learning_rate": 0.00024097228307472202, + "loss": 0.81232524, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.40380859, + "step": 3551, + "time_per_iteration": 2.783318519592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_mlp": 1.00367296, + "epoch": 0.6833397460561754, + "flos": 715098746880.0, + "grad_norm": 0.03508880753124507, + "language_loss": 0.82590389, + "learning_rate": 0.00024070585657947846, + "loss": 0.83634502, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.40429688, + "step": 3552, + "time_per_iteration": 2.87227725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_mlp": 1.00414932, + "epoch": 0.683532127741439, + "flos": 465727684608.0, + "grad_norm": 0.028861941577793874, + "language_loss": 0.86039191, + "learning_rate": 0.00024043953075114934, + "loss": 0.87083775, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.40429688, + "step": 3553, + "time_per_iteration": 2.685239315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044014, + "balance_loss_mlp": 1.00353038, + "epoch": 0.6837245094267026, + "flos": 583340431872.0, + "grad_norm": 0.03309577009255294, + "language_loss": 0.89582229, + "learning_rate": 0.00024017330569313128, + "loss": 0.9062624, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.40478516, + "step": 3554, + "time_per_iteration": 2.738507032394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044038, + "balance_loss_mlp": 1.00345898, + "epoch": 0.6839168911119662, + "flos": 795524937984.0, + "grad_norm": 0.03513613894761906, + "language_loss": 0.75376379, + "learning_rate": 0.0002399071815087821, + "loss": 0.7642042, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.40576172, + "step": 3555, + "time_per_iteration": 3.038098096847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049451, + "balance_loss_mlp": 1.00908649, + "epoch": 0.6841092727972297, + "flos": 581115362304.0, + "grad_norm": 0.037584614918211315, + "language_loss": 0.84306592, + "learning_rate": 0.00023964115830142025, + "loss": 0.85356045, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.40356445, + "step": 3556, + "time_per_iteration": 2.6664743423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047993, + "balance_loss_mlp": 1.0076046, + "epoch": 0.6843016544824932, + "flos": 384595771392.0, + "grad_norm": 0.04136622286730017, + "language_loss": 0.88220561, + "learning_rate": 0.00023937523617432522, + "loss": 0.89268553, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.40380859, + "step": 3557, + "time_per_iteration": 2.429532289505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104626, + "balance_loss_mlp": 1.00582457, + "epoch": 0.6844940361677568, + "flos": 1441289793792.0, + "grad_norm": 0.032795620592968935, + "language_loss": 0.87315959, + "learning_rate": 0.00023910941523073705, + "loss": 0.88362217, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.40429688, + "step": 3558, + "time_per_iteration": 3.8917641639709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104589, + "balance_loss_mlp": 1.00550175, + "epoch": 0.6846864178530204, + "flos": 521900719872.0, + "grad_norm": 0.03199772830475091, + "language_loss": 0.86959422, + "learning_rate": 0.0002388436955738566, + "loss": 0.8800531, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.40380859, + "step": 3559, + "time_per_iteration": 2.6707799434661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045488, + "balance_loss_mlp": 1.00514805, + "epoch": 0.6848787995382839, + "flos": 719230045440.0, + "grad_norm": 0.030101152384031323, + "language_loss": 0.81828642, + "learning_rate": 0.00023857807730684523, + "loss": 0.82874131, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.40332031, + "step": 3560, + "time_per_iteration": 2.8835229873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_mlp": 1.00440454, + "epoch": 0.6850711812235475, + "flos": 512162516736.0, + "grad_norm": 0.03806744815323664, + "language_loss": 0.83236831, + "learning_rate": 0.00023831256053282547, + "loss": 0.84281576, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.40332031, + "step": 3561, + "time_per_iteration": 2.723851203918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104642, + "balance_loss_mlp": 1.0061748, + "epoch": 0.6852635629088111, + "flos": 669432820224.0, + "grad_norm": 0.034115256160246236, + "language_loss": 0.78766859, + "learning_rate": 0.00023804714535488003, + "loss": 0.79813278, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.40234375, + "step": 3562, + "time_per_iteration": 2.862870454788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_mlp": 1.00953674, + "epoch": 0.6854559445940747, + "flos": 1526367323136.0, + "grad_norm": 0.0075236953863810525, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80858457, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.39257812, + "step": 3563, + "time_per_iteration": 4.951240539550781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045746, + "balance_loss_mlp": 1.00550103, + "epoch": 0.6856483262793382, + "flos": 455137950720.0, + "grad_norm": 0.03558245087854763, + "language_loss": 0.81134826, + "learning_rate": 0.00023751662019934488, + "loss": 0.82180572, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.40234375, + "step": 3564, + "time_per_iteration": 2.5381388664245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043955, + "balance_loss_mlp": 1.00378108, + "epoch": 0.6858407079646017, + "flos": 616689252096.0, + "grad_norm": 0.034154017668987145, + "language_loss": 0.79535556, + "learning_rate": 0.00023725151042772364, + "loss": 0.80579513, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.40161133, + "step": 3565, + "time_per_iteration": 2.8012731075286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044161, + "balance_loss_mlp": 1.00394011, + "epoch": 0.6860330896498653, + "flos": 467095387392.0, + "grad_norm": 0.03227163562068172, + "language_loss": 0.83989513, + "learning_rate": 0.00023698650266411276, + "loss": 0.85033673, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.40209961, + "step": 3566, + "time_per_iteration": 2.6114397048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045444, + "balance_loss_mlp": 1.00527024, + "epoch": 0.6862254713351289, + "flos": 865839650304.0, + "grad_norm": 0.03269984364116833, + "language_loss": 0.83511543, + "learning_rate": 0.00023672159701139755, + "loss": 0.84556985, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.40161133, + "step": 3567, + "time_per_iteration": 3.2268896102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045242, + "balance_loss_mlp": 1.00504482, + "epoch": 0.6864178530203925, + "flos": 448091411712.0, + "grad_norm": 0.03951724412418829, + "language_loss": 0.86782575, + "learning_rate": 0.00023645679357242296, + "loss": 0.87827814, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.40185547, + "step": 3568, + "time_per_iteration": 2.5142667293548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00512683, + "epoch": 0.6866102347056561, + "flos": 425212761600.0, + "grad_norm": 0.04100777191651884, + "language_loss": 0.84717417, + "learning_rate": 0.00023619209244999534, + "loss": 0.85762644, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.40087891, + "step": 3569, + "time_per_iteration": 2.506850004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_mlp": 1.00339389, + "epoch": 0.6868026163909196, + "flos": 473334137088.0, + "grad_norm": 0.0410478225777228, + "language_loss": 0.85724694, + "learning_rate": 0.0002359274937468806, + "loss": 0.86768192, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.40087891, + "step": 3570, + "time_per_iteration": 2.5271074771881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_mlp": 1.00446975, + "epoch": 0.6869949980761831, + "flos": 465206654976.0, + "grad_norm": 0.037625801670490476, + "language_loss": 0.78364801, + "learning_rate": 0.00023566299756580512, + "loss": 0.79409337, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.40039062, + "step": 3571, + "time_per_iteration": 2.641204595565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_mlp": 1.00682652, + "epoch": 0.6871873797614467, + "flos": 427131629568.0, + "grad_norm": 0.03563606510751839, + "language_loss": 0.78681505, + "learning_rate": 0.0002353986040094551, + "loss": 0.79728556, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.40209961, + "step": 3572, + "time_per_iteration": 2.508169412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051965, + "balance_loss_mlp": 1.01188707, + "epoch": 0.6873797614467103, + "flos": 444555019776.0, + "grad_norm": 0.03726905033743987, + "language_loss": 0.79780114, + "learning_rate": 0.00023513431318047796, + "loss": 0.80832076, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.40063477, + "step": 3573, + "time_per_iteration": 2.524447441101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049651, + "balance_loss_mlp": 1.00952482, + "epoch": 0.6875721431319738, + "flos": 993915764736.0, + "grad_norm": 0.03636326410660492, + "language_loss": 0.77452493, + "learning_rate": 0.00023487012518147977, + "loss": 0.78502142, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.40112305, + "step": 3574, + "time_per_iteration": 3.220405340194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050375, + "balance_loss_mlp": 1.0102489, + "epoch": 0.6877645248172374, + "flos": 1287448957440.0, + "grad_norm": 0.03573540340682003, + "language_loss": 0.8513974, + "learning_rate": 0.00023460604011502772, + "loss": 0.86190116, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.40112305, + "step": 3575, + "time_per_iteration": 3.642275094985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050219, + "balance_loss_mlp": 1.01016474, + "epoch": 0.687956906502501, + "flos": 878230633728.0, + "grad_norm": 0.03712322767152043, + "language_loss": 0.86061072, + "learning_rate": 0.00023434205808364845, + "loss": 0.87111294, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.40039062, + "step": 3576, + "time_per_iteration": 3.093545436859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047023, + "balance_loss_mlp": 1.00680172, + "epoch": 0.6881492881877646, + "flos": 564471571200.0, + "grad_norm": 0.039318035109250464, + "language_loss": 0.86179203, + "learning_rate": 0.00023407817918982932, + "loss": 0.87226224, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.40209961, + "step": 3577, + "time_per_iteration": 2.755629777908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_mlp": 1.00656283, + "epoch": 0.6883416698730281, + "flos": 796510616832.0, + "grad_norm": 0.03470611198905491, + "language_loss": 0.79102242, + "learning_rate": 0.00023381440353601718, + "loss": 0.80149001, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.40185547, + "step": 3578, + "time_per_iteration": 2.990251302719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_mlp": 1.00540566, + "epoch": 0.6885340515582916, + "flos": 724880691456.0, + "grad_norm": 0.04272273427793483, + "language_loss": 0.86559987, + "learning_rate": 0.00023355073122461822, + "loss": 0.87605572, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.40161133, + "step": 3579, + "time_per_iteration": 2.9245500564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_mlp": 1.00522161, + "epoch": 0.6887264332435552, + "flos": 1012522165248.0, + "grad_norm": 0.033292192645982856, + "language_loss": 0.83352244, + "learning_rate": 0.00023328716235799973, + "loss": 0.84397686, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.40209961, + "step": 3580, + "time_per_iteration": 3.2759361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049535, + "balance_loss_mlp": 1.00936127, + "epoch": 0.6889188149288188, + "flos": 586347045888.0, + "grad_norm": 0.03483646378728446, + "language_loss": 0.84317255, + "learning_rate": 0.00023302369703848803, + "loss": 0.85366791, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.40161133, + "step": 3581, + "time_per_iteration": 2.692676544189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.00718749, + "epoch": 0.6891111966140824, + "flos": 637277703936.0, + "grad_norm": 0.03603221184194459, + "language_loss": 0.80829328, + "learning_rate": 0.00023276033536836937, + "loss": 0.81876856, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.40332031, + "step": 3582, + "time_per_iteration": 2.7863240242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_mlp": 1.00297284, + "epoch": 0.6893035782993459, + "flos": 496312909056.0, + "grad_norm": 0.032647536159740746, + "language_loss": 0.85196984, + "learning_rate": 0.00023249707744988984, + "loss": 0.86240131, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.40161133, + "step": 3583, + "time_per_iteration": 2.6404688358306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_mlp": 1.00486732, + "epoch": 0.6894959599846094, + "flos": 459149685504.0, + "grad_norm": 0.038319027803205424, + "language_loss": 0.82998735, + "learning_rate": 0.00023223392338525529, + "loss": 0.84043896, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.40283203, + "step": 3584, + "time_per_iteration": 2.526021957397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050689, + "balance_loss_mlp": 1.01030123, + "epoch": 0.689688341669873, + "flos": 506057915136.0, + "grad_norm": 0.03433951849080314, + "language_loss": 0.79221714, + "learning_rate": 0.00023197087327663107, + "loss": 0.802724, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.40380859, + "step": 3585, + "time_per_iteration": 2.6632885932922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_mlp": 1.00449693, + "epoch": 0.6898807233551366, + "flos": 765219670272.0, + "grad_norm": 0.036720139480463, + "language_loss": 0.81855822, + "learning_rate": 0.00023170792722614243, + "loss": 0.82900751, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.40429688, + "step": 3586, + "time_per_iteration": 2.8943870067596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046135, + "balance_loss_mlp": 1.00577044, + "epoch": 0.6900731050404002, + "flos": 584573986560.0, + "grad_norm": 0.03037103532376505, + "language_loss": 0.84293818, + "learning_rate": 0.00023144508533587377, + "loss": 0.85339952, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.40356445, + "step": 3587, + "time_per_iteration": 2.826327085494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_mlp": 1.00622261, + "epoch": 0.6902654867256637, + "flos": 713206123776.0, + "grad_norm": 0.03728824809581911, + "language_loss": 0.79222, + "learning_rate": 0.0002311823477078698, + "loss": 0.8026861, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.40380859, + "step": 3588, + "time_per_iteration": 2.9109723567962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046452, + "balance_loss_mlp": 1.00611138, + "epoch": 0.6904578684109273, + "flos": 598304482560.0, + "grad_norm": 0.034163579129476235, + "language_loss": 0.85722661, + "learning_rate": 0.00023091971444413428, + "loss": 0.8676911, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.40332031, + "step": 3589, + "time_per_iteration": 4.1711201667785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044858, + "balance_loss_mlp": 1.00454128, + "epoch": 0.6906502500961909, + "flos": 586177904640.0, + "grad_norm": 0.030860818872724436, + "language_loss": 0.82910645, + "learning_rate": 0.00023065718564663012, + "loss": 0.83955508, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.40307617, + "step": 3590, + "time_per_iteration": 2.7104885578155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_mlp": 1.00551605, + "epoch": 0.6908426317814544, + "flos": 1591143183360.0, + "grad_norm": 0.007096149350185522, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74956298, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.39160156, + "step": 3591, + "time_per_iteration": 5.0011866092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045736, + "balance_loss_mlp": 1.0053004, + "epoch": 0.6910350134667179, + "flos": 501805107456.0, + "grad_norm": 0.029643353067264133, + "language_loss": 0.81368697, + "learning_rate": 0.0002301324418579666, + "loss": 0.82414436, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.40429688, + "step": 3592, + "time_per_iteration": 2.710340738296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050949, + "balance_loss_mlp": 1.01184845, + "epoch": 0.6912273951519815, + "flos": 1412135443968.0, + "grad_norm": 0.014289812000501409, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79739422, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.390625, + "step": 3593, + "time_per_iteration": 4.750281810760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_mlp": 1.00402915, + "epoch": 0.6914197768372451, + "flos": 636557397504.0, + "grad_norm": 0.0367015241777211, + "language_loss": 0.8129431, + "learning_rate": 0.00022960811715677415, + "loss": 0.82338709, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.40356445, + "step": 3594, + "time_per_iteration": 2.846938133239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_mlp": 1.00485945, + "epoch": 0.6916121585225087, + "flos": 559202949120.0, + "grad_norm": 0.030135543775537642, + "language_loss": 0.82059658, + "learning_rate": 0.00022934611221845608, + "loss": 0.83104908, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.40380859, + "step": 3595, + "time_per_iteration": 2.8187928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049786, + "balance_loss_mlp": 1.00925434, + "epoch": 0.6918045402077723, + "flos": 530293574400.0, + "grad_norm": 0.0337393790819551, + "language_loss": 0.78598142, + "learning_rate": 0.00022908421235729609, + "loss": 0.79647928, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.40527344, + "step": 3596, + "time_per_iteration": 2.7116031646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_mlp": 1.00894892, + "epoch": 0.6919969218930357, + "flos": 571426736640.0, + "grad_norm": 0.033365686577519565, + "language_loss": 0.8572033, + "learning_rate": 0.0002288224176749728, + "loss": 0.86769807, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.40527344, + "step": 3597, + "time_per_iteration": 2.6345982551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053776, + "balance_loss_mlp": 1.01334012, + "epoch": 0.6921893035782993, + "flos": 684504774144.0, + "grad_norm": 0.03882210113784689, + "language_loss": 0.79009509, + "learning_rate": 0.00022856072827312385, + "loss": 0.80063289, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.40429688, + "step": 3598, + "time_per_iteration": 2.7988228797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055733, + "balance_loss_mlp": 1.01532125, + "epoch": 0.6923816852635629, + "flos": 547794732288.0, + "grad_norm": 0.03734800797345761, + "language_loss": 0.77726078, + "learning_rate": 0.00022829914425334598, + "loss": 0.78781813, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.40405273, + "step": 3599, + "time_per_iteration": 2.628700017929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053695, + "balance_loss_mlp": 1.01318777, + "epoch": 0.6925740669488265, + "flos": 511057274112.0, + "grad_norm": 0.04268943868915618, + "language_loss": 0.81083095, + "learning_rate": 0.0002280376657171956, + "loss": 0.82136786, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.4050293, + "step": 3600, + "time_per_iteration": 2.6388540267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_mlp": 1.00723314, + "epoch": 0.69276644863409, + "flos": 870914831616.0, + "grad_norm": 0.03151516530710953, + "language_loss": 0.76992857, + "learning_rate": 0.00022777629276618706, + "loss": 0.78040528, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.40429688, + "step": 3601, + "time_per_iteration": 3.086951732635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_mlp": 1.00776148, + "epoch": 0.6929588303193536, + "flos": 626918349312.0, + "grad_norm": 0.03515773513290382, + "language_loss": 0.77888995, + "learning_rate": 0.0002275150255017947, + "loss": 0.78937119, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.40356445, + "step": 3602, + "time_per_iteration": 2.8207640647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_mlp": 1.01031494, + "epoch": 0.6931512120046172, + "flos": 1548807568896.0, + "grad_norm": 0.008023369975758985, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76782179, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.39160156, + "step": 3603, + "time_per_iteration": 5.031715631484985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_mlp": 1.00765991, + "epoch": 0.6933435936898807, + "flos": 1451326405632.0, + "grad_norm": 0.006067349506475221, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.7617377, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.38964844, + "step": 3604, + "time_per_iteration": 4.695592641830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_mlp": 1.00505233, + "epoch": 0.6935359753751443, + "flos": 541931203584.0, + "grad_norm": 0.03296159343177749, + "language_loss": 0.85254478, + "learning_rate": 0.0002267318588424379, + "loss": 0.86299777, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.40234375, + "step": 3605, + "time_per_iteration": 2.62107253074646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_mlp": 1.00330186, + "epoch": 0.6937283570604078, + "flos": 720691067136.0, + "grad_norm": 0.03433808415235627, + "language_loss": 0.87899154, + "learning_rate": 0.00022647101533842845, + "loss": 0.88942766, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.40307617, + "step": 3606, + "time_per_iteration": 2.9330387115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043345, + "balance_loss_mlp": 1.00302887, + "epoch": 0.6939207387456714, + "flos": 523194545664.0, + "grad_norm": 0.042523396404585766, + "language_loss": 0.76967436, + "learning_rate": 0.00022621027802778872, + "loss": 0.7801078, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.40307617, + "step": 3607, + "time_per_iteration": 2.6252737045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00430596, + "epoch": 0.694113120430935, + "flos": 536402066688.0, + "grad_norm": 0.03600646931475283, + "language_loss": 0.7913326, + "learning_rate": 0.00022594964701174586, + "loss": 0.80177784, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.40209961, + "step": 3608, + "time_per_iteration": 2.674360513687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044111, + "balance_loss_mlp": 1.00391352, + "epoch": 0.6943055021161986, + "flos": 524395052544.0, + "grad_norm": 0.03489608183841533, + "language_loss": 0.85372239, + "learning_rate": 0.00022568912239148586, + "loss": 0.86416358, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.40185547, + "step": 3609, + "time_per_iteration": 2.610682964324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043745, + "balance_loss_mlp": 1.0034523, + "epoch": 0.694497883801462, + "flos": 485971051008.0, + "grad_norm": 0.03140889244124769, + "language_loss": 0.81940842, + "learning_rate": 0.00022542870426815344, + "loss": 0.82984591, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.40283203, + "step": 3610, + "time_per_iteration": 2.7095394134521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_mlp": 1.00303328, + "epoch": 0.6946902654867256, + "flos": 462425562624.0, + "grad_norm": 0.03725802111731568, + "language_loss": 0.86767513, + "learning_rate": 0.00022516839274285173, + "loss": 0.87810791, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.40234375, + "step": 3611, + "time_per_iteration": 2.5144243240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_mlp": 1.00223184, + "epoch": 0.6948826471719892, + "flos": 513868502016.0, + "grad_norm": 0.03700002274884872, + "language_loss": 0.75493568, + "learning_rate": 0.00022490818791664265, + "loss": 0.76536095, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.40283203, + "step": 3612, + "time_per_iteration": 2.5950610637664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_mlp": 1.00429881, + "epoch": 0.6950750288572528, + "flos": 558256154112.0, + "grad_norm": 0.03078051424242557, + "language_loss": 0.86039829, + "learning_rate": 0.00022464808989054676, + "loss": 0.87084323, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.40185547, + "step": 3613, + "time_per_iteration": 2.6489851474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_mlp": 1.00567627, + "epoch": 0.6952674105425164, + "flos": 543522482688.0, + "grad_norm": 0.037582150054456365, + "language_loss": 0.76400638, + "learning_rate": 0.00022438809876554284, + "loss": 0.77446485, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.40161133, + "step": 3614, + "time_per_iteration": 2.666472911834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046875, + "balance_loss_mlp": 1.00672579, + "epoch": 0.6954597922277799, + "flos": 547857915648.0, + "grad_norm": 0.03577219625118018, + "language_loss": 0.81085944, + "learning_rate": 0.00022412821464256873, + "loss": 0.82132822, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.40136719, + "step": 3615, + "time_per_iteration": 2.6799051761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.00479829, + "epoch": 0.6956521739130435, + "flos": 520541765376.0, + "grad_norm": 0.03709092288517812, + "language_loss": 0.82944018, + "learning_rate": 0.00022386843762252023, + "loss": 0.83988917, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.40087891, + "step": 3616, + "time_per_iteration": 2.600679397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_mlp": 1.00496709, + "epoch": 0.695844555598307, + "flos": 467264528640.0, + "grad_norm": 0.03687910314272662, + "language_loss": 0.8069849, + "learning_rate": 0.00022360876780625193, + "loss": 0.81743586, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.40112305, + "step": 3617, + "time_per_iteration": 2.5893161296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_mlp": 1.00462067, + "epoch": 0.6960369372835706, + "flos": 601932248064.0, + "grad_norm": 0.02883770808166936, + "language_loss": 0.80609798, + "learning_rate": 0.00022334920529457604, + "loss": 0.81654525, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.40087891, + "step": 3618, + "time_per_iteration": 2.8958587646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045417, + "balance_loss_mlp": 1.00538695, + "epoch": 0.6962293189688342, + "flos": 645466424064.0, + "grad_norm": 0.029378827731847603, + "language_loss": 0.88201439, + "learning_rate": 0.00022308975018826423, + "loss": 0.89246857, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.40014648, + "step": 3619, + "time_per_iteration": 2.849514961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_mlp": 1.00476694, + "epoch": 0.6964217006540977, + "flos": 639958674432.0, + "grad_norm": 0.03836514772463411, + "language_loss": 0.84951282, + "learning_rate": 0.00022283040258804564, + "loss": 0.85996217, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.40161133, + "step": 3620, + "time_per_iteration": 2.755397319793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042358, + "balance_loss_mlp": 1.00220859, + "epoch": 0.6966140823393613, + "flos": 653387826432.0, + "grad_norm": 0.036503412775040926, + "language_loss": 0.84546065, + "learning_rate": 0.00022257116259460802, + "loss": 0.85588425, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.40136719, + "step": 3621, + "time_per_iteration": 2.8644983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_mlp": 1.00342941, + "epoch": 0.6968064640246249, + "flos": 705825192960.0, + "grad_norm": 0.030665085995137797, + "language_loss": 0.81856084, + "learning_rate": 0.00022231203030859725, + "loss": 0.82899684, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.40161133, + "step": 3622, + "time_per_iteration": 3.017775297164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_mlp": 1.00803852, + "epoch": 0.6969988457098885, + "flos": 493531816704.0, + "grad_norm": 0.04078314735210094, + "language_loss": 0.8408944, + "learning_rate": 0.00022205300583061737, + "loss": 0.85137624, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.40136719, + "step": 3623, + "time_per_iteration": 2.5776522159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_mlp": 1.00615692, + "epoch": 0.6971912273951519, + "flos": 1355615377920.0, + "grad_norm": 0.00769674903149883, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83883369, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.38964844, + "step": 3624, + "time_per_iteration": 4.895683288574219 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046564, + "balance_loss_mlp": 1.00653315, + "epoch": 0.6973836090804155, + "flos": 603575049984.0, + "grad_norm": 0.03550964133883238, + "language_loss": 0.77939522, + "learning_rate": 0.00022153528070095735, + "loss": 0.7898609, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.40014648, + "step": 3625, + "time_per_iteration": 2.73093581199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_mlp": 1.00628626, + "epoch": 0.6975759907656791, + "flos": 525111468288.0, + "grad_norm": 0.03728439171184861, + "language_loss": 0.88488603, + "learning_rate": 0.00022127658025027568, + "loss": 0.89534825, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.39916992, + "step": 3626, + "time_per_iteration": 2.645886182785034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_mlp": 1.00771928, + "epoch": 0.6977683724509427, + "flos": 481878636288.0, + "grad_norm": 0.032998889272974, + "language_loss": 0.85482383, + "learning_rate": 0.00022101798800962258, + "loss": 0.86530197, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.40087891, + "step": 3627, + "time_per_iteration": 2.6026127338409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048765, + "balance_loss_mlp": 1.00875843, + "epoch": 0.6979607541362063, + "flos": 523641698304.0, + "grad_norm": 0.041862603089362516, + "language_loss": 0.79471421, + "learning_rate": 0.00022075950407939227, + "loss": 0.80520177, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.39990234, + "step": 3628, + "time_per_iteration": 2.61621356010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045103, + "balance_loss_mlp": 1.00514364, + "epoch": 0.6981531358214698, + "flos": 549116748288.0, + "grad_norm": 0.03728815941445965, + "language_loss": 0.83285969, + "learning_rate": 0.0002205011285599367, + "loss": 0.84331071, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.39941406, + "step": 3629, + "time_per_iteration": 2.6081953048706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041921, + "balance_loss_mlp": 1.00200999, + "epoch": 0.6983455175067333, + "flos": 701276877312.0, + "grad_norm": 0.05573052179945255, + "language_loss": 0.80735791, + "learning_rate": 0.00022024286155156658, + "loss": 0.81777716, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.39892578, + "step": 3630, + "time_per_iteration": 2.828234910964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041029, + "balance_loss_mlp": 1.00106966, + "epoch": 0.6985378991919969, + "flos": 486120750336.0, + "grad_norm": 0.034934255505656486, + "language_loss": 0.86530191, + "learning_rate": 0.00021998470315454994, + "loss": 0.87571216, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.39941406, + "step": 3631, + "time_per_iteration": 2.689331293106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_mlp": 1.00034702, + "epoch": 0.6987302808772605, + "flos": 559893120000.0, + "grad_norm": 0.03380510665243889, + "language_loss": 0.86876583, + "learning_rate": 0.00021972665346911275, + "loss": 0.87916821, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.39868164, + "step": 3632, + "time_per_iteration": 2.689023017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040303, + "balance_loss_mlp": 1.00032043, + "epoch": 0.698922662562524, + "flos": 484568355072.0, + "grad_norm": 0.03644538242957212, + "language_loss": 0.80445158, + "learning_rate": 0.00021946871259543877, + "loss": 0.81485462, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.3996582, + "step": 3633, + "time_per_iteration": 2.584099292755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040446, + "balance_loss_mlp": 1.00048685, + "epoch": 0.6991150442477876, + "flos": 720206976000.0, + "grad_norm": 0.03286124329654603, + "language_loss": 0.83436686, + "learning_rate": 0.00021921088063366957, + "loss": 0.84477133, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.39941406, + "step": 3634, + "time_per_iteration": 2.9156620502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_mlp": 1.00328004, + "epoch": 0.6993074259330512, + "flos": 490160675328.0, + "grad_norm": 0.03268452893811677, + "language_loss": 0.82517856, + "learning_rate": 0.00021895315768390435, + "loss": 0.83561075, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.39916992, + "step": 3635, + "time_per_iteration": 2.5866551399230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_mlp": 1.01126432, + "epoch": 0.6994998076183148, + "flos": 719469172992.0, + "grad_norm": 0.02932000302360117, + "language_loss": 0.88269186, + "learning_rate": 0.00021869554384619999, + "loss": 0.89320338, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.39868164, + "step": 3636, + "time_per_iteration": 2.971536159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050583, + "balance_loss_mlp": 1.01057589, + "epoch": 0.6996921893035783, + "flos": 580164676608.0, + "grad_norm": 0.03639524799705141, + "language_loss": 0.81240088, + "learning_rate": 0.00021843803922057115, + "loss": 0.82290673, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.39990234, + "step": 3637, + "time_per_iteration": 2.725170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_mlp": 1.01060295, + "epoch": 0.6998845709888418, + "flos": 519675650304.0, + "grad_norm": 0.03468807829141317, + "language_loss": 0.82837808, + "learning_rate": 0.00021818064390698977, + "loss": 0.83888352, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.39916992, + "step": 3638, + "time_per_iteration": 2.633237838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_mlp": 1.00311363, + "epoch": 0.7000769526741054, + "flos": 622096879872.0, + "grad_norm": 0.03453806338856074, + "language_loss": 0.87273943, + "learning_rate": 0.0002179233580053861, + "loss": 0.8831709, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.40014648, + "step": 3639, + "time_per_iteration": 2.7544472217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_mlp": 1.0029943, + "epoch": 0.700269334359369, + "flos": 561056688384.0, + "grad_norm": 0.033530662596956085, + "language_loss": 0.85948008, + "learning_rate": 0.00021766618161564688, + "loss": 0.86991107, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.40087891, + "step": 3640, + "time_per_iteration": 2.7110095024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_mlp": 1.00132048, + "epoch": 0.7004617160446326, + "flos": 484362275328.0, + "grad_norm": 0.03557696097422109, + "language_loss": 0.87556666, + "learning_rate": 0.00021740911483761677, + "loss": 0.88598037, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.40039062, + "step": 3641, + "time_per_iteration": 2.5866122245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_mlp": 1.00230658, + "epoch": 0.7006540977298961, + "flos": 698322753024.0, + "grad_norm": 0.029252813269696705, + "language_loss": 0.92278117, + "learning_rate": 0.00021715215777109837, + "loss": 0.93320382, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.39941406, + "step": 3642, + "time_per_iteration": 2.9658164978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.01600468, + "epoch": 0.7008464794151597, + "flos": 505771155456.0, + "grad_norm": 0.0370639666427534, + "language_loss": 0.84983593, + "learning_rate": 0.00021689531051585103, + "loss": 0.86039579, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.3996582, + "step": 3643, + "time_per_iteration": 2.605422019958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055864, + "balance_loss_mlp": 1.01583362, + "epoch": 0.7010388611004232, + "flos": 538273302528.0, + "grad_norm": 0.03585337078400258, + "language_loss": 0.8111937, + "learning_rate": 0.00021663857317159196, + "loss": 0.82175231, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.40014648, + "step": 3644, + "time_per_iteration": 2.601376533508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053049, + "balance_loss_mlp": 1.01304281, + "epoch": 0.7012312427856868, + "flos": 548315761920.0, + "grad_norm": 0.03435070912909032, + "language_loss": 0.82316148, + "learning_rate": 0.00021638194583799487, + "loss": 0.83369195, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.39990234, + "step": 3645, + "time_per_iteration": 2.686854839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_mlp": 1.00636709, + "epoch": 0.7014236244709504, + "flos": 942974413056.0, + "grad_norm": 0.03710405842133189, + "language_loss": 0.83184248, + "learning_rate": 0.00021612542861469176, + "loss": 0.84230787, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.40161133, + "step": 3646, + "time_per_iteration": 3.2522597312927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.01623774, + "epoch": 0.7016160061562139, + "flos": 526209907968.0, + "grad_norm": 0.03458129081843451, + "language_loss": 0.82967472, + "learning_rate": 0.00021586902160127135, + "loss": 0.84023744, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.40014648, + "step": 3647, + "time_per_iteration": 2.592898368835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_mlp": 1.01410604, + "epoch": 0.7018083878414775, + "flos": 374245165056.0, + "grad_norm": 0.045887676858618894, + "language_loss": 0.74931926, + "learning_rate": 0.00021561272489727974, + "loss": 0.75986135, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.40087891, + "step": 3648, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047959, + "balance_loss_mlp": 1.00785732, + "epoch": 0.7020007695267411, + "flos": 528834498048.0, + "grad_norm": 0.03554324535987718, + "language_loss": 0.81039417, + "learning_rate": 0.0002153565386022199, + "loss": 0.82087374, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.40087891, + "step": 3649, + "time_per_iteration": 2.695328712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00897217, + "epoch": 0.7021931512120047, + "flos": 691373423616.0, + "grad_norm": 0.035617603587249046, + "language_loss": 0.82844687, + "learning_rate": 0.00021510046281555262, + "loss": 0.83893853, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.40185547, + "step": 3650, + "time_per_iteration": 2.8195676803588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047858, + "balance_loss_mlp": 1.00761259, + "epoch": 0.7023855328972681, + "flos": 640926856704.0, + "grad_norm": 0.042051655567710275, + "language_loss": 0.82163751, + "learning_rate": 0.0002148444976366949, + "loss": 0.83211613, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.40234375, + "step": 3651, + "time_per_iteration": 2.7910640239715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049797, + "balance_loss_mlp": 1.00969481, + "epoch": 0.7025779145825317, + "flos": 562007374080.0, + "grad_norm": 0.03669409965522196, + "language_loss": 0.8294403, + "learning_rate": 0.00021458864316502136, + "loss": 0.83993822, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.40087891, + "step": 3652, + "time_per_iteration": 2.7377076148986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050304, + "balance_loss_mlp": 1.01020181, + "epoch": 0.7027702962677953, + "flos": 448371368448.0, + "grad_norm": 0.037398832167444995, + "language_loss": 0.87441307, + "learning_rate": 0.0002143328994998634, + "loss": 0.88491613, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.40087891, + "step": 3653, + "time_per_iteration": 2.510070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048411, + "balance_loss_mlp": 1.00833249, + "epoch": 0.7029626779530589, + "flos": 623714403840.0, + "grad_norm": 0.0361167635185571, + "language_loss": 0.78985465, + "learning_rate": 0.00021407726674050982, + "loss": 0.80033875, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.40063477, + "step": 3654, + "time_per_iteration": 2.8577005863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_mlp": 1.00903809, + "epoch": 0.7031550596383225, + "flos": 630734697984.0, + "grad_norm": 0.031984411751134825, + "language_loss": 0.87403131, + "learning_rate": 0.0002138217449862061, + "loss": 0.88452226, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.40039062, + "step": 3655, + "time_per_iteration": 2.731257915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051587, + "balance_loss_mlp": 1.01160455, + "epoch": 0.703347441323586, + "flos": 531860553984.0, + "grad_norm": 0.032014026327257146, + "language_loss": 0.7905367, + "learning_rate": 0.00021356633433615403, + "loss": 0.80105257, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.3996582, + "step": 3656, + "time_per_iteration": 2.6462786197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.01192546, + "epoch": 0.7035398230088495, + "flos": 694916618496.0, + "grad_norm": 0.025544718758457735, + "language_loss": 0.83906752, + "learning_rate": 0.0002133110348895133, + "loss": 0.84958708, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.40014648, + "step": 3657, + "time_per_iteration": 2.968036413192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_mlp": 1.01158977, + "epoch": 0.7037322046941131, + "flos": 969667466496.0, + "grad_norm": 0.030163391429171182, + "language_loss": 0.85463339, + "learning_rate": 0.0002130558467453999, + "loss": 0.8651489, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.39941406, + "step": 3658, + "time_per_iteration": 3.3951528072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047875, + "balance_loss_mlp": 1.00789237, + "epoch": 0.7039245863793767, + "flos": 503926164480.0, + "grad_norm": 0.029582354045105844, + "language_loss": 0.84755009, + "learning_rate": 0.0002128007700028865, + "loss": 0.85802877, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.3996582, + "step": 3659, + "time_per_iteration": 2.754249334335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_mlp": 1.00460947, + "epoch": 0.7041169680646402, + "flos": 466938885120.0, + "grad_norm": 0.03694565934757681, + "language_loss": 0.8474158, + "learning_rate": 0.00021254580476100276, + "loss": 0.85786295, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.40087891, + "step": 3660, + "time_per_iteration": 2.576219081878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_mlp": 1.00359917, + "epoch": 0.7043093497499038, + "flos": 633322349568.0, + "grad_norm": 0.037641747763634714, + "language_loss": 0.79470807, + "learning_rate": 0.00021229095111873497, + "loss": 0.80514407, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.39990234, + "step": 3661, + "time_per_iteration": 2.7775161266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043368, + "balance_loss_mlp": 1.00333726, + "epoch": 0.7045017314351674, + "flos": 544096002048.0, + "grad_norm": 0.03023690962448049, + "language_loss": 0.86693418, + "learning_rate": 0.0002120362091750261, + "loss": 0.87736779, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.40014648, + "step": 3662, + "time_per_iteration": 2.815168857574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.00267351, + "epoch": 0.704694113120431, + "flos": 429141871104.0, + "grad_norm": 0.036907150984541, + "language_loss": 0.87510955, + "learning_rate": 0.00021178157902877566, + "loss": 0.88553607, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.3996582, + "step": 3663, + "time_per_iteration": 2.458578109741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_mlp": 1.00373864, + "epoch": 0.7048864948056945, + "flos": 651713922048.0, + "grad_norm": 0.04106624653226338, + "language_loss": 0.87760627, + "learning_rate": 0.0002115270607788397, + "loss": 0.88804281, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.39892578, + "step": 3664, + "time_per_iteration": 2.756804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_mlp": 1.00445461, + "epoch": 0.705078876490958, + "flos": 413494452480.0, + "grad_norm": 0.03442797785772838, + "language_loss": 0.86509478, + "learning_rate": 0.00021127265452403133, + "loss": 0.87553817, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.39868164, + "step": 3665, + "time_per_iteration": 2.534076690673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_mlp": 1.0045929, + "epoch": 0.7052712581762216, + "flos": 1423150943232.0, + "grad_norm": 0.008458198264264957, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85135132, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.38867188, + "step": 3666, + "time_per_iteration": 4.859800815582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_mlp": 1.00266969, + "epoch": 0.7054636398614852, + "flos": 494070342912.0, + "grad_norm": 0.037128971718994215, + "language_loss": 0.833794, + "learning_rate": 0.00021076417839483065, + "loss": 0.84422016, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.39916992, + "step": 3667, + "time_per_iteration": 2.7798430919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00283003, + "epoch": 0.7056560215467488, + "flos": 451377982464.0, + "grad_norm": 0.031014936324499143, + "language_loss": 0.85416818, + "learning_rate": 0.00021051010871784589, + "loss": 0.86459577, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.39916992, + "step": 3668, + "time_per_iteration": 2.560455560684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_mlp": 1.00304842, + "epoch": 0.7058484032320124, + "flos": 566818149888.0, + "grad_norm": 0.030353159640158514, + "language_loss": 0.79448986, + "learning_rate": 0.0002102561514308045, + "loss": 0.8049202, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.3996582, + "step": 3669, + "time_per_iteration": 2.7246358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_mlp": 1.00234294, + "epoch": 0.7060407849172758, + "flos": 568103227392.0, + "grad_norm": 0.03405380367536788, + "language_loss": 0.82700998, + "learning_rate": 0.00021000230663230135, + "loss": 0.83743417, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.40063477, + "step": 3670, + "time_per_iteration": 2.6809375286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_mlp": 1.00293946, + "epoch": 0.7062331666025394, + "flos": 469713174528.0, + "grad_norm": 0.035705889445470915, + "language_loss": 0.83772206, + "learning_rate": 0.00020974857442088762, + "loss": 0.8481518, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.40014648, + "step": 3671, + "time_per_iteration": 2.6487808227539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_mlp": 1.00330818, + "epoch": 0.706425548287803, + "flos": 596417695488.0, + "grad_norm": 0.03583731061026118, + "language_loss": 0.89143217, + "learning_rate": 0.00020949495489507104, + "loss": 0.90186673, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.40136719, + "step": 3672, + "time_per_iteration": 2.704887628555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_mlp": 1.00331914, + "epoch": 0.7066179299730666, + "flos": 476814148608.0, + "grad_norm": 0.034102097435369114, + "language_loss": 0.84997833, + "learning_rate": 0.00020924144815331525, + "loss": 0.86041224, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.40063477, + "step": 3673, + "time_per_iteration": 2.5945112705230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_mlp": 1.0026809, + "epoch": 0.7068103116583301, + "flos": 507436311552.0, + "grad_norm": 0.033684521411270194, + "language_loss": 0.83985698, + "learning_rate": 0.00020898805429404044, + "loss": 0.85028362, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.3996582, + "step": 3674, + "time_per_iteration": 2.5818920135498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_mlp": 1.00266123, + "epoch": 0.7070026933435937, + "flos": 680575664640.0, + "grad_norm": 0.03512873001655734, + "language_loss": 0.78734016, + "learning_rate": 0.0002087347734156228, + "loss": 0.7977668, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.39990234, + "step": 3675, + "time_per_iteration": 2.8316643238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044552, + "balance_loss_mlp": 1.00447345, + "epoch": 0.7071950750288573, + "flos": 473166941184.0, + "grad_norm": 0.03289895415072129, + "language_loss": 0.79907787, + "learning_rate": 0.00020848160561639452, + "loss": 0.8095234, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.40063477, + "step": 3676, + "time_per_iteration": 2.662691354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_mlp": 1.00358856, + "epoch": 0.7073874567141208, + "flos": 474684343296.0, + "grad_norm": 0.031178438211795275, + "language_loss": 0.86372793, + "learning_rate": 0.0002082285509946445, + "loss": 0.87416512, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.40112305, + "step": 3677, + "time_per_iteration": 2.54286789894104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043907, + "balance_loss_mlp": 1.0038054, + "epoch": 0.7075798383993844, + "flos": 547037487360.0, + "grad_norm": 0.033007214142821914, + "language_loss": 0.83766264, + "learning_rate": 0.00020797560964861683, + "loss": 0.84810174, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.40087891, + "step": 3678, + "time_per_iteration": 2.7636282444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043429, + "balance_loss_mlp": 1.00335097, + "epoch": 0.7077722200846479, + "flos": 663391401984.0, + "grad_norm": 0.033779282823635445, + "language_loss": 0.81209165, + "learning_rate": 0.0002077227816765122, + "loss": 0.82252598, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.40063477, + "step": 3679, + "time_per_iteration": 3.0056393146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_mlp": 1.00824738, + "epoch": 0.7079646017699115, + "flos": 1533303046656.0, + "grad_norm": 0.005266739458106997, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.7749517, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.39160156, + "step": 3680, + "time_per_iteration": 4.76727819442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_mlp": 1.00492287, + "epoch": 0.7081569834551751, + "flos": 622646099712.0, + "grad_norm": 0.03129589389619307, + "language_loss": 0.78969026, + "learning_rate": 0.00020721746624665383, + "loss": 0.80013955, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.39990234, + "step": 3681, + "time_per_iteration": 2.72866153717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_mlp": 1.00419092, + "epoch": 0.7083493651404387, + "flos": 796035273984.0, + "grad_norm": 0.031303476473040825, + "language_loss": 0.80593359, + "learning_rate": 0.00020696497898508114, + "loss": 0.81637675, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.40112305, + "step": 3682, + "time_per_iteration": 3.041132926940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044378, + "balance_loss_mlp": 1.00425231, + "epoch": 0.7085417468257021, + "flos": 815162704128.0, + "grad_norm": 0.03799512363441117, + "language_loss": 0.78282857, + "learning_rate": 0.00020671260548979316, + "loss": 0.79327232, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.40112305, + "step": 3683, + "time_per_iteration": 2.980470895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046933, + "balance_loss_mlp": 1.00675917, + "epoch": 0.7087341285109657, + "flos": 701797906944.0, + "grad_norm": 0.03765603647775186, + "language_loss": 0.85959506, + "learning_rate": 0.00020646034585876982, + "loss": 0.87006438, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.40161133, + "step": 3684, + "time_per_iteration": 2.83225417137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00350797, + "epoch": 0.7089265101962293, + "flos": 597735820800.0, + "grad_norm": 0.030001144776417084, + "language_loss": 0.8503226, + "learning_rate": 0.00020620820018994718, + "loss": 0.86075842, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.40063477, + "step": 3685, + "time_per_iteration": 2.808814287185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_mlp": 1.00334978, + "epoch": 0.7091188918814929, + "flos": 488167930368.0, + "grad_norm": 0.039691244265052834, + "language_loss": 0.82984829, + "learning_rate": 0.00020595616858121675, + "loss": 0.84028256, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.40063477, + "step": 3686, + "time_per_iteration": 2.696423292160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_mlp": 1.00316179, + "epoch": 0.7093112735667565, + "flos": 601256661504.0, + "grad_norm": 0.03416651463344776, + "language_loss": 0.81164849, + "learning_rate": 0.00020570425113042586, + "loss": 0.82208097, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.40063477, + "step": 3687, + "time_per_iteration": 2.735722303390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_mlp": 1.00281906, + "epoch": 0.70950365525202, + "flos": 506850153216.0, + "grad_norm": 0.03675476987666338, + "language_loss": 0.86545879, + "learning_rate": 0.0002054524479353776, + "loss": 0.87588727, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.40014648, + "step": 3688, + "time_per_iteration": 2.6537790298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042758, + "balance_loss_mlp": 1.0026803, + "epoch": 0.7096960369372836, + "flos": 733425190656.0, + "grad_norm": 0.03699911632186226, + "language_loss": 0.81610233, + "learning_rate": 0.00020520075909383063, + "loss": 0.82652992, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.40063477, + "step": 3689, + "time_per_iteration": 2.8920962810516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_mlp": 1.00576556, + "epoch": 0.7098884186225471, + "flos": 973652956416.0, + "grad_norm": 0.0320857463001868, + "language_loss": 0.811288, + "learning_rate": 0.00020494918470349916, + "loss": 0.82174444, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.39868164, + "step": 3690, + "time_per_iteration": 3.3136045932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045022, + "balance_loss_mlp": 1.00513482, + "epoch": 0.7100808003078107, + "flos": 505258874112.0, + "grad_norm": 0.03898509483209187, + "language_loss": 0.86111224, + "learning_rate": 0.00020469772486205297, + "loss": 0.87156248, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.39868164, + "step": 3691, + "time_per_iteration": 2.6186795234680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_mlp": 1.00715148, + "epoch": 0.7102731819930742, + "flos": 541390732032.0, + "grad_norm": 0.07359850513533242, + "language_loss": 0.81684911, + "learning_rate": 0.0002044463796671177, + "loss": 0.82731974, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.39892578, + "step": 3692, + "time_per_iteration": 2.7347307205200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047542, + "balance_loss_mlp": 1.00767887, + "epoch": 0.7104655636783378, + "flos": 621628339968.0, + "grad_norm": 0.03494472731168418, + "language_loss": 0.80876124, + "learning_rate": 0.00020419514921627408, + "loss": 0.8192367, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.3984375, + "step": 3693, + "time_per_iteration": 2.9353420734405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_mlp": 1.00568545, + "epoch": 0.7106579453636014, + "flos": 558377663232.0, + "grad_norm": 0.034076048259573104, + "language_loss": 0.77580255, + "learning_rate": 0.00020394403360705855, + "loss": 0.78625828, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.39868164, + "step": 3694, + "time_per_iteration": 2.7425014972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041714, + "balance_loss_mlp": 1.00187469, + "epoch": 0.710850327048865, + "flos": 514063888128.0, + "grad_norm": 0.03425732262265505, + "language_loss": 0.88495499, + "learning_rate": 0.00020369303293696228, + "loss": 0.89537215, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.39819336, + "step": 3695, + "time_per_iteration": 2.6524975299835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_mlp": 1.00138831, + "epoch": 0.7110427087341286, + "flos": 424507039488.0, + "grad_norm": 0.03544655381873144, + "language_loss": 0.78715348, + "learning_rate": 0.00020344214730343304, + "loss": 0.79756576, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.39819336, + "step": 3696, + "time_per_iteration": 2.5949435234069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044579, + "balance_loss_mlp": 1.0046916, + "epoch": 0.711235090419392, + "flos": 578654077440.0, + "grad_norm": 0.028723552959570162, + "language_loss": 0.79433203, + "learning_rate": 0.00020319137680387296, + "loss": 0.80477786, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.39868164, + "step": 3697, + "time_per_iteration": 2.9308555126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00476861, + "epoch": 0.7114274721046556, + "flos": 448985716992.0, + "grad_norm": 0.03974363326367457, + "language_loss": 0.81048799, + "learning_rate": 0.0002029407215356398, + "loss": 0.82093453, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.39868164, + "step": 3698, + "time_per_iteration": 2.51846981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047508, + "balance_loss_mlp": 1.00747764, + "epoch": 0.7116198537899192, + "flos": 623093252352.0, + "grad_norm": 0.03573092214562991, + "language_loss": 0.83794999, + "learning_rate": 0.00020269018159604663, + "loss": 0.84842503, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.40014648, + "step": 3699, + "time_per_iteration": 2.7074286937713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.00760162, + "epoch": 0.7118122354751828, + "flos": 499720988928.0, + "grad_norm": 0.03677211843520988, + "language_loss": 0.82181633, + "learning_rate": 0.00020243975708236162, + "loss": 0.83229172, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.39916992, + "step": 3700, + "time_per_iteration": 2.564375877380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_mlp": 1.00660574, + "epoch": 0.7120046171604463, + "flos": 573845246976.0, + "grad_norm": 0.03454353277878698, + "language_loss": 0.86407083, + "learning_rate": 0.00020218944809180818, + "loss": 0.87453598, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.39892578, + "step": 3701, + "time_per_iteration": 2.7084884643554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_mlp": 1.00657344, + "epoch": 0.7121969988457099, + "flos": 573771369984.0, + "grad_norm": 0.03303682180607054, + "language_loss": 0.8533892, + "learning_rate": 0.00020193925472156493, + "loss": 0.86385572, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.40063477, + "step": 3702, + "time_per_iteration": 2.7079381942749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_mlp": 1.00603485, + "epoch": 0.7123893805309734, + "flos": 1526823224064.0, + "grad_norm": 0.008337798105396301, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75334108, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.38867188, + "step": 3703, + "time_per_iteration": 4.8932740688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057166, + "balance_loss_mlp": 1.01696837, + "epoch": 0.712581762216237, + "flos": 616414152960.0, + "grad_norm": 0.03156423949245577, + "language_loss": 0.84361899, + "learning_rate": 0.00020143921523049863, + "loss": 0.85419071, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.40185547, + "step": 3704, + "time_per_iteration": 2.9233312606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052935, + "balance_loss_mlp": 1.01285696, + "epoch": 0.7127741439015006, + "flos": 598875089664.0, + "grad_norm": 0.03941549169831495, + "language_loss": 0.84401309, + "learning_rate": 0.00020118936930380837, + "loss": 0.85454243, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.40063477, + "step": 3705, + "time_per_iteration": 2.7015953063964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_mlp": 1.00774693, + "epoch": 0.7129665255867641, + "flos": 538440498432.0, + "grad_norm": 0.03692779593562928, + "language_loss": 0.81897098, + "learning_rate": 0.0002009396393856932, + "loss": 0.82945073, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.40209961, + "step": 3706, + "time_per_iteration": 2.649216890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047729, + "balance_loss_mlp": 1.00746036, + "epoch": 0.7131589072720277, + "flos": 527521230336.0, + "grad_norm": 0.035672100544370096, + "language_loss": 0.82740968, + "learning_rate": 0.00020069002557310673, + "loss": 0.83788699, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.40258789, + "step": 3707, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043206, + "balance_loss_mlp": 1.00281823, + "epoch": 0.7133512889572913, + "flos": 532097736192.0, + "grad_norm": 0.0323096227749812, + "language_loss": 0.77545685, + "learning_rate": 0.00020044052796295807, + "loss": 0.78588891, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.40380859, + "step": 3708, + "time_per_iteration": 2.791064500808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048487, + "balance_loss_mlp": 1.00821805, + "epoch": 0.7135436706425549, + "flos": 504551206656.0, + "grad_norm": 0.04325770643622515, + "language_loss": 0.82374418, + "learning_rate": 0.00020019114665211063, + "loss": 0.83422899, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.40258789, + "step": 3709, + "time_per_iteration": 2.6297860145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046781, + "balance_loss_mlp": 1.00648808, + "epoch": 0.7137360523278183, + "flos": 516968434944.0, + "grad_norm": 0.035345949050593885, + "language_loss": 0.81970435, + "learning_rate": 0.00019994188173738276, + "loss": 0.83017212, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.40283203, + "step": 3710, + "time_per_iteration": 2.6330204010009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047924, + "balance_loss_mlp": 1.00755966, + "epoch": 0.7139284340130819, + "flos": 511537474560.0, + "grad_norm": 0.03739330083001905, + "language_loss": 0.81062478, + "learning_rate": 0.0001996927333155477, + "loss": 0.82110405, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.40356445, + "step": 3711, + "time_per_iteration": 2.74644136428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049541, + "balance_loss_mlp": 1.0092001, + "epoch": 0.7141208156983455, + "flos": 891800736768.0, + "grad_norm": 0.03143322017513776, + "language_loss": 0.85805249, + "learning_rate": 0.00019944370148333346, + "loss": 0.86854792, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.40332031, + "step": 3712, + "time_per_iteration": 3.1481471061706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049102, + "balance_loss_mlp": 1.00871384, + "epoch": 0.7143131973836091, + "flos": 536884212480.0, + "grad_norm": 0.034489718193939395, + "language_loss": 0.80643392, + "learning_rate": 0.00019919478633742278, + "loss": 0.81692493, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.40380859, + "step": 3713, + "time_per_iteration": 2.6485395431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048925, + "balance_loss_mlp": 1.00848949, + "epoch": 0.7145055790688727, + "flos": 474627962880.0, + "grad_norm": 0.04039016318386717, + "language_loss": 0.85767764, + "learning_rate": 0.00019894598797445302, + "loss": 0.86816686, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.40429688, + "step": 3714, + "time_per_iteration": 2.5401811599731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050713, + "balance_loss_mlp": 1.01037288, + "epoch": 0.7146979607541362, + "flos": 571702802688.0, + "grad_norm": 0.03221862991626059, + "language_loss": 0.82471192, + "learning_rate": 0.00019869730649101615, + "loss": 0.83521909, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.40332031, + "step": 3715, + "time_per_iteration": 2.75704288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105074, + "balance_loss_mlp": 1.0103991, + "epoch": 0.7148903424393998, + "flos": 841139341824.0, + "grad_norm": 0.03811132383920714, + "language_loss": 0.72900105, + "learning_rate": 0.00019844874198365943, + "loss": 0.73950851, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.40332031, + "step": 3716, + "time_per_iteration": 3.115915536880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049607, + "balance_loss_mlp": 1.00921834, + "epoch": 0.7150827241246633, + "flos": 542879943936.0, + "grad_norm": 0.037838986549668586, + "language_loss": 0.84377575, + "learning_rate": 0.00019820029454888362, + "loss": 0.85427183, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.40380859, + "step": 3717, + "time_per_iteration": 2.7640199661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.01282501, + "epoch": 0.7152751058099269, + "flos": 1587190741248.0, + "grad_norm": 0.009155096775058921, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.7557348, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.39355469, + "step": 3718, + "time_per_iteration": 5.020099639892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_mlp": 1.00296581, + "epoch": 0.7154674874951905, + "flos": 518429456640.0, + "grad_norm": 0.0370915875215028, + "language_loss": 0.80511153, + "learning_rate": 0.0001977037512828529, + "loss": 0.81554461, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.40332031, + "step": 3719, + "time_per_iteration": 2.593027114868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043585, + "balance_loss_mlp": 1.00326824, + "epoch": 0.715659869180454, + "flos": 603640178688.0, + "grad_norm": 0.03300286270545162, + "language_loss": 0.86582744, + "learning_rate": 0.0001974556556443734, + "loss": 0.87626332, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.40307617, + "step": 3720, + "time_per_iteration": 2.725634813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047164, + "balance_loss_mlp": 1.0068953, + "epoch": 0.7158522508657176, + "flos": 532770410496.0, + "grad_norm": 0.029643200911988788, + "language_loss": 0.89179665, + "learning_rate": 0.00019720767746402547, + "loss": 0.90226829, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.40258789, + "step": 3721, + "time_per_iteration": 2.727351188659668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061105, + "balance_loss_mlp": 1.02069271, + "epoch": 0.7160446325509812, + "flos": 558646926336.0, + "grad_norm": 0.03644218382348141, + "language_loss": 0.80571723, + "learning_rate": 0.00019695981683808222, + "loss": 0.81632823, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.40405273, + "step": 3722, + "time_per_iteration": 2.7068886756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.01662219, + "epoch": 0.7162370142362448, + "flos": 692283280128.0, + "grad_norm": 0.03246359808294338, + "language_loss": 0.85348076, + "learning_rate": 0.00019671207386277225, + "loss": 0.86404943, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.40234375, + "step": 3723, + "time_per_iteration": 2.9236690998077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046407, + "balance_loss_mlp": 1.00611401, + "epoch": 0.7164293959215082, + "flos": 795459809280.0, + "grad_norm": 0.035040971125857495, + "language_loss": 0.78785622, + "learning_rate": 0.0001964644486342777, + "loss": 0.79832029, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.40283203, + "step": 3724, + "time_per_iteration": 2.9631621837615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_mlp": 1.00506568, + "epoch": 0.7166217776067718, + "flos": 495205721088.0, + "grad_norm": 0.03180638125163834, + "language_loss": 0.86850977, + "learning_rate": 0.00019621694124873524, + "loss": 0.87896389, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.40332031, + "step": 3725, + "time_per_iteration": 2.6598877906799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_mlp": 1.00958252, + "epoch": 0.7168141592920354, + "flos": 1403964220416.0, + "grad_norm": 0.007874165171020433, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77589142, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.39453125, + "step": 3726, + "time_per_iteration": 4.864764451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049401, + "balance_loss_mlp": 1.00922716, + "epoch": 0.717006540977299, + "flos": 794600497152.0, + "grad_norm": 0.03337333426789978, + "language_loss": 0.77893984, + "learning_rate": 0.00019572228039082428, + "loss": 0.78943384, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.40161133, + "step": 3727, + "time_per_iteration": 3.107271432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_mlp": 1.01066017, + "epoch": 0.7171989226625626, + "flos": 555964010496.0, + "grad_norm": 0.028215345270395674, + "language_loss": 0.84187287, + "learning_rate": 0.0001954751271105002, + "loss": 0.85238069, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.40112305, + "step": 3728, + "time_per_iteration": 2.8074874877929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049671, + "balance_loss_mlp": 1.00940251, + "epoch": 0.717391304347826, + "flos": 557062450176.0, + "grad_norm": 0.03474148956732634, + "language_loss": 0.81498766, + "learning_rate": 0.00019522809205721687, + "loss": 0.8254844, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.40258789, + "step": 3729, + "time_per_iteration": 2.736825704574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048977, + "balance_loss_mlp": 1.00885069, + "epoch": 0.7175836860330896, + "flos": 539955955200.0, + "grad_norm": 0.033940302209900526, + "language_loss": 0.83540523, + "learning_rate": 0.0001949811753268816, + "loss": 0.84589505, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.40112305, + "step": 3730, + "time_per_iteration": 2.732431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047309, + "balance_loss_mlp": 1.00720656, + "epoch": 0.7177760677183532, + "flos": 516651539712.0, + "grad_norm": 0.04023515024908783, + "language_loss": 0.83238113, + "learning_rate": 0.00019473437701535634, + "loss": 0.8428542, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.40087891, + "step": 3731, + "time_per_iteration": 2.608720064163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_mlp": 1.00444937, + "epoch": 0.7179684494036168, + "flos": 675940833024.0, + "grad_norm": 0.03223034722468918, + "language_loss": 0.90125024, + "learning_rate": 0.00019448769721845677, + "loss": 0.9116962, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.40136719, + "step": 3732, + "time_per_iteration": 2.8010287284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_mlp": 1.00342619, + "epoch": 0.7181608310888803, + "flos": 470876742912.0, + "grad_norm": 0.03459418465075036, + "language_loss": 0.86262, + "learning_rate": 0.00019424113603195203, + "loss": 0.87305647, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.40209961, + "step": 3733, + "time_per_iteration": 2.5431971549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044053, + "balance_loss_mlp": 1.0037843, + "epoch": 0.7183532127741439, + "flos": 595185107712.0, + "grad_norm": 0.037144823365086815, + "language_loss": 0.8025893, + "learning_rate": 0.0001939946935515657, + "loss": 0.81302989, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.40258789, + "step": 3734, + "time_per_iteration": 2.8843894004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_mlp": 1.00533688, + "epoch": 0.7185455944594075, + "flos": 499916375040.0, + "grad_norm": 0.03883855208122221, + "language_loss": 0.8098954, + "learning_rate": 0.0001937483698729755, + "loss": 0.82035124, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.40234375, + "step": 3735, + "time_per_iteration": 2.6381587982177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_mlp": 1.00243688, + "epoch": 0.718737976144671, + "flos": 816308775936.0, + "grad_norm": 0.032230667359085925, + "language_loss": 0.82948256, + "learning_rate": 0.0001935021650918128, + "loss": 0.83990961, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.40258789, + "step": 3736, + "time_per_iteration": 3.0015594959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.00332057, + "epoch": 0.7189303578299346, + "flos": 439240710912.0, + "grad_norm": 0.03694442625738843, + "language_loss": 0.87466842, + "learning_rate": 0.0001932560793036625, + "loss": 0.88510168, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.39990234, + "step": 3737, + "time_per_iteration": 2.522517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043371, + "balance_loss_mlp": 1.00341213, + "epoch": 0.7191227395151981, + "flos": 550447512576.0, + "grad_norm": 0.0396546540063306, + "language_loss": 0.86941743, + "learning_rate": 0.00019301011260406382, + "loss": 0.87985116, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.39941406, + "step": 3738, + "time_per_iteration": 2.6374080181121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046219, + "balance_loss_mlp": 1.00616467, + "epoch": 0.7193151212004617, + "flos": 628081917696.0, + "grad_norm": 0.032473190286521646, + "language_loss": 0.80187446, + "learning_rate": 0.00019276426508850936, + "loss": 0.81233668, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.40039062, + "step": 3739, + "time_per_iteration": 2.7331862449645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_mlp": 1.00620306, + "epoch": 0.7195075028857253, + "flos": 742441142016.0, + "grad_norm": 0.03365291152671841, + "language_loss": 0.80674922, + "learning_rate": 0.00019251853685244564, + "loss": 0.8172121, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.40063477, + "step": 3740, + "time_per_iteration": 3.040309429168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044377, + "balance_loss_mlp": 1.00410771, + "epoch": 0.7196998845709889, + "flos": 804291068160.0, + "grad_norm": 0.03612611127551407, + "language_loss": 0.81356812, + "learning_rate": 0.00019227292799127283, + "loss": 0.82401186, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.40258789, + "step": 3741, + "time_per_iteration": 3.0432052612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_mlp": 1.00416589, + "epoch": 0.7198922662562524, + "flos": 926777774592.0, + "grad_norm": 0.036362359760093145, + "language_loss": 0.79752231, + "learning_rate": 0.00019202743860034454, + "loss": 0.80796617, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.40209961, + "step": 3742, + "time_per_iteration": 3.223635196685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049155, + "balance_loss_mlp": 1.0088625, + "epoch": 0.7200846479415159, + "flos": 581208681216.0, + "grad_norm": 0.0348094997574978, + "language_loss": 0.84359837, + "learning_rate": 0.00019178206877496873, + "loss": 0.85408992, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.40283203, + "step": 3743, + "time_per_iteration": 2.6937367916107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.0053103, + "epoch": 0.7202770296267795, + "flos": 558840367104.0, + "grad_norm": 0.028995122197605715, + "language_loss": 0.85587943, + "learning_rate": 0.0001915368186104059, + "loss": 0.86633497, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.40234375, + "step": 3744, + "time_per_iteration": 2.737929582595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_mlp": 1.00722742, + "epoch": 0.7204694113120431, + "flos": 673772143872.0, + "grad_norm": 0.03601847406415609, + "language_loss": 0.81636101, + "learning_rate": 0.0001912916882018706, + "loss": 0.82683504, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.40161133, + "step": 3745, + "time_per_iteration": 2.8627820014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010475, + "balance_loss_mlp": 1.00727844, + "epoch": 0.7206617929973067, + "flos": 800596228608.0, + "grad_norm": 0.04088395220221656, + "language_loss": 0.80132556, + "learning_rate": 0.00019104667764453125, + "loss": 0.8118006, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.40209961, + "step": 3746, + "time_per_iteration": 3.0283303260803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.01020253, + "epoch": 0.7208541746825702, + "flos": 532939551744.0, + "grad_norm": 0.030159350032508997, + "language_loss": 0.80461586, + "learning_rate": 0.00019080178703350926, + "loss": 0.81511962, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.40161133, + "step": 3747, + "time_per_iteration": 2.6268179416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_mlp": 1.00945175, + "epoch": 0.7210465563678338, + "flos": 536169742080.0, + "grad_norm": 0.034039887094515435, + "language_loss": 0.83305407, + "learning_rate": 0.00019055701646387952, + "loss": 0.84355056, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.40185547, + "step": 3748, + "time_per_iteration": 2.642871618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050407, + "balance_loss_mlp": 1.0114975, + "epoch": 0.7212389380530974, + "flos": 1537249652736.0, + "grad_norm": 0.008513050614024542, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81523097, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.38867188, + "step": 3749, + "time_per_iteration": 4.767102003097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046475, + "balance_loss_mlp": 1.00627732, + "epoch": 0.7214313197383609, + "flos": 462453752832.0, + "grad_norm": 0.03442724668025846, + "language_loss": 0.86840045, + "learning_rate": 0.00019006783582886368, + "loss": 0.87886518, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.40185547, + "step": 3750, + "time_per_iteration": 2.5307884216308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.00384998, + "epoch": 0.7216237014236244, + "flos": 1038913874688.0, + "grad_norm": 0.03633272884659257, + "language_loss": 0.83278096, + "learning_rate": 0.00018982342595339437, + "loss": 0.84322238, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.40283203, + "step": 3751, + "time_per_iteration": 3.5147032737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044645, + "balance_loss_mlp": 1.00437641, + "epoch": 0.721816083108888, + "flos": 897451382784.0, + "grad_norm": 0.033868816355573705, + "language_loss": 0.82631296, + "learning_rate": 0.00018957913649915076, + "loss": 0.83675945, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.40258789, + "step": 3752, + "time_per_iteration": 3.1239399909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_mlp": 1.00446343, + "epoch": 0.7220084647941516, + "flos": 524312427264.0, + "grad_norm": 0.03748349952969219, + "language_loss": 0.80553722, + "learning_rate": 0.00018933496756097428, + "loss": 0.81598485, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.40283203, + "step": 3753, + "time_per_iteration": 2.6250908374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.00487828, + "epoch": 0.7222008464794152, + "flos": 817472344320.0, + "grad_norm": 0.035953196977106826, + "language_loss": 0.82196552, + "learning_rate": 0.0001890909192336603, + "loss": 0.83241749, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.40307617, + "step": 3754, + "time_per_iteration": 3.015929698944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_mlp": 1.00417137, + "epoch": 0.7223932281646788, + "flos": 750373238016.0, + "grad_norm": 0.03340807501662783, + "language_loss": 0.70701879, + "learning_rate": 0.00018884699161195623, + "loss": 0.7174632, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.40258789, + "step": 3755, + "time_per_iteration": 2.934309959411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043188, + "balance_loss_mlp": 1.00279963, + "epoch": 0.7225856098499422, + "flos": 746989457664.0, + "grad_norm": 0.03539660333033103, + "language_loss": 0.77625644, + "learning_rate": 0.00018860318479056327, + "loss": 0.78668833, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.40380859, + "step": 3756, + "time_per_iteration": 3.092843770980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_mlp": 1.00541389, + "epoch": 0.7227779915352058, + "flos": 548435325696.0, + "grad_norm": 0.03162886339795087, + "language_loss": 0.84069121, + "learning_rate": 0.00018835949886413555, + "loss": 0.85114777, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.40234375, + "step": 3757, + "time_per_iteration": 2.697178602218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047884, + "balance_loss_mlp": 1.00756705, + "epoch": 0.7229703732204694, + "flos": 531506720256.0, + "grad_norm": 0.03673346832571833, + "language_loss": 0.78688115, + "learning_rate": 0.0001881159339272806, + "loss": 0.79735994, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.40307617, + "step": 3758, + "time_per_iteration": 2.672168731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00597107, + "epoch": 0.723162754905733, + "flos": 529366221312.0, + "grad_norm": 0.03397833212706175, + "language_loss": 0.79266065, + "learning_rate": 0.00018787249007455858, + "loss": 0.80312276, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.40234375, + "step": 3759, + "time_per_iteration": 2.587975025177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046442, + "balance_loss_mlp": 1.00629199, + "epoch": 0.7233551365909965, + "flos": 656060048640.0, + "grad_norm": 0.03524788149604232, + "language_loss": 0.71597099, + "learning_rate": 0.00018762916740048302, + "loss": 0.72643542, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.40136719, + "step": 3760, + "time_per_iteration": 2.7926323413848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_mlp": 1.00701594, + "epoch": 0.7235475182762601, + "flos": 523444366848.0, + "grad_norm": 0.0316872797389574, + "language_loss": 0.86490506, + "learning_rate": 0.0001873859659995195, + "loss": 0.87537622, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.40087891, + "step": 3761, + "time_per_iteration": 2.7313694953918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047248, + "balance_loss_mlp": 1.00721729, + "epoch": 0.7237398999615237, + "flos": 610322190336.0, + "grad_norm": 0.03701947835091587, + "language_loss": 0.84027237, + "learning_rate": 0.0001871428859660878, + "loss": 0.85074484, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.40014648, + "step": 3762, + "time_per_iteration": 2.724437952041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_mlp": 1.00707483, + "epoch": 0.7239322816467872, + "flos": 660282720768.0, + "grad_norm": 0.032017946801170455, + "language_loss": 0.82444721, + "learning_rate": 0.00018689992739455975, + "loss": 0.83491802, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.39990234, + "step": 3763, + "time_per_iteration": 2.8985331058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045695, + "balance_loss_mlp": 1.00566471, + "epoch": 0.7241246633320508, + "flos": 970941850368.0, + "grad_norm": 0.0325077929756691, + "language_loss": 0.8663789, + "learning_rate": 0.00018665709037926027, + "loss": 0.87683582, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.40014648, + "step": 3764, + "time_per_iteration": 3.3307945728302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_mlp": 1.00323248, + "epoch": 0.7243170450173143, + "flos": 516000252672.0, + "grad_norm": 0.037062443743513, + "language_loss": 0.85301733, + "learning_rate": 0.00018641437501446694, + "loss": 0.86344957, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.3996582, + "step": 3765, + "time_per_iteration": 2.57521915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_mlp": 1.00170028, + "epoch": 0.7245094267025779, + "flos": 560806867200.0, + "grad_norm": 0.03616258332607596, + "language_loss": 0.82752323, + "learning_rate": 0.0001861717813944104, + "loss": 0.83794075, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.40039062, + "step": 3766, + "time_per_iteration": 2.6512858867645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_mlp": 1.00287974, + "epoch": 0.7247018083878415, + "flos": 613775956992.0, + "grad_norm": 0.03625673893536532, + "language_loss": 0.79743433, + "learning_rate": 0.00018592930961327365, + "loss": 0.80786318, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.39990234, + "step": 3767, + "time_per_iteration": 2.704402208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_mlp": 1.00588417, + "epoch": 0.7248941900731051, + "flos": 635871117312.0, + "grad_norm": 0.03196657989519071, + "language_loss": 0.88960397, + "learning_rate": 0.00018568695976519273, + "loss": 0.90006363, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.40063477, + "step": 3768, + "time_per_iteration": 2.764528751373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046055, + "balance_loss_mlp": 1.0059768, + "epoch": 0.7250865717583687, + "flos": 425837803776.0, + "grad_norm": 0.0390622861553884, + "language_loss": 0.80584097, + "learning_rate": 0.00018544473194425593, + "loss": 0.81630147, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.40063477, + "step": 3769, + "time_per_iteration": 2.4841666221618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043424, + "balance_loss_mlp": 1.00329816, + "epoch": 0.7252789534436321, + "flos": 636398949888.0, + "grad_norm": 0.04244308423853245, + "language_loss": 0.79393184, + "learning_rate": 0.00018520262624450485, + "loss": 0.80436611, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.40112305, + "step": 3770, + "time_per_iteration": 2.8432021141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046388, + "balance_loss_mlp": 1.00638103, + "epoch": 0.7254713351288957, + "flos": 618354408192.0, + "grad_norm": 0.03205335937009439, + "language_loss": 0.87801862, + "learning_rate": 0.00018496064275993324, + "loss": 0.88848257, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.39990234, + "step": 3771, + "time_per_iteration": 2.753740072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046271, + "balance_loss_mlp": 1.00612164, + "epoch": 0.7256637168141593, + "flos": 768291412992.0, + "grad_norm": 0.038084131410306525, + "language_loss": 0.82372004, + "learning_rate": 0.00018471878158448686, + "loss": 0.83418274, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.40136719, + "step": 3772, + "time_per_iteration": 2.917302370071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_mlp": 1.0082382, + "epoch": 0.7258560984994229, + "flos": 496727980800.0, + "grad_norm": 0.02992069132066452, + "language_loss": 0.84553695, + "learning_rate": 0.00018447704281206512, + "loss": 0.85602057, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.40112305, + "step": 3773, + "time_per_iteration": 2.843857765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048932, + "balance_loss_mlp": 1.00878251, + "epoch": 0.7260484801846864, + "flos": 531142192896.0, + "grad_norm": 0.03465658020099934, + "language_loss": 0.83523774, + "learning_rate": 0.0001842354265365191, + "loss": 0.84572709, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.40136719, + "step": 3774, + "time_per_iteration": 2.6899774074554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046049, + "balance_loss_mlp": 1.00592351, + "epoch": 0.72624086186995, + "flos": 626108614656.0, + "grad_norm": 0.036794080035960464, + "language_loss": 0.81133199, + "learning_rate": 0.0001839939328516526, + "loss": 0.82179248, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.40112305, + "step": 3775, + "time_per_iteration": 2.75508451461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104583, + "balance_loss_mlp": 1.0056808, + "epoch": 0.7264332435552135, + "flos": 717805962240.0, + "grad_norm": 0.03611168561837021, + "language_loss": 0.82141531, + "learning_rate": 0.0001837525618512218, + "loss": 0.83187354, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.40136719, + "step": 3776, + "time_per_iteration": 2.8697876930236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.0069201, + "epoch": 0.7266256252404771, + "flos": 682242766080.0, + "grad_norm": 0.036803325831150785, + "language_loss": 0.83319986, + "learning_rate": 0.00018351131362893519, + "loss": 0.84367126, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.40209961, + "step": 3777, + "time_per_iteration": 2.7980828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_mlp": 1.00580347, + "epoch": 0.7268180069257407, + "flos": 519918668544.0, + "grad_norm": 0.038913474879357805, + "language_loss": 0.81077832, + "learning_rate": 0.00018327018827845364, + "loss": 0.82123971, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.40332031, + "step": 3778, + "time_per_iteration": 2.610944986343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00537205, + "epoch": 0.7270103886110042, + "flos": 513673115904.0, + "grad_norm": 0.03821848161600015, + "language_loss": 0.88036418, + "learning_rate": 0.00018302918589339036, + "loss": 0.89082056, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.40258789, + "step": 3779, + "time_per_iteration": 2.6776628494262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044448, + "balance_loss_mlp": 1.00413156, + "epoch": 0.7272027702962678, + "flos": 547692665088.0, + "grad_norm": 0.03543573147287282, + "language_loss": 0.90566671, + "learning_rate": 0.00018278830656731054, + "loss": 0.91611117, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.40307617, + "step": 3780, + "time_per_iteration": 2.6612467765808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_mlp": 1.003613, + "epoch": 0.7273951519815314, + "flos": 594155687424.0, + "grad_norm": 0.02879348395383923, + "language_loss": 0.86881804, + "learning_rate": 0.00018254755039373222, + "loss": 0.87925708, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.40283203, + "step": 3781, + "time_per_iteration": 2.724158763885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045168, + "balance_loss_mlp": 1.00482738, + "epoch": 0.727587533666795, + "flos": 607139632128.0, + "grad_norm": 0.03859798712496429, + "language_loss": 0.84525704, + "learning_rate": 0.0001823069174661252, + "loss": 0.85570872, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.40332031, + "step": 3782, + "time_per_iteration": 2.7668051719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_mlp": 1.00395322, + "epoch": 0.7277799153520584, + "flos": 514026949632.0, + "grad_norm": 0.03650439895450689, + "language_loss": 0.78873003, + "learning_rate": 0.00018206640787791112, + "loss": 0.79917252, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.40283203, + "step": 3783, + "time_per_iteration": 2.649040699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_mlp": 1.00268304, + "epoch": 0.727972297037322, + "flos": 538794332160.0, + "grad_norm": 0.03501392489574684, + "language_loss": 0.86669183, + "learning_rate": 0.00018182602172246416, + "loss": 0.87712133, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.40258789, + "step": 3784, + "time_per_iteration": 2.603267192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045035, + "balance_loss_mlp": 1.00474274, + "epoch": 0.7281646787225856, + "flos": 536076423168.0, + "grad_norm": 0.037923852732183974, + "language_loss": 0.77186882, + "learning_rate": 0.00018158575909311075, + "loss": 0.78231919, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.40283203, + "step": 3785, + "time_per_iteration": 2.6864423751831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_mlp": 1.00489032, + "epoch": 0.7283570604078492, + "flos": 626210681856.0, + "grad_norm": 0.0363846490797151, + "language_loss": 0.80090117, + "learning_rate": 0.000181345620083129, + "loss": 0.81135345, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.40332031, + "step": 3786, + "time_per_iteration": 2.7992641925811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_mlp": 1.00548017, + "epoch": 0.7285494420931128, + "flos": 535255994880.0, + "grad_norm": 0.04682580138791378, + "language_loss": 0.86931181, + "learning_rate": 0.00018110560478574927, + "loss": 0.87977034, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.40356445, + "step": 3787, + "time_per_iteration": 2.680211305618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_mlp": 1.00277114, + "epoch": 0.7287418237783763, + "flos": 667741419264.0, + "grad_norm": 0.04795946543380901, + "language_loss": 0.80688787, + "learning_rate": 0.0001808657132941533, + "loss": 0.81731963, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.40405273, + "step": 3788, + "time_per_iteration": 2.793989658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_mlp": 1.00286126, + "epoch": 0.7289342054636399, + "flos": 551639271168.0, + "grad_norm": 0.04788875018667363, + "language_loss": 0.83400464, + "learning_rate": 0.00018062594570147572, + "loss": 0.84443599, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.40258789, + "step": 3789, + "time_per_iteration": 2.5800626277923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043681, + "balance_loss_mlp": 1.00331712, + "epoch": 0.7291265871489034, + "flos": 689139605760.0, + "grad_norm": 0.0306016583616733, + "language_loss": 0.86152685, + "learning_rate": 0.00018038630210080243, + "loss": 0.87196368, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.40356445, + "step": 3790, + "time_per_iteration": 2.791778326034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_mlp": 1.00133801, + "epoch": 0.729318968834167, + "flos": 573771369984.0, + "grad_norm": 0.03320164846736232, + "language_loss": 0.8504535, + "learning_rate": 0.0001801467825851712, + "loss": 0.86087084, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.40380859, + "step": 3791, + "time_per_iteration": 2.7736573219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_mlp": 1.00278151, + "epoch": 0.7295113505194305, + "flos": 587165528832.0, + "grad_norm": 0.039500127545913186, + "language_loss": 0.79190361, + "learning_rate": 0.00017990738724757172, + "loss": 0.80233628, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.40478516, + "step": 3792, + "time_per_iteration": 2.8482463359832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_mlp": 1.00345612, + "epoch": 0.7297037322046941, + "flos": 708442980096.0, + "grad_norm": 0.03263259511522569, + "language_loss": 0.82787073, + "learning_rate": 0.00017966811618094598, + "loss": 0.83830941, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.40405273, + "step": 3793, + "time_per_iteration": 2.889319658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044424, + "balance_loss_mlp": 1.0039407, + "epoch": 0.7298961138899577, + "flos": 488308881408.0, + "grad_norm": 0.03689917900491825, + "language_loss": 0.85408473, + "learning_rate": 0.00017942896947818664, + "loss": 0.86452901, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.40478516, + "step": 3794, + "time_per_iteration": 2.550274133682251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_mlp": 1.00383759, + "epoch": 0.7300884955752213, + "flos": 1368624600576.0, + "grad_norm": 0.005828351386569188, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75868088, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.39160156, + "step": 3795, + "time_per_iteration": 4.89626932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042163, + "balance_loss_mlp": 1.00179839, + "epoch": 0.7302808772604849, + "flos": 532837484544.0, + "grad_norm": 0.04171921070399138, + "language_loss": 0.85686743, + "learning_rate": 0.00017895104953559947, + "loss": 0.86728907, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.40356445, + "step": 3796, + "time_per_iteration": 2.57736873626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_mlp": 1.00203419, + "epoch": 0.7304732589457483, + "flos": 437063273472.0, + "grad_norm": 0.04046264333697194, + "language_loss": 0.90178061, + "learning_rate": 0.00017871227648131672, + "loss": 0.91220486, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.40380859, + "step": 3797, + "time_per_iteration": 2.474209785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_mlp": 1.00489223, + "epoch": 0.7306656406310119, + "flos": 452604734208.0, + "grad_norm": 0.029697022991301388, + "language_loss": 0.82934296, + "learning_rate": 0.0001784736281619907, + "loss": 0.83979571, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.40380859, + "step": 3798, + "time_per_iteration": 2.5923726558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044562, + "balance_loss_mlp": 1.00407827, + "epoch": 0.7308580223162755, + "flos": 513030577152.0, + "grad_norm": 0.032710497654363443, + "language_loss": 0.75410861, + "learning_rate": 0.00017823510467027232, + "loss": 0.7645542, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.40478516, + "step": 3799, + "time_per_iteration": 2.7622478008270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045673, + "balance_loss_mlp": 1.00521374, + "epoch": 0.7310504040015391, + "flos": 376283596800.0, + "grad_norm": 0.039904062723008, + "language_loss": 0.79136682, + "learning_rate": 0.00017799670609876516, + "loss": 0.80182356, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.40454102, + "step": 3800, + "time_per_iteration": 2.493797540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_mlp": 1.00222194, + "epoch": 0.7312427856868026, + "flos": 550382383872.0, + "grad_norm": 0.0325229913216085, + "language_loss": 0.89329851, + "learning_rate": 0.00017775843254002366, + "loss": 0.90372574, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.4050293, + "step": 3801, + "time_per_iteration": 2.7277941703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047641, + "balance_loss_mlp": 1.00727654, + "epoch": 0.7314351673720662, + "flos": 768678294528.0, + "grad_norm": 0.03330924575668911, + "language_loss": 0.84167385, + "learning_rate": 0.00017752028408655367, + "loss": 0.8521502, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.40356445, + "step": 3802, + "time_per_iteration": 3.040632486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_mlp": 1.00824392, + "epoch": 0.7316275490573297, + "flos": 487705226496.0, + "grad_norm": 0.03826862590336393, + "language_loss": 0.8564449, + "learning_rate": 0.00017728226083081272, + "loss": 0.86693048, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.40307617, + "step": 3803, + "time_per_iteration": 2.5550501346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048554, + "balance_loss_mlp": 1.00833249, + "epoch": 0.7318199307425933, + "flos": 474413134848.0, + "grad_norm": 0.03815942500131441, + "language_loss": 0.82039976, + "learning_rate": 0.00017704436286520965, + "loss": 0.83088529, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.40209961, + "step": 3804, + "time_per_iteration": 2.58294677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_mlp": 1.00793362, + "epoch": 0.7320123124278569, + "flos": 550512641280.0, + "grad_norm": 0.03634721787215332, + "language_loss": 0.8514055, + "learning_rate": 0.0001768065902821046, + "loss": 0.86188722, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.40234375, + "step": 3805, + "time_per_iteration": 2.6493990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_mlp": 1.00665021, + "epoch": 0.7322046941131204, + "flos": 571900134144.0, + "grad_norm": 0.03447588355898286, + "language_loss": 0.82488358, + "learning_rate": 0.00017656894317380907, + "loss": 0.83535278, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.40258789, + "step": 3806, + "time_per_iteration": 2.7446413040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_mlp": 1.00468445, + "epoch": 0.732397075798384, + "flos": 1472503928064.0, + "grad_norm": 0.008037479366224719, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77074862, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.38867188, + "step": 3807, + "time_per_iteration": 5.015838623046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_mlp": 1.00413609, + "epoch": 0.7325894574836476, + "flos": 465831697152.0, + "grad_norm": 0.038585998350043275, + "language_loss": 0.84359336, + "learning_rate": 0.00017609402575064875, + "loss": 0.85403788, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.40307617, + "step": 3808, + "time_per_iteration": 2.5619466304779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044203, + "balance_loss_mlp": 1.00398183, + "epoch": 0.7327818391689112, + "flos": 496482050304.0, + "grad_norm": 0.03775450514575077, + "language_loss": 0.81649804, + "learning_rate": 0.00017585675562016367, + "loss": 0.82694006, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.40209961, + "step": 3809, + "time_per_iteration": 2.5793349742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044159, + "balance_loss_mlp": 1.00524902, + "epoch": 0.7329742208541746, + "flos": 1436682162432.0, + "grad_norm": 0.007309956802170158, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78257012, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.38867188, + "step": 3810, + "time_per_iteration": 4.810467720031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_mlp": 1.00303602, + "epoch": 0.7331666025394382, + "flos": 497869195008.0, + "grad_norm": 0.0392578744691446, + "language_loss": 0.85801327, + "learning_rate": 0.00017538259298196474, + "loss": 0.86844486, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.40112305, + "step": 3811, + "time_per_iteration": 2.5858519077301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_mlp": 1.00657213, + "epoch": 0.7333589842247018, + "flos": 539639059968.0, + "grad_norm": 0.03309973691359967, + "language_loss": 0.82286286, + "learning_rate": 0.00017514570065833745, + "loss": 0.83333039, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.40161133, + "step": 3812, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_mlp": 1.00525999, + "epoch": 0.7335513659099654, + "flos": 492042604800.0, + "grad_norm": 0.03925978819405336, + "language_loss": 0.81363267, + "learning_rate": 0.00017490893445433426, + "loss": 0.82408601, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.40063477, + "step": 3813, + "time_per_iteration": 2.608065128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_mlp": 1.00384104, + "epoch": 0.733743747595229, + "flos": 563253567744.0, + "grad_norm": 0.033972583106890976, + "language_loss": 0.82267326, + "learning_rate": 0.00017467229446187587, + "loss": 0.83311474, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.40307617, + "step": 3814, + "time_per_iteration": 2.6955394744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043196, + "balance_loss_mlp": 1.00290346, + "epoch": 0.7339361292804925, + "flos": 539649753600.0, + "grad_norm": 0.03487524168244714, + "language_loss": 0.81803584, + "learning_rate": 0.00017443578077283424, + "loss": 0.82846785, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.40283203, + "step": 3815, + "time_per_iteration": 2.6844675540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_mlp": 1.00742269, + "epoch": 0.734128510965756, + "flos": 549561955584.0, + "grad_norm": 0.03210943726156845, + "language_loss": 0.85443103, + "learning_rate": 0.0001741993934790319, + "loss": 0.86490697, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.40161133, + "step": 3816, + "time_per_iteration": 2.754804849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.0069536, + "epoch": 0.7343208926510196, + "flos": 541202148864.0, + "grad_norm": 0.03979674876858525, + "language_loss": 0.84579813, + "learning_rate": 0.00017396313267224273, + "loss": 0.85627079, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.40307617, + "step": 3817, + "time_per_iteration": 2.7152209281921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046673, + "balance_loss_mlp": 1.00638032, + "epoch": 0.7345132743362832, + "flos": 572171342592.0, + "grad_norm": 0.03405657916649516, + "language_loss": 0.88968074, + "learning_rate": 0.0001737269984441912, + "loss": 0.9001475, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.40283203, + "step": 3818, + "time_per_iteration": 2.63198184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_mlp": 1.00906372, + "epoch": 0.7347056560215467, + "flos": 546481464576.0, + "grad_norm": 0.04751068267806247, + "language_loss": 0.85475308, + "learning_rate": 0.00017349099088655263, + "loss": 0.86524642, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.40258789, + "step": 3819, + "time_per_iteration": 2.796168804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_mlp": 1.0060178, + "epoch": 0.7348980377068103, + "flos": 597077730816.0, + "grad_norm": 0.03129969376285051, + "language_loss": 0.81227374, + "learning_rate": 0.00017325511009095375, + "loss": 0.82273662, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.40258789, + "step": 3820, + "time_per_iteration": 2.7165210247039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_mlp": 1.00621831, + "epoch": 0.7350904193920739, + "flos": 539612815104.0, + "grad_norm": 0.03503609859827407, + "language_loss": 0.84185189, + "learning_rate": 0.00017301935614897113, + "loss": 0.8523168, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.40258789, + "step": 3821, + "time_per_iteration": 2.7012970447540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046737, + "balance_loss_mlp": 1.00656378, + "epoch": 0.7352828010773375, + "flos": 514061942784.0, + "grad_norm": 0.02996543941139594, + "language_loss": 0.8232463, + "learning_rate": 0.00017278372915213274, + "loss": 0.83371365, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.40161133, + "step": 3822, + "time_per_iteration": 2.646228313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105101, + "balance_loss_mlp": 1.01171875, + "epoch": 0.735475182762601, + "flos": 1557258749184.0, + "grad_norm": 0.004879497460224864, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80944854, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.39257812, + "step": 3823, + "time_per_iteration": 5.001528024673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_mlp": 1.00636435, + "epoch": 0.7356675644478645, + "flos": 682612151040.0, + "grad_norm": 0.0358517187113506, + "language_loss": 0.8115629, + "learning_rate": 0.00017231285635975314, + "loss": 0.82202852, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.40185547, + "step": 3824, + "time_per_iteration": 2.916127920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_mlp": 1.00615466, + "epoch": 0.7358599461331281, + "flos": 516232577280.0, + "grad_norm": 0.05204398731861849, + "language_loss": 0.83695984, + "learning_rate": 0.00017207761074702115, + "loss": 0.8474232, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.40161133, + "step": 3825, + "time_per_iteration": 2.62750506401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_mlp": 1.00662804, + "epoch": 0.7360523278183917, + "flos": 444917601792.0, + "grad_norm": 0.03194798623104488, + "language_loss": 0.84528393, + "learning_rate": 0.0001718424924450514, + "loss": 0.85575122, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.40087891, + "step": 3826, + "time_per_iteration": 2.61261248588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046993, + "balance_loss_mlp": 1.00684357, + "epoch": 0.7362447095036553, + "flos": 604551980544.0, + "grad_norm": 0.028984397633237662, + "language_loss": 0.86482602, + "learning_rate": 0.00017160750154512482, + "loss": 0.875296, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.40136719, + "step": 3827, + "time_per_iteration": 2.6998865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_mlp": 1.00305784, + "epoch": 0.7364370911889189, + "flos": 554251222272.0, + "grad_norm": 0.040234447169501614, + "language_loss": 0.8371399, + "learning_rate": 0.0001713726381384731, + "loss": 0.84757173, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.40112305, + "step": 3828, + "time_per_iteration": 2.746196746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_mlp": 1.00163531, + "epoch": 0.7366294728741823, + "flos": 449990837760.0, + "grad_norm": 0.03659096604544618, + "language_loss": 0.81686258, + "learning_rate": 0.00017113790231627812, + "loss": 0.82728064, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.40161133, + "step": 3829, + "time_per_iteration": 2.5232386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_mlp": 1.00445557, + "epoch": 0.7368218545594459, + "flos": 1538705816832.0, + "grad_norm": 0.007725694552394297, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80301964, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.39257812, + "step": 3830, + "time_per_iteration": 4.843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_mlp": 1.00405526, + "epoch": 0.7370142362447095, + "flos": 516473650176.0, + "grad_norm": 0.03681023024701871, + "language_loss": 0.82271254, + "learning_rate": 0.00017066881378973936, + "loss": 0.83315504, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.40185547, + "step": 3831, + "time_per_iteration": 2.684302806854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_mlp": 1.00578201, + "epoch": 0.7372066179299731, + "flos": 501905229312.0, + "grad_norm": 0.03287634093560934, + "language_loss": 0.83259964, + "learning_rate": 0.00017043446126751189, + "loss": 0.84305775, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.40014648, + "step": 3832, + "time_per_iteration": 2.710259199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_mlp": 1.00409663, + "epoch": 0.7373989996152366, + "flos": 559167955968.0, + "grad_norm": 0.03638251388363948, + "language_loss": 0.76960367, + "learning_rate": 0.00017020023669397376, + "loss": 0.78004539, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.40063477, + "step": 3833, + "time_per_iteration": 2.735877752304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050598, + "balance_loss_mlp": 1.01054347, + "epoch": 0.7375913813005002, + "flos": 507781396992.0, + "grad_norm": 0.059668100448601574, + "language_loss": 0.82237148, + "learning_rate": 0.0001699661401600589, + "loss": 0.8328774, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.40039062, + "step": 3834, + "time_per_iteration": 2.579663038253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_mlp": 1.007357, + "epoch": 0.7377837629857638, + "flos": 487156006656.0, + "grad_norm": 0.03637906521459096, + "language_loss": 0.78828633, + "learning_rate": 0.00016973217175665205, + "loss": 0.79876041, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.40039062, + "step": 3835, + "time_per_iteration": 2.6623384952545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_mlp": 1.00731659, + "epoch": 0.7379761446710273, + "flos": 1417880375808.0, + "grad_norm": 0.007661340220520439, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82212675, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.39160156, + "step": 3836, + "time_per_iteration": 4.928514003753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_mlp": 1.00634801, + "epoch": 0.7381685263562909, + "flos": 630910642176.0, + "grad_norm": 0.035800200298820535, + "language_loss": 0.84820634, + "learning_rate": 0.00016926461970465047, + "loss": 0.85866964, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.3996582, + "step": 3837, + "time_per_iteration": 2.762173891067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043931, + "balance_loss_mlp": 1.00382948, + "epoch": 0.7383609080415544, + "flos": 740652531456.0, + "grad_norm": 0.029602535209274302, + "language_loss": 0.84896356, + "learning_rate": 0.00016903103623757516, + "loss": 0.85940289, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.40087891, + "step": 3838, + "time_per_iteration": 3.0506296157836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_mlp": 1.00541735, + "epoch": 0.738553289726818, + "flos": 551257247232.0, + "grad_norm": 0.038121042805401205, + "language_loss": 0.807634, + "learning_rate": 0.00016879758126404738, + "loss": 0.8180899, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.40161133, + "step": 3839, + "time_per_iteration": 2.715830087661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104385, + "balance_loss_mlp": 1.00372398, + "epoch": 0.7387456714120816, + "flos": 911776785408.0, + "grad_norm": 0.03920302310428291, + "language_loss": 0.80385631, + "learning_rate": 0.00016856425487470216, + "loss": 0.81429482, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.40112305, + "step": 3840, + "time_per_iteration": 3.1212151050567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044355, + "balance_loss_mlp": 1.00422895, + "epoch": 0.7389380530973452, + "flos": 854197163520.0, + "grad_norm": 0.035349098992081385, + "language_loss": 0.79466581, + "learning_rate": 0.00016833105716012486, + "loss": 0.80510932, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.40112305, + "step": 3841, + "time_per_iteration": 3.1690988540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_mlp": 1.0040617, + "epoch": 0.7391304347826086, + "flos": 818421084672.0, + "grad_norm": 0.0368177293104177, + "language_loss": 0.85204184, + "learning_rate": 0.00016809798821085088, + "loss": 0.86248374, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.40112305, + "step": 3842, + "time_per_iteration": 3.033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104542, + "balance_loss_mlp": 1.00536537, + "epoch": 0.7393228164678722, + "flos": 573938565888.0, + "grad_norm": 0.03389595177699646, + "language_loss": 0.89421487, + "learning_rate": 0.00016786504811736565, + "loss": 0.90466905, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.40039062, + "step": 3843, + "time_per_iteration": 2.723698616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104513, + "balance_loss_mlp": 1.00500441, + "epoch": 0.7395151981531358, + "flos": 686576253696.0, + "grad_norm": 0.0300135100261375, + "language_loss": 0.83072603, + "learning_rate": 0.00016763223697010442, + "loss": 0.84117734, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.40112305, + "step": 3844, + "time_per_iteration": 2.975797414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_mlp": 1.00389111, + "epoch": 0.7397075798383994, + "flos": 557455167744.0, + "grad_norm": 0.04240767697887406, + "language_loss": 0.84802914, + "learning_rate": 0.00016739955485945256, + "loss": 0.85846901, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.40087891, + "step": 3845, + "time_per_iteration": 2.720717191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_mlp": 1.00448394, + "epoch": 0.739899961523663, + "flos": 547822922496.0, + "grad_norm": 0.04053063595065812, + "language_loss": 0.86230588, + "learning_rate": 0.00016716700187574513, + "loss": 0.87275296, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.40209961, + "step": 3846, + "time_per_iteration": 2.703578472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_mlp": 1.00492537, + "epoch": 0.7400923432089265, + "flos": 610304693760.0, + "grad_norm": 0.03543720475620032, + "language_loss": 0.84347486, + "learning_rate": 0.0001669345781092675, + "loss": 0.85392559, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.40136719, + "step": 3847, + "time_per_iteration": 2.7703194618225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_mlp": 1.00455177, + "epoch": 0.7402847248941901, + "flos": 592180439040.0, + "grad_norm": 0.0397830502127856, + "language_loss": 0.87809312, + "learning_rate": 0.0001667022836502546, + "loss": 0.8885411, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.40234375, + "step": 3848, + "time_per_iteration": 2.760023355484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_mlp": 1.00629044, + "epoch": 0.7404771065794536, + "flos": 478305305856.0, + "grad_norm": 0.03878201132992699, + "language_loss": 0.83579338, + "learning_rate": 0.00016647011858889077, + "loss": 0.84625876, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.40234375, + "step": 3849, + "time_per_iteration": 2.566498041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_mlp": 1.00385714, + "epoch": 0.7406694882647172, + "flos": 497467729152.0, + "grad_norm": 0.04044358723064945, + "language_loss": 0.86492926, + "learning_rate": 0.00016623808301531056, + "loss": 0.87536979, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.40185547, + "step": 3850, + "time_per_iteration": 2.659444808959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043918, + "balance_loss_mlp": 1.00367308, + "epoch": 0.7408618699499807, + "flos": 563327444736.0, + "grad_norm": 0.04103255616090965, + "language_loss": 0.79822052, + "learning_rate": 0.00016600617701959842, + "loss": 0.80865979, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.40234375, + "step": 3851, + "time_per_iteration": 2.7590160369873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044136, + "balance_loss_mlp": 1.0050354, + "epoch": 0.7410542516352443, + "flos": 1391472136704.0, + "grad_norm": 0.004180276378427017, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.7988795, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.390625, + "step": 3852, + "time_per_iteration": 4.960458040237427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047168, + "balance_loss_mlp": 1.00699425, + "epoch": 0.7412466333205079, + "flos": 671212682496.0, + "grad_norm": 0.032734679500117485, + "language_loss": 0.81693292, + "learning_rate": 0.00016554275412186315, + "loss": 0.82740462, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.40161133, + "step": 3853, + "time_per_iteration": 2.8345468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045922, + "balance_loss_mlp": 1.00579631, + "epoch": 0.7414390150057715, + "flos": 490319122944.0, + "grad_norm": 0.03898197484032271, + "language_loss": 0.81142187, + "learning_rate": 0.0001653112373997568, + "loss": 0.82188106, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.40112305, + "step": 3854, + "time_per_iteration": 2.6750757694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048137, + "balance_loss_mlp": 1.00786769, + "epoch": 0.7416313966910351, + "flos": 600494558976.0, + "grad_norm": 0.046812555930759385, + "language_loss": 0.75529599, + "learning_rate": 0.0001650798506153517, + "loss": 0.76577735, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.40258789, + "step": 3855, + "time_per_iteration": 2.7398931980133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_mlp": 1.00440431, + "epoch": 0.7418237783762985, + "flos": 543587611392.0, + "grad_norm": 0.04165043457756402, + "language_loss": 0.84612322, + "learning_rate": 0.00016484859385848023, + "loss": 0.85657072, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.40332031, + "step": 3856, + "time_per_iteration": 2.6185436248779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047216, + "balance_loss_mlp": 1.00692356, + "epoch": 0.7420161600615621, + "flos": 545224577280.0, + "grad_norm": 0.03738954086230496, + "language_loss": 0.77780879, + "learning_rate": 0.0001646174672189243, + "loss": 0.78828102, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.40283203, + "step": 3857, + "time_per_iteration": 2.689188241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_mlp": 1.00661707, + "epoch": 0.7422085417468257, + "flos": 528211401216.0, + "grad_norm": 0.03526154422012509, + "language_loss": 0.80570501, + "learning_rate": 0.00016438647078641488, + "loss": 0.81617367, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.40234375, + "step": 3858, + "time_per_iteration": 2.5922017097473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_mlp": 1.00247657, + "epoch": 0.7424009234320893, + "flos": 509761502976.0, + "grad_norm": 0.033547873778652565, + "language_loss": 0.83657616, + "learning_rate": 0.00016415560465063344, + "loss": 0.84700406, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.40307617, + "step": 3859, + "time_per_iteration": 2.7559196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_mlp": 1.00216925, + "epoch": 0.7425933051173528, + "flos": 513607987200.0, + "grad_norm": 0.0418042544684692, + "language_loss": 0.79894865, + "learning_rate": 0.0001639248689012095, + "loss": 0.8093735, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.40307617, + "step": 3860, + "time_per_iteration": 2.5863146781921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042581, + "balance_loss_mlp": 1.00235939, + "epoch": 0.7427856868026164, + "flos": 459378119424.0, + "grad_norm": 0.03937431006783476, + "language_loss": 0.88026142, + "learning_rate": 0.00016369426362772271, + "loss": 0.89068723, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.40209961, + "step": 3861, + "time_per_iteration": 2.761857271194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046937, + "balance_loss_mlp": 1.00681162, + "epoch": 0.74297806848788, + "flos": 606188946432.0, + "grad_norm": 0.03201159100602054, + "language_loss": 0.80801797, + "learning_rate": 0.00016346378891970233, + "loss": 0.81848741, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.40112305, + "step": 3862, + "time_per_iteration": 2.8071773052215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_mlp": 1.00797915, + "epoch": 0.7431704501731435, + "flos": 893071229952.0, + "grad_norm": 0.0336740145247338, + "language_loss": 0.81989479, + "learning_rate": 0.00016323344486662633, + "loss": 0.8303746, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.39990234, + "step": 3863, + "time_per_iteration": 3.324979066848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_mlp": 1.0081737, + "epoch": 0.7433628318584071, + "flos": 593352755712.0, + "grad_norm": 0.03174757765296807, + "language_loss": 0.78870291, + "learning_rate": 0.00016300323155792247, + "loss": 0.7991842, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.39941406, + "step": 3864, + "time_per_iteration": 2.9272854328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052134, + "balance_loss_mlp": 1.01215136, + "epoch": 0.7435552135436706, + "flos": 478190599680.0, + "grad_norm": 0.033980491156459056, + "language_loss": 0.89128578, + "learning_rate": 0.00016277314908296687, + "loss": 0.90180707, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.3996582, + "step": 3865, + "time_per_iteration": 2.6214301586151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050851, + "balance_loss_mlp": 1.01086855, + "epoch": 0.7437475952289342, + "flos": 674432179200.0, + "grad_norm": 0.04325039484001494, + "language_loss": 0.76593798, + "learning_rate": 0.00016254319753108604, + "loss": 0.77644652, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.3996582, + "step": 3866, + "time_per_iteration": 2.899153232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_mlp": 1.00706291, + "epoch": 0.7439399769141978, + "flos": 771771424512.0, + "grad_norm": 0.03836259627327615, + "language_loss": 0.77282906, + "learning_rate": 0.00016231337699155492, + "loss": 0.78330016, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.40039062, + "step": 3867, + "time_per_iteration": 3.037646532058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046954, + "balance_loss_mlp": 1.00680459, + "epoch": 0.7441323585994614, + "flos": 649039754496.0, + "grad_norm": 0.035166098424979836, + "language_loss": 0.78786439, + "learning_rate": 0.0001620836875535977, + "loss": 0.79833388, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.40136719, + "step": 3868, + "time_per_iteration": 2.850342273712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044105, + "balance_loss_mlp": 1.00385988, + "epoch": 0.7443247402847248, + "flos": 566501254656.0, + "grad_norm": 0.03170658148117992, + "language_loss": 0.81203747, + "learning_rate": 0.00016185412930638766, + "loss": 0.82247853, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.40234375, + "step": 3869, + "time_per_iteration": 2.845094680786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_mlp": 1.00283337, + "epoch": 0.7445171219699884, + "flos": 579680585472.0, + "grad_norm": 0.03566273998402668, + "language_loss": 0.8328712, + "learning_rate": 0.00016162470233904765, + "loss": 0.843301, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.40136719, + "step": 3870, + "time_per_iteration": 2.720104217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043223, + "balance_loss_mlp": 1.00304985, + "epoch": 0.744709503655252, + "flos": 620030257920.0, + "grad_norm": 0.03479057330030947, + "language_loss": 0.82728422, + "learning_rate": 0.00016139540674064856, + "loss": 0.83771646, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.40161133, + "step": 3871, + "time_per_iteration": 2.7673120498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_mlp": 1.00208843, + "epoch": 0.7449018853405156, + "flos": 529681171200.0, + "grad_norm": 0.03196452770059439, + "language_loss": 0.78282529, + "learning_rate": 0.00016116624260021113, + "loss": 0.79324627, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.39990234, + "step": 3872, + "time_per_iteration": 2.7602975368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_mlp": 1.00197768, + "epoch": 0.7450942670257792, + "flos": 434223855360.0, + "grad_norm": 0.03942691463996184, + "language_loss": 0.84282726, + "learning_rate": 0.0001609372100067046, + "loss": 0.85324788, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.40063477, + "step": 3873, + "time_per_iteration": 2.557443618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_mlp": 1.00324678, + "epoch": 0.7452866487110427, + "flos": 698166250752.0, + "grad_norm": 0.03979606180562333, + "language_loss": 0.85209823, + "learning_rate": 0.0001607083090490475, + "loss": 0.86253166, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.40087891, + "step": 3874, + "time_per_iteration": 2.9215829372406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042214, + "balance_loss_mlp": 1.00213587, + "epoch": 0.7454790303963063, + "flos": 513280398336.0, + "grad_norm": 0.038948732221191794, + "language_loss": 0.80756831, + "learning_rate": 0.00016047953981610714, + "loss": 0.81799042, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.40063477, + "step": 3875, + "time_per_iteration": 2.7751615047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042316, + "balance_loss_mlp": 1.00331116, + "epoch": 0.7456714120815698, + "flos": 1328876637696.0, + "grad_norm": 0.007608844356592571, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80771959, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.38964844, + "step": 3876, + "time_per_iteration": 4.963236331939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104273, + "balance_loss_mlp": 1.00258029, + "epoch": 0.7458637937668334, + "flos": 722972517120.0, + "grad_norm": 0.03405336651276997, + "language_loss": 0.81492639, + "learning_rate": 0.0001600223968795889, + "loss": 0.82535368, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.40136719, + "step": 3877, + "time_per_iteration": 2.910365581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040028, + "balance_loss_mlp": 1.00102234, + "epoch": 0.746056175452097, + "flos": 1504869014784.0, + "grad_norm": 0.004565558570820898, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76736104, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.38964844, + "step": 3878, + "time_per_iteration": 4.932594060897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_mlp": 1.00196517, + "epoch": 0.7462485571373605, + "flos": 521295119616.0, + "grad_norm": 0.03746689938213739, + "language_loss": 0.82366681, + "learning_rate": 0.00015956578190706483, + "loss": 0.83408701, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.40039062, + "step": 3879, + "time_per_iteration": 2.6971168518066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043092, + "balance_loss_mlp": 1.00282276, + "epoch": 0.7464409388226241, + "flos": 482167341312.0, + "grad_norm": 0.03527801182694915, + "language_loss": 0.76289219, + "learning_rate": 0.00015933767262892468, + "loss": 0.77332312, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.40258789, + "step": 3880, + "time_per_iteration": 2.739508628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043029, + "balance_loss_mlp": 1.00290275, + "epoch": 0.7466333205078877, + "flos": 487742164992.0, + "grad_norm": 0.04213099092543845, + "language_loss": 0.82585847, + "learning_rate": 0.00015910969560762927, + "loss": 0.83628881, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.40112305, + "step": 3881, + "time_per_iteration": 2.562812089920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041177, + "balance_loss_mlp": 1.00102758, + "epoch": 0.7468257021931513, + "flos": 612408254208.0, + "grad_norm": 0.03436500005268551, + "language_loss": 0.83349586, + "learning_rate": 0.00015888185093168727, + "loss": 0.84390759, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.40136719, + "step": 3882, + "time_per_iteration": 2.775710105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_mlp": 1.00434875, + "epoch": 0.7470180838784147, + "flos": 534485144064.0, + "grad_norm": 0.033392076126709996, + "language_loss": 0.81580567, + "learning_rate": 0.00015865413868955581, + "loss": 0.82625163, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.40234375, + "step": 3883, + "time_per_iteration": 2.641209125518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_mlp": 1.00276339, + "epoch": 0.7472104655636783, + "flos": 740673918720.0, + "grad_norm": 0.03165690169757385, + "language_loss": 0.83215499, + "learning_rate": 0.00015842655896964054, + "loss": 0.84258389, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.40112305, + "step": 3884, + "time_per_iteration": 3.0401206016540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_mlp": 1.00191045, + "epoch": 0.7474028472489419, + "flos": 641502321408.0, + "grad_norm": 0.03740320780985832, + "language_loss": 0.74281669, + "learning_rate": 0.00015819911186029567, + "loss": 0.75323802, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.40209961, + "step": 3885, + "time_per_iteration": 2.7730581760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_mlp": 1.00332129, + "epoch": 0.7475952289342055, + "flos": 591326962944.0, + "grad_norm": 0.03361665798046632, + "language_loss": 0.8701033, + "learning_rate": 0.00015797179744982443, + "loss": 0.88053918, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.40258789, + "step": 3886, + "time_per_iteration": 2.708472967147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_mlp": 1.00303507, + "epoch": 0.7477876106194691, + "flos": 489220683264.0, + "grad_norm": 0.029904604338816032, + "language_loss": 0.79095513, + "learning_rate": 0.00015774461582647765, + "loss": 0.80138862, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.40307617, + "step": 3887, + "time_per_iteration": 2.619105100631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044379, + "balance_loss_mlp": 1.00406253, + "epoch": 0.7479799923047326, + "flos": 555790011648.0, + "grad_norm": 0.036783241933874694, + "language_loss": 0.81563759, + "learning_rate": 0.00015751756707845505, + "loss": 0.82608134, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.40307617, + "step": 3888, + "time_per_iteration": 2.639768123626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041973, + "balance_loss_mlp": 1.00170422, + "epoch": 0.7481723739899961, + "flos": 768791055360.0, + "grad_norm": 0.03246382733666718, + "language_loss": 0.88938636, + "learning_rate": 0.00015729065129390502, + "loss": 0.89980614, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.40258789, + "step": 3889, + "time_per_iteration": 3.0039196014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041937, + "balance_loss_mlp": 1.00162077, + "epoch": 0.7483647556752597, + "flos": 497161527552.0, + "grad_norm": 0.037416161983298064, + "language_loss": 0.82518947, + "learning_rate": 0.0001570638685609241, + "loss": 0.83560884, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.40307617, + "step": 3890, + "time_per_iteration": 2.6009106636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_mlp": 1.00238311, + "epoch": 0.7485571373605233, + "flos": 473826976512.0, + "grad_norm": 0.0374886975546847, + "language_loss": 0.80841064, + "learning_rate": 0.00015683721896755693, + "loss": 0.81883812, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.40356445, + "step": 3891, + "time_per_iteration": 2.5633225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050194, + "balance_loss_mlp": 1.0109024, + "epoch": 0.7487495190457868, + "flos": 1557901287936.0, + "grad_norm": 0.009107033640044568, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83260679, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.39257812, + "step": 3892, + "time_per_iteration": 4.94974160194397 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046825, + "balance_loss_mlp": 1.00665128, + "epoch": 0.7489419007310504, + "flos": 582967156224.0, + "grad_norm": 0.04143959916189291, + "language_loss": 0.85828441, + "learning_rate": 0.00015638431955158528, + "loss": 0.8687526, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.40161133, + "step": 3893, + "time_per_iteration": 2.6816978454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_mlp": 1.0072118, + "epoch": 0.749134282416314, + "flos": 568698134016.0, + "grad_norm": 0.030135437984765083, + "language_loss": 0.81634343, + "learning_rate": 0.00015615806990481186, + "loss": 0.82681662, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.40087891, + "step": 3894, + "time_per_iteration": 2.7294962406158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046959, + "balance_loss_mlp": 1.0068568, + "epoch": 0.7493266641015776, + "flos": 534166303488.0, + "grad_norm": 0.0348465154646137, + "language_loss": 0.84720361, + "learning_rate": 0.00015593195374931452, + "loss": 0.85767317, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.40087891, + "step": 3895, + "time_per_iteration": 2.7430076599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047, + "balance_loss_mlp": 1.00685048, + "epoch": 0.7495190457868411, + "flos": 524718750720.0, + "grad_norm": 0.040656951694221274, + "language_loss": 0.80276871, + "learning_rate": 0.00015570597117287922, + "loss": 0.81323874, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.40136719, + "step": 3896, + "time_per_iteration": 2.6507298946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_mlp": 1.00107622, + "epoch": 0.7497114274721046, + "flos": 515190518016.0, + "grad_norm": 0.03462966662761621, + "language_loss": 0.78418148, + "learning_rate": 0.0001554801222632406, + "loss": 0.79459298, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.40063477, + "step": 3897, + "time_per_iteration": 2.595093250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_mlp": 1.00186181, + "epoch": 0.7499038091573682, + "flos": 495997959168.0, + "grad_norm": 0.03336183376647943, + "language_loss": 0.85394609, + "learning_rate": 0.00015525440710808052, + "loss": 0.86436647, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.40161133, + "step": 3898, + "time_per_iteration": 2.643571376800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_mlp": 1.00302041, + "epoch": 0.7500961908426318, + "flos": 738989320704.0, + "grad_norm": 0.03519199778666105, + "language_loss": 0.78480381, + "learning_rate": 0.00015502882579502953, + "loss": 0.79523695, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.40283203, + "step": 3899, + "time_per_iteration": 2.97965669631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043279, + "balance_loss_mlp": 1.00303352, + "epoch": 0.7502885725278954, + "flos": 534537633792.0, + "grad_norm": 0.03091865582012727, + "language_loss": 0.85061979, + "learning_rate": 0.00015480337841166592, + "loss": 0.86105257, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.40234375, + "step": 3900, + "time_per_iteration": 2.7444653511047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043426, + "balance_loss_mlp": 1.00322855, + "epoch": 0.7504809542131589, + "flos": 590559024384.0, + "grad_norm": 0.034641340110691664, + "language_loss": 0.83055896, + "learning_rate": 0.00015457806504551647, + "loss": 0.84099317, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.40185547, + "step": 3901, + "time_per_iteration": 2.847846269607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_mlp": 1.0071348, + "epoch": 0.7506733358984224, + "flos": 512583424512.0, + "grad_norm": 0.03350221131006084, + "language_loss": 0.78925437, + "learning_rate": 0.0001543528857840554, + "loss": 0.79972672, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.40087891, + "step": 3902, + "time_per_iteration": 2.6609957218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047342, + "balance_loss_mlp": 1.00728762, + "epoch": 0.750865717583686, + "flos": 540383665920.0, + "grad_norm": 0.03644816467758723, + "language_loss": 0.80910051, + "learning_rate": 0.000154127840714705, + "loss": 0.81957394, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.40039062, + "step": 3903, + "time_per_iteration": 2.778198003768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048232, + "balance_loss_mlp": 1.00810659, + "epoch": 0.7510580992689496, + "flos": 477541257984.0, + "grad_norm": 0.040090358516612946, + "language_loss": 0.8254571, + "learning_rate": 0.00015390292992483557, + "loss": 0.83593941, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.40112305, + "step": 3904, + "time_per_iteration": 2.5382485389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047868, + "balance_loss_mlp": 1.0078609, + "epoch": 0.7512504809542132, + "flos": 580201615104.0, + "grad_norm": 0.03358602757025677, + "language_loss": 0.84426451, + "learning_rate": 0.00015367815350176523, + "loss": 0.85474312, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.39990234, + "step": 3905, + "time_per_iteration": 2.741651773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_mlp": 1.00804722, + "epoch": 0.7514428626394767, + "flos": 419564060928.0, + "grad_norm": 0.03247714739847641, + "language_loss": 0.83377486, + "learning_rate": 0.00015345351153275987, + "loss": 0.84425521, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.3996582, + "step": 3906, + "time_per_iteration": 2.5285587310791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041564, + "balance_loss_mlp": 1.0013901, + "epoch": 0.7516352443247403, + "flos": 642255675648.0, + "grad_norm": 0.03199624670716249, + "language_loss": 0.81475991, + "learning_rate": 0.00015322900410503332, + "loss": 0.82517552, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.40161133, + "step": 3907, + "time_per_iteration": 2.814133405685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_mlp": 1.00366187, + "epoch": 0.7518276260100039, + "flos": 582192414720.0, + "grad_norm": 0.03412627966929826, + "language_loss": 0.77873939, + "learning_rate": 0.00015300463130574703, + "loss": 0.78917778, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.40161133, + "step": 3908, + "time_per_iteration": 2.909247875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_mlp": 1.00210238, + "epoch": 0.7520200076952674, + "flos": 688616630784.0, + "grad_norm": 0.028908861637072923, + "language_loss": 0.82461572, + "learning_rate": 0.00015278039322201033, + "loss": 0.83503771, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.40087891, + "step": 3909, + "time_per_iteration": 2.9831857681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044247, + "balance_loss_mlp": 1.00419319, + "epoch": 0.7522123893805309, + "flos": 487416521472.0, + "grad_norm": 0.03727501857461446, + "language_loss": 0.8023864, + "learning_rate": 0.00015255628994088004, + "loss": 0.8128289, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.40039062, + "step": 3910, + "time_per_iteration": 2.5681653022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104353, + "balance_loss_mlp": 1.00352335, + "epoch": 0.7524047710657945, + "flos": 820592686080.0, + "grad_norm": 0.03692479601662457, + "language_loss": 0.75641394, + "learning_rate": 0.00015233232154936082, + "loss": 0.76684928, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.39990234, + "step": 3911, + "time_per_iteration": 3.284299612045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046246, + "balance_loss_mlp": 1.00616753, + "epoch": 0.7525971527510581, + "flos": 700782092544.0, + "grad_norm": 0.03573003611692562, + "language_loss": 0.76908588, + "learning_rate": 0.0001521084881344048, + "loss": 0.77954835, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.40063477, + "step": 3912, + "time_per_iteration": 2.8574602603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_mlp": 1.00155079, + "epoch": 0.7527895344363217, + "flos": 634950567168.0, + "grad_norm": 0.03264325310237669, + "language_loss": 0.8679074, + "learning_rate": 0.00015188478978291208, + "loss": 0.87832272, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.3996582, + "step": 3913, + "time_per_iteration": 2.7522592544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_mlp": 1.00173748, + "epoch": 0.7529819161215853, + "flos": 563933044992.0, + "grad_norm": 0.03193556827495635, + "language_loss": 0.86971831, + "learning_rate": 0.00015166122658173014, + "loss": 0.88013625, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.40039062, + "step": 3914, + "time_per_iteration": 2.8044931888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_mlp": 1.00230658, + "epoch": 0.7531742978068487, + "flos": 691957636608.0, + "grad_norm": 0.032939092122736, + "language_loss": 0.89373708, + "learning_rate": 0.00015143779861765332, + "loss": 0.90416014, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.39990234, + "step": 3915, + "time_per_iteration": 2.895873546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_mlp": 1.00266302, + "epoch": 0.7533666794921123, + "flos": 682307894784.0, + "grad_norm": 0.030283450917942635, + "language_loss": 0.81763279, + "learning_rate": 0.00015121450597742458, + "loss": 0.82805902, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.39941406, + "step": 3916, + "time_per_iteration": 2.8187012672424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00275862, + "epoch": 0.7535590611773759, + "flos": 624814788864.0, + "grad_norm": 0.03530069245734392, + "language_loss": 0.79033458, + "learning_rate": 0.00015099134874773369, + "loss": 0.80076224, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.39990234, + "step": 3917, + "time_per_iteration": 2.729224443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_mlp": 1.0022496, + "epoch": 0.7537514428626395, + "flos": 520494133248.0, + "grad_norm": 0.030735782054698856, + "language_loss": 0.80733752, + "learning_rate": 0.00015076832701521793, + "loss": 0.81775939, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.39916992, + "step": 3918, + "time_per_iteration": 2.7341344356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_mlp": 1.00632989, + "epoch": 0.753943824547903, + "flos": 725035248384.0, + "grad_norm": 0.03833991263993651, + "language_loss": 0.82337809, + "learning_rate": 0.000150545440866462, + "loss": 0.83384174, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.40014648, + "step": 3919, + "time_per_iteration": 2.988217353820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045499, + "balance_loss_mlp": 1.00544465, + "epoch": 0.7541362062331666, + "flos": 438467914752.0, + "grad_norm": 0.03907672700659196, + "language_loss": 0.78807712, + "learning_rate": 0.000150322690387998, + "loss": 0.79853213, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.40039062, + "step": 3920, + "time_per_iteration": 2.503204107284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_mlp": 1.00841296, + "epoch": 0.7543285879184302, + "flos": 566344752384.0, + "grad_norm": 0.03511209658305934, + "language_loss": 0.7581147, + "learning_rate": 0.00015010007566630535, + "loss": 0.76859963, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.40063477, + "step": 3921, + "time_per_iteration": 2.785719633102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_mlp": 1.00624704, + "epoch": 0.7545209696036937, + "flos": 522059167488.0, + "grad_norm": 0.043005780548435554, + "language_loss": 0.81968284, + "learning_rate": 0.00014987759678781077, + "loss": 0.83014631, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.40087891, + "step": 3922, + "time_per_iteration": 2.611830711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045351, + "balance_loss_mlp": 1.00524938, + "epoch": 0.7547133512889573, + "flos": 617210281728.0, + "grad_norm": 0.034097045182419745, + "language_loss": 0.82924581, + "learning_rate": 0.00014965525383888795, + "loss": 0.83969939, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.40087891, + "step": 3923, + "time_per_iteration": 2.7791478633880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104536, + "balance_loss_mlp": 1.00532925, + "epoch": 0.7549057329742208, + "flos": 752142406656.0, + "grad_norm": 0.03232128162967594, + "language_loss": 0.72664821, + "learning_rate": 0.00014943304690585851, + "loss": 0.73710179, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.40014648, + "step": 3924, + "time_per_iteration": 2.8950600624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047725, + "balance_loss_mlp": 1.00767088, + "epoch": 0.7550981146594844, + "flos": 515451032832.0, + "grad_norm": 0.03846404623424841, + "language_loss": 0.79993105, + "learning_rate": 0.0001492109760749908, + "loss": 0.81040823, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.40039062, + "step": 3925, + "time_per_iteration": 2.582379102706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.00763071, + "epoch": 0.755290496344748, + "flos": 523027349760.0, + "grad_norm": 0.03160852953683284, + "language_loss": 0.80470473, + "learning_rate": 0.00014898904143250002, + "loss": 0.81518203, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.40087891, + "step": 3926, + "time_per_iteration": 2.642066240310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047142, + "balance_loss_mlp": 1.00804138, + "epoch": 0.7554828780300116, + "flos": 1417706376960.0, + "grad_norm": 0.005903328707274883, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76802349, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.390625, + "step": 3927, + "time_per_iteration": 4.909573793411255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045265, + "balance_loss_mlp": 1.00513911, + "epoch": 0.7556752597152752, + "flos": 557986891008.0, + "grad_norm": 0.0318859682760306, + "language_loss": 0.80794632, + "learning_rate": 0.0001485455810572474, + "loss": 0.81839895, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.40112305, + "step": 3928, + "time_per_iteration": 2.635267734527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_mlp": 1.00466609, + "epoch": 0.7558676414005386, + "flos": 564742779648.0, + "grad_norm": 0.029085057110465686, + "language_loss": 0.84313619, + "learning_rate": 0.00014832405549665236, + "loss": 0.853585, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.40209961, + "step": 3929, + "time_per_iteration": 2.7366552352905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_mlp": 1.00514007, + "epoch": 0.7560600230858022, + "flos": 562535206656.0, + "grad_norm": 0.03398651483995001, + "language_loss": 0.79036754, + "learning_rate": 0.00014810266646876746, + "loss": 0.80082047, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.40136719, + "step": 3930, + "time_per_iteration": 2.748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_mlp": 1.00550461, + "epoch": 0.7562524047710658, + "flos": 720958384896.0, + "grad_norm": 0.03398387115243252, + "language_loss": 0.78128892, + "learning_rate": 0.00014788141405954364, + "loss": 0.79174572, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.40161133, + "step": 3931, + "time_per_iteration": 3.0010688304901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046674, + "balance_loss_mlp": 1.0064286, + "epoch": 0.7564447864563294, + "flos": 544397346048.0, + "grad_norm": 0.04087931734394053, + "language_loss": 0.85259515, + "learning_rate": 0.00014766029835487865, + "loss": 0.8630619, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.40234375, + "step": 3932, + "time_per_iteration": 2.7051644325256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045407, + "balance_loss_mlp": 1.00528109, + "epoch": 0.7566371681415929, + "flos": 727095067392.0, + "grad_norm": 0.040524003150174424, + "language_loss": 0.80254388, + "learning_rate": 0.0001474393194406173, + "loss": 0.812998, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.40112305, + "step": 3933, + "time_per_iteration": 2.88698410987854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045593, + "balance_loss_mlp": 1.00546694, + "epoch": 0.7568295498268565, + "flos": 577807404288.0, + "grad_norm": 0.03205492443871288, + "language_loss": 0.80140668, + "learning_rate": 0.00014721847740255112, + "loss": 0.81186259, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.40112305, + "step": 3934, + "time_per_iteration": 2.8201425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_mlp": 1.00357056, + "epoch": 0.75702193151212, + "flos": 1523218791168.0, + "grad_norm": 0.006266777740012466, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74954593, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.390625, + "step": 3935, + "time_per_iteration": 4.622663736343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_mlp": 1.00100183, + "epoch": 0.7572143131973836, + "flos": 526489864704.0, + "grad_norm": 0.04266541401518767, + "language_loss": 0.78904128, + "learning_rate": 0.00014677720429790526, + "loss": 0.79945183, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.40039062, + "step": 3936, + "time_per_iteration": 2.5691311359405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104174, + "balance_loss_mlp": 1.00159049, + "epoch": 0.7574066948826472, + "flos": 551823963648.0, + "grad_norm": 0.029232134856981343, + "language_loss": 0.85000217, + "learning_rate": 0.0001465567734026429, + "loss": 0.86041951, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.40136719, + "step": 3937, + "time_per_iteration": 2.6958813667297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045039, + "balance_loss_mlp": 1.00488961, + "epoch": 0.7575990765679107, + "flos": 396769981440.0, + "grad_norm": 0.04157992306337891, + "language_loss": 0.82874024, + "learning_rate": 0.00014633647972621034, + "loss": 0.83919066, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.40136719, + "step": 3938, + "time_per_iteration": 2.443800449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_mlp": 1.00556958, + "epoch": 0.7577914582531743, + "flos": 586186652928.0, + "grad_norm": 0.031504909373110845, + "language_loss": 0.86987495, + "learning_rate": 0.00014611632335413354, + "loss": 0.8803314, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.40063477, + "step": 3939, + "time_per_iteration": 2.7657620906829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043556, + "balance_loss_mlp": 1.00357366, + "epoch": 0.7579838399384379, + "flos": 822485309184.0, + "grad_norm": 0.033895333971604005, + "language_loss": 0.83048445, + "learning_rate": 0.00014589630437188456, + "loss": 0.84091997, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.3996582, + "step": 3940, + "time_per_iteration": 3.1827540397644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_mlp": 1.00527036, + "epoch": 0.7581762216237015, + "flos": 444806786304.0, + "grad_norm": 0.03886523682666057, + "language_loss": 0.78962266, + "learning_rate": 0.00014567642286488253, + "loss": 0.8000766, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.40112305, + "step": 3941, + "time_per_iteration": 2.5701324939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_mlp": 1.00506926, + "epoch": 0.7583686033089649, + "flos": 541939951872.0, + "grad_norm": 0.03861315862447661, + "language_loss": 0.79739159, + "learning_rate": 0.00014545667891849258, + "loss": 0.8078438, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.40136719, + "step": 3942, + "time_per_iteration": 2.6185083389282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_mlp": 1.00675285, + "epoch": 0.7585609849942285, + "flos": 523613508096.0, + "grad_norm": 0.03344324045472487, + "language_loss": 0.82940769, + "learning_rate": 0.00014523707261802733, + "loss": 0.83987653, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.40112305, + "step": 3943, + "time_per_iteration": 2.615499973297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045818, + "balance_loss_mlp": 1.00564396, + "epoch": 0.7587533666794921, + "flos": 542908134144.0, + "grad_norm": 0.03989389594451329, + "language_loss": 0.81696534, + "learning_rate": 0.00014501760404874527, + "loss": 0.82742351, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.40161133, + "step": 3944, + "time_per_iteration": 2.7015254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047263, + "balance_loss_mlp": 1.00713706, + "epoch": 0.7589457483647557, + "flos": 607521656064.0, + "grad_norm": 0.037013243760391015, + "language_loss": 0.86645532, + "learning_rate": 0.00014479827329585176, + "loss": 0.87692797, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.40112305, + "step": 3945, + "time_per_iteration": 2.707260847091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.008057, + "epoch": 0.7591381300500193, + "flos": 556252715520.0, + "grad_norm": 0.030362278965781222, + "language_loss": 0.85217047, + "learning_rate": 0.00014457908044449846, + "loss": 0.86265111, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.39990234, + "step": 3946, + "time_per_iteration": 2.7425830364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048648, + "balance_loss_mlp": 1.00868881, + "epoch": 0.7593305117352828, + "flos": 530814604032.0, + "grad_norm": 0.0320699776647955, + "language_loss": 0.83156931, + "learning_rate": 0.00014436002557978371, + "loss": 0.8420558, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.39941406, + "step": 3947, + "time_per_iteration": 2.852153778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052135, + "balance_loss_mlp": 1.01313019, + "epoch": 0.7595228934205464, + "flos": 1505925658368.0, + "grad_norm": 0.007143494000939788, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77695286, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.38964844, + "step": 3948, + "time_per_iteration": 4.8901238441467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_mlp": 1.00338674, + "epoch": 0.7597152751058099, + "flos": 456468715008.0, + "grad_norm": 0.03356126441084979, + "language_loss": 0.80132592, + "learning_rate": 0.0001439223301503945, + "loss": 0.81176007, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.40014648, + "step": 3949, + "time_per_iteration": 2.5245442390441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_mlp": 1.0028255, + "epoch": 0.7599076567910735, + "flos": 686799830016.0, + "grad_norm": 0.04215278284699455, + "language_loss": 0.76435691, + "learning_rate": 0.00014370368975564834, + "loss": 0.77478409, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.39868164, + "step": 3950, + "time_per_iteration": 3.002926826477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_mlp": 1.0027709, + "epoch": 0.760100038476337, + "flos": 533495574528.0, + "grad_norm": 0.03911832457042585, + "language_loss": 0.84080267, + "learning_rate": 0.00014348518768739766, + "loss": 0.85123098, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.40039062, + "step": 3951, + "time_per_iteration": 2.7287793159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_mlp": 1.00780487, + "epoch": 0.7602924201616006, + "flos": 1474919526144.0, + "grad_norm": 0.009800306556812065, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77774942, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.390625, + "step": 3952, + "time_per_iteration": 4.851192951202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_mlp": 1.00179482, + "epoch": 0.7604848018468642, + "flos": 776041728768.0, + "grad_norm": 0.043428396505350506, + "language_loss": 0.86555135, + "learning_rate": 0.00014304859886964867, + "loss": 0.87596822, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.39868164, + "step": 3953, + "time_per_iteration": 3.0201337337493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_mlp": 1.00415182, + "epoch": 0.7606771835321278, + "flos": 559261274880.0, + "grad_norm": 0.03249370950181494, + "language_loss": 0.8406316, + "learning_rate": 0.00014283051228964878, + "loss": 0.85107362, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.40039062, + "step": 3954, + "time_per_iteration": 2.6745314598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046758, + "balance_loss_mlp": 1.00687051, + "epoch": 0.7608695652173914, + "flos": 526433484288.0, + "grad_norm": 0.03436460979792566, + "language_loss": 0.83105361, + "learning_rate": 0.00014261256437514197, + "loss": 0.84152114, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.39868164, + "step": 3955, + "time_per_iteration": 2.6607260704040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046091, + "balance_loss_mlp": 1.0060848, + "epoch": 0.7610619469026548, + "flos": 616168222464.0, + "grad_norm": 0.03814764574124358, + "language_loss": 0.82773203, + "learning_rate": 0.0001423947552107428, + "loss": 0.83819294, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.39990234, + "step": 3956, + "time_per_iteration": 2.731502056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_mlp": 1.00053155, + "epoch": 0.7612543285879184, + "flos": 864818978304.0, + "grad_norm": 0.03440554152429829, + "language_loss": 0.77563798, + "learning_rate": 0.00014217708488101243, + "loss": 0.78604192, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.3984375, + "step": 3957, + "time_per_iteration": 3.0592825412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_mlp": 1.00076258, + "epoch": 0.761446710273182, + "flos": 554728510464.0, + "grad_norm": 0.045631291273616384, + "language_loss": 0.77730322, + "learning_rate": 0.0001419595534704579, + "loss": 0.78771019, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.39916992, + "step": 3958, + "time_per_iteration": 2.693791389465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_mlp": 1.00143242, + "epoch": 0.7616390919584456, + "flos": 468326029824.0, + "grad_norm": 0.03770259597334161, + "language_loss": 0.81622386, + "learning_rate": 0.00014174216106353237, + "loss": 0.82663804, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.3996582, + "step": 3959, + "time_per_iteration": 2.6240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043866, + "balance_loss_mlp": 1.00385952, + "epoch": 0.7618314736437091, + "flos": 499432283904.0, + "grad_norm": 0.036732960604225574, + "language_loss": 0.76590341, + "learning_rate": 0.00014152490774463512, + "loss": 0.77634203, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.39990234, + "step": 3960, + "time_per_iteration": 2.6385769844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_mlp": 1.00236273, + "epoch": 0.7620238553289727, + "flos": 435452552448.0, + "grad_norm": 0.04258907673967457, + "language_loss": 0.87829125, + "learning_rate": 0.00014130779359811135, + "loss": 0.88871497, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.39990234, + "step": 3961, + "time_per_iteration": 2.530336380004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046467, + "balance_loss_mlp": 1.00657988, + "epoch": 0.7622162370142362, + "flos": 665542594560.0, + "grad_norm": 0.03171084912805384, + "language_loss": 0.86222768, + "learning_rate": 0.0001410908187082521, + "loss": 0.87269235, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.39868164, + "step": 3962, + "time_per_iteration": 2.8736419677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047801, + "balance_loss_mlp": 1.0077945, + "epoch": 0.7624086186994998, + "flos": 559028950272.0, + "grad_norm": 0.03864138857233312, + "language_loss": 0.84107929, + "learning_rate": 0.0001408739831592949, + "loss": 0.85155731, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.39990234, + "step": 3963, + "time_per_iteration": 2.639000415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_mlp": 1.00829566, + "epoch": 0.7626010003847634, + "flos": 630287545344.0, + "grad_norm": 0.04234358402280358, + "language_loss": 0.77802932, + "learning_rate": 0.0001406572870354224, + "loss": 0.78851116, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.39868164, + "step": 3964, + "time_per_iteration": 2.855811834335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_mlp": 1.00591099, + "epoch": 0.7627933820700269, + "flos": 438849938688.0, + "grad_norm": 0.03234706292902695, + "language_loss": 0.87125206, + "learning_rate": 0.00014044073042076337, + "loss": 0.88171101, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.3996582, + "step": 3965, + "time_per_iteration": 2.5181050300598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_mlp": 1.00586748, + "epoch": 0.7629857637552905, + "flos": 533794973184.0, + "grad_norm": 0.028534394430764273, + "language_loss": 0.89329129, + "learning_rate": 0.00014022431339939302, + "loss": 0.90375006, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.39990234, + "step": 3966, + "time_per_iteration": 2.671855926513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_mlp": 1.00679314, + "epoch": 0.7631781454405541, + "flos": 681237645312.0, + "grad_norm": 0.04110089752084587, + "language_loss": 0.78748721, + "learning_rate": 0.00014000803605533163, + "loss": 0.79795587, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.40063477, + "step": 3967, + "time_per_iteration": 2.8315372467041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104216, + "balance_loss_mlp": 1.00203407, + "epoch": 0.7633705271258177, + "flos": 508489064448.0, + "grad_norm": 0.04146307364785201, + "language_loss": 0.8433795, + "learning_rate": 0.00013979189847254553, + "loss": 0.85380107, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.40112305, + "step": 3968, + "time_per_iteration": 2.601447582244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_mlp": 1.00454402, + "epoch": 0.7635629088110811, + "flos": 620039006208.0, + "grad_norm": 0.03458604771119312, + "language_loss": 0.81047332, + "learning_rate": 0.00013957590073494674, + "loss": 0.82091957, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.40063477, + "step": 3969, + "time_per_iteration": 2.8777170181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044405, + "balance_loss_mlp": 1.00430274, + "epoch": 0.7637552904963447, + "flos": 639567902208.0, + "grad_norm": 0.03961564196889536, + "language_loss": 0.79463089, + "learning_rate": 0.0001393600429263931, + "loss": 0.80507493, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.40087891, + "step": 3970, + "time_per_iteration": 2.7422754764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.0035553, + "epoch": 0.7639476721816083, + "flos": 1566686860032.0, + "grad_norm": 0.00740169880788124, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75787538, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.390625, + "step": 3971, + "time_per_iteration": 4.935492038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_mlp": 1.00043809, + "epoch": 0.7641400538668719, + "flos": 497020576512.0, + "grad_norm": 0.032719762183458435, + "language_loss": 0.81907034, + "learning_rate": 0.0001389287474315804, + "loss": 0.82947505, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.40014648, + "step": 3972, + "time_per_iteration": 2.630120038986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046444, + "balance_loss_mlp": 1.00638986, + "epoch": 0.7643324355521355, + "flos": 579515334912.0, + "grad_norm": 0.03140885431358122, + "language_loss": 0.80818957, + "learning_rate": 0.00013871330991276505, + "loss": 0.81865394, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.40039062, + "step": 3973, + "time_per_iteration": 2.685450553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_mlp": 1.00257015, + "epoch": 0.764524817237399, + "flos": 786233887488.0, + "grad_norm": 0.035934794543156075, + "language_loss": 0.81384689, + "learning_rate": 0.00013849801265788247, + "loss": 0.82427323, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.40039062, + "step": 3974, + "time_per_iteration": 3.039971113204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_mlp": 1.00235903, + "epoch": 0.7647171989226625, + "flos": 527299599360.0, + "grad_norm": 0.03568861441891304, + "language_loss": 0.83377182, + "learning_rate": 0.00013828285575051818, + "loss": 0.84419549, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.39990234, + "step": 3975, + "time_per_iteration": 2.6113204956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_mlp": 1.00243056, + "epoch": 0.7649095806079261, + "flos": 556029139200.0, + "grad_norm": 0.03438397238975277, + "language_loss": 0.84555364, + "learning_rate": 0.0001380678392742035, + "loss": 0.85597825, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.40014648, + "step": 3976, + "time_per_iteration": 2.702728509902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042138, + "balance_loss_mlp": 1.0021317, + "epoch": 0.7651019622931897, + "flos": 650389960704.0, + "grad_norm": 0.02964586673443437, + "language_loss": 0.84697402, + "learning_rate": 0.00013785296331241526, + "loss": 0.85739541, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.39990234, + "step": 3977, + "time_per_iteration": 2.8500404357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_mlp": 1.00282192, + "epoch": 0.7652943439784533, + "flos": 1048113551616.0, + "grad_norm": 0.03693742198159439, + "language_loss": 0.8784855, + "learning_rate": 0.00013763822794857583, + "loss": 0.88891351, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.3996582, + "step": 3978, + "time_per_iteration": 3.2964861392974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_mlp": 1.00266194, + "epoch": 0.7654867256637168, + "flos": 505415376384.0, + "grad_norm": 0.03301663266188199, + "language_loss": 0.9032107, + "learning_rate": 0.00013742363326605278, + "loss": 0.91363835, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.40087891, + "step": 3979, + "time_per_iteration": 2.7543904781341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_mlp": 1.00289607, + "epoch": 0.7656791073489804, + "flos": 575864236800.0, + "grad_norm": 0.031055895405363115, + "language_loss": 0.78887016, + "learning_rate": 0.00013720917934815935, + "loss": 0.79929984, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.40063477, + "step": 3980, + "time_per_iteration": 2.757488489151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043263, + "balance_loss_mlp": 1.0031848, + "epoch": 0.765871489034244, + "flos": 493792331520.0, + "grad_norm": 0.04115022529331337, + "language_loss": 0.83214378, + "learning_rate": 0.00013699486627815344, + "loss": 0.84257638, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.40063477, + "step": 3981, + "time_per_iteration": 2.6013007164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_mlp": 1.00347948, + "epoch": 0.7660638707195075, + "flos": 487051994112.0, + "grad_norm": 0.036811021847235705, + "language_loss": 0.83011079, + "learning_rate": 0.00013678069413923928, + "loss": 0.84054542, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.3996582, + "step": 3982, + "time_per_iteration": 2.647836208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042258, + "balance_loss_mlp": 1.00225163, + "epoch": 0.766256252404771, + "flos": 445243245312.0, + "grad_norm": 0.03517202501681349, + "language_loss": 0.8304435, + "learning_rate": 0.00013656666301456555, + "loss": 0.84086609, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.39990234, + "step": 3983, + "time_per_iteration": 2.5181782245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045473, + "balance_loss_mlp": 1.00541902, + "epoch": 0.7664486340900346, + "flos": 486214069248.0, + "grad_norm": 0.03304538519441237, + "language_loss": 0.84839791, + "learning_rate": 0.0001363527729872267, + "loss": 0.85885262, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.40039062, + "step": 3984, + "time_per_iteration": 2.7154600620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_mlp": 1.00527942, + "epoch": 0.7666410157752982, + "flos": 647385292032.0, + "grad_norm": 0.036051539426371945, + "language_loss": 0.77239299, + "learning_rate": 0.00013613902414026207, + "loss": 0.78284609, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.40014648, + "step": 3985, + "time_per_iteration": 2.793349027633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_mlp": 1.00238621, + "epoch": 0.7668333974605618, + "flos": 775661650176.0, + "grad_norm": 0.03427802042896287, + "language_loss": 0.82765865, + "learning_rate": 0.00013592541655665642, + "loss": 0.83808166, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.39892578, + "step": 3986, + "time_per_iteration": 2.9631149768829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_mlp": 1.00257468, + "epoch": 0.7670257791458254, + "flos": 614513760000.0, + "grad_norm": 0.03630429655058752, + "language_loss": 0.85794669, + "learning_rate": 0.00013571195031933947, + "loss": 0.86837274, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.40014648, + "step": 3987, + "time_per_iteration": 2.684053659439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040779, + "balance_loss_mlp": 1.0018692, + "epoch": 0.7672181608310888, + "flos": 1488365207808.0, + "grad_norm": 0.004720848952888087, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81522119, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.38867188, + "step": 3988, + "time_per_iteration": 4.726950168609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.0030216, + "epoch": 0.7674105425163524, + "flos": 611867782656.0, + "grad_norm": 0.03766281507369906, + "language_loss": 0.85887635, + "learning_rate": 0.00013528544221501655, + "loss": 0.86930621, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.39941406, + "step": 3989, + "time_per_iteration": 2.710402011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043109, + "balance_loss_mlp": 1.00315046, + "epoch": 0.767602924201616, + "flos": 846605295360.0, + "grad_norm": 0.0329376529812033, + "language_loss": 0.82137692, + "learning_rate": 0.00013507240051359586, + "loss": 0.83180797, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.39941406, + "step": 3990, + "time_per_iteration": 3.0520286560058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_mlp": 1.00342703, + "epoch": 0.7677953058868796, + "flos": 528146272512.0, + "grad_norm": 0.038347091036525886, + "language_loss": 0.8687346, + "learning_rate": 0.00013485950048963425, + "loss": 0.87916845, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.39941406, + "step": 3991, + "time_per_iteration": 2.597275495529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.01365852, + "epoch": 0.7679876875721431, + "flos": 925112618496.0, + "grad_norm": 0.036512387474733066, + "language_loss": 0.83205199, + "learning_rate": 0.00013464674222578643, + "loss": 0.84258771, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.39892578, + "step": 3992, + "time_per_iteration": 3.1764492988586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.01389194, + "epoch": 0.7681800692574067, + "flos": 459019428096.0, + "grad_norm": 0.03635515300980307, + "language_loss": 0.83761203, + "learning_rate": 0.00013443412580465292, + "loss": 0.84815073, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.3996582, + "step": 3993, + "time_per_iteration": 2.583146810531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053421, + "balance_loss_mlp": 1.01348555, + "epoch": 0.7683724509426703, + "flos": 659733500928.0, + "grad_norm": 0.040381204925205964, + "language_loss": 0.84726322, + "learning_rate": 0.00013422165130877857, + "loss": 0.85779738, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.39916992, + "step": 3994, + "time_per_iteration": 2.8877995014190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_mlp": 1.00473106, + "epoch": 0.7685648326279338, + "flos": 556339231488.0, + "grad_norm": 0.052990639724004036, + "language_loss": 0.80869007, + "learning_rate": 0.00013400931882065327, + "loss": 0.81913817, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.40063477, + "step": 3995, + "time_per_iteration": 2.693859815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043655, + "balance_loss_mlp": 1.00367188, + "epoch": 0.7687572143131974, + "flos": 688744942848.0, + "grad_norm": 0.032666888186809864, + "language_loss": 0.81219018, + "learning_rate": 0.0001337971284227118, + "loss": 0.82262671, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.3996582, + "step": 3996, + "time_per_iteration": 3.0207791328430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_mlp": 1.00307465, + "epoch": 0.7689495959984609, + "flos": 1492668559872.0, + "grad_norm": 0.00690868544016345, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77160406, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.38867188, + "step": 3997, + "time_per_iteration": 4.991567134857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_mlp": 1.00463462, + "epoch": 0.7691419776837245, + "flos": 571500613632.0, + "grad_norm": 0.032008326579370035, + "language_loss": 0.80634248, + "learning_rate": 0.0001333731742268438, + "loss": 0.81678867, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.3996582, + "step": 3998, + "time_per_iteration": 2.698580026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040606, + "balance_loss_mlp": 1.00064719, + "epoch": 0.7693343593689881, + "flos": 521191107072.0, + "grad_norm": 0.03337650423069263, + "language_loss": 0.85920131, + "learning_rate": 0.0001331614105935109, + "loss": 0.86960733, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.39941406, + "step": 3999, + "time_per_iteration": 2.693692684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_mlp": 1.00495398, + "epoch": 0.7695267410542517, + "flos": 661552247040.0, + "grad_norm": 0.031590911772699855, + "language_loss": 0.84561241, + "learning_rate": 0.00013294978937954883, + "loss": 0.85606205, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.39990234, + "step": 4000, + "time_per_iteration": 2.7991349697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_mlp": 1.00548565, + "epoch": 0.7697191227395151, + "flos": 547859860992.0, + "grad_norm": 0.04547292617376322, + "language_loss": 0.8583228, + "learning_rate": 0.00013273831066711655, + "loss": 0.86877775, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.39990234, + "step": 4001, + "time_per_iteration": 2.640451192855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_mlp": 1.00354123, + "epoch": 0.7699115044247787, + "flos": 541696933632.0, + "grad_norm": 0.030960933943813315, + "language_loss": 0.80473912, + "learning_rate": 0.00013252697453831747, + "loss": 0.8151741, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.39941406, + "step": 4002, + "time_per_iteration": 2.709754467010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_mlp": 1.00417161, + "epoch": 0.7701038861100423, + "flos": 564143982336.0, + "grad_norm": 0.03227531523104023, + "language_loss": 0.82851601, + "learning_rate": 0.00013231578107519916, + "loss": 0.83895779, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.39990234, + "step": 4003, + "time_per_iteration": 2.914151191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_mlp": 1.0037117, + "epoch": 0.7702962677953059, + "flos": 482734057728.0, + "grad_norm": 0.0383418204368582, + "language_loss": 0.83275282, + "learning_rate": 0.00013210473035975422, + "loss": 0.84318936, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.39916992, + "step": 4004, + "time_per_iteration": 2.605908155441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.0033915, + "epoch": 0.7704886494805695, + "flos": 771806417664.0, + "grad_norm": 0.03621639997578191, + "language_loss": 0.85901195, + "learning_rate": 0.0001318938224739201, + "loss": 0.8694452, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.39916992, + "step": 4005, + "time_per_iteration": 3.059812545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044187, + "balance_loss_mlp": 1.00441921, + "epoch": 0.770681031165833, + "flos": 602318162688.0, + "grad_norm": 0.030887976595528478, + "language_loss": 0.84163052, + "learning_rate": 0.00013168305749957843, + "loss": 0.85207236, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.39746094, + "step": 4006, + "time_per_iteration": 2.730853796005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_mlp": 1.00063193, + "epoch": 0.7708734128510966, + "flos": 497096398848.0, + "grad_norm": 0.03317085046195358, + "language_loss": 0.83013129, + "learning_rate": 0.00013147243551855532, + "loss": 0.84053576, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.39794922, + "step": 4007, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_mlp": 1.00025964, + "epoch": 0.7710657945363601, + "flos": 568455115776.0, + "grad_norm": 0.02959339881613439, + "language_loss": 0.81033671, + "learning_rate": 0.00013126195661262148, + "loss": 0.82073796, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.3984375, + "step": 4008, + "time_per_iteration": 2.8038330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_mlp": 1.00230277, + "epoch": 0.7712581762216237, + "flos": 605750542080.0, + "grad_norm": 0.030762375032726955, + "language_loss": 0.8689748, + "learning_rate": 0.00013105162086349216, + "loss": 0.87939554, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.39746094, + "step": 4009, + "time_per_iteration": 2.8229057788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045827, + "balance_loss_mlp": 1.00593925, + "epoch": 0.7714505579068872, + "flos": 531997614336.0, + "grad_norm": 0.03203683238249966, + "language_loss": 0.86152643, + "learning_rate": 0.00013084142835282687, + "loss": 0.87198472, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.39868164, + "step": 4010, + "time_per_iteration": 2.6913058757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_mlp": 1.00637054, + "epoch": 0.7716429395921508, + "flos": 1425382815744.0, + "grad_norm": 0.007782218935032237, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80929649, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.38867188, + "step": 4011, + "time_per_iteration": 4.785134315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_mlp": 1.00353301, + "epoch": 0.7718353212774144, + "flos": 579587266560.0, + "grad_norm": 0.03553512598849003, + "language_loss": 0.89913195, + "learning_rate": 0.0001304214733732485, + "loss": 0.90956569, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.39819336, + "step": 4012, + "time_per_iteration": 4.228041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_mlp": 1.00733876, + "epoch": 0.772027702962678, + "flos": 511773689856.0, + "grad_norm": 0.036769707264373286, + "language_loss": 0.83085632, + "learning_rate": 0.00013021171106737672, + "loss": 0.8413288, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.39892578, + "step": 4013, + "time_per_iteration": 2.6609246730804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_mlp": 1.00402582, + "epoch": 0.7722200846479416, + "flos": 526748434176.0, + "grad_norm": 0.0322565513109964, + "language_loss": 0.80160201, + "learning_rate": 0.00013000209232605071, + "loss": 0.81204116, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.39868164, + "step": 4014, + "time_per_iteration": 2.6655430793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042333, + "balance_loss_mlp": 1.00244582, + "epoch": 0.772412466333205, + "flos": 480602307072.0, + "grad_norm": 0.033386370052076744, + "language_loss": 0.80578887, + "learning_rate": 0.0001297926172306519, + "loss": 0.81621224, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.39868164, + "step": 4015, + "time_per_iteration": 2.6234195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_mlp": 1.00240195, + "epoch": 0.7726048480184686, + "flos": 907314007296.0, + "grad_norm": 0.032763935043036714, + "language_loss": 0.79579479, + "learning_rate": 0.0001295832858625055, + "loss": 0.8062169, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.39794922, + "step": 4016, + "time_per_iteration": 3.251309394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_mlp": 1.0024209, + "epoch": 0.7727972297037322, + "flos": 632567049984.0, + "grad_norm": 0.031482852227098596, + "language_loss": 0.70049077, + "learning_rate": 0.00012937409830288154, + "loss": 0.71091413, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.39892578, + "step": 4017, + "time_per_iteration": 2.821953296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043688, + "balance_loss_mlp": 1.00377691, + "epoch": 0.7729896113889958, + "flos": 415673835264.0, + "grad_norm": 0.04152117908534408, + "language_loss": 0.85515797, + "learning_rate": 0.00012916505463299362, + "loss": 0.86559486, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.39892578, + "step": 4018, + "time_per_iteration": 2.5182814598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_mlp": 1.00375712, + "epoch": 0.7731819930742593, + "flos": 670105494528.0, + "grad_norm": 0.03808310048663825, + "language_loss": 0.78672588, + "learning_rate": 0.00012895615493399972, + "loss": 0.79716301, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.39941406, + "step": 4019, + "time_per_iteration": 2.8195626735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042435, + "balance_loss_mlp": 1.00249946, + "epoch": 0.7733743747595229, + "flos": 490859594496.0, + "grad_norm": 0.04130359033653859, + "language_loss": 0.83203042, + "learning_rate": 0.00012874739928700192, + "loss": 0.84245479, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.39916992, + "step": 4020, + "time_per_iteration": 2.561997652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_mlp": 1.00260687, + "epoch": 0.7735667564447865, + "flos": 660888321024.0, + "grad_norm": 0.03933419760406932, + "language_loss": 0.8045736, + "learning_rate": 0.00012853878777304624, + "loss": 0.81499898, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.39916992, + "step": 4021, + "time_per_iteration": 2.866426706314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_mlp": 1.00597656, + "epoch": 0.77375913813005, + "flos": 534491947008.0, + "grad_norm": 0.03165766491354683, + "language_loss": 0.84674478, + "learning_rate": 0.000128330320473123, + "loss": 0.85720319, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.3984375, + "step": 4022, + "time_per_iteration": 2.689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.01501465, + "epoch": 0.7739515198153136, + "flos": 1523382096384.0, + "grad_norm": 0.014118198615631215, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79385823, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.38867188, + "step": 4023, + "time_per_iteration": 4.873432874679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_mlp": 1.00660741, + "epoch": 0.7741439015005771, + "flos": 641252500224.0, + "grad_norm": 0.03719222986938954, + "language_loss": 0.8204658, + "learning_rate": 0.0001279138188390543, + "loss": 0.83093119, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.39916992, + "step": 4024, + "time_per_iteration": 2.7891459465026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042489, + "balance_loss_mlp": 1.00267303, + "epoch": 0.7743362831858407, + "flos": 667025003520.0, + "grad_norm": 0.033177934187398395, + "language_loss": 0.86806941, + "learning_rate": 0.00012770578466660915, + "loss": 0.87849432, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.39794922, + "step": 4025, + "time_per_iteration": 2.8528504371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_mlp": 1.00135612, + "epoch": 0.7745286648711043, + "flos": 563994283008.0, + "grad_norm": 0.03246135787328845, + "language_loss": 0.82025433, + "learning_rate": 0.0001274978950315968, + "loss": 0.83066702, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.39892578, + "step": 4026, + "time_per_iteration": 2.8042469024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_mlp": 1.00322056, + "epoch": 0.7747210465563679, + "flos": 517962862080.0, + "grad_norm": 0.03718030635862113, + "language_loss": 0.83308971, + "learning_rate": 0.00012729015001472716, + "loss": 0.84352028, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.39819336, + "step": 4027, + "time_per_iteration": 2.6950860023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042837, + "balance_loss_mlp": 1.00294983, + "epoch": 0.7749134282416313, + "flos": 635369529600.0, + "grad_norm": 0.032368305886577194, + "language_loss": 0.81846035, + "learning_rate": 0.00012708254969665418, + "loss": 0.82888865, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.39868164, + "step": 4028, + "time_per_iteration": 2.7516164779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043156, + "balance_loss_mlp": 1.00326848, + "epoch": 0.7751058099268949, + "flos": 496351792896.0, + "grad_norm": 0.03964938582220019, + "language_loss": 0.83793879, + "learning_rate": 0.00012687509415797526, + "loss": 0.84837031, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.39868164, + "step": 4029, + "time_per_iteration": 2.5878894329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104315, + "balance_loss_mlp": 1.003286, + "epoch": 0.7752981916121585, + "flos": 511363475712.0, + "grad_norm": 0.03526859549006244, + "language_loss": 0.8169387, + "learning_rate": 0.00012666778347923208, + "loss": 0.82737017, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.3984375, + "step": 4030, + "time_per_iteration": 2.717336893081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104185, + "balance_loss_mlp": 1.00208211, + "epoch": 0.7754905732974221, + "flos": 498566168832.0, + "grad_norm": 0.03835604282300423, + "language_loss": 0.84299457, + "learning_rate": 0.0001264606177409092, + "loss": 0.85341311, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.39746094, + "step": 4031, + "time_per_iteration": 2.6836674213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_mlp": 1.00206888, + "epoch": 0.7756829549826857, + "flos": 481783372032.0, + "grad_norm": 0.032423363351723834, + "language_loss": 0.86526835, + "learning_rate": 0.00012625359702343609, + "loss": 0.87568641, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.3972168, + "step": 4032, + "time_per_iteration": 2.74953031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042791, + "balance_loss_mlp": 1.00316608, + "epoch": 0.7758753366679492, + "flos": 553686451200.0, + "grad_norm": 0.036449679892663574, + "language_loss": 0.85421842, + "learning_rate": 0.00012604672140718504, + "loss": 0.86464632, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.39599609, + "step": 4033, + "time_per_iteration": 2.6570351123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.0033319, + "epoch": 0.7760677183532128, + "flos": 705065035776.0, + "grad_norm": 0.035522641284568106, + "language_loss": 0.78343493, + "learning_rate": 0.00012583999097247233, + "loss": 0.79386473, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.39624023, + "step": 4034, + "time_per_iteration": 2.828260660171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_mlp": 1.00273633, + "epoch": 0.7762601000384763, + "flos": 524479623168.0, + "grad_norm": 0.037193057814734455, + "language_loss": 0.80287337, + "learning_rate": 0.0001256334057995578, + "loss": 0.81329775, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.39672852, + "step": 4035, + "time_per_iteration": 2.694047689437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042576, + "balance_loss_mlp": 1.00292659, + "epoch": 0.7764524817237399, + "flos": 558618736128.0, + "grad_norm": 0.03306447256493109, + "language_loss": 0.85536653, + "learning_rate": 0.000125426965968645, + "loss": 0.86579227, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.39624023, + "step": 4036, + "time_per_iteration": 2.668351173400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_mlp": 1.00257981, + "epoch": 0.7766448634090035, + "flos": 580817908992.0, + "grad_norm": 0.03814563398191554, + "language_loss": 0.83102942, + "learning_rate": 0.00012522067155988092, + "loss": 0.84145141, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.39599609, + "step": 4037, + "time_per_iteration": 2.738696336746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_mlp": 1.00182426, + "epoch": 0.776837245094267, + "flos": 636819857664.0, + "grad_norm": 0.03633176837238025, + "language_loss": 0.75620854, + "learning_rate": 0.00012501452265335617, + "loss": 0.76662356, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.39648438, + "step": 4038, + "time_per_iteration": 2.8050642013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.00211596, + "epoch": 0.7770296267795306, + "flos": 615814388736.0, + "grad_norm": 0.05953534229047703, + "language_loss": 0.82882428, + "learning_rate": 0.0001248085193291047, + "loss": 0.83924192, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.39624023, + "step": 4039, + "time_per_iteration": 2.7656314373016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104134, + "balance_loss_mlp": 1.00166762, + "epoch": 0.7772220084647942, + "flos": 880297255680.0, + "grad_norm": 0.03559940349726857, + "language_loss": 0.82936066, + "learning_rate": 0.00012460266166710443, + "loss": 0.83977401, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.39648438, + "step": 4040, + "time_per_iteration": 3.203681468963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_mlp": 1.00201309, + "epoch": 0.7774143901500578, + "flos": 841039219968.0, + "grad_norm": 0.03667780998396207, + "language_loss": 0.78218615, + "learning_rate": 0.00012439694974727633, + "loss": 0.79260302, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.39648438, + "step": 4041, + "time_per_iteration": 3.1048929691314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042796, + "balance_loss_mlp": 1.00305152, + "epoch": 0.7776067718353212, + "flos": 569229857280.0, + "grad_norm": 0.03323606563363869, + "language_loss": 0.80526865, + "learning_rate": 0.00012419138364948458, + "loss": 0.8156966, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.3972168, + "step": 4042, + "time_per_iteration": 2.759185791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104225, + "balance_loss_mlp": 1.00255311, + "epoch": 0.7777991535205848, + "flos": 747210121728.0, + "grad_norm": 0.04016086024334982, + "language_loss": 0.83042264, + "learning_rate": 0.00012398596345353702, + "loss": 0.84084511, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.39672852, + "step": 4043, + "time_per_iteration": 2.8885416984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055285, + "balance_loss_mlp": 1.01556456, + "epoch": 0.7779915352058484, + "flos": 539183159040.0, + "grad_norm": 0.05452361141280675, + "language_loss": 0.8397001, + "learning_rate": 0.0001237806892391851, + "loss": 0.85025299, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.39697266, + "step": 4044, + "time_per_iteration": 2.6936380863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051551, + "balance_loss_mlp": 1.01173472, + "epoch": 0.778183916891112, + "flos": 635955687936.0, + "grad_norm": 0.03830611533598255, + "language_loss": 0.81336337, + "learning_rate": 0.0001235755610861233, + "loss": 0.82387888, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.39794922, + "step": 4045, + "time_per_iteration": 2.8001327514648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051224, + "balance_loss_mlp": 1.01140773, + "epoch": 0.7783762985763756, + "flos": 589790118912.0, + "grad_norm": 0.03835840399941748, + "language_loss": 0.85962141, + "learning_rate": 0.0001233705790739893, + "loss": 0.87013364, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.39794922, + "step": 4046, + "time_per_iteration": 2.7678940296173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_mlp": 1.00712717, + "epoch": 0.7785686802616391, + "flos": 932241782784.0, + "grad_norm": 0.03816222734497192, + "language_loss": 0.75308621, + "learning_rate": 0.0001231657432823643, + "loss": 0.76355535, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.39770508, + "step": 4047, + "time_per_iteration": 3.2008919715881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_mlp": 1.00816751, + "epoch": 0.7787610619469026, + "flos": 498956941056.0, + "grad_norm": 0.03863312039595469, + "language_loss": 0.79339081, + "learning_rate": 0.0001229610537907725, + "loss": 0.80387038, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.39770508, + "step": 4048, + "time_per_iteration": 2.6606078147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047514, + "balance_loss_mlp": 1.00776947, + "epoch": 0.7789534436321662, + "flos": 516651539712.0, + "grad_norm": 0.03926321418096956, + "language_loss": 0.91044021, + "learning_rate": 0.00012275651067868143, + "loss": 0.92091531, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.3972168, + "step": 4049, + "time_per_iteration": 2.5831098556518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048801, + "balance_loss_mlp": 1.00903308, + "epoch": 0.7791458253174298, + "flos": 990062477568.0, + "grad_norm": 0.03241253923352413, + "language_loss": 0.80757916, + "learning_rate": 0.00012255211402550182, + "loss": 0.81806719, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.39746094, + "step": 4050, + "time_per_iteration": 3.227099657058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_mlp": 1.00596547, + "epoch": 0.7793382070026933, + "flos": 630185478144.0, + "grad_norm": 0.040685190405043196, + "language_loss": 0.77082014, + "learning_rate": 0.00012234786391058727, + "loss": 0.78127873, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.39868164, + "step": 4051, + "time_per_iteration": 2.803809881210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_mlp": 1.00439727, + "epoch": 0.7795305886879569, + "flos": 532763607552.0, + "grad_norm": 0.14552341545352887, + "language_loss": 0.85931647, + "learning_rate": 0.0001221437604132352, + "loss": 0.86975908, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.3984375, + "step": 4052, + "time_per_iteration": 2.6521799564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044071, + "balance_loss_mlp": 1.00425482, + "epoch": 0.7797229703732205, + "flos": 613142166528.0, + "grad_norm": 0.03707443730916314, + "language_loss": 0.81622672, + "learning_rate": 0.0001219398036126852, + "loss": 0.82666743, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.39794922, + "step": 4053, + "time_per_iteration": 2.7498905658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043695, + "balance_loss_mlp": 1.00385571, + "epoch": 0.7799153520584841, + "flos": 873796045824.0, + "grad_norm": 0.03756906141902607, + "language_loss": 0.78616834, + "learning_rate": 0.00012173599358812027, + "loss": 0.79660529, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.39819336, + "step": 4054, + "time_per_iteration": 3.2531538009643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010442, + "balance_loss_mlp": 1.00424063, + "epoch": 0.7801077337437476, + "flos": 584745073152.0, + "grad_norm": 0.034551857273689666, + "language_loss": 0.83048439, + "learning_rate": 0.0001215323304186668, + "loss": 0.84092641, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.39941406, + "step": 4055, + "time_per_iteration": 2.802626371383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104311, + "balance_loss_mlp": 1.00329399, + "epoch": 0.7803001154290111, + "flos": 602281224192.0, + "grad_norm": 0.03735081367855325, + "language_loss": 0.87971795, + "learning_rate": 0.00012132881418339364, + "loss": 0.890149, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.39794922, + "step": 4056, + "time_per_iteration": 2.779559850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_mlp": 1.00430298, + "epoch": 0.7804924971142747, + "flos": 1482928411392.0, + "grad_norm": 0.004870984594471592, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.7856068, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.38769531, + "step": 4057, + "time_per_iteration": 4.857725620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_mlp": 1.00350153, + "epoch": 0.7806848787995383, + "flos": 631516242432.0, + "grad_norm": 0.03794339503679321, + "language_loss": 0.77468872, + "learning_rate": 0.00012092222283137944, + "loss": 0.78512168, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.39770508, + "step": 4058, + "time_per_iteration": 2.742105722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045322, + "balance_loss_mlp": 1.00650787, + "epoch": 0.7808772604848019, + "flos": 1420747984128.0, + "grad_norm": 0.008365987604131462, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79951632, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.38769531, + "step": 4059, + "time_per_iteration": 4.828707695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043702, + "balance_loss_mlp": 1.00390983, + "epoch": 0.7810696421700654, + "flos": 733104404736.0, + "grad_norm": 0.03231348100854236, + "language_loss": 0.83930951, + "learning_rate": 0.00012051622016348856, + "loss": 0.84974658, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.39770508, + "step": 4060, + "time_per_iteration": 2.9990715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043011, + "balance_loss_mlp": 1.00324297, + "epoch": 0.781262023855329, + "flos": 425837803776.0, + "grad_norm": 0.036166261595935334, + "language_loss": 0.84719038, + "learning_rate": 0.00012031343978315539, + "loss": 0.85762048, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.39746094, + "step": 4061, + "time_per_iteration": 2.4627366065979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042268, + "balance_loss_mlp": 1.00247562, + "epoch": 0.7814544055405925, + "flos": 502074370560.0, + "grad_norm": 0.0342232602285917, + "language_loss": 0.83237028, + "learning_rate": 0.00012011080681021774, + "loss": 0.84279293, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.39770508, + "step": 4062, + "time_per_iteration": 2.689143657684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104349, + "balance_loss_mlp": 1.00372207, + "epoch": 0.7816467872258561, + "flos": 463393744896.0, + "grad_norm": 0.03454181235361348, + "language_loss": 0.86497313, + "learning_rate": 0.00011990832132334512, + "loss": 0.87540805, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.39746094, + "step": 4063, + "time_per_iteration": 2.554494619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_mlp": 1.00317287, + "epoch": 0.7818391689111197, + "flos": 742108695552.0, + "grad_norm": 0.04030756572766353, + "language_loss": 0.82932305, + "learning_rate": 0.00011970598340114897, + "loss": 0.8397525, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.39746094, + "step": 4064, + "time_per_iteration": 2.970621109008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_mlp": 1.00241697, + "epoch": 0.7820315505963832, + "flos": 548806656000.0, + "grad_norm": 0.039516882872222964, + "language_loss": 0.84180045, + "learning_rate": 0.00011950379312218396, + "loss": 0.85222203, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.3972168, + "step": 4065, + "time_per_iteration": 2.7288360595703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104416, + "balance_loss_mlp": 1.00446284, + "epoch": 0.7822239322816468, + "flos": 730260129024.0, + "grad_norm": 0.03113922880228995, + "language_loss": 0.86965168, + "learning_rate": 0.00011930175056494719, + "loss": 0.88009328, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.39672852, + "step": 4066, + "time_per_iteration": 2.9733567237854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_mlp": 1.00383461, + "epoch": 0.7824163139669104, + "flos": 452986758144.0, + "grad_norm": 0.03027995654836667, + "language_loss": 0.76300895, + "learning_rate": 0.00011909985580787885, + "loss": 0.77344429, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.39672852, + "step": 4067, + "time_per_iteration": 2.63247013092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_mlp": 1.0023967, + "epoch": 0.782608695652174, + "flos": 541621111296.0, + "grad_norm": 0.030067199560216216, + "language_loss": 0.81511915, + "learning_rate": 0.00011889810892936137, + "loss": 0.82554203, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.39868164, + "step": 4068, + "time_per_iteration": 2.725503444671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042501, + "balance_loss_mlp": 1.00256538, + "epoch": 0.7828010773374374, + "flos": 501429886464.0, + "grad_norm": 0.036639479010935665, + "language_loss": 0.77685481, + "learning_rate": 0.00011869651000771959, + "loss": 0.78727984, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.39916992, + "step": 4069, + "time_per_iteration": 2.831753730773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.00274503, + "epoch": 0.782993459022701, + "flos": 601918642176.0, + "grad_norm": 0.036456028329252196, + "language_loss": 0.83725941, + "learning_rate": 0.00011849505912122117, + "loss": 0.84768599, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.39892578, + "step": 4070, + "time_per_iteration": 2.7105395793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042326, + "balance_loss_mlp": 1.00246227, + "epoch": 0.7831858407079646, + "flos": 811476612864.0, + "grad_norm": 0.03866218742365993, + "language_loss": 0.78222632, + "learning_rate": 0.00011829375634807654, + "loss": 0.79264963, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.3984375, + "step": 4071, + "time_per_iteration": 3.082258939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_mlp": 1.00321043, + "epoch": 0.7833782223932282, + "flos": 808014097920.0, + "grad_norm": 0.03240130540030076, + "language_loss": 0.81343973, + "learning_rate": 0.00011809260176643821, + "loss": 0.82386994, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.39794922, + "step": 4072, + "time_per_iteration": 3.0537989139556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_mlp": 1.00296557, + "epoch": 0.7835706040784918, + "flos": 521900719872.0, + "grad_norm": 0.03900176982337939, + "language_loss": 0.84087825, + "learning_rate": 0.00011789159545440131, + "loss": 0.85130656, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.3984375, + "step": 4073, + "time_per_iteration": 2.628188133239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_mlp": 1.00314867, + "epoch": 0.7837629857637552, + "flos": 506744195328.0, + "grad_norm": 0.031003851704209363, + "language_loss": 0.82853079, + "learning_rate": 0.00011769073749000348, + "loss": 0.83895999, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.39746094, + "step": 4074, + "time_per_iteration": 2.7814579010009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_mlp": 1.00359035, + "epoch": 0.7839553674490188, + "flos": 517135630848.0, + "grad_norm": 0.03896088374638199, + "language_loss": 0.76594853, + "learning_rate": 0.0001174900279512246, + "loss": 0.77638209, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.39746094, + "step": 4075, + "time_per_iteration": 2.5712804794311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043376, + "balance_loss_mlp": 1.00363171, + "epoch": 0.7841477491342824, + "flos": 507651139584.0, + "grad_norm": 0.03246431097284687, + "language_loss": 0.82211149, + "learning_rate": 0.00011728946691598707, + "loss": 0.83254528, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.3972168, + "step": 4076, + "time_per_iteration": 2.604954242706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00401795, + "epoch": 0.784340130819546, + "flos": 720905895168.0, + "grad_norm": 0.038070904406741414, + "language_loss": 0.76823437, + "learning_rate": 0.00011708905446215561, + "loss": 0.77867198, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.3972168, + "step": 4077, + "time_per_iteration": 2.8703718185424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_mlp": 1.00389552, + "epoch": 0.7845325125048095, + "flos": 515514216192.0, + "grad_norm": 0.030616823376727165, + "language_loss": 0.80449855, + "learning_rate": 0.00011688879066753711, + "loss": 0.81493515, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.39746094, + "step": 4078, + "time_per_iteration": 2.693617582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042875, + "balance_loss_mlp": 1.00313067, + "epoch": 0.7847248941900731, + "flos": 467051645952.0, + "grad_norm": 0.040474516708916684, + "language_loss": 0.87913537, + "learning_rate": 0.00011668867560988122, + "loss": 0.88956416, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.3972168, + "step": 4079, + "time_per_iteration": 2.5590639114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_mlp": 1.00582922, + "epoch": 0.7849172758753367, + "flos": 504084612096.0, + "grad_norm": 0.03640725809465974, + "language_loss": 0.84891224, + "learning_rate": 0.00011648870936687916, + "loss": 0.85936773, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.39697266, + "step": 4080, + "time_per_iteration": 2.7692296504974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046484, + "balance_loss_mlp": 1.00678754, + "epoch": 0.7851096575606002, + "flos": 533032870656.0, + "grad_norm": 0.04308382250768319, + "language_loss": 0.79184526, + "learning_rate": 0.00011628889201616461, + "loss": 0.80231011, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.39672852, + "step": 4081, + "time_per_iteration": 2.6643264293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_mlp": 1.00411689, + "epoch": 0.7853020392458638, + "flos": 571044712704.0, + "grad_norm": 0.03315243630239655, + "language_loss": 0.82372963, + "learning_rate": 0.00011608922363531393, + "loss": 0.8341682, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.3972168, + "step": 4082, + "time_per_iteration": 2.6805782318115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_mlp": 1.00403428, + "epoch": 0.7854944209311273, + "flos": 833992680960.0, + "grad_norm": 0.03684416800395475, + "language_loss": 0.83803403, + "learning_rate": 0.00011588970430184504, + "loss": 0.84847128, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.39672852, + "step": 4083, + "time_per_iteration": 3.0843493938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_mlp": 1.00404453, + "epoch": 0.7856868026163909, + "flos": 561011001600.0, + "grad_norm": 0.030260484959858683, + "language_loss": 0.82344627, + "learning_rate": 0.00011569033409321822, + "loss": 0.83388317, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.39624023, + "step": 4084, + "time_per_iteration": 2.692643165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044727, + "balance_loss_mlp": 1.0050776, + "epoch": 0.7858791843016545, + "flos": 546268581888.0, + "grad_norm": 0.039325334071154384, + "language_loss": 0.73417258, + "learning_rate": 0.00011549111308683591, + "loss": 0.74461985, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.39624023, + "step": 4085, + "time_per_iteration": 2.6917884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_mlp": 1.00529623, + "epoch": 0.7860715659869181, + "flos": 381840923904.0, + "grad_norm": 0.042614016338838545, + "language_loss": 0.8128258, + "learning_rate": 0.00011529204136004251, + "loss": 0.82327527, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.39624023, + "step": 4086, + "time_per_iteration": 2.4572253227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_mlp": 1.0049361, + "epoch": 0.7862639476721817, + "flos": 568513441536.0, + "grad_norm": 0.03346159984299651, + "language_loss": 0.84931922, + "learning_rate": 0.00011509311899012459, + "loss": 0.85976499, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.39624023, + "step": 4087, + "time_per_iteration": 2.763591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_mlp": 1.00353885, + "epoch": 0.7864563293574451, + "flos": 546323016960.0, + "grad_norm": 0.03949651761127577, + "language_loss": 0.78551108, + "learning_rate": 0.00011489434605431053, + "loss": 0.79594392, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.3972168, + "step": 4088, + "time_per_iteration": 2.6439645290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042959, + "balance_loss_mlp": 1.00321484, + "epoch": 0.7866487110427087, + "flos": 564649460736.0, + "grad_norm": 0.036592949661453156, + "language_loss": 0.81577885, + "learning_rate": 0.0001146957226297708, + "loss": 0.82620847, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.3972168, + "step": 4089, + "time_per_iteration": 2.679487705230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042934, + "balance_loss_mlp": 1.00321376, + "epoch": 0.7868410927279723, + "flos": 729559264512.0, + "grad_norm": 0.030545920555930417, + "language_loss": 0.76902366, + "learning_rate": 0.00011449724879361827, + "loss": 0.77945304, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.39697266, + "step": 4090, + "time_per_iteration": 2.9623334407806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043691, + "balance_loss_mlp": 1.00404155, + "epoch": 0.7870334744132359, + "flos": 522447994368.0, + "grad_norm": 0.042680254244296036, + "language_loss": 0.74582481, + "learning_rate": 0.00011429892462290687, + "loss": 0.75626171, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.39624023, + "step": 4091, + "time_per_iteration": 2.718287229537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_mlp": 1.00435197, + "epoch": 0.7872258560984994, + "flos": 452363661312.0, + "grad_norm": 0.033106880677710115, + "language_loss": 0.83571684, + "learning_rate": 0.00011410075019463295, + "loss": 0.84615809, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.39746094, + "step": 4092, + "time_per_iteration": 2.627462148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043831, + "balance_loss_mlp": 1.00413382, + "epoch": 0.787418237783763, + "flos": 516250073856.0, + "grad_norm": 0.03274569080250533, + "language_loss": 0.80842328, + "learning_rate": 0.00011390272558573461, + "loss": 0.8188616, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.39672852, + "step": 4093, + "time_per_iteration": 2.678356409072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_mlp": 1.00474429, + "epoch": 0.7876106194690266, + "flos": 486057566976.0, + "grad_norm": 0.03217400572636969, + "language_loss": 0.80303454, + "learning_rate": 0.00011370485087309202, + "loss": 0.81347895, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.39672852, + "step": 4094, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_mlp": 1.00449383, + "epoch": 0.7878030011542901, + "flos": 543930751488.0, + "grad_norm": 0.036296400111331464, + "language_loss": 0.79175836, + "learning_rate": 0.00011350712613352688, + "loss": 0.80220002, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.39648438, + "step": 4095, + "time_per_iteration": 2.705301284790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_mlp": 1.00463307, + "epoch": 0.7879953828395537, + "flos": 517749979392.0, + "grad_norm": 0.042475497231540135, + "language_loss": 0.79742056, + "learning_rate": 0.00011330955144380283, + "loss": 0.80786359, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.39648438, + "step": 4096, + "time_per_iteration": 2.592628240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_mlp": 1.00336826, + "epoch": 0.7881877645248172, + "flos": 583377370368.0, + "grad_norm": 0.033751498450810845, + "language_loss": 0.86674351, + "learning_rate": 0.00011311212688062483, + "loss": 0.87717438, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.39697266, + "step": 4097, + "time_per_iteration": 2.8006155490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_mlp": 1.0035969, + "epoch": 0.7883801462100808, + "flos": 590328645120.0, + "grad_norm": 0.0369008039403456, + "language_loss": 0.78409964, + "learning_rate": 0.0001129148525206402, + "loss": 0.79453301, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.3972168, + "step": 4098, + "time_per_iteration": 2.824293375015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_mlp": 1.00373876, + "epoch": 0.7885725278953444, + "flos": 482742806016.0, + "grad_norm": 0.04185353422422626, + "language_loss": 0.86687458, + "learning_rate": 0.00011271772844043759, + "loss": 0.87730914, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.39697266, + "step": 4099, + "time_per_iteration": 2.6993777751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_mlp": 1.00372386, + "epoch": 0.788764909580608, + "flos": 758099254272.0, + "grad_norm": 0.0413483333130522, + "language_loss": 0.76537859, + "learning_rate": 0.00011252075471654727, + "loss": 0.77581275, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.39672852, + "step": 4100, + "time_per_iteration": 2.9176177978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_mlp": 1.00374079, + "epoch": 0.7889572912658714, + "flos": 703880080128.0, + "grad_norm": 0.0322415537049841, + "language_loss": 0.7816056, + "learning_rate": 0.00011232393142544133, + "loss": 0.79204047, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.3972168, + "step": 4101, + "time_per_iteration": 2.9494380950927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044086, + "balance_loss_mlp": 1.00436497, + "epoch": 0.789149672951135, + "flos": 737841303552.0, + "grad_norm": 0.03312890995407851, + "language_loss": 0.83342379, + "learning_rate": 0.00011212725864353323, + "loss": 0.84386468, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.39697266, + "step": 4102, + "time_per_iteration": 3.066310405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104599, + "balance_loss_mlp": 1.00727081, + "epoch": 0.7893420546363986, + "flos": 1484490533376.0, + "grad_norm": 0.0037033448465983686, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77381915, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.38671875, + "step": 4103, + "time_per_iteration": 4.842837810516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043191, + "balance_loss_mlp": 1.00349379, + "epoch": 0.7895344363216622, + "flos": 510080343552.0, + "grad_norm": 0.04492862133379161, + "language_loss": 0.75946063, + "learning_rate": 0.00011173436491267291, + "loss": 0.76989251, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.39672852, + "step": 4104, + "time_per_iteration": 2.6089494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_mlp": 1.00348377, + "epoch": 0.7897268180069258, + "flos": 543038391552.0, + "grad_norm": 0.035594569075133434, + "language_loss": 0.82524866, + "learning_rate": 0.0001115381441162554, + "loss": 0.83568072, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.39697266, + "step": 4105, + "time_per_iteration": 2.610574245452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_mlp": 1.00495148, + "epoch": 0.7899191996921893, + "flos": 1415752515840.0, + "grad_norm": 0.004244579927016686, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74627399, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.38671875, + "step": 4106, + "time_per_iteration": 4.910478830337524 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042244, + "balance_loss_mlp": 1.00254726, + "epoch": 0.7901115813774529, + "flos": 624022550784.0, + "grad_norm": 0.03217840063053149, + "language_loss": 0.855353, + "learning_rate": 0.00011114615504234465, + "loss": 0.86577547, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.39672852, + "step": 4107, + "time_per_iteration": 2.746295690536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_mlp": 1.0045898, + "epoch": 0.7903039630627164, + "flos": 646805936640.0, + "grad_norm": 0.033942053342870586, + "language_loss": 0.81416857, + "learning_rate": 0.00011095038691703468, + "loss": 0.82461071, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.39599609, + "step": 4108, + "time_per_iteration": 2.8708901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104442, + "balance_loss_mlp": 1.00479472, + "epoch": 0.79049634474798, + "flos": 595612818432.0, + "grad_norm": 0.037550083801842486, + "language_loss": 0.83416122, + "learning_rate": 0.00011075476983417998, + "loss": 0.84460539, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.39599609, + "step": 4109, + "time_per_iteration": 2.8592021465301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043584, + "balance_loss_mlp": 1.00393546, + "epoch": 0.7906887264332435, + "flos": 717332564736.0, + "grad_norm": 0.03806568849711228, + "language_loss": 0.7824564, + "learning_rate": 0.00011055930386972579, + "loss": 0.79289222, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.39624023, + "step": 4110, + "time_per_iteration": 2.860257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_mlp": 1.00162232, + "epoch": 0.7908811081185071, + "flos": 791261436672.0, + "grad_norm": 0.034643176312320036, + "language_loss": 0.78703582, + "learning_rate": 0.00011036398909955863, + "loss": 0.79744881, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.39648438, + "step": 4111, + "time_per_iteration": 2.9770195484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043101, + "balance_loss_mlp": 1.0034523, + "epoch": 0.7910734898037707, + "flos": 643076103936.0, + "grad_norm": 0.033380496511460814, + "language_loss": 0.8228001, + "learning_rate": 0.00011016882559950648, + "loss": 0.83323109, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.39624023, + "step": 4112, + "time_per_iteration": 2.8614118099212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_mlp": 1.00446427, + "epoch": 0.7912658714890343, + "flos": 670561395456.0, + "grad_norm": 0.037601887407010925, + "language_loss": 0.80818218, + "learning_rate": 0.00010997381344533853, + "loss": 0.81862211, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.39501953, + "step": 4113, + "time_per_iteration": 2.806915521621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 1.00639081, + "epoch": 0.7914582531742979, + "flos": 558887999232.0, + "grad_norm": 0.03473923170116899, + "language_loss": 0.81077361, + "learning_rate": 0.00010977895271276517, + "loss": 0.82123232, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.39453125, + "step": 4114, + "time_per_iteration": 2.710866928100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046255, + "balance_loss_mlp": 1.00667739, + "epoch": 0.7916506348595613, + "flos": 571192466688.0, + "grad_norm": 0.03381455786010569, + "language_loss": 0.80545115, + "learning_rate": 0.00010958424347743807, + "loss": 0.81591368, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.39550781, + "step": 4115, + "time_per_iteration": 2.720463991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044883, + "balance_loss_mlp": 1.00528204, + "epoch": 0.7918430165448249, + "flos": 719647062528.0, + "grad_norm": 0.03312205517130564, + "language_loss": 0.8089326, + "learning_rate": 0.00010938968581494991, + "loss": 0.81938136, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.39575195, + "step": 4116, + "time_per_iteration": 2.9487526416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_mlp": 1.004758, + "epoch": 0.7920353982300885, + "flos": 554737258752.0, + "grad_norm": 0.04353090133720626, + "language_loss": 0.79680514, + "learning_rate": 0.000109195279800835, + "loss": 0.80724961, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.39672852, + "step": 4117, + "time_per_iteration": 2.7193853855133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046231, + "balance_loss_mlp": 1.0065577, + "epoch": 0.7922277799153521, + "flos": 811541741568.0, + "grad_norm": 0.051618169063903374, + "language_loss": 0.76734924, + "learning_rate": 0.00010900102551056834, + "loss": 0.77781159, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.39648438, + "step": 4118, + "time_per_iteration": 3.0203771591186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_mlp": 1.00644493, + "epoch": 0.7924201616006156, + "flos": 422245031424.0, + "grad_norm": 0.03727479456025455, + "language_loss": 0.84903586, + "learning_rate": 0.00010880692301956601, + "loss": 0.85949719, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.39672852, + "step": 4119, + "time_per_iteration": 2.5143675804138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_mlp": 1.00416684, + "epoch": 0.7926125432858792, + "flos": 619105817088.0, + "grad_norm": 0.030768589003691713, + "language_loss": 0.86626256, + "learning_rate": 0.00010861297240318518, + "loss": 0.876701, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.39648438, + "step": 4120, + "time_per_iteration": 2.870023250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_mlp": 1.00611162, + "epoch": 0.7928049249711427, + "flos": 603611988480.0, + "grad_norm": 0.0348759372841926, + "language_loss": 0.8754127, + "learning_rate": 0.00010841917373672444, + "loss": 0.88587052, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.39648438, + "step": 4121, + "time_per_iteration": 2.7993838787078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_mlp": 1.00568569, + "epoch": 0.7929973066564063, + "flos": 657232365312.0, + "grad_norm": 0.04825872036668382, + "language_loss": 0.79469776, + "learning_rate": 0.00010822552709542293, + "loss": 0.80515188, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.39697266, + "step": 4122, + "time_per_iteration": 2.8277747631073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104501, + "balance_loss_mlp": 1.00526559, + "epoch": 0.7931896883416699, + "flos": 537435377664.0, + "grad_norm": 0.033652478318624945, + "language_loss": 0.86540711, + "learning_rate": 0.0001080320325544612, + "loss": 0.87585717, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.3972168, + "step": 4123, + "time_per_iteration": 2.7195277214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_mlp": 1.00394118, + "epoch": 0.7933820700269334, + "flos": 499069701888.0, + "grad_norm": 0.034451341323961555, + "language_loss": 0.83510745, + "learning_rate": 0.00010783869018895997, + "loss": 0.84554619, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.39916992, + "step": 4124, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_mlp": 1.00495958, + "epoch": 0.793574451712197, + "flos": 538496878848.0, + "grad_norm": 0.03367415266088285, + "language_loss": 0.84549522, + "learning_rate": 0.00010764550007398189, + "loss": 0.85594392, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.39892578, + "step": 4125, + "time_per_iteration": 4.054261207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045425, + "balance_loss_mlp": 1.00556159, + "epoch": 0.7937668333974606, + "flos": 489259567104.0, + "grad_norm": 0.03475053715190497, + "language_loss": 0.82054108, + "learning_rate": 0.00010745246228452982, + "loss": 0.83099532, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.3984375, + "step": 4126, + "time_per_iteration": 2.5979418754577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_mlp": 1.0054512, + "epoch": 0.7939592150827242, + "flos": 528480664320.0, + "grad_norm": 0.03444144820805524, + "language_loss": 0.8203451, + "learning_rate": 0.00010725957689554771, + "loss": 0.83079869, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.39892578, + "step": 4127, + "time_per_iteration": 2.7990803718566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_mlp": 1.0037564, + "epoch": 0.7941515967679876, + "flos": 542804121600.0, + "grad_norm": 0.027974353873713647, + "language_loss": 0.84939337, + "learning_rate": 0.00010706684398192013, + "loss": 0.85982978, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.39868164, + "step": 4128, + "time_per_iteration": 2.6992971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_mlp": 1.00342357, + "epoch": 0.7943439784532512, + "flos": 519524005632.0, + "grad_norm": 0.0378035902598828, + "language_loss": 0.82137024, + "learning_rate": 0.00010687426361847313, + "loss": 0.83180356, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.39892578, + "step": 4129, + "time_per_iteration": 2.7055931091308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_mlp": 1.00368559, + "epoch": 0.7945363601385148, + "flos": 510060901632.0, + "grad_norm": 0.033194408400906726, + "language_loss": 0.86515343, + "learning_rate": 0.00010668183587997254, + "loss": 0.87558991, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.39941406, + "step": 4130, + "time_per_iteration": 2.6280934810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043077, + "balance_loss_mlp": 1.00318933, + "epoch": 0.7947287418237784, + "flos": 652402147584.0, + "grad_norm": 0.029896706291295146, + "language_loss": 0.77920771, + "learning_rate": 0.0001064895608411256, + "loss": 0.78963846, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.39868164, + "step": 4131, + "time_per_iteration": 2.8259942531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042645, + "balance_loss_mlp": 1.00282872, + "epoch": 0.794921123509042, + "flos": 697374012672.0, + "grad_norm": 0.04755906636232369, + "language_loss": 0.80848777, + "learning_rate": 0.00010629743857657998, + "loss": 0.81891429, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.39794922, + "step": 4132, + "time_per_iteration": 2.8961074352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_mlp": 1.00901794, + "epoch": 0.7951135051943055, + "flos": 1406079441408.0, + "grad_norm": 0.006864430064478978, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.716465, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.38769531, + "step": 4133, + "time_per_iteration": 4.614002704620361 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_mlp": 1.00524557, + "epoch": 0.795305886879569, + "flos": 811450368000.0, + "grad_norm": 0.03507425862831722, + "language_loss": 0.82587457, + "learning_rate": 0.00010591365266868802, + "loss": 0.83632547, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.39819336, + "step": 4134, + "time_per_iteration": 2.9641194343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044941, + "balance_loss_mlp": 1.0061264, + "epoch": 0.7954982685648326, + "flos": 1429216660992.0, + "grad_norm": 0.005948416138120475, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76556724, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.38769531, + "step": 4135, + "time_per_iteration": 4.960731029510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.00557125, + "epoch": 0.7956906502500962, + "flos": 390748005120.0, + "grad_norm": 0.05367501121915611, + "language_loss": 0.80196106, + "learning_rate": 0.00010553047875229166, + "loss": 0.81241471, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.39770508, + "step": 4136, + "time_per_iteration": 2.5680596828460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045778, + "balance_loss_mlp": 1.00596261, + "epoch": 0.7958830319353598, + "flos": 516586411008.0, + "grad_norm": 0.03268572059370949, + "language_loss": 0.83743113, + "learning_rate": 0.00010533912147689328, + "loss": 0.84788889, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.39794922, + "step": 4137, + "time_per_iteration": 2.6882131099700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_mlp": 1.00492001, + "epoch": 0.7960754136206233, + "flos": 494927709696.0, + "grad_norm": 0.03240268195496617, + "language_loss": 0.82921439, + "learning_rate": 0.00010514791742243656, + "loss": 0.83966154, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.39770508, + "step": 4138, + "time_per_iteration": 2.5695807933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_mlp": 1.0049001, + "epoch": 0.7962677953058869, + "flos": 657006843648.0, + "grad_norm": 0.03902501447603489, + "language_loss": 0.83096194, + "learning_rate": 0.00010495686666315341, + "loss": 0.84140819, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.39697266, + "step": 4139, + "time_per_iteration": 2.8975212574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044322, + "balance_loss_mlp": 1.00443506, + "epoch": 0.7964601769911505, + "flos": 543420415488.0, + "grad_norm": 0.04091295752087844, + "language_loss": 0.777354, + "learning_rate": 0.00010476596927321635, + "loss": 0.78779727, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.39868164, + "step": 4140, + "time_per_iteration": 2.654552459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_mlp": 1.00785065, + "epoch": 0.796652558676414, + "flos": 538827379968.0, + "grad_norm": 0.03162317226196635, + "language_loss": 0.80818027, + "learning_rate": 0.00010457522532673835, + "loss": 0.81865692, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.39794922, + "step": 4141, + "time_per_iteration": 2.842707633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044598, + "balance_loss_mlp": 1.00480628, + "epoch": 0.7968449403616775, + "flos": 476052046080.0, + "grad_norm": 0.03609806445163449, + "language_loss": 0.83603644, + "learning_rate": 0.00010438463489777272, + "loss": 0.8464824, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.39770508, + "step": 4142, + "time_per_iteration": 2.5717051029205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_mlp": 1.00287914, + "epoch": 0.7970373220469411, + "flos": 568726324224.0, + "grad_norm": 0.03529843245430609, + "language_loss": 0.7784009, + "learning_rate": 0.00010419419806031316, + "loss": 0.78882766, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.39770508, + "step": 4143, + "time_per_iteration": 2.6530473232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_mlp": 1.00467539, + "epoch": 0.7972297037322047, + "flos": 557351155200.0, + "grad_norm": 0.03335474096113663, + "language_loss": 0.84457743, + "learning_rate": 0.00010400391488829403, + "loss": 0.85502243, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.39794922, + "step": 4144, + "time_per_iteration": 2.832122564315796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044556, + "balance_loss_mlp": 1.00471592, + "epoch": 0.7974220854174683, + "flos": 577307761920.0, + "grad_norm": 0.030245112607884015, + "language_loss": 0.87015516, + "learning_rate": 0.00010381378545558984, + "loss": 0.88060075, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.39819336, + "step": 4145, + "time_per_iteration": 2.6970877647399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104421, + "balance_loss_mlp": 1.00434661, + "epoch": 0.7976144671027319, + "flos": 484056073728.0, + "grad_norm": 0.03356319241102144, + "language_loss": 0.8495326, + "learning_rate": 0.00010362380983601505, + "loss": 0.85997462, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.3984375, + "step": 4146, + "time_per_iteration": 2.5355587005615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_mlp": 1.00420487, + "epoch": 0.7978068487879953, + "flos": 1079654319360.0, + "grad_norm": 0.028459484935127146, + "language_loss": 0.79190552, + "learning_rate": 0.00010343398810332477, + "loss": 0.80234593, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.39819336, + "step": 4147, + "time_per_iteration": 3.4484007358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_mlp": 1.00370336, + "epoch": 0.7979992304732589, + "flos": 735016469760.0, + "grad_norm": 0.038421904097834796, + "language_loss": 0.84714222, + "learning_rate": 0.00010324432033121467, + "loss": 0.8575781, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.39868164, + "step": 4148, + "time_per_iteration": 2.8759710788726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044627, + "balance_loss_mlp": 1.00476301, + "epoch": 0.7981916121585225, + "flos": 416750887680.0, + "grad_norm": 0.03692074531599656, + "language_loss": 0.84042895, + "learning_rate": 0.00010305480659332005, + "loss": 0.85087514, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.3984375, + "step": 4149, + "time_per_iteration": 2.6903555393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_mlp": 1.00493348, + "epoch": 0.7983839938437861, + "flos": 466213721088.0, + "grad_norm": 0.03398705424173267, + "language_loss": 0.84049666, + "learning_rate": 0.00010286544696321682, + "loss": 0.85094512, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.39892578, + "step": 4150, + "time_per_iteration": 2.5223419666290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_mlp": 1.00731564, + "epoch": 0.7985763755290496, + "flos": 511623990528.0, + "grad_norm": 0.03850329476813429, + "language_loss": 0.80184937, + "learning_rate": 0.00010267624151442073, + "loss": 0.81232083, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.39819336, + "step": 4151, + "time_per_iteration": 2.6372790336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045244, + "balance_loss_mlp": 1.00545216, + "epoch": 0.7987687572143132, + "flos": 1012279147008.0, + "grad_norm": 0.036156953147693155, + "language_loss": 0.81612265, + "learning_rate": 0.000102487190320388, + "loss": 0.8265751, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.39770508, + "step": 4152, + "time_per_iteration": 3.3100497722625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_mlp": 1.00644886, + "epoch": 0.7989611388995768, + "flos": 1022749317120.0, + "grad_norm": 0.0483734968534093, + "language_loss": 0.80480343, + "learning_rate": 0.00010229829345451475, + "loss": 0.81526625, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.39819336, + "step": 4153, + "time_per_iteration": 3.305338144302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048343, + "balance_loss_mlp": 1.00855112, + "epoch": 0.7991535205848403, + "flos": 1103038447872.0, + "grad_norm": 0.03770888532142324, + "language_loss": 0.80308628, + "learning_rate": 0.00010210955099013724, + "loss": 0.81356978, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.39770508, + "step": 4154, + "time_per_iteration": 3.409900188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047808, + "balance_loss_mlp": 1.00789726, + "epoch": 0.7993459022701039, + "flos": 836280933888.0, + "grad_norm": 0.04128229855953485, + "language_loss": 0.77654159, + "learning_rate": 0.00010192096300053167, + "loss": 0.78701961, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.39892578, + "step": 4155, + "time_per_iteration": 3.1075351238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_mlp": 1.00387979, + "epoch": 0.7995382839553674, + "flos": 523770010368.0, + "grad_norm": 0.043215230874116634, + "language_loss": 0.85791343, + "learning_rate": 0.00010173252955891477, + "loss": 0.86835158, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.39916992, + "step": 4156, + "time_per_iteration": 2.741454839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_mlp": 1.00358212, + "epoch": 0.799730665640631, + "flos": 538859460864.0, + "grad_norm": 0.0402681416401722, + "language_loss": 0.73719215, + "learning_rate": 0.00010154425073844253, + "loss": 0.74762684, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.39868164, + "step": 4157, + "time_per_iteration": 2.709291458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_mlp": 1.00384808, + "epoch": 0.7999230473258946, + "flos": 506068608768.0, + "grad_norm": 0.03223966585630621, + "language_loss": 0.82729542, + "learning_rate": 0.00010135612661221138, + "loss": 0.83773249, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.3984375, + "step": 4158, + "time_per_iteration": 2.557003974914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043464, + "balance_loss_mlp": 1.00357628, + "epoch": 0.8001154290111582, + "flos": 1028977373184.0, + "grad_norm": 0.03912877230354993, + "language_loss": 0.82057023, + "learning_rate": 0.00010116815725325751, + "loss": 0.83100486, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.39868164, + "step": 4159, + "time_per_iteration": 3.304746389389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_mlp": 1.00401807, + "epoch": 0.8003078106964217, + "flos": 752270718720.0, + "grad_norm": 0.03707561964119669, + "language_loss": 0.81281012, + "learning_rate": 0.00010098034273455725, + "loss": 0.82324892, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.3984375, + "step": 4160, + "time_per_iteration": 2.93477463722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043012, + "balance_loss_mlp": 1.00317216, + "epoch": 0.8005001923816852, + "flos": 489526884864.0, + "grad_norm": 0.03420748582066261, + "language_loss": 0.80276787, + "learning_rate": 0.00010079268312902662, + "loss": 0.81319797, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.39819336, + "step": 4161, + "time_per_iteration": 2.6929373741149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_mlp": 1.00474119, + "epoch": 0.8006925740669488, + "flos": 514313709312.0, + "grad_norm": 0.033458859540608864, + "language_loss": 0.82609326, + "learning_rate": 0.0001006051785095215, + "loss": 0.83653903, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.39819336, + "step": 4162, + "time_per_iteration": 2.734436511993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046338, + "balance_loss_mlp": 1.00642645, + "epoch": 0.8008849557522124, + "flos": 579680585472.0, + "grad_norm": 0.03667202832039182, + "language_loss": 0.79988742, + "learning_rate": 0.0001004178289488376, + "loss": 0.81035084, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.39892578, + "step": 4163, + "time_per_iteration": 2.767613410949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046056, + "balance_loss_mlp": 1.00626385, + "epoch": 0.801077337437476, + "flos": 479681756928.0, + "grad_norm": 0.03506615543600683, + "language_loss": 0.84141004, + "learning_rate": 0.0001002306345197106, + "loss": 0.8518706, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.39770508, + "step": 4164, + "time_per_iteration": 2.631165027618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_mlp": 1.00552833, + "epoch": 0.8012697191227395, + "flos": 677968571136.0, + "grad_norm": 0.045614046534047034, + "language_loss": 0.80445755, + "learning_rate": 0.00010004359529481571, + "loss": 0.81491077, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.39770508, + "step": 4165, + "time_per_iteration": 2.901402473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045546, + "balance_loss_mlp": 1.00572991, + "epoch": 0.8014621008080031, + "flos": 1297172576256.0, + "grad_norm": 0.042609498676076864, + "language_loss": 0.83314872, + "learning_rate": 9.985671134676804e-05, + "loss": 0.84360421, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.39794922, + "step": 4166, + "time_per_iteration": 3.69667911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_mlp": 1.00647485, + "epoch": 0.8016544824932667, + "flos": 512826442752.0, + "grad_norm": 0.041586651320783194, + "language_loss": 0.83886582, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84932899, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.39819336, + "step": 4167, + "time_per_iteration": 2.5852413177490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_mlp": 1.0039798, + "epoch": 0.8018468641785302, + "flos": 536718961920.0, + "grad_norm": 0.04260031626477269, + "language_loss": 0.82111335, + "learning_rate": 9.948340957137308e-05, + "loss": 0.83155155, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.39819336, + "step": 4168, + "time_per_iteration": 2.6272878646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_mlp": 1.00434577, + "epoch": 0.8020392458637937, + "flos": 1025058957312.0, + "grad_norm": 0.03898276528172633, + "language_loss": 0.80097771, + "learning_rate": 9.929699188895447e-05, + "loss": 0.81142002, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.39868164, + "step": 4169, + "time_per_iteration": 3.2593564987182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_mlp": 1.00231934, + "epoch": 0.8022316275490573, + "flos": 1565073226752.0, + "grad_norm": 0.005727989546887444, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79095441, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.38671875, + "step": 4170, + "time_per_iteration": 4.9659693241119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047428, + "balance_loss_mlp": 1.0074693, + "epoch": 0.8024240092343209, + "flos": 421602492672.0, + "grad_norm": 0.0363857175356543, + "language_loss": 0.83789802, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84837228, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.39941406, + "step": 4171, + "time_per_iteration": 2.524991989135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047654, + "balance_loss_mlp": 1.0077666, + "epoch": 0.8026163909195845, + "flos": 765163289856.0, + "grad_norm": 0.03859277730807378, + "language_loss": 0.79318523, + "learning_rate": 9.873867253111762e-05, + "loss": 0.8036617, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.39868164, + "step": 4172, + "time_per_iteration": 2.919250726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047932, + "balance_loss_mlp": 1.00911713, + "epoch": 0.8028087726048481, + "flos": 1522144651008.0, + "grad_norm": 0.012840205003212634, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81312495, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.38769531, + "step": 4173, + "time_per_iteration": 4.94536828994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_mlp": 1.00673401, + "epoch": 0.8030011542901115, + "flos": 518830922496.0, + "grad_norm": 0.03954362492691932, + "language_loss": 0.89050967, + "learning_rate": 9.836723842278733e-05, + "loss": 0.90097666, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.39941406, + "step": 4174, + "time_per_iteration": 2.5619349479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_mlp": 1.00696588, + "epoch": 0.8031935359753751, + "flos": 546659354112.0, + "grad_norm": 0.036526547211400404, + "language_loss": 0.78600073, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79647022, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.3996582, + "step": 4175, + "time_per_iteration": 2.650787115097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_mlp": 1.00668824, + "epoch": 0.8033859176606387, + "flos": 604736673024.0, + "grad_norm": 0.03137628778903353, + "language_loss": 0.85134256, + "learning_rate": 9.79964280250632e-05, + "loss": 0.86180949, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.39990234, + "step": 4176, + "time_per_iteration": 2.7565901279449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046784, + "balance_loss_mlp": 1.00675344, + "epoch": 0.8035782993459023, + "flos": 566985345792.0, + "grad_norm": 0.038450106349373375, + "language_loss": 0.82200831, + "learning_rate": 9.781125689766795e-05, + "loss": 0.83247614, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.40014648, + "step": 4177, + "time_per_iteration": 2.7695512771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046088, + "balance_loss_mlp": 1.00593793, + "epoch": 0.8037706810311658, + "flos": 539473809408.0, + "grad_norm": 0.047429405417763595, + "language_loss": 0.85172522, + "learning_rate": 9.762624191379054e-05, + "loss": 0.86218613, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.40136719, + "step": 4178, + "time_per_iteration": 2.6202852725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046139, + "balance_loss_mlp": 1.00601351, + "epoch": 0.8039630627164294, + "flos": 516195638784.0, + "grad_norm": 0.036004712534776565, + "language_loss": 0.79951143, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80997288, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.40112305, + "step": 4179, + "time_per_iteration": 2.6379129886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046989, + "balance_loss_mlp": 1.00769806, + "epoch": 0.804155444401693, + "flos": 1481939820288.0, + "grad_norm": 0.005350241122210075, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75780553, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.39257812, + "step": 4180, + "time_per_iteration": 4.866838693618774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045581, + "balance_loss_mlp": 1.00545502, + "epoch": 0.8043478260869565, + "flos": 522189424896.0, + "grad_norm": 0.036924486045392176, + "language_loss": 0.77422369, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78467953, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.40112305, + "step": 4181, + "time_per_iteration": 2.67852783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045883, + "balance_loss_mlp": 1.00568521, + "epoch": 0.8045402077722201, + "flos": 546564089856.0, + "grad_norm": 0.03189319138496175, + "language_loss": 0.80777282, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81823158, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.40185547, + "step": 4182, + "time_per_iteration": 2.8362486362457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047051, + "balance_loss_mlp": 1.00687742, + "epoch": 0.8047325894574836, + "flos": 679707604224.0, + "grad_norm": 0.03606539068582512, + "language_loss": 0.74659956, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75707006, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.40161133, + "step": 4183, + "time_per_iteration": 2.9409847259521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049374, + "balance_loss_mlp": 1.0092963, + "epoch": 0.8049249711427472, + "flos": 588329097216.0, + "grad_norm": 0.03633262888197943, + "language_loss": 0.79051793, + "learning_rate": 9.65194350425882e-05, + "loss": 0.80101168, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.40063477, + "step": 4184, + "time_per_iteration": 2.737316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105025, + "balance_loss_mlp": 1.01019537, + "epoch": 0.8051173528280108, + "flos": 815681788416.0, + "grad_norm": 0.04554718693460932, + "language_loss": 0.78688985, + "learning_rate": 9.633551507115452e-05, + "loss": 0.79739237, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.40039062, + "step": 4185, + "time_per_iteration": 3.1375975608825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049726, + "balance_loss_mlp": 1.00974321, + "epoch": 0.8053097345132744, + "flos": 726956061696.0, + "grad_norm": 0.034010858312542885, + "language_loss": 0.7827903, + "learning_rate": 9.615175181617259e-05, + "loss": 0.79328752, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.3996582, + "step": 4186, + "time_per_iteration": 2.971076011657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050316, + "balance_loss_mlp": 1.01026201, + "epoch": 0.805502116198538, + "flos": 749431300608.0, + "grad_norm": 0.03715615744266424, + "language_loss": 0.8164562, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82695937, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.40039062, + "step": 4187, + "time_per_iteration": 2.964796304702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104701, + "balance_loss_mlp": 1.00690746, + "epoch": 0.8056944978838014, + "flos": 641482879488.0, + "grad_norm": 0.04269261124509272, + "language_loss": 0.88006115, + "learning_rate": 9.578469574087561e-05, + "loss": 0.8905313, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.40087891, + "step": 4188, + "time_per_iteration": 2.8772683143615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_mlp": 1.00536251, + "epoch": 0.805886879569065, + "flos": 645785264640.0, + "grad_norm": 0.037205213078360604, + "language_loss": 0.78592306, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79637742, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.40063477, + "step": 4189, + "time_per_iteration": 2.9584858417510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048365, + "balance_loss_mlp": 1.00831056, + "epoch": 0.8060792612543286, + "flos": 662444606976.0, + "grad_norm": 0.03414135318032402, + "language_loss": 0.82343107, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8339147, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.40039062, + "step": 4190, + "time_per_iteration": 2.850748300552368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048736, + "balance_loss_mlp": 1.00861061, + "epoch": 0.8062716429395922, + "flos": 456012814080.0, + "grad_norm": 0.039964771774069895, + "language_loss": 0.83061063, + "learning_rate": 9.523528878291904e-05, + "loss": 0.84109795, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.40112305, + "step": 4191, + "time_per_iteration": 2.526196002960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045876, + "balance_loss_mlp": 1.00572634, + "epoch": 0.8064640246248557, + "flos": 527429856768.0, + "grad_norm": 0.04221200524238329, + "language_loss": 0.85863805, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86909676, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.40136719, + "step": 4192, + "time_per_iteration": 2.6026053428649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044221, + "balance_loss_mlp": 1.00407135, + "epoch": 0.8066564063101193, + "flos": 866677575168.0, + "grad_norm": 0.03011720621266792, + "language_loss": 0.82612681, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83656895, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.40136719, + "step": 4193, + "time_per_iteration": 3.2554736137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044791, + "balance_loss_mlp": 1.00473714, + "epoch": 0.8068487879953828, + "flos": 531643780608.0, + "grad_norm": 0.04047187421116328, + "language_loss": 0.8236109, + "learning_rate": 9.468729611697246e-05, + "loss": 0.83405876, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.40039062, + "step": 4194, + "time_per_iteration": 2.708320379257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044539, + "balance_loss_mlp": 1.00450861, + "epoch": 0.8070411696806464, + "flos": 567247805952.0, + "grad_norm": 0.032246940295438974, + "language_loss": 0.82059777, + "learning_rate": 9.450494651319003e-05, + "loss": 0.83104318, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.40014648, + "step": 4195, + "time_per_iteration": 2.6272635459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_mlp": 1.00428092, + "epoch": 0.80723355136591, + "flos": 988254425088.0, + "grad_norm": 0.037033582993084305, + "language_loss": 0.79562807, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80607164, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.40063477, + "step": 4196, + "time_per_iteration": 3.2852365970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044577, + "balance_loss_mlp": 1.0044992, + "epoch": 0.8074259330511735, + "flos": 568083785472.0, + "grad_norm": 0.04553692157756435, + "language_loss": 0.83215851, + "learning_rate": 9.414071965778221e-05, + "loss": 0.84260428, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.40063477, + "step": 4197, + "time_per_iteration": 2.825437545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104441, + "balance_loss_mlp": 1.00442731, + "epoch": 0.8076183147364371, + "flos": 495752995584.0, + "grad_norm": 0.033139849122030246, + "language_loss": 0.80485344, + "learning_rate": 9.395884254756242e-05, + "loss": 0.8152976, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.3996582, + "step": 4198, + "time_per_iteration": 2.748180389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_mlp": 1.00461829, + "epoch": 0.8078106964217007, + "flos": 420868580352.0, + "grad_norm": 0.042710700295185595, + "language_loss": 0.8034749, + "learning_rate": 9.377712307650044e-05, + "loss": 0.81392121, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.39990234, + "step": 4199, + "time_per_iteration": 2.4973928928375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044825, + "balance_loss_mlp": 1.00481844, + "epoch": 0.8080030781069643, + "flos": 528565234944.0, + "grad_norm": 0.036109062708885115, + "language_loss": 0.83559549, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84604371, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.39990234, + "step": 4200, + "time_per_iteration": 2.6258418560028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_mlp": 1.00298929, + "epoch": 0.8081954597922277, + "flos": 545152645632.0, + "grad_norm": 0.040903251277153094, + "language_loss": 0.81791621, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82834733, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.40112305, + "step": 4201, + "time_per_iteration": 2.6441001892089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_mlp": 1.00339341, + "epoch": 0.8083878414774913, + "flos": 642134166528.0, + "grad_norm": 0.11239758637794657, + "language_loss": 0.76130128, + "learning_rate": 9.323291120345207e-05, + "loss": 0.77173698, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.40161133, + "step": 4202, + "time_per_iteration": 2.8298070430755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_mlp": 1.00474536, + "epoch": 0.8085802231627549, + "flos": 706906136064.0, + "grad_norm": 0.03893641453034792, + "language_loss": 0.73079675, + "learning_rate": 9.305182299390614e-05, + "loss": 0.74124646, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.40209961, + "step": 4203, + "time_per_iteration": 2.8911709785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043357, + "balance_loss_mlp": 1.00325489, + "epoch": 0.8087726048480185, + "flos": 420662500608.0, + "grad_norm": 0.03792090932692915, + "language_loss": 0.89067852, + "learning_rate": 9.287089277565409e-05, + "loss": 0.90111208, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.40087891, + "step": 4204, + "time_per_iteration": 2.5835013389587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043379, + "balance_loss_mlp": 1.00327671, + "epoch": 0.8089649865332821, + "flos": 509863570176.0, + "grad_norm": 0.028595163425198668, + "language_loss": 0.87236726, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88280106, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.40087891, + "step": 4205, + "time_per_iteration": 2.761137008666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_mlp": 1.00391018, + "epoch": 0.8091573682185456, + "flos": 458262183168.0, + "grad_norm": 0.031965150246737496, + "language_loss": 0.85574394, + "learning_rate": 9.250950659394386e-05, + "loss": 0.86618531, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.40209961, + "step": 4206, + "time_per_iteration": 2.7414729595184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_mlp": 1.00491476, + "epoch": 0.8093497499038091, + "flos": 526375158528.0, + "grad_norm": 0.03398258533246476, + "language_loss": 0.77492428, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78537422, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.40063477, + "step": 4207, + "time_per_iteration": 2.759403705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044343, + "balance_loss_mlp": 1.00426447, + "epoch": 0.8095421315890727, + "flos": 490581583104.0, + "grad_norm": 0.040026247360877884, + "language_loss": 0.77545118, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78589457, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.40063477, + "step": 4208, + "time_per_iteration": 2.59245228767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_mlp": 1.00459588, + "epoch": 0.8097345132743363, + "flos": 626284558848.0, + "grad_norm": 0.03435821442590694, + "language_loss": 0.81210512, + "learning_rate": 9.196861401017164e-05, + "loss": 0.82255137, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.40014648, + "step": 4209, + "time_per_iteration": 2.7602193355560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_mlp": 1.00590312, + "epoch": 0.8099268949595998, + "flos": 616873944576.0, + "grad_norm": 0.03716472832486093, + "language_loss": 0.79843062, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80888975, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.39990234, + "step": 4210, + "time_per_iteration": 2.8025641441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043825, + "balance_loss_mlp": 1.00381863, + "epoch": 0.8101192766448634, + "flos": 480684932352.0, + "grad_norm": 0.0329328024402014, + "language_loss": 0.80138117, + "learning_rate": 9.160881089682566e-05, + "loss": 0.81181943, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.39990234, + "step": 4211, + "time_per_iteration": 2.6329565048217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_mlp": 1.00482833, + "epoch": 0.810311658330127, + "flos": 518327389440.0, + "grad_norm": 0.03337868417274248, + "language_loss": 0.86965179, + "learning_rate": 9.142914713252725e-05, + "loss": 0.88009942, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.39916992, + "step": 4212, + "time_per_iteration": 2.6201486587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_mlp": 1.00274634, + "epoch": 0.8105040400153906, + "flos": 576988921344.0, + "grad_norm": 0.02936598323523461, + "language_loss": 0.84615433, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85658085, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.39892578, + "step": 4213, + "time_per_iteration": 2.7945985794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043338, + "balance_loss_mlp": 1.00345039, + "epoch": 0.8106964217006541, + "flos": 640189053696.0, + "grad_norm": 0.03756812888703321, + "language_loss": 0.8566975, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86713088, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.39868164, + "step": 4214, + "time_per_iteration": 2.809328317642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_mlp": 1.00428391, + "epoch": 0.8108888033859176, + "flos": 580585584384.0, + "grad_norm": 0.0376800294588735, + "language_loss": 0.81953692, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82997859, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.39868164, + "step": 4215, + "time_per_iteration": 2.652230978012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042383, + "balance_loss_mlp": 1.00242364, + "epoch": 0.8110811850711812, + "flos": 561091681536.0, + "grad_norm": 0.03446014791580575, + "language_loss": 0.83807087, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84849465, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.39941406, + "step": 4216, + "time_per_iteration": 2.785763740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047291, + "balance_loss_mlp": 1.00790405, + "epoch": 0.8112735667564448, + "flos": 1521069543936.0, + "grad_norm": 0.007281864777553036, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78307706, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.39355469, + "step": 4217, + "time_per_iteration": 4.774747848510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_mlp": 1.00388443, + "epoch": 0.8114659484417084, + "flos": 617516483328.0, + "grad_norm": 0.03805694470042781, + "language_loss": 0.86072737, + "learning_rate": 9.035449803045792e-05, + "loss": 0.87116575, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.39941406, + "step": 4218, + "time_per_iteration": 2.7768678665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104246, + "balance_loss_mlp": 1.00259662, + "epoch": 0.8116583301269719, + "flos": 651262878720.0, + "grad_norm": 0.030415189633352945, + "language_loss": 0.79453695, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80496156, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.3984375, + "step": 4219, + "time_per_iteration": 2.9257187843322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042621, + "balance_loss_mlp": 1.00273395, + "epoch": 0.8118507118122354, + "flos": 554196787200.0, + "grad_norm": 0.03896243269868023, + "language_loss": 0.80358791, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81401414, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.39868164, + "step": 4220, + "time_per_iteration": 2.715939521789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_mlp": 1.00440466, + "epoch": 0.812043093497499, + "flos": 545178890496.0, + "grad_norm": 0.04144683362187673, + "language_loss": 0.87743199, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88787466, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.3984375, + "step": 4221, + "time_per_iteration": 2.6466968059539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104378, + "balance_loss_mlp": 1.0038445, + "epoch": 0.8122354751827626, + "flos": 584575931904.0, + "grad_norm": 0.0323625870343774, + "language_loss": 0.84112966, + "learning_rate": 8.964124513805628e-05, + "loss": 0.85156739, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.39916992, + "step": 4222, + "time_per_iteration": 2.7797539234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046749, + "balance_loss_mlp": 1.00736237, + "epoch": 0.8124278568680262, + "flos": 1533862960128.0, + "grad_norm": 0.005674804601180374, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79296821, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.39355469, + "step": 4223, + "time_per_iteration": 4.989461660385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104717, + "balance_loss_mlp": 1.00716376, + "epoch": 0.8126202385532897, + "flos": 433767954432.0, + "grad_norm": 0.03852069724209621, + "language_loss": 0.80558193, + "learning_rate": 8.928557430748668e-05, + "loss": 0.81605363, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.39990234, + "step": 4224, + "time_per_iteration": 2.5844249725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_mlp": 1.00914764, + "epoch": 0.8128126202385533, + "flos": 1551149289984.0, + "grad_norm": 0.0078100354933002825, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77543974, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.39160156, + "step": 4225, + "time_per_iteration": 4.842838287353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044264, + "balance_loss_mlp": 1.00437629, + "epoch": 0.8130050019238169, + "flos": 529338031104.0, + "grad_norm": 0.033874870691741325, + "language_loss": 0.89490134, + "learning_rate": 8.893054129078077e-05, + "loss": 0.90534395, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.39868164, + "step": 4226, + "time_per_iteration": 2.643038749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_mlp": 1.00647342, + "epoch": 0.8131973836090804, + "flos": 544228204800.0, + "grad_norm": 0.040352466131287415, + "language_loss": 0.80754006, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81800371, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.39868164, + "step": 4227, + "time_per_iteration": 2.749776601791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046115, + "balance_loss_mlp": 1.00613213, + "epoch": 0.8133897652943439, + "flos": 577578970368.0, + "grad_norm": 0.03996024422287757, + "language_loss": 0.82968926, + "learning_rate": 8.857614663928249e-05, + "loss": 0.84015042, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.3996582, + "step": 4228, + "time_per_iteration": 2.7195777893066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_mlp": 1.00532568, + "epoch": 0.8135821469796075, + "flos": 580351314432.0, + "grad_norm": 0.039368139599927306, + "language_loss": 0.79510874, + "learning_rate": 8.839918887251025e-05, + "loss": 0.8055616, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.39941406, + "step": 4229, + "time_per_iteration": 2.764267921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_mlp": 1.00579298, + "epoch": 0.8137745286648711, + "flos": 651644902656.0, + "grad_norm": 0.033713073313620584, + "language_loss": 0.84232569, + "learning_rate": 8.822239090334472e-05, + "loss": 0.85278368, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.39990234, + "step": 4230, + "time_per_iteration": 2.923346757888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045567, + "balance_loss_mlp": 1.00553608, + "epoch": 0.8139669103501347, + "flos": 703128671232.0, + "grad_norm": 0.036115570851931435, + "language_loss": 0.76082253, + "learning_rate": 8.804575280042493e-05, + "loss": 0.77127826, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.40014648, + "step": 4231, + "time_per_iteration": 2.955892562866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_mlp": 1.00483012, + "epoch": 0.8141592920353983, + "flos": 651388278528.0, + "grad_norm": 0.04294462477319246, + "language_loss": 0.83589506, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84634244, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.39892578, + "step": 4232, + "time_per_iteration": 2.7943365573883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_mlp": 1.0044142, + "epoch": 0.8143516737206618, + "flos": 537845591808.0, + "grad_norm": 0.03939422640128119, + "language_loss": 0.816208, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82665199, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.3996582, + "step": 4233, + "time_per_iteration": 2.6279783248901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_mlp": 1.00510478, + "epoch": 0.8145440554059253, + "flos": 509363927808.0, + "grad_norm": 0.0369441276850866, + "language_loss": 0.83043873, + "learning_rate": 8.751679837459963e-05, + "loss": 0.84088916, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.39916992, + "step": 4234, + "time_per_iteration": 2.5817222595214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_mlp": 1.00513363, + "epoch": 0.8147364370911889, + "flos": 636288134400.0, + "grad_norm": 0.034229096047118546, + "language_loss": 0.8689273, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87937772, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.39892578, + "step": 4235, + "time_per_iteration": 2.8325142860412598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_mlp": 1.0052501, + "epoch": 0.8149288187764525, + "flos": 423706053120.0, + "grad_norm": 0.037533460123593716, + "language_loss": 0.78556967, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79602206, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.3996582, + "step": 4236, + "time_per_iteration": 2.4753267765045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045064, + "balance_loss_mlp": 1.0051285, + "epoch": 0.8151212004617161, + "flos": 598621377792.0, + "grad_norm": 0.03507677024776033, + "language_loss": 0.82292378, + "learning_rate": 8.698928521003097e-05, + "loss": 0.83337444, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.39916992, + "step": 4237, + "time_per_iteration": 2.8309948444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042789, + "balance_loss_mlp": 1.00378418, + "epoch": 0.8153135821469796, + "flos": 1482415163136.0, + "grad_norm": 0.004372930675089117, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78895634, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.38964844, + "step": 4238, + "time_per_iteration": 5.034019231796265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_mlp": 1.00327706, + "epoch": 0.8155059638322432, + "flos": 438012013824.0, + "grad_norm": 0.05133658449911632, + "language_loss": 0.83284819, + "learning_rate": 8.663841137810741e-05, + "loss": 0.84328049, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.39941406, + "step": 4239, + "time_per_iteration": 2.4995291233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043057, + "balance_loss_mlp": 1.0030508, + "epoch": 0.8156983455175068, + "flos": 795820445952.0, + "grad_norm": 0.03774301364361203, + "language_loss": 0.85754836, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86797893, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.39990234, + "step": 4240, + "time_per_iteration": 3.0334153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_mlp": 1.00405586, + "epoch": 0.8158907272027703, + "flos": 687194492928.0, + "grad_norm": 0.0362603344870212, + "language_loss": 0.82029748, + "learning_rate": 8.628817947092616e-05, + "loss": 0.83073783, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.3996582, + "step": 4241, + "time_per_iteration": 2.873093843460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010462, + "balance_loss_mlp": 1.00631261, + "epoch": 0.8160831088880338, + "flos": 488030870016.0, + "grad_norm": 0.04708907661610768, + "language_loss": 0.84995806, + "learning_rate": 8.611330440911797e-05, + "loss": 0.86041999, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.39868164, + "step": 4242, + "time_per_iteration": 2.61018967628479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043886, + "balance_loss_mlp": 1.00390291, + "epoch": 0.8162754905732974, + "flos": 465822948864.0, + "grad_norm": 0.0364863486615585, + "language_loss": 0.80885643, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81929529, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.3996582, + "step": 4243, + "time_per_iteration": 2.594784736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043095, + "balance_loss_mlp": 1.00408936, + "epoch": 0.816467872258561, + "flos": 1242145601280.0, + "grad_norm": 0.0055455657933980934, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76328218, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.38964844, + "step": 4244, + "time_per_iteration": 4.718213081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_mlp": 1.00290811, + "epoch": 0.8166602539438246, + "flos": 688403748096.0, + "grad_norm": 0.03188594993660783, + "language_loss": 0.8693856, + "learning_rate": 8.558964360534615e-05, + "loss": 0.8798157, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.40087891, + "step": 4245, + "time_per_iteration": 2.9532947540283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043255, + "balance_loss_mlp": 1.00424957, + "epoch": 0.8168526356290882, + "flos": 1493919611136.0, + "grad_norm": 0.0050365971652065996, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.74017996, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.38964844, + "step": 4246, + "time_per_iteration": 4.9726879596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_mlp": 1.00471079, + "epoch": 0.8170450173143516, + "flos": 579300506880.0, + "grad_norm": 0.03921262861389307, + "language_loss": 0.84971178, + "learning_rate": 8.524134073172984e-05, + "loss": 0.86015892, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.39990234, + "step": 4247, + "time_per_iteration": 2.737804651260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_mlp": 1.00498414, + "epoch": 0.8172373989996152, + "flos": 572438660352.0, + "grad_norm": 0.034223737538548314, + "language_loss": 0.85163987, + "learning_rate": 8.506743079651974e-05, + "loss": 0.86208975, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.39990234, + "step": 4248, + "time_per_iteration": 2.743518352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045023, + "balance_loss_mlp": 1.00501621, + "epoch": 0.8174297806848788, + "flos": 529859060736.0, + "grad_norm": 0.04399953494353778, + "language_loss": 0.8105247, + "learning_rate": 8.489368195241948e-05, + "loss": 0.82097489, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.39990234, + "step": 4249, + "time_per_iteration": 2.6323330402374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044479, + "balance_loss_mlp": 1.00451958, + "epoch": 0.8176221623701424, + "flos": 570269971200.0, + "grad_norm": 0.038692902180605414, + "language_loss": 0.79434025, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80478501, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.39941406, + "step": 4250, + "time_per_iteration": 2.815521240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045712, + "balance_loss_mlp": 1.00572968, + "epoch": 0.8178145440554059, + "flos": 657707708160.0, + "grad_norm": 0.043415035680912505, + "language_loss": 0.80942428, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81988138, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.3996582, + "step": 4251, + "time_per_iteration": 2.8530821800231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_mlp": 1.00503504, + "epoch": 0.8180069257406695, + "flos": 547056929280.0, + "grad_norm": 0.03736687564854558, + "language_loss": 0.88220131, + "learning_rate": 8.437340264101828e-05, + "loss": 0.8926518, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.39990234, + "step": 4252, + "time_per_iteration": 2.708724021911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_mlp": 1.004143, + "epoch": 0.818199307425933, + "flos": 620412281856.0, + "grad_norm": 0.03556485769365952, + "language_loss": 0.85256195, + "learning_rate": 8.420029883528474e-05, + "loss": 0.86300319, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.3996582, + "step": 4253, + "time_per_iteration": 2.7210686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044408, + "balance_loss_mlp": 1.00437737, + "epoch": 0.8183916891111966, + "flos": 648935741952.0, + "grad_norm": 0.0363052045214381, + "language_loss": 0.77293622, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78338039, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.40014648, + "step": 4254, + "time_per_iteration": 2.8884494304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_mlp": 1.00440454, + "epoch": 0.8185840707964602, + "flos": 500103012864.0, + "grad_norm": 0.03824906610181014, + "language_loss": 0.78647155, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79691517, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.39941406, + "step": 4255, + "time_per_iteration": 2.5975873470306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_mlp": 1.00541151, + "epoch": 0.8187764524817237, + "flos": 787612283904.0, + "grad_norm": 0.03247097866724817, + "language_loss": 0.8011173, + "learning_rate": 8.368195625315251e-05, + "loss": 0.81157076, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.39916992, + "step": 4256, + "time_per_iteration": 3.1064188480377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044034, + "balance_loss_mlp": 1.00409889, + "epoch": 0.8189688341669873, + "flos": 551787025152.0, + "grad_norm": 0.03028491007701996, + "language_loss": 0.81150901, + "learning_rate": 8.350949856106283e-05, + "loss": 0.82194936, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.39916992, + "step": 4257, + "time_per_iteration": 2.832043409347534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047611, + "balance_loss_mlp": 1.00860596, + "epoch": 0.8191612158522509, + "flos": 1354883410944.0, + "grad_norm": 0.005783385057534148, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72196954, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.38964844, + "step": 4258, + "time_per_iteration": 4.853669881820679 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045252, + "balance_loss_mlp": 1.00529289, + "epoch": 0.8193535975375145, + "flos": 545300399616.0, + "grad_norm": 0.04084284494867611, + "language_loss": 0.84286118, + "learning_rate": 8.316506833163318e-05, + "loss": 0.85331368, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.39941406, + "step": 4259, + "time_per_iteration": 2.628735065460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045378, + "balance_loss_mlp": 1.0054425, + "epoch": 0.8195459792227779, + "flos": 867228740352.0, + "grad_norm": 0.029318190080420886, + "language_loss": 0.8587026, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86915636, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.39916992, + "step": 4260, + "time_per_iteration": 3.0968639850616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043987, + "balance_loss_mlp": 1.00400376, + "epoch": 0.8197383609080415, + "flos": 570410922240.0, + "grad_norm": 0.03757495975364379, + "language_loss": 0.82012129, + "learning_rate": 8.282128542083101e-05, + "loss": 0.83056116, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.3996582, + "step": 4261, + "time_per_iteration": 2.7136785984039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_mlp": 1.0044421, + "epoch": 0.8199307425933051, + "flos": 531886798848.0, + "grad_norm": 0.03681727702304477, + "language_loss": 0.85360086, + "learning_rate": 8.264963687678978e-05, + "loss": 0.8640449, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.39941406, + "step": 4262, + "time_per_iteration": 2.646632432937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_mlp": 1.00508881, + "epoch": 0.8201231242785687, + "flos": 568231539456.0, + "grad_norm": 0.034428735556058375, + "language_loss": 0.85892022, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86937046, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.39916992, + "step": 4263, + "time_per_iteration": 2.7189278602600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044973, + "balance_loss_mlp": 1.00503826, + "epoch": 0.8203155059638323, + "flos": 1232385055488.0, + "grad_norm": 0.043350247910763196, + "language_loss": 0.83505571, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84550548, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.39916992, + "step": 4264, + "time_per_iteration": 3.5374419689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_mlp": 1.00456357, + "epoch": 0.8205078876490958, + "flos": 575280023808.0, + "grad_norm": 0.03192227347796584, + "language_loss": 0.80383801, + "learning_rate": 8.213566368959558e-05, + "loss": 0.81428391, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.40014648, + "step": 4265, + "time_per_iteration": 2.681304931640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_mlp": 1.00418675, + "epoch": 0.8207002693343594, + "flos": 932986388736.0, + "grad_norm": 0.03621668430838832, + "language_loss": 0.79055989, + "learning_rate": 8.196466366388744e-05, + "loss": 0.80100161, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.3996582, + "step": 4266, + "time_per_iteration": 3.221090316772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045676, + "balance_loss_mlp": 1.00578809, + "epoch": 0.8208926510196229, + "flos": 550660395264.0, + "grad_norm": 0.04514159408391212, + "language_loss": 0.81006944, + "learning_rate": 8.179382593389029e-05, + "loss": 0.82052624, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.39868164, + "step": 4267, + "time_per_iteration": 2.64736270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_mlp": 1.0050478, + "epoch": 0.8210850327048865, + "flos": 649413030144.0, + "grad_norm": 0.03228047800877003, + "language_loss": 0.82577145, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83622265, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.40063477, + "step": 4268, + "time_per_iteration": 2.82000994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046586, + "balance_loss_mlp": 1.00669813, + "epoch": 0.82127741439015, + "flos": 602698241280.0, + "grad_norm": 0.03546193069409799, + "language_loss": 0.81945211, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82991797, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.39868164, + "step": 4269, + "time_per_iteration": 2.735588550567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046582, + "balance_loss_mlp": 1.00664651, + "epoch": 0.8214697960754136, + "flos": 475854714624.0, + "grad_norm": 0.033877079052907766, + "language_loss": 0.84021544, + "learning_rate": 8.128228718110015e-05, + "loss": 0.85068125, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.39916992, + "step": 4270, + "time_per_iteration": 2.667363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045275, + "balance_loss_mlp": 1.00529253, + "epoch": 0.8216621777606772, + "flos": 905094773760.0, + "grad_norm": 0.03887576075130339, + "language_loss": 0.85440713, + "learning_rate": 8.11120992965671e-05, + "loss": 0.86485988, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.3996582, + "step": 4271, + "time_per_iteration": 3.068880558013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044467, + "balance_loss_mlp": 1.00446057, + "epoch": 0.8218545594459408, + "flos": 515496719616.0, + "grad_norm": 0.035519082612497935, + "language_loss": 0.82364231, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83408695, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.39990234, + "step": 4272, + "time_per_iteration": 2.6724655628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044288, + "balance_loss_mlp": 1.00425744, + "epoch": 0.8220469411312044, + "flos": 495559554816.0, + "grad_norm": 0.0323215818844035, + "language_loss": 0.86376536, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87420821, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.40014648, + "step": 4273, + "time_per_iteration": 2.6066205501556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044867, + "balance_loss_mlp": 1.00486076, + "epoch": 0.8222393228164678, + "flos": 387276741888.0, + "grad_norm": 0.04119506633036295, + "language_loss": 0.90363312, + "learning_rate": 8.060251166717835e-05, + "loss": 0.91408181, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.39990234, + "step": 4274, + "time_per_iteration": 2.4332804679870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104479, + "balance_loss_mlp": 1.00485444, + "epoch": 0.8224317045017314, + "flos": 537630763776.0, + "grad_norm": 0.03442772169242134, + "language_loss": 0.87371385, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88416171, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.39916992, + "step": 4275, + "time_per_iteration": 2.6473186016082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043826, + "balance_loss_mlp": 1.00384283, + "epoch": 0.822624086186995, + "flos": 555948459264.0, + "grad_norm": 0.03742835179571848, + "language_loss": 0.8278271, + "learning_rate": 8.02636005937346e-05, + "loss": 0.8382653, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.3996582, + "step": 4276, + "time_per_iteration": 2.646347999572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044148, + "balance_loss_mlp": 1.00418925, + "epoch": 0.8228164678722586, + "flos": 540718057728.0, + "grad_norm": 0.032194648588505737, + "language_loss": 0.80336571, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81380719, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.39941406, + "step": 4277, + "time_per_iteration": 2.7481751441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045066, + "balance_loss_mlp": 1.00517821, + "epoch": 0.8230088495575221, + "flos": 474263435520.0, + "grad_norm": 0.04222743851278786, + "language_loss": 0.79908466, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80953538, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.39868164, + "step": 4278, + "time_per_iteration": 2.649730920791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045006, + "balance_loss_mlp": 1.00504684, + "epoch": 0.8232012312427857, + "flos": 592751046144.0, + "grad_norm": 0.03975207751077369, + "language_loss": 0.83853042, + "learning_rate": 7.975645631856127e-05, + "loss": 0.84898043, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.39941406, + "step": 4279, + "time_per_iteration": 2.6517245769500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_mlp": 1.00475395, + "epoch": 0.8233936129280492, + "flos": 573788866560.0, + "grad_norm": 0.034088399185727584, + "language_loss": 0.75156295, + "learning_rate": 7.958773444541916e-05, + "loss": 0.76200867, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.39794922, + "step": 4280, + "time_per_iteration": 2.790695905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_mlp": 1.0049988, + "epoch": 0.8235859946133128, + "flos": 732750571008.0, + "grad_norm": 0.030832979934739466, + "language_loss": 0.78604949, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79649758, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.39794922, + "step": 4281, + "time_per_iteration": 2.999879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043865, + "balance_loss_mlp": 1.0039773, + "epoch": 0.8237783762985764, + "flos": 571398546432.0, + "grad_norm": 0.03880920292514566, + "language_loss": 0.82002759, + "learning_rate": 7.92507804201253e-05, + "loss": 0.83046621, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.39868164, + "step": 4282, + "time_per_iteration": 2.677208423614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104578, + "balance_loss_mlp": 1.0067749, + "epoch": 0.8239707579838399, + "flos": 1469427327744.0, + "grad_norm": 0.006396202661854135, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76343453, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.38964844, + "step": 4283, + "time_per_iteration": 4.968418121337891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_mlp": 1.00370288, + "epoch": 0.8241631396691035, + "flos": 468297839616.0, + "grad_norm": 0.046097230790764596, + "language_loss": 0.8120932, + "learning_rate": 7.89144797921037e-05, + "loss": 0.82252717, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.39672852, + "step": 4284, + "time_per_iteration": 2.6612024307250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044239, + "balance_loss_mlp": 1.0054245, + "epoch": 0.8243555213543671, + "flos": 1542552301056.0, + "grad_norm": 0.004349611814925143, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78978509, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.38769531, + "step": 4285, + "time_per_iteration": 4.981449842453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_mlp": 1.00584543, + "epoch": 0.8245479030396307, + "flos": 798863998464.0, + "grad_norm": 0.03989950535073509, + "language_loss": 0.83088112, + "learning_rate": 7.85788330836078e-05, + "loss": 0.8413372, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.39746094, + "step": 4286, + "time_per_iteration": 3.1188526153564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_mlp": 1.00571775, + "epoch": 0.8247402847248941, + "flos": 647400843264.0, + "grad_norm": 0.03590906041018328, + "language_loss": 0.76881772, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77927309, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.39794922, + "step": 4287, + "time_per_iteration": 2.9174938201904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044954, + "balance_loss_mlp": 1.00511408, + "epoch": 0.8249326664101577, + "flos": 605620284672.0, + "grad_norm": 0.03219362212927226, + "language_loss": 0.80230033, + "learning_rate": 7.824384081587637e-05, + "loss": 0.81274986, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.39819336, + "step": 4288, + "time_per_iteration": 2.795452833175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044781, + "balance_loss_mlp": 1.00496483, + "epoch": 0.8251250480954213, + "flos": 825828260352.0, + "grad_norm": 0.041963910969405445, + "language_loss": 0.86787474, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87832254, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.39794922, + "step": 4289, + "time_per_iteration": 3.134443759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_mlp": 1.00264227, + "epoch": 0.8253174297806849, + "flos": 758676664320.0, + "grad_norm": 0.039060289997601944, + "language_loss": 0.78858769, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79901201, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.39770508, + "step": 4290, + "time_per_iteration": 2.944941520690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_mlp": 1.00305033, + "epoch": 0.8255098114659485, + "flos": 795994444800.0, + "grad_norm": 0.037141819973277965, + "language_loss": 0.87861943, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88904715, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.39697266, + "step": 4291, + "time_per_iteration": 3.1870129108428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043682, + "balance_loss_mlp": 1.0038898, + "epoch": 0.825702193151212, + "flos": 711681918720.0, + "grad_norm": 0.035866698346178935, + "language_loss": 0.7753849, + "learning_rate": 7.757582168257731e-05, + "loss": 0.7858218, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.39770508, + "step": 4292, + "time_per_iteration": 2.85308575630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043056, + "balance_loss_mlp": 1.00321639, + "epoch": 0.8258945748364755, + "flos": 684670024704.0, + "grad_norm": 0.03268721539583558, + "language_loss": 0.81239599, + "learning_rate": 7.740922673634537e-05, + "loss": 0.82282656, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.39819336, + "step": 4293, + "time_per_iteration": 2.9332666397094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_mlp": 1.00339425, + "epoch": 0.8260869565217391, + "flos": 595681837824.0, + "grad_norm": 0.03866521927101234, + "language_loss": 0.79496479, + "learning_rate": 7.724279585440186e-05, + "loss": 0.80539787, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.39892578, + "step": 4294, + "time_per_iteration": 2.710196018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044652, + "balance_loss_mlp": 1.00466919, + "epoch": 0.8262793382070027, + "flos": 652653914112.0, + "grad_norm": 0.035640366708924454, + "language_loss": 0.85982841, + "learning_rate": 7.707652910136098e-05, + "loss": 0.8702749, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.3996582, + "step": 4295, + "time_per_iteration": 2.7833869457244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046294, + "balance_loss_mlp": 1.00628734, + "epoch": 0.8264717198922663, + "flos": 539957900544.0, + "grad_norm": 0.03542923648415416, + "language_loss": 0.84949446, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85995746, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.39990234, + "step": 4296, + "time_per_iteration": 2.6374382972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_mlp": 1.00437677, + "epoch": 0.8266641015775298, + "flos": 539994839040.0, + "grad_norm": 0.04217853595107177, + "language_loss": 0.76282918, + "learning_rate": 7.674448824012514e-05, + "loss": 0.77327377, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.40063477, + "step": 4297, + "time_per_iteration": 2.7391257286071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_mlp": 1.0064801, + "epoch": 0.8268564832627934, + "flos": 586503548160.0, + "grad_norm": 0.03264457137254003, + "language_loss": 0.84539366, + "learning_rate": 7.657871426083979e-05, + "loss": 0.8558585, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.39990234, + "step": 4298, + "time_per_iteration": 2.7992489337921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045276, + "balance_loss_mlp": 1.00526941, + "epoch": 0.827048864948057, + "flos": 431571075072.0, + "grad_norm": 0.03940875322434759, + "language_loss": 0.84735167, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85780442, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.39990234, + "step": 4299, + "time_per_iteration": 2.4731128215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_mlp": 1.00542581, + "epoch": 0.8272412466333205, + "flos": 1390502032128.0, + "grad_norm": 0.03504583315652351, + "language_loss": 0.8553803, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86583406, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.39941406, + "step": 4300, + "time_per_iteration": 3.7725141048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_mlp": 1.00410068, + "epoch": 0.827433628318584, + "flos": 539350354944.0, + "grad_norm": 0.03755336156042174, + "language_loss": 0.83188915, + "learning_rate": 7.608237890043335e-05, + "loss": 0.84232998, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.3996582, + "step": 4301, + "time_per_iteration": 2.6834564208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_mlp": 1.00330544, + "epoch": 0.8276260100038476, + "flos": 732064290816.0, + "grad_norm": 0.03864569373591978, + "language_loss": 0.78056109, + "learning_rate": 7.59172628535526e-05, + "loss": 0.79099381, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.39941406, + "step": 4302, + "time_per_iteration": 2.933929920196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043683, + "balance_loss_mlp": 1.00372398, + "epoch": 0.8278183916891112, + "flos": 872662612992.0, + "grad_norm": 0.033669788377383415, + "language_loss": 0.82804894, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83848584, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.39941406, + "step": 4303, + "time_per_iteration": 3.2120118141174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045387, + "balance_loss_mlp": 1.00554717, + "epoch": 0.8280107733743748, + "flos": 595699334400.0, + "grad_norm": 0.033210409698881456, + "language_loss": 0.78002685, + "learning_rate": 7.558752475439134e-05, + "loss": 0.79048073, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.39819336, + "step": 4304, + "time_per_iteration": 2.777714490890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_mlp": 1.00468552, + "epoch": 0.8282031550596384, + "flos": 770028500736.0, + "grad_norm": 0.03499833227551203, + "language_loss": 0.84353423, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85397851, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.3972168, + "step": 4305, + "time_per_iteration": 3.0219714641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045284, + "balance_loss_mlp": 1.00551581, + "epoch": 0.8283955367449019, + "flos": 697447889664.0, + "grad_norm": 0.03481801368106837, + "language_loss": 0.78346533, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79391819, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.39746094, + "step": 4306, + "time_per_iteration": 2.9294331073760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045461, + "balance_loss_mlp": 1.00566852, + "epoch": 0.8285879184301654, + "flos": 661939128576.0, + "grad_norm": 0.04153040782978192, + "language_loss": 0.83166927, + "learning_rate": 7.509415355178806e-05, + "loss": 0.84212393, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.39770508, + "step": 4307, + "time_per_iteration": 2.9238927364349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_mlp": 1.0061034, + "epoch": 0.828780300115429, + "flos": 559773556224.0, + "grad_norm": 0.04048281455959126, + "language_loss": 0.78020167, + "learning_rate": 7.493002632534618e-05, + "loss": 0.79065967, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.39672852, + "step": 4308, + "time_per_iteration": 2.6687874794006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045287, + "balance_loss_mlp": 1.00547111, + "epoch": 0.8289726818006926, + "flos": 832373211648.0, + "grad_norm": 0.03463570750325123, + "language_loss": 0.8228085, + "learning_rate": 7.476606412570352e-05, + "loss": 0.83326137, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.39794922, + "step": 4309, + "time_per_iteration": 3.030407667160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_mlp": 1.00534868, + "epoch": 0.8291650634859561, + "flos": 733555448064.0, + "grad_norm": 0.0357732652689289, + "language_loss": 0.81524992, + "learning_rate": 7.460226701651624e-05, + "loss": 0.82570136, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.39770508, + "step": 4310, + "time_per_iteration": 2.9470043182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104542, + "balance_loss_mlp": 1.00555694, + "epoch": 0.8293574451712197, + "flos": 862470454272.0, + "grad_norm": 0.03315775834141588, + "language_loss": 0.81509542, + "learning_rate": 7.443863506137566e-05, + "loss": 0.8255496, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.3984375, + "step": 4311, + "time_per_iteration": 3.262594223022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045102, + "balance_loss_mlp": 1.0053333, + "epoch": 0.8295498268564833, + "flos": 496291521792.0, + "grad_norm": 0.030294085038389356, + "language_loss": 0.82037485, + "learning_rate": 7.427516832380948e-05, + "loss": 0.83082587, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.39746094, + "step": 4312, + "time_per_iteration": 2.9078259468078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045472, + "balance_loss_mlp": 1.00570416, + "epoch": 0.8297422085417469, + "flos": 555655863552.0, + "grad_norm": 0.029980290167267002, + "language_loss": 0.78229713, + "learning_rate": 7.4111866867281e-05, + "loss": 0.79275185, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.39746094, + "step": 4313, + "time_per_iteration": 2.8011112213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044992, + "balance_loss_mlp": 1.00529504, + "epoch": 0.8299345902270104, + "flos": 1249489605120.0, + "grad_norm": 0.0352855921199785, + "language_loss": 0.7777639, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78821379, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.39672852, + "step": 4314, + "time_per_iteration": 3.6537117958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_mlp": 1.0053004, + "epoch": 0.8301269719122739, + "flos": 586410229248.0, + "grad_norm": 0.035731998635991455, + "language_loss": 0.83257413, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84302461, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.3972168, + "step": 4315, + "time_per_iteration": 2.726372003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_mlp": 1.00410736, + "epoch": 0.8303193535975375, + "flos": 510777317376.0, + "grad_norm": 0.03881103976705545, + "language_loss": 0.85426903, + "learning_rate": 7.362295481759412e-05, + "loss": 0.86470878, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.3984375, + "step": 4316, + "time_per_iteration": 2.6592206954956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_mlp": 1.00502753, + "epoch": 0.8305117352828011, + "flos": 581766649344.0, + "grad_norm": 0.036149535286214125, + "language_loss": 0.84061778, + "learning_rate": 7.346031511856722e-05, + "loss": 0.8510648, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.39648438, + "step": 4317, + "time_per_iteration": 2.6798949241638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_mlp": 1.00519013, + "epoch": 0.8307041169680647, + "flos": 482649487104.0, + "grad_norm": 0.03503303248285494, + "language_loss": 0.79233706, + "learning_rate": 7.329784101693232e-05, + "loss": 0.80278593, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.39672852, + "step": 4318, + "time_per_iteration": 2.6182241439819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045375, + "balance_loss_mlp": 1.00567794, + "epoch": 0.8308964986533282, + "flos": 625754780928.0, + "grad_norm": 0.03813638637537738, + "language_loss": 0.83587325, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84632701, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.39672852, + "step": 4319, + "time_per_iteration": 2.722752571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104435, + "balance_loss_mlp": 1.0046773, + "epoch": 0.8310888803385917, + "flos": 828706562304.0, + "grad_norm": 0.03928829475188373, + "language_loss": 0.79520625, + "learning_rate": 7.297338985808589e-05, + "loss": 0.80564976, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.39648438, + "step": 4320, + "time_per_iteration": 3.013678789138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00594461, + "epoch": 0.8312812620238553, + "flos": 584947262208.0, + "grad_norm": 0.03143191383809492, + "language_loss": 0.82371467, + "learning_rate": 7.281141292683746e-05, + "loss": 0.83417112, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.39672852, + "step": 4321, + "time_per_iteration": 2.7931981086730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044244, + "balance_loss_mlp": 1.00459492, + "epoch": 0.8314736437091189, + "flos": 1117370653440.0, + "grad_norm": 0.03967418669212243, + "language_loss": 0.75441504, + "learning_rate": 7.26496018449071e-05, + "loss": 0.76485747, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.39624023, + "step": 4322, + "time_per_iteration": 3.406388759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044382, + "balance_loss_mlp": 1.00473273, + "epoch": 0.8316660253943825, + "flos": 518559714048.0, + "grad_norm": 0.037215858057632896, + "language_loss": 0.82538068, + "learning_rate": 7.248795667511543e-05, + "loss": 0.83582449, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.39624023, + "step": 4323, + "time_per_iteration": 2.790639877319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_mlp": 1.00444412, + "epoch": 0.831858407079646, + "flos": 796697254656.0, + "grad_norm": 0.03541243399585954, + "language_loss": 0.78485489, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79529536, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.39575195, + "step": 4324, + "time_per_iteration": 2.975816249847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043202, + "balance_loss_mlp": 1.00360107, + "epoch": 0.8320507887649096, + "flos": 551042419200.0, + "grad_norm": 0.03671170113151978, + "language_loss": 0.83109081, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84152287, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.39575195, + "step": 4325, + "time_per_iteration": 2.658466339111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043112, + "balance_loss_mlp": 1.00355887, + "epoch": 0.8322431704501732, + "flos": 480352485888.0, + "grad_norm": 0.03962331915706713, + "language_loss": 0.82560384, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83603495, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.39526367, + "step": 4326, + "time_per_iteration": 2.514432907104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043175, + "balance_loss_mlp": 1.0036217, + "epoch": 0.8324355521354367, + "flos": 573547793664.0, + "grad_norm": 0.031905797463172826, + "language_loss": 0.85702962, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86746132, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.39526367, + "step": 4327, + "time_per_iteration": 2.6735494136810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_mlp": 1.00399959, + "epoch": 0.8326279338207002, + "flos": 504440391168.0, + "grad_norm": 0.03284253573925844, + "language_loss": 0.82986456, + "learning_rate": 7.168222170244888e-05, + "loss": 0.84030032, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.39550781, + "step": 4328, + "time_per_iteration": 2.5880331993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_mlp": 1.00509644, + "epoch": 0.8328203155059638, + "flos": 606951048960.0, + "grad_norm": 0.032340535352269904, + "language_loss": 0.81377709, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82422405, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.39575195, + "step": 4329, + "time_per_iteration": 2.9070374965667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044592, + "balance_loss_mlp": 1.0050385, + "epoch": 0.8330126971912274, + "flos": 699123739392.0, + "grad_norm": 0.03346059950715292, + "language_loss": 0.86209023, + "learning_rate": 7.136109128985663e-05, + "loss": 0.87253612, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.39526367, + "step": 4330, + "time_per_iteration": 2.884406328201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044072, + "balance_loss_mlp": 1.00442338, + "epoch": 0.833205078876491, + "flos": 495021028608.0, + "grad_norm": 0.03393420593048976, + "language_loss": 0.87050742, + "learning_rate": 7.120077567098249e-05, + "loss": 0.88094813, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.39624023, + "step": 4331, + "time_per_iteration": 2.5215518474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_mlp": 1.00428641, + "epoch": 0.8333974605617546, + "flos": 483795558912.0, + "grad_norm": 0.03242489973707072, + "language_loss": 0.83629441, + "learning_rate": 7.104062652673115e-05, + "loss": 0.84673423, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.39672852, + "step": 4332, + "time_per_iteration": 2.589259147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_mlp": 1.00282395, + "epoch": 0.833589842247018, + "flos": 688041166080.0, + "grad_norm": 0.05811234258631201, + "language_loss": 0.83496767, + "learning_rate": 7.088064391927818e-05, + "loss": 0.84539282, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.39672852, + "step": 4333, + "time_per_iteration": 2.803917646408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055734, + "balance_loss_mlp": 1.01606107, + "epoch": 0.8337822239322816, + "flos": 883193054208.0, + "grad_norm": 0.035373002810175205, + "language_loss": 0.83053595, + "learning_rate": 7.072082791073419e-05, + "loss": 0.8410933, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.39648438, + "step": 4334, + "time_per_iteration": 3.102529525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105553, + "balance_loss_mlp": 1.01578557, + "epoch": 0.8339746056175452, + "flos": 498157900032.0, + "grad_norm": 0.03915786312504082, + "language_loss": 0.83033496, + "learning_rate": 7.056117856314531e-05, + "loss": 0.84089029, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.3972168, + "step": 4335, + "time_per_iteration": 2.581801652908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_mlp": 1.01480997, + "epoch": 0.8341669873028088, + "flos": 511504426752.0, + "grad_norm": 0.03653377774815775, + "language_loss": 0.86941135, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87995613, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.39648438, + "step": 4336, + "time_per_iteration": 2.5874557495117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053038, + "balance_loss_mlp": 1.0132935, + "epoch": 0.8343593689880723, + "flos": 693542112768.0, + "grad_norm": 0.042823891239838895, + "language_loss": 0.84834361, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85887402, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.3972168, + "step": 4337, + "time_per_iteration": 2.831866979598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_mlp": 1.00987899, + "epoch": 0.8345517506733359, + "flos": 553517309952.0, + "grad_norm": 0.03514600102944956, + "language_loss": 0.78723717, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79773366, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.39746094, + "step": 4338, + "time_per_iteration": 2.805140495300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049491, + "balance_loss_mlp": 1.00981832, + "epoch": 0.8347441323585995, + "flos": 593268185088.0, + "grad_norm": 0.03562490658718948, + "language_loss": 0.76787317, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77836812, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.39648438, + "step": 4339, + "time_per_iteration": 2.823744535446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050135, + "balance_loss_mlp": 1.01043832, + "epoch": 0.834936514043863, + "flos": 616092400128.0, + "grad_norm": 0.03219895087392271, + "language_loss": 0.85226107, + "learning_rate": 6.976543390660983e-05, + "loss": 0.86276239, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.39672852, + "step": 4340, + "time_per_iteration": 2.7585527896881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049678, + "balance_loss_mlp": 1.00998175, + "epoch": 0.8351288957291266, + "flos": 468864556032.0, + "grad_norm": 0.03659606336677384, + "language_loss": 0.80055946, + "learning_rate": 6.960678582409424e-05, + "loss": 0.81105626, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.39672852, + "step": 4341, + "time_per_iteration": 2.6036171913146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044054, + "balance_loss_mlp": 1.00428581, + "epoch": 0.8353212774143901, + "flos": 510349606656.0, + "grad_norm": 0.03169243784279604, + "language_loss": 0.79527783, + "learning_rate": 6.944830483504328e-05, + "loss": 0.8057183, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.39746094, + "step": 4342, + "time_per_iteration": 2.655421018600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_mlp": 1.00411177, + "epoch": 0.8355136590996537, + "flos": 689018096640.0, + "grad_norm": 0.03436088909183797, + "language_loss": 0.81165028, + "learning_rate": 6.928999100098483e-05, + "loss": 0.82208884, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.3972168, + "step": 4343, + "time_per_iteration": 2.826841115951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044821, + "balance_loss_mlp": 1.00512445, + "epoch": 0.8357060407849173, + "flos": 985976865792.0, + "grad_norm": 0.04054182242673009, + "language_loss": 0.84078169, + "learning_rate": 6.913184438338138e-05, + "loss": 0.85122991, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.39672852, + "step": 4344, + "time_per_iteration": 3.2107176780700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_mlp": 1.00431824, + "epoch": 0.8358984224701809, + "flos": 844508537856.0, + "grad_norm": 0.030813015519650187, + "language_loss": 0.85689914, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86733955, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.39697266, + "step": 4345, + "time_per_iteration": 3.163668155670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_mlp": 1.00331438, + "epoch": 0.8360908041554445, + "flos": 627419937024.0, + "grad_norm": 0.03291313388233015, + "language_loss": 0.82514834, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83557892, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.3972168, + "step": 4346, + "time_per_iteration": 2.7415146827697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_mlp": 1.00408411, + "epoch": 0.8362831858407079, + "flos": 577223191296.0, + "grad_norm": 0.0302998467328884, + "language_loss": 0.85529792, + "learning_rate": 6.865840844295796e-05, + "loss": 0.86573529, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.39624023, + "step": 4347, + "time_per_iteration": 2.7346980571746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_mlp": 1.00418234, + "epoch": 0.8364755675259715, + "flos": 835184439552.0, + "grad_norm": 0.03946015330884712, + "language_loss": 0.81571031, + "learning_rate": 6.850093130450569e-05, + "loss": 0.82614934, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.39697266, + "step": 4348, + "time_per_iteration": 3.08042573928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_mlp": 1.00388896, + "epoch": 0.8366679492112351, + "flos": 583564008192.0, + "grad_norm": 0.038362808412147696, + "language_loss": 0.86538804, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87582415, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.39697266, + "step": 4349, + "time_per_iteration": 2.7079648971557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043699, + "balance_loss_mlp": 1.00400245, + "epoch": 0.8368603308964987, + "flos": 612881651712.0, + "grad_norm": 0.03780805961056685, + "language_loss": 0.87590146, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88633847, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.39672852, + "step": 4350, + "time_per_iteration": 2.783841609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_mlp": 1.00321651, + "epoch": 0.8370527125817622, + "flos": 508265488128.0, + "grad_norm": 0.031770468246229984, + "language_loss": 0.85780954, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86823821, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.39624023, + "step": 4351, + "time_per_iteration": 2.735769748687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042672, + "balance_loss_mlp": 1.00302303, + "epoch": 0.8372450942670258, + "flos": 772283705856.0, + "grad_norm": 0.03277928493486849, + "language_loss": 0.82987893, + "learning_rate": 6.787269858905603e-05, + "loss": 0.84030557, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.39624023, + "step": 4352, + "time_per_iteration": 2.9203648567199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043841, + "balance_loss_mlp": 1.00428724, + "epoch": 0.8374374759522893, + "flos": 580362008064.0, + "grad_norm": 0.034034807171666244, + "language_loss": 0.85397196, + "learning_rate": 6.771605967466033e-05, + "loss": 0.8644104, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.39526367, + "step": 4353, + "time_per_iteration": 2.720546007156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_mlp": 1.00159764, + "epoch": 0.8376298576375529, + "flos": 789529206528.0, + "grad_norm": 0.04449117180273345, + "language_loss": 0.82807922, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83849216, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.39672852, + "step": 4354, + "time_per_iteration": 3.007485866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042862, + "balance_loss_mlp": 1.00309372, + "epoch": 0.8378222393228165, + "flos": 578723096832.0, + "grad_norm": 0.032911278141950814, + "language_loss": 0.81327355, + "learning_rate": 6.74032853891452e-05, + "loss": 0.82370222, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.39746094, + "step": 4355, + "time_per_iteration": 2.750502824783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_mlp": 1.00272167, + "epoch": 0.83801462100808, + "flos": 481859194368.0, + "grad_norm": 0.03244725858098954, + "language_loss": 0.82614964, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83657384, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.39672852, + "step": 4356, + "time_per_iteration": 2.648829936981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042528, + "balance_loss_mlp": 1.00283134, + "epoch": 0.8382070026933436, + "flos": 551997962496.0, + "grad_norm": 0.03119989334307816, + "language_loss": 0.8965174, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90694273, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.39672852, + "step": 4357, + "time_per_iteration": 2.776762008666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104287, + "balance_loss_mlp": 1.00310183, + "epoch": 0.8383993843786072, + "flos": 626226233088.0, + "grad_norm": 0.0387641353530007, + "language_loss": 0.82408631, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83451504, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.39746094, + "step": 4358, + "time_per_iteration": 2.8670356273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_mlp": 1.00497019, + "epoch": 0.8385917660638708, + "flos": 492135923712.0, + "grad_norm": 0.03605120183669825, + "language_loss": 0.86969417, + "learning_rate": 6.677975268986719e-05, + "loss": 0.88014084, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.39672852, + "step": 4359, + "time_per_iteration": 2.556107759475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_mlp": 1.00478995, + "epoch": 0.8387841477491342, + "flos": 467870128896.0, + "grad_norm": 0.036141032042788915, + "language_loss": 0.875561, + "learning_rate": 6.662428984145336e-05, + "loss": 0.88600636, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.3972168, + "step": 4360, + "time_per_iteration": 2.5896832942962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046318, + "balance_loss_mlp": 1.00740814, + "epoch": 0.8389765294343978, + "flos": 1567600607232.0, + "grad_norm": 0.006654770635082277, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72826505, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.38867188, + "step": 4361, + "time_per_iteration": 5.005736351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_mlp": 1.0051924, + "epoch": 0.8391689111196614, + "flos": 603412711680.0, + "grad_norm": 0.02981143994477007, + "language_loss": 0.83398944, + "learning_rate": 6.631386895903308e-05, + "loss": 0.84443831, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.39672852, + "step": 4362, + "time_per_iteration": 2.860398530960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_mlp": 1.0072782, + "epoch": 0.839361292804925, + "flos": 443968861440.0, + "grad_norm": 0.03966524931271206, + "language_loss": 0.80562806, + "learning_rate": 6.615891104554261e-05, + "loss": 0.81609803, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.39697266, + "step": 4363, + "time_per_iteration": 2.480076789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046837, + "balance_loss_mlp": 1.0070926, + "epoch": 0.8395536744901886, + "flos": 595299813888.0, + "grad_norm": 0.04057022644943622, + "language_loss": 0.83120066, + "learning_rate": 6.600412156410057e-05, + "loss": 0.84166896, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.3972168, + "step": 4364, + "time_per_iteration": 2.7753520011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047198, + "balance_loss_mlp": 1.00743032, + "epoch": 0.8397460561754521, + "flos": 891336087552.0, + "grad_norm": 0.03526735316823496, + "language_loss": 0.85782105, + "learning_rate": 6.58495005748016e-05, + "loss": 0.86829305, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.39746094, + "step": 4365, + "time_per_iteration": 3.16624116897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045668, + "balance_loss_mlp": 1.00580478, + "epoch": 0.8399384378607156, + "flos": 554561314560.0, + "grad_norm": 0.03367711275726433, + "language_loss": 0.89177513, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90223181, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.3984375, + "step": 4366, + "time_per_iteration": 2.6069655418395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00489306, + "epoch": 0.8401308195459792, + "flos": 519964355328.0, + "grad_norm": 0.03275930619956067, + "language_loss": 0.83950633, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84995365, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.39819336, + "step": 4367, + "time_per_iteration": 2.6613383293151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045197, + "balance_loss_mlp": 1.00540471, + "epoch": 0.8403232012312428, + "flos": 686296296960.0, + "grad_norm": 0.036430309250403296, + "language_loss": 0.8146503, + "learning_rate": 6.538664915972648e-05, + "loss": 0.82510233, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.39770508, + "step": 4368, + "time_per_iteration": 2.995043992996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045982, + "balance_loss_mlp": 1.00618947, + "epoch": 0.8405155829165063, + "flos": 578670607104.0, + "grad_norm": 0.03783265596862743, + "language_loss": 0.78067476, + "learning_rate": 6.523270273863652e-05, + "loss": 0.79113454, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.39770508, + "step": 4369, + "time_per_iteration": 2.6606693267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_mlp": 1.00694239, + "epoch": 0.8407079646017699, + "flos": 457567154688.0, + "grad_norm": 0.04104173911837978, + "language_loss": 0.88549638, + "learning_rate": 6.507892510918079e-05, + "loss": 0.89596373, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.39770508, + "step": 4370, + "time_per_iteration": 2.5601558685302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047037, + "balance_loss_mlp": 1.00726855, + "epoch": 0.8409003462870335, + "flos": 536000600832.0, + "grad_norm": 0.03952335471853486, + "language_loss": 0.82247508, + "learning_rate": 6.492531633106114e-05, + "loss": 0.83294547, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.39746094, + "step": 4371, + "time_per_iteration": 2.771491527557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053168, + "balance_loss_mlp": 1.01339984, + "epoch": 0.8410927279722971, + "flos": 557900375040.0, + "grad_norm": 0.03882537955263822, + "language_loss": 0.7860809, + "learning_rate": 6.477187646391374e-05, + "loss": 0.79661262, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.39746094, + "step": 4372, + "time_per_iteration": 2.7046260833740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059166, + "balance_loss_mlp": 1.02044678, + "epoch": 0.8412851096575606, + "flos": 1552929152256.0, + "grad_norm": 0.010019431510238475, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78738284, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.38671875, + "step": 4373, + "time_per_iteration": 4.874579668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056957, + "balance_loss_mlp": 1.01718879, + "epoch": 0.8414774913428241, + "flos": 553109041152.0, + "grad_norm": 0.03716925944029823, + "language_loss": 0.79357052, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80414009, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.39746094, + "step": 4374, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_mlp": 1.01098144, + "epoch": 0.8416698730280877, + "flos": 574070768640.0, + "grad_norm": 0.03440342527742133, + "language_loss": 0.78264368, + "learning_rate": 6.431257092368336e-05, + "loss": 0.79315263, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.39892578, + "step": 4375, + "time_per_iteration": 2.6643028259277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050239, + "balance_loss_mlp": 1.01030397, + "epoch": 0.8418622547133513, + "flos": 760044367104.0, + "grad_norm": 0.03936876955125197, + "language_loss": 0.805475, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81597739, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.39916992, + "step": 4376, + "time_per_iteration": 2.8879754543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057887, + "balance_loss_mlp": 1.01795137, + "epoch": 0.8420546363986149, + "flos": 1075922541312.0, + "grad_norm": 0.04195291281382577, + "language_loss": 0.73367876, + "learning_rate": 6.40072128754366e-05, + "loss": 0.74425763, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.39916992, + "step": 4377, + "time_per_iteration": 3.3982491493225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_mlp": 1.01659334, + "epoch": 0.8422470180838784, + "flos": 527017697280.0, + "grad_norm": 0.03613137527395126, + "language_loss": 0.83433902, + "learning_rate": 6.385478772280933e-05, + "loss": 0.84490454, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.39941406, + "step": 4378, + "time_per_iteration": 2.7835891246795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053667, + "balance_loss_mlp": 1.01370764, + "epoch": 0.842439399769142, + "flos": 601964328960.0, + "grad_norm": 0.037358019375431845, + "language_loss": 0.82734925, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83788586, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.39941406, + "step": 4379, + "time_per_iteration": 2.748546600341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_mlp": 1.01895809, + "epoch": 0.8426317814544055, + "flos": 553376358912.0, + "grad_norm": 0.03714135398668163, + "language_loss": 0.87016404, + "learning_rate": 6.355044545643073e-05, + "loss": 0.88075352, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.3996582, + "step": 4380, + "time_per_iteration": 2.800340414047241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_mlp": 1.01922655, + "epoch": 0.8428241631396691, + "flos": 680045886720.0, + "grad_norm": 0.03742137043660712, + "language_loss": 0.78399694, + "learning_rate": 6.33985284608356e-05, + "loss": 0.7945888, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.39941406, + "step": 4381, + "time_per_iteration": 2.8040478229522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054594, + "balance_loss_mlp": 1.01473093, + "epoch": 0.8430165448249327, + "flos": 755199565056.0, + "grad_norm": 0.027180883037501744, + "language_loss": 0.80385518, + "learning_rate": 6.324678096896435e-05, + "loss": 0.81440109, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.3984375, + "step": 4382, + "time_per_iteration": 3.0603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052676, + "balance_loss_mlp": 1.01266921, + "epoch": 0.8432089265101962, + "flos": 700437007104.0, + "grad_norm": 0.036252263967316525, + "language_loss": 0.81660181, + "learning_rate": 6.30952030397306e-05, + "loss": 0.82712859, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.39990234, + "step": 4383, + "time_per_iteration": 2.8923966884613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_mlp": 1.01243746, + "epoch": 0.8434013081954598, + "flos": 486791479296.0, + "grad_norm": 0.043909328594933086, + "language_loss": 0.85683775, + "learning_rate": 6.294379473198208e-05, + "loss": 0.86736149, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.39916992, + "step": 4384, + "time_per_iteration": 2.7198166847229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_mlp": 1.01380002, + "epoch": 0.8435936898807234, + "flos": 521631456768.0, + "grad_norm": 0.03666745946070383, + "language_loss": 0.85839081, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86892796, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.39892578, + "step": 4385, + "time_per_iteration": 2.6209969520568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052098, + "balance_loss_mlp": 1.01199555, + "epoch": 0.843786071565987, + "flos": 787314830592.0, + "grad_norm": 0.035531415028739466, + "language_loss": 0.81066084, + "learning_rate": 6.264148721600254e-05, + "loss": 0.82118183, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.40087891, + "step": 4386, + "time_per_iteration": 3.0166499614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059635, + "balance_loss_mlp": 1.02062988, + "epoch": 0.8439784532512504, + "flos": 1449516407808.0, + "grad_norm": 0.007889310542636748, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76896149, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.38964844, + "step": 4387, + "time_per_iteration": 4.9381585121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052807, + "balance_loss_mlp": 1.01287234, + "epoch": 0.844170834936514, + "flos": 709969130496.0, + "grad_norm": 0.0430882842289394, + "language_loss": 0.82951272, + "learning_rate": 6.23398588904906e-05, + "loss": 0.8400408, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.39916992, + "step": 4388, + "time_per_iteration": 2.8580751419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00940061, + "epoch": 0.8443632166217776, + "flos": 484409907456.0, + "grad_norm": 0.03543864011776059, + "language_loss": 0.80285496, + "learning_rate": 6.218929957057922e-05, + "loss": 0.81334955, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.40039062, + "step": 4389, + "time_per_iteration": 2.6879217624664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048845, + "balance_loss_mlp": 1.00886238, + "epoch": 0.8445555983070412, + "flos": 679924377600.0, + "grad_norm": 0.03469492088540733, + "language_loss": 0.80821919, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81870764, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.3996582, + "step": 4390, + "time_per_iteration": 2.8265607357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048945, + "balance_loss_mlp": 1.00898623, + "epoch": 0.8447479799923048, + "flos": 742860104448.0, + "grad_norm": 0.03377233556180916, + "language_loss": 0.74732709, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75781655, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.39941406, + "step": 4391, + "time_per_iteration": 3.059424638748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048643, + "balance_loss_mlp": 1.00866032, + "epoch": 0.8449403616775683, + "flos": 954951291648.0, + "grad_norm": 0.03403484020500879, + "language_loss": 0.8089571, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81944358, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.3996582, + "step": 4392, + "time_per_iteration": 3.283132791519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048397, + "balance_loss_mlp": 1.0083667, + "epoch": 0.8451327433628318, + "flos": 658608816384.0, + "grad_norm": 0.037595076995326535, + "language_loss": 0.72948486, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73996878, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.40014648, + "step": 4393, + "time_per_iteration": 2.8895535469055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049246, + "balance_loss_mlp": 1.00933516, + "epoch": 0.8453251250480954, + "flos": 447049352448.0, + "grad_norm": 0.037490105164885316, + "language_loss": 0.83802319, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84851563, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.39892578, + "step": 4394, + "time_per_iteration": 2.600051164627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044816, + "balance_loss_mlp": 1.00488043, + "epoch": 0.845517506733359, + "flos": 543874371072.0, + "grad_norm": 0.04600609812131957, + "language_loss": 0.72015631, + "learning_rate": 6.128951512927305e-05, + "loss": 0.73060441, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.39916992, + "step": 4395, + "time_per_iteration": 2.669736623764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104476, + "balance_loss_mlp": 1.0048008, + "epoch": 0.8457098884186226, + "flos": 503507202048.0, + "grad_norm": 0.03225054285185889, + "language_loss": 0.84712255, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85757017, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.39941406, + "step": 4396, + "time_per_iteration": 2.6642940044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_mlp": 1.00355434, + "epoch": 0.8459022701038861, + "flos": 449895573504.0, + "grad_norm": 0.03459696576725407, + "language_loss": 0.80533981, + "learning_rate": 6.099094894219326e-05, + "loss": 0.81577528, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.3996582, + "step": 4397, + "time_per_iteration": 2.6888678073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043514, + "balance_loss_mlp": 1.00350761, + "epoch": 0.8460946517891497, + "flos": 744472770816.0, + "grad_norm": 0.03316086560733144, + "language_loss": 0.75788116, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76831627, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.39990234, + "step": 4398, + "time_per_iteration": 2.957204580307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_mlp": 1.00393856, + "epoch": 0.8462870334744133, + "flos": 554327044608.0, + "grad_norm": 0.035598785998024324, + "language_loss": 0.80453002, + "learning_rate": 6.069306450876389e-05, + "loss": 0.81496894, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.39941406, + "step": 4399, + "time_per_iteration": 2.7827699184417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_mlp": 1.00907135, + "epoch": 0.8464794151596768, + "flos": 1568271336192.0, + "grad_norm": 0.006109463060419775, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82756555, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.38867188, + "step": 4400, + "time_per_iteration": 4.881330966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_mlp": 1.00759327, + "epoch": 0.8466717968449403, + "flos": 551265995520.0, + "grad_norm": 0.03379372899576281, + "language_loss": 0.80214202, + "learning_rate": 6.039586229158084e-05, + "loss": 0.81261623, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.39819336, + "step": 4401, + "time_per_iteration": 2.7047319412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047774, + "balance_loss_mlp": 1.00788701, + "epoch": 0.8468641785302039, + "flos": 553096402176.0, + "grad_norm": 0.036798324054331616, + "language_loss": 0.849747, + "learning_rate": 6.024751715835314e-05, + "loss": 0.86022472, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.39868164, + "step": 4402, + "time_per_iteration": 2.815459966659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049428, + "balance_loss_mlp": 1.00966001, + "epoch": 0.8470565602154675, + "flos": 573825805056.0, + "grad_norm": 0.037110384516023706, + "language_loss": 0.87787378, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88836807, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.39746094, + "step": 4403, + "time_per_iteration": 2.737457275390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_mlp": 1.00511968, + "epoch": 0.8472489419007311, + "flos": 473781289728.0, + "grad_norm": 0.0392549286911581, + "language_loss": 0.84440565, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85485572, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.39868164, + "step": 4404, + "time_per_iteration": 2.568206787109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_mlp": 1.00615251, + "epoch": 0.8474413235859947, + "flos": 799378225152.0, + "grad_norm": 0.035625221282266174, + "language_loss": 0.80429268, + "learning_rate": 5.980350635103954e-05, + "loss": 0.81475306, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.39868164, + "step": 4405, + "time_per_iteration": 2.9909889698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046099, + "balance_loss_mlp": 1.00616419, + "epoch": 0.8476337052712581, + "flos": 503378889984.0, + "grad_norm": 0.054053938180127596, + "language_loss": 0.80838627, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81884724, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.39916992, + "step": 4406, + "time_per_iteration": 2.5926382541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046099, + "balance_loss_mlp": 1.00623512, + "epoch": 0.8478260869565217, + "flos": 933518112000.0, + "grad_norm": 0.029539714928688187, + "language_loss": 0.83627093, + "learning_rate": 5.9508353547573e-05, + "loss": 0.8467319, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.3984375, + "step": 4407, + "time_per_iteration": 3.2023520469665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043476, + "balance_loss_mlp": 1.00358844, + "epoch": 0.8480184686417853, + "flos": 710053701120.0, + "grad_norm": 0.03753136985020684, + "language_loss": 0.81381404, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.82424879, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.39868164, + "step": 4408, + "time_per_iteration": 2.9116780757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044002, + "balance_loss_mlp": 1.0040431, + "epoch": 0.8482108503270489, + "flos": 615599560704.0, + "grad_norm": 0.029096278628502316, + "language_loss": 0.82911217, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83955222, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.39941406, + "step": 4409, + "time_per_iteration": 2.8034651279449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044369, + "balance_loss_mlp": 1.00445783, + "epoch": 0.8484032320123124, + "flos": 532073436672.0, + "grad_norm": 0.03490278429304404, + "language_loss": 0.82839775, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83884144, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.39892578, + "step": 4410, + "time_per_iteration": 2.6078169345855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_mlp": 1.00699615, + "epoch": 0.848595613697576, + "flos": 1546174230528.0, + "grad_norm": 0.008492225160972254, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77342916, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.38964844, + "step": 4411, + "time_per_iteration": 4.888483762741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047746, + "balance_loss_mlp": 1.00788236, + "epoch": 0.8487879953828396, + "flos": 678619858176.0, + "grad_norm": 0.034815423917737294, + "language_loss": 0.74149477, + "learning_rate": 5.877346528406635e-05, + "loss": 0.75197226, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.3984375, + "step": 4412, + "time_per_iteration": 2.861584186553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_mlp": 1.00928438, + "epoch": 0.8489803770681031, + "flos": 504672715776.0, + "grad_norm": 0.03763166001316411, + "language_loss": 0.80223823, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.81272948, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.39819336, + "step": 4413, + "time_per_iteration": 2.607466459274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049111, + "balance_loss_mlp": 1.00919974, + "epoch": 0.8491727587533667, + "flos": 564350062080.0, + "grad_norm": 0.038784570505553576, + "language_loss": 0.77884841, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78933948, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.39892578, + "step": 4414, + "time_per_iteration": 2.721888303756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_mlp": 1.0045321, + "epoch": 0.8493651404386302, + "flos": 460749712896.0, + "grad_norm": 0.04126295336032692, + "language_loss": 0.78762388, + "learning_rate": 5.833458746159243e-05, + "loss": 0.79806906, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.3996582, + "step": 4415, + "time_per_iteration": 2.544360399246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_mlp": 1.00367904, + "epoch": 0.8495575221238938, + "flos": 462145605888.0, + "grad_norm": 0.04113715838668957, + "language_loss": 0.81992638, + "learning_rate": 5.818863771788013e-05, + "loss": 0.83036369, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.40039062, + "step": 4416, + "time_per_iteration": 2.621957302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044228, + "balance_loss_mlp": 1.00426936, + "epoch": 0.8497499038091574, + "flos": 872154222336.0, + "grad_norm": 0.035688143834842465, + "language_loss": 0.81738758, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82782984, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.39941406, + "step": 4417, + "time_per_iteration": 3.095893383026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044868, + "balance_loss_mlp": 1.00490916, + "epoch": 0.849942285494421, + "flos": 780975959040.0, + "grad_norm": 0.036021778410362866, + "language_loss": 0.7830838, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79353249, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.39941406, + "step": 4418, + "time_per_iteration": 2.994999885559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_mlp": 1.00348961, + "epoch": 0.8501346671796844, + "flos": 514908615936.0, + "grad_norm": 0.03606911304712284, + "language_loss": 0.85473448, + "learning_rate": 5.775181787135819e-05, + "loss": 0.86516964, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.40014648, + "step": 4419, + "time_per_iteration": 2.667847156524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043246, + "balance_loss_mlp": 1.00323963, + "epoch": 0.850327048864948, + "flos": 622635406080.0, + "grad_norm": 0.03298723952459495, + "language_loss": 0.84261787, + "learning_rate": 5.76065545724877e-05, + "loss": 0.85305035, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.39990234, + "step": 4420, + "time_per_iteration": 2.828456401824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043242, + "balance_loss_mlp": 1.0032357, + "epoch": 0.8505194305502116, + "flos": 775550834688.0, + "grad_norm": 0.038283262853593514, + "language_loss": 0.80220652, + "learning_rate": 5.746146302598454e-05, + "loss": 0.812639, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.39990234, + "step": 4421, + "time_per_iteration": 3.011596202850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045026, + "balance_loss_mlp": 1.00501943, + "epoch": 0.8507118122354752, + "flos": 466213721088.0, + "grad_norm": 0.03411940614930077, + "language_loss": 0.8696543, + "learning_rate": 5.731654328817859e-05, + "loss": 0.88010454, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.39990234, + "step": 4422, + "time_per_iteration": 2.5618016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_mlp": 1.00654387, + "epoch": 0.8509041939207388, + "flos": 535470822912.0, + "grad_norm": 0.03503428991987823, + "language_loss": 0.85501492, + "learning_rate": 5.717179541533257e-05, + "loss": 0.86547995, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.39941406, + "step": 4423, + "time_per_iteration": 2.668095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_mlp": 1.00636971, + "epoch": 0.8510965756060023, + "flos": 584829643776.0, + "grad_norm": 0.037852967205166614, + "language_loss": 0.8484906, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85895479, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.40039062, + "step": 4424, + "time_per_iteration": 2.7089426517486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_mlp": 1.0064522, + "epoch": 0.8512889572912659, + "flos": 602018764032.0, + "grad_norm": 0.0370000397469383, + "language_loss": 0.77723837, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78770298, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.39990234, + "step": 4425, + "time_per_iteration": 2.759779930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045288, + "balance_loss_mlp": 1.00530505, + "epoch": 0.8514813389765294, + "flos": 656066851584.0, + "grad_norm": 0.0349446752760593, + "language_loss": 0.79018658, + "learning_rate": 5.673858354818151e-05, + "loss": 0.80063945, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.3996582, + "step": 4426, + "time_per_iteration": 2.8464009761810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_mlp": 1.00515938, + "epoch": 0.851673720661793, + "flos": 430659273216.0, + "grad_norm": 0.040120645560250315, + "language_loss": 0.78890347, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.79935461, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.39941406, + "step": 4427, + "time_per_iteration": 2.5440762042999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045694, + "balance_loss_mlp": 1.00573528, + "epoch": 0.8518661023470565, + "flos": 642759208704.0, + "grad_norm": 0.04399407608255305, + "language_loss": 0.80127829, + "learning_rate": 5.645063599002875e-05, + "loss": 0.81173521, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.39941406, + "step": 4428, + "time_per_iteration": 2.8217527866363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_mlp": 1.00565207, + "epoch": 0.8520584840323201, + "flos": 563199132672.0, + "grad_norm": 0.036053827771286885, + "language_loss": 0.80072153, + "learning_rate": 5.630692048472363e-05, + "loss": 0.81117785, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.3996582, + "step": 4429, + "time_per_iteration": 2.688634157180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045506, + "balance_loss_mlp": 1.00557125, + "epoch": 0.8522508657175837, + "flos": 528081143808.0, + "grad_norm": 0.038575296001451785, + "language_loss": 0.79170716, + "learning_rate": 5.61633772363489e-05, + "loss": 0.80216217, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.39916992, + "step": 4430, + "time_per_iteration": 2.594003915786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045227, + "balance_loss_mlp": 1.00538754, + "epoch": 0.8524432474028473, + "flos": 500103012864.0, + "grad_norm": 0.03514704462668056, + "language_loss": 0.81136119, + "learning_rate": 5.602000630063298e-05, + "loss": 0.82181346, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.39819336, + "step": 4431, + "time_per_iteration": 2.6524744033813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_mlp": 1.00585186, + "epoch": 0.8526356290881109, + "flos": 422216841216.0, + "grad_norm": 0.043916999307345196, + "language_loss": 0.80055523, + "learning_rate": 5.587680773323706e-05, + "loss": 0.81101382, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.39990234, + "step": 4432, + "time_per_iteration": 2.482304334640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045542, + "balance_loss_mlp": 1.00560737, + "epoch": 0.8528280107733743, + "flos": 508330616832.0, + "grad_norm": 0.03493847122451932, + "language_loss": 0.81211418, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.82256961, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.39916992, + "step": 4433, + "time_per_iteration": 2.595567464828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045928, + "balance_loss_mlp": 1.00606406, + "epoch": 0.8530203924586379, + "flos": 446817027840.0, + "grad_norm": 0.037902023727573, + "language_loss": 0.83218634, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.84264565, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.3984375, + "step": 4434, + "time_per_iteration": 2.583287477493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045846, + "balance_loss_mlp": 1.00593507, + "epoch": 0.8532127741439015, + "flos": 658990840320.0, + "grad_norm": 0.0366688600257907, + "language_loss": 0.84032941, + "learning_rate": 5.54482467965825e-05, + "loss": 0.85078788, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.39892578, + "step": 4435, + "time_per_iteration": 2.818974494934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_mlp": 1.00604296, + "epoch": 0.8534051558291651, + "flos": 537099040512.0, + "grad_norm": 0.030704738666311435, + "language_loss": 0.8344785, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84493661, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.39746094, + "step": 4436, + "time_per_iteration": 2.724040985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045453, + "balance_loss_mlp": 1.00566089, + "epoch": 0.8535975375144286, + "flos": 534037991424.0, + "grad_norm": 0.044664222013351275, + "language_loss": 0.79138994, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80184448, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.39770508, + "step": 4437, + "time_per_iteration": 2.639385223388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00428987, + "epoch": 0.8537899191996922, + "flos": 575269330176.0, + "grad_norm": 0.04474802449890559, + "language_loss": 0.82764935, + "learning_rate": 5.502123917219848e-05, + "loss": 0.8380909, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.3984375, + "step": 4438, + "time_per_iteration": 2.7381479740142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044071, + "balance_loss_mlp": 1.00425565, + "epoch": 0.8539823008849557, + "flos": 466007641344.0, + "grad_norm": 0.03412606398220342, + "language_loss": 0.83686745, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84730822, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.39794922, + "step": 4439, + "time_per_iteration": 2.7366132736206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_mlp": 1.00421679, + "epoch": 0.8541746825702193, + "flos": 555807508224.0, + "grad_norm": 0.03647803217669747, + "language_loss": 0.82074428, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.83118486, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.39819336, + "step": 4440, + "time_per_iteration": 2.6492371559143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_mlp": 1.00459361, + "epoch": 0.8543670642554829, + "flos": 547558516992.0, + "grad_norm": 0.03717660212080705, + "language_loss": 0.78091979, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.79136366, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.39770508, + "step": 4441, + "time_per_iteration": 2.7436399459838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00444269, + "epoch": 0.8545594459407464, + "flos": 513076263936.0, + "grad_norm": 0.033320650973310266, + "language_loss": 0.82524663, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83568943, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.39819336, + "step": 4442, + "time_per_iteration": 2.6414926052093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_mlp": 1.00454664, + "epoch": 0.85475182762601, + "flos": 422086583808.0, + "grad_norm": 0.03678224160115087, + "language_loss": 0.82031214, + "learning_rate": 5.431301565318786e-05, + "loss": 0.83075607, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.39819336, + "step": 4443, + "time_per_iteration": 2.5231664180755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_mlp": 1.00352824, + "epoch": 0.8549442093112736, + "flos": 390292104192.0, + "grad_norm": 0.043585312806385154, + "language_loss": 0.78223205, + "learning_rate": 5.41718898228542e-05, + "loss": 0.7926662, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.39868164, + "step": 4444, + "time_per_iteration": 2.4840567111968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_mlp": 1.00369012, + "epoch": 0.8551365909965372, + "flos": 607155183360.0, + "grad_norm": 0.037333626651253705, + "language_loss": 0.79457009, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80500567, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.3984375, + "step": 4445, + "time_per_iteration": 2.793098211288452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_mlp": 1.00355375, + "epoch": 0.8553289726818007, + "flos": 505156806912.0, + "grad_norm": 0.036563885282109194, + "language_loss": 0.79314804, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.80358338, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.3996582, + "step": 4446, + "time_per_iteration": 2.5899364948272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_mlp": 1.00650752, + "epoch": 0.8555213543670642, + "flos": 558106454784.0, + "grad_norm": 0.03639099586734523, + "language_loss": 0.76389134, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77435529, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.39868164, + "step": 4447, + "time_per_iteration": 2.7353360652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045537, + "balance_loss_mlp": 1.00555396, + "epoch": 0.8557137360523278, + "flos": 549153686784.0, + "grad_norm": 0.03500470093076036, + "language_loss": 0.75183821, + "learning_rate": 5.360911790663775e-05, + "loss": 0.76229358, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.3996582, + "step": 4448, + "time_per_iteration": 2.6300766468048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_mlp": 1.00393462, + "epoch": 0.8559061177375914, + "flos": 729504829440.0, + "grad_norm": 0.036315489674909586, + "language_loss": 0.79067165, + "learning_rate": 5.346885805197238e-05, + "loss": 0.80111039, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.39916992, + "step": 4449, + "time_per_iteration": 2.997072219848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_mlp": 1.00391161, + "epoch": 0.856098499422855, + "flos": 536977531392.0, + "grad_norm": 0.03929943737892077, + "language_loss": 0.83804214, + "learning_rate": 5.332877155607085e-05, + "loss": 0.8484804, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.39892578, + "step": 4450, + "time_per_iteration": 2.7184524536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044891, + "balance_loss_mlp": 1.00493169, + "epoch": 0.8562908811081185, + "flos": 574776490752.0, + "grad_norm": 0.03344530380286612, + "language_loss": 0.83789825, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84834719, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.39941406, + "step": 4451, + "time_per_iteration": 2.730924367904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043477, + "balance_loss_mlp": 1.00354207, + "epoch": 0.856483262793382, + "flos": 783216579840.0, + "grad_norm": 0.03711590961781359, + "language_loss": 0.80795747, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81839228, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.39916992, + "step": 4452, + "time_per_iteration": 3.097334146499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_mlp": 1.0023483, + "epoch": 0.8566756444786456, + "flos": 456757420032.0, + "grad_norm": 0.03247549077414915, + "language_loss": 0.85096687, + "learning_rate": 5.290955276447651e-05, + "loss": 0.8613894, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.39892578, + "step": 4453, + "time_per_iteration": 2.5756120681762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042269, + "balance_loss_mlp": 1.00226271, + "epoch": 0.8568680261639092, + "flos": 450316481280.0, + "grad_norm": 0.03777805349232298, + "language_loss": 0.84773082, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85815352, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.39990234, + "step": 4454, + "time_per_iteration": 2.5053937435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_mlp": 1.00222218, + "epoch": 0.8570604078491728, + "flos": 480938644224.0, + "grad_norm": 0.046064792472179546, + "language_loss": 0.82946479, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83988655, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.39941406, + "step": 4455, + "time_per_iteration": 2.5223333835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_mlp": 1.00209057, + "epoch": 0.8572527895344363, + "flos": 506934723840.0, + "grad_norm": 0.035455469317855655, + "language_loss": 0.85363388, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86405408, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.39916992, + "step": 4456, + "time_per_iteration": 2.575731039047241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042121, + "balance_loss_mlp": 1.0021379, + "epoch": 0.8574451712196999, + "flos": 788476453632.0, + "grad_norm": 0.04614360974080898, + "language_loss": 0.8365714, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84699261, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.3996582, + "step": 4457, + "time_per_iteration": 3.0536177158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_mlp": 1.00283957, + "epoch": 0.8576375529049635, + "flos": 510347661312.0, + "grad_norm": 0.03326548366354344, + "language_loss": 0.7548418, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76526952, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.39916992, + "step": 4458, + "time_per_iteration": 2.69887375831604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043285, + "balance_loss_mlp": 1.00447083, + "epoch": 0.857829934590227, + "flos": 1463891387904.0, + "grad_norm": 0.004297416027187635, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85810578, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.38769531, + "step": 4459, + "time_per_iteration": 4.931604623794556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010427, + "balance_loss_mlp": 1.00283611, + "epoch": 0.8580223162754905, + "flos": 480259166976.0, + "grad_norm": 0.03161384875669954, + "language_loss": 0.89644599, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90687293, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.3984375, + "step": 4460, + "time_per_iteration": 2.6447529792785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_mlp": 1.00214815, + "epoch": 0.8582146979607541, + "flos": 707457301248.0, + "grad_norm": 0.03987085543559285, + "language_loss": 0.79637587, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80679691, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.39941406, + "step": 4461, + "time_per_iteration": 2.836300849914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_mlp": 1.00325847, + "epoch": 0.8584070796460177, + "flos": 766494054144.0, + "grad_norm": 0.03752152487734852, + "language_loss": 0.8312273, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.84166038, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.40039062, + "step": 4462, + "time_per_iteration": 3.0077900886535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104119, + "balance_loss_mlp": 1.00120723, + "epoch": 0.8585994613312813, + "flos": 588010256640.0, + "grad_norm": 0.035369500112361965, + "language_loss": 0.86055285, + "learning_rate": 5.152344741070919e-05, + "loss": 0.87096477, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.3996582, + "step": 4463, + "time_per_iteration": 2.777745008468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041881, + "balance_loss_mlp": 1.0019455, + "epoch": 0.8587918430165449, + "flos": 609510510336.0, + "grad_norm": 0.03358316134119744, + "language_loss": 0.79521871, + "learning_rate": 5.138579361741169e-05, + "loss": 0.80563754, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.39916992, + "step": 4464, + "time_per_iteration": 2.8097100257873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042564, + "balance_loss_mlp": 1.00265265, + "epoch": 0.8589842247018084, + "flos": 590070075648.0, + "grad_norm": 0.037330813544588, + "language_loss": 0.81354475, + "learning_rate": 5.124831399159535e-05, + "loss": 0.82397044, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.39892578, + "step": 4465, + "time_per_iteration": 2.6861515045166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00357902, + "epoch": 0.8591766063870719, + "flos": 544964062464.0, + "grad_norm": 0.047961179507573816, + "language_loss": 0.79156327, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.80199909, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.39990234, + "step": 4466, + "time_per_iteration": 2.645440101623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_mlp": 1.00329959, + "epoch": 0.8593689880723355, + "flos": 494786758656.0, + "grad_norm": 0.038918895611797386, + "language_loss": 0.81233793, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.82277095, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.39990234, + "step": 4467, + "time_per_iteration": 2.674589157104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104568, + "balance_loss_mlp": 1.00550628, + "epoch": 0.8595613697575991, + "flos": 534941044992.0, + "grad_norm": 0.04226975076032596, + "language_loss": 0.84279299, + "learning_rate": 5.083692065243822e-05, + "loss": 0.85324979, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.40161133, + "step": 4468, + "time_per_iteration": 2.6663825511932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_mlp": 1.0055114, + "epoch": 0.8597537514428626, + "flos": 618755874048.0, + "grad_norm": 0.03926324289956361, + "language_loss": 0.76639593, + "learning_rate": 5.070013822961328e-05, + "loss": 0.77685177, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.40063477, + "step": 4469, + "time_per_iteration": 2.7248737812042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_mlp": 1.00585926, + "epoch": 0.8599461331281262, + "flos": 609857541120.0, + "grad_norm": 0.03895417100249646, + "language_loss": 0.84123969, + "learning_rate": 5.056353024046462e-05, + "loss": 0.85170031, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.40185547, + "step": 4470, + "time_per_iteration": 2.754119396209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_mlp": 1.00572836, + "epoch": 0.8601385148133898, + "flos": 552344993280.0, + "grad_norm": 0.037895908280551775, + "language_loss": 0.83561713, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84607613, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.40161133, + "step": 4471, + "time_per_iteration": 2.6509101390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_mlp": 1.00565875, + "epoch": 0.8603308964986534, + "flos": 582379052544.0, + "grad_norm": 0.03060970457898105, + "language_loss": 0.81397867, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82443655, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.40112305, + "step": 4472, + "time_per_iteration": 2.8442461490631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00585616, + "epoch": 0.8605232781839169, + "flos": 630148539648.0, + "grad_norm": 0.0379081889526199, + "language_loss": 0.75537038, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76582998, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.40087891, + "step": 4473, + "time_per_iteration": 2.7697927951812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_mlp": 1.00264955, + "epoch": 0.8607156598691804, + "flos": 469090077696.0, + "grad_norm": 0.03841128602003186, + "language_loss": 0.77358502, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78401279, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.40112305, + "step": 4474, + "time_per_iteration": 2.5194132328033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00430596, + "epoch": 0.860908041554444, + "flos": 489407321088.0, + "grad_norm": 0.035006828141935564, + "language_loss": 0.82981098, + "learning_rate": 4.988310865374945e-05, + "loss": 0.84025621, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.40209961, + "step": 4475, + "time_per_iteration": 2.652743339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043859, + "balance_loss_mlp": 1.00361395, + "epoch": 0.8611004232397076, + "flos": 593170008576.0, + "grad_norm": 0.039966577780763526, + "language_loss": 0.80588216, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81632078, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.40234375, + "step": 4476, + "time_per_iteration": 2.666529655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043525, + "balance_loss_mlp": 1.00332773, + "epoch": 0.8612928049249712, + "flos": 775622766336.0, + "grad_norm": 0.03341468834325145, + "language_loss": 0.86407369, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87450892, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.40185547, + "step": 4477, + "time_per_iteration": 3.053422212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_mlp": 1.00321817, + "epoch": 0.8614851866102347, + "flos": 538607694336.0, + "grad_norm": 0.051459370752897714, + "language_loss": 0.82765687, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83809155, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.40234375, + "step": 4478, + "time_per_iteration": 2.6455395221710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_mlp": 1.00275767, + "epoch": 0.8616775682954982, + "flos": 566996039424.0, + "grad_norm": 0.03431517223967173, + "language_loss": 0.79459572, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80502427, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.40087891, + "step": 4479, + "time_per_iteration": 2.7187790870666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_mlp": 1.0017215, + "epoch": 0.8618699499807618, + "flos": 482558113536.0, + "grad_norm": 0.0381793142401585, + "language_loss": 0.82083333, + "learning_rate": 4.92070558355221e-05, + "loss": 0.83125293, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.40234375, + "step": 4480, + "time_per_iteration": 2.6569716930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042011, + "balance_loss_mlp": 1.00176573, + "epoch": 0.8620623316660254, + "flos": 650680611072.0, + "grad_norm": 0.043394655514730936, + "language_loss": 0.74778575, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75820589, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.40234375, + "step": 4481, + "time_per_iteration": 2.7786409854888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.00283086, + "epoch": 0.862254713351289, + "flos": 753082398720.0, + "grad_norm": 0.0457468668200295, + "language_loss": 0.8622297, + "learning_rate": 4.893785943464801e-05, + "loss": 0.87265956, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.40136719, + "step": 4482, + "time_per_iteration": 3.0409467220306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.00346696, + "epoch": 0.8624470950365525, + "flos": 843136944384.0, + "grad_norm": 0.07263887982948322, + "language_loss": 0.7833854, + "learning_rate": 4.880352388488024e-05, + "loss": 0.79382133, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.40112305, + "step": 4483, + "time_per_iteration": 3.2473502159118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043317, + "balance_loss_mlp": 1.00316727, + "epoch": 0.8626394767218161, + "flos": 756089012736.0, + "grad_norm": 0.03708175872595014, + "language_loss": 0.83609211, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84652531, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.40136719, + "step": 4484, + "time_per_iteration": 2.9040815830230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044262, + "balance_loss_mlp": 1.00415993, + "epoch": 0.8628318584070797, + "flos": 704858956032.0, + "grad_norm": 0.03939072014015938, + "language_loss": 0.8295635, + "learning_rate": 4.853537834745203e-05, + "loss": 0.84000611, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.40087891, + "step": 4485, + "time_per_iteration": 2.871338367462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043684, + "balance_loss_mlp": 1.00367761, + "epoch": 0.8630242400923432, + "flos": 472198758912.0, + "grad_norm": 0.03510006759017659, + "language_loss": 0.77971268, + "learning_rate": 4.840156846389487e-05, + "loss": 0.79014951, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.39990234, + "step": 4486, + "time_per_iteration": 2.571122169494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_mlp": 1.00370646, + "epoch": 0.8632166217776067, + "flos": 965963878656.0, + "grad_norm": 0.04035538480745229, + "language_loss": 0.77694219, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78737986, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.40039062, + "step": 4487, + "time_per_iteration": 3.206270456314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044627, + "balance_loss_mlp": 1.00469244, + "epoch": 0.8634090034628703, + "flos": 769240153344.0, + "grad_norm": 0.03791528570619557, + "language_loss": 0.79186964, + "learning_rate": 4.813447472684246e-05, + "loss": 0.80231589, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.39916992, + "step": 4488, + "time_per_iteration": 2.956378936767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_mlp": 1.00585032, + "epoch": 0.8636013851481339, + "flos": 521720884992.0, + "grad_norm": 0.03486742602328962, + "language_loss": 0.83548576, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84594309, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.39868164, + "step": 4489, + "time_per_iteration": 2.7456369400024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044161, + "balance_loss_mlp": 1.00417769, + "epoch": 0.8637937668333975, + "flos": 633294159360.0, + "grad_norm": 0.03761878159838688, + "language_loss": 0.81188381, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.82232535, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.3996582, + "step": 4490, + "time_per_iteration": 2.7456276416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104453, + "balance_loss_mlp": 1.00466633, + "epoch": 0.8639861485186611, + "flos": 857522618112.0, + "grad_norm": 0.032138271837822946, + "language_loss": 0.76775849, + "learning_rate": 4.773514997362e-05, + "loss": 0.77820385, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.3984375, + "step": 4491, + "time_per_iteration": 3.0699753761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049712, + "balance_loss_mlp": 1.00980031, + "epoch": 0.8641785302039245, + "flos": 482241218304.0, + "grad_norm": 0.04135190383528018, + "language_loss": 0.78167886, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.79217601, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.39892578, + "step": 4492, + "time_per_iteration": 2.5359408855438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_mlp": 1.00989342, + "epoch": 0.8643709118891881, + "flos": 505649646336.0, + "grad_norm": 0.037459806999943016, + "language_loss": 0.80912906, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81962758, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.39941406, + "step": 4493, + "time_per_iteration": 2.5917551517486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_mlp": 1.00972354, + "epoch": 0.8645632935744517, + "flos": 553552303104.0, + "grad_norm": 0.03733364468795558, + "language_loss": 0.82919705, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83969343, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.39892578, + "step": 4494, + "time_per_iteration": 2.78363299369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_mlp": 1.00309765, + "epoch": 0.8647556752597153, + "flos": 525736510464.0, + "grad_norm": 0.0378697936627377, + "language_loss": 0.84654057, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.85697162, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.39990234, + "step": 4495, + "time_per_iteration": 2.5666584968566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042802, + "balance_loss_mlp": 1.00284255, + "epoch": 0.8649480569449788, + "flos": 789238556160.0, + "grad_norm": 0.04501425991105197, + "language_loss": 0.83136499, + "learning_rate": 4.707312109960471e-05, + "loss": 0.841793, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.39941406, + "step": 4496, + "time_per_iteration": 3.103167772293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_mlp": 1.00332153, + "epoch": 0.8651404386302424, + "flos": 765200228352.0, + "grad_norm": 0.038819843203582616, + "language_loss": 0.77151841, + "learning_rate": 4.694124264495225e-05, + "loss": 0.78195071, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.39892578, + "step": 4497, + "time_per_iteration": 3.0549564361572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_mlp": 1.002949, + "epoch": 0.865332820315506, + "flos": 540989266176.0, + "grad_norm": 0.06154975091588949, + "language_loss": 0.82805288, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83848143, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.39892578, + "step": 4498, + "time_per_iteration": 2.718996286392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043068, + "balance_loss_mlp": 1.00396729, + "epoch": 0.8655252020007695, + "flos": 1479679757568.0, + "grad_norm": 0.0038323013026693355, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80217516, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.390625, + "step": 4499, + "time_per_iteration": 4.773655652999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041418, + "balance_loss_mlp": 1.00148308, + "epoch": 0.8657175836860331, + "flos": 518473198080.0, + "grad_norm": 0.03250129983959287, + "language_loss": 0.82903546, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83944964, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.39916992, + "step": 4500, + "time_per_iteration": 2.7259538173675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041689, + "balance_loss_mlp": 1.00168264, + "epoch": 0.8659099653712966, + "flos": 591633164544.0, + "grad_norm": 0.06794339939961025, + "language_loss": 0.80461693, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81503385, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.39990234, + "step": 4501, + "time_per_iteration": 2.7145915031433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_mlp": 1.00302076, + "epoch": 0.8661023470565602, + "flos": 591576784128.0, + "grad_norm": 0.031149828188743837, + "language_loss": 0.88302559, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89345515, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.39916992, + "step": 4502, + "time_per_iteration": 2.8617639541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_mlp": 1.00586581, + "epoch": 0.8662947287418238, + "flos": 568737017856.0, + "grad_norm": 0.03468810910304343, + "language_loss": 0.80155271, + "learning_rate": 4.61536674574336e-05, + "loss": 0.81201071, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.39916992, + "step": 4503, + "time_per_iteration": 2.7440474033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046514, + "balance_loss_mlp": 1.00653136, + "epoch": 0.8664871104270874, + "flos": 517003428096.0, + "grad_norm": 0.031791485322302415, + "language_loss": 0.82510114, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83556628, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.3996582, + "step": 4504, + "time_per_iteration": 2.792924642562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_mlp": 1.00352764, + "epoch": 0.866679492112351, + "flos": 558430152960.0, + "grad_norm": 0.03528793241244482, + "language_loss": 0.7860105, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79644579, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.39990234, + "step": 4505, + "time_per_iteration": 2.8408970832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_mlp": 1.00308836, + "epoch": 0.8668718737976144, + "flos": 723662688000.0, + "grad_norm": 0.04842825321665466, + "language_loss": 0.82218444, + "learning_rate": 4.57622578599054e-05, + "loss": 0.83261466, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.39916992, + "step": 4506, + "time_per_iteration": 2.9215447902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_mlp": 1.00320053, + "epoch": 0.867064255482878, + "flos": 601834071552.0, + "grad_norm": 0.0362899031078603, + "language_loss": 0.84903908, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.8594709, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.3996582, + "step": 4507, + "time_per_iteration": 2.7223806381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042871, + "balance_loss_mlp": 1.00286424, + "epoch": 0.8672566371681416, + "flos": 804933606912.0, + "grad_norm": 0.038978815297564966, + "language_loss": 0.76347792, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77390665, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.39990234, + "step": 4508, + "time_per_iteration": 3.089169979095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042127, + "balance_loss_mlp": 1.00223958, + "epoch": 0.8674490188534052, + "flos": 628555315200.0, + "grad_norm": 0.03358541173243661, + "language_loss": 0.84039915, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.85082042, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.39868164, + "step": 4509, + "time_per_iteration": 2.7640254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_mlp": 1.0022794, + "epoch": 0.8676414005386687, + "flos": 729205430784.0, + "grad_norm": 0.03339457562318333, + "language_loss": 0.86446714, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87488854, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.3984375, + "step": 4510, + "time_per_iteration": 2.9625110626220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042812, + "balance_loss_mlp": 1.0029006, + "epoch": 0.8678337822239323, + "flos": 541163265024.0, + "grad_norm": 0.03671096945461231, + "language_loss": 0.81317061, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.82359874, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.39892578, + "step": 4511, + "time_per_iteration": 2.7604947090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042756, + "balance_loss_mlp": 1.00279653, + "epoch": 0.8680261639091958, + "flos": 508526002944.0, + "grad_norm": 0.03890609352738196, + "language_loss": 0.7946341, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.8050617, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.39941406, + "step": 4512, + "time_per_iteration": 2.556802272796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044096, + "balance_loss_mlp": 1.00420845, + "epoch": 0.8682185455944594, + "flos": 488150433792.0, + "grad_norm": 0.035041700942467814, + "language_loss": 0.81525004, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82569093, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.39868164, + "step": 4513, + "time_per_iteration": 2.6287481784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043961, + "balance_loss_mlp": 1.00412118, + "epoch": 0.868410927279723, + "flos": 604803747072.0, + "grad_norm": 0.03776631207974779, + "language_loss": 0.81658906, + "learning_rate": 4.472626206030528e-05, + "loss": 0.82702863, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.39819336, + "step": 4514, + "time_per_iteration": 2.7213940620422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_mlp": 1.00180769, + "epoch": 0.8686033089649865, + "flos": 1120722352896.0, + "grad_norm": 0.03739432356503297, + "language_loss": 0.85135609, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.86177301, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.39868164, + "step": 4515, + "time_per_iteration": 3.3755922317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_mlp": 1.0019449, + "epoch": 0.8687956906502501, + "flos": 569099599872.0, + "grad_norm": 0.03900935559178445, + "language_loss": 0.84242064, + "learning_rate": 4.446902963685862e-05, + "loss": 0.85283899, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.39868164, + "step": 4516, + "time_per_iteration": 2.6541643142700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_mlp": 1.00171173, + "epoch": 0.8689880723355137, + "flos": 545411215104.0, + "grad_norm": 0.037327297055917835, + "language_loss": 0.84681803, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85723472, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.39941406, + "step": 4517, + "time_per_iteration": 2.646094799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_mlp": 1.00119364, + "epoch": 0.8691804540207773, + "flos": 458385637632.0, + "grad_norm": 0.037344798435976774, + "language_loss": 0.86974192, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.88015229, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.39819336, + "step": 4518, + "time_per_iteration": 2.580050468444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_mlp": 1.00109982, + "epoch": 0.8693728357060407, + "flos": 593000867328.0, + "grad_norm": 0.04024352256498578, + "language_loss": 0.80694568, + "learning_rate": 4.40845075221456e-05, + "loss": 0.81735557, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.39868164, + "step": 4519, + "time_per_iteration": 2.6886699199676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104283, + "balance_loss_mlp": 1.00299013, + "epoch": 0.8695652173913043, + "flos": 681524404992.0, + "grad_norm": 0.038580823232518636, + "language_loss": 0.80304897, + "learning_rate": 4.395668742181164e-05, + "loss": 0.81347722, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.39819336, + "step": 4520, + "time_per_iteration": 2.8930678367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_mlp": 1.00298536, + "epoch": 0.8697575990765679, + "flos": 493336430592.0, + "grad_norm": 0.04547224074564954, + "language_loss": 0.78913867, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79956746, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.39868164, + "step": 4521, + "time_per_iteration": 2.5533957481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_mlp": 1.00215554, + "epoch": 0.8699499807618315, + "flos": 527987824896.0, + "grad_norm": 0.032020821735607795, + "language_loss": 0.82212275, + "learning_rate": 4.370157842584671e-05, + "loss": 0.83254337, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.39892578, + "step": 4522, + "time_per_iteration": 2.6833107471466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042365, + "balance_loss_mlp": 1.00240612, + "epoch": 0.8701423624470951, + "flos": 815794549248.0, + "grad_norm": 0.04802472888774931, + "language_loss": 0.80982125, + "learning_rate": 4.357428962925808e-05, + "loss": 0.82024491, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.39941406, + "step": 4523, + "time_per_iteration": 3.1088523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042816, + "balance_loss_mlp": 1.00285709, + "epoch": 0.8703347441323586, + "flos": 557874130176.0, + "grad_norm": 0.034879545908552134, + "language_loss": 0.89101827, + "learning_rate": 4.344717803284542e-05, + "loss": 0.90144646, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.39941406, + "step": 4524, + "time_per_iteration": 2.664231538772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_mlp": 1.00281131, + "epoch": 0.8705271258176221, + "flos": 586614363648.0, + "grad_norm": 0.03362644335681585, + "language_loss": 0.84724236, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85766935, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.39868164, + "step": 4525, + "time_per_iteration": 2.838411808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044069, + "balance_loss_mlp": 1.0041815, + "epoch": 0.8707195075028857, + "flos": 670503069696.0, + "grad_norm": 0.033221924916940926, + "language_loss": 0.85867798, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86911869, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.39868164, + "step": 4526, + "time_per_iteration": 2.8975462913513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044019, + "balance_loss_mlp": 1.00410795, + "epoch": 0.8709118891881493, + "flos": 521471063808.0, + "grad_norm": 0.03631625832210608, + "language_loss": 0.84302342, + "learning_rate": 4.306690693781007e-05, + "loss": 0.85346365, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.39892578, + "step": 4527, + "time_per_iteration": 2.7671144008636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_mlp": 1.00504327, + "epoch": 0.8711042708734128, + "flos": 554272609536.0, + "grad_norm": 0.0374848177192315, + "language_loss": 0.82055509, + "learning_rate": 4.294050463490401e-05, + "loss": 0.83100373, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.39794922, + "step": 4528, + "time_per_iteration": 2.6653261184692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044793, + "balance_loss_mlp": 1.00497687, + "epoch": 0.8712966525586764, + "flos": 503237938944.0, + "grad_norm": 0.039647791126880064, + "language_loss": 0.82525837, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83570629, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.39794922, + "step": 4529, + "time_per_iteration": 2.712507486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044425, + "balance_loss_mlp": 1.00460875, + "epoch": 0.87148903424394, + "flos": 805528513536.0, + "grad_norm": 0.034217964706317425, + "language_loss": 0.74154443, + "learning_rate": 4.268823241679593e-05, + "loss": 0.75198865, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.39794922, + "step": 4530, + "time_per_iteration": 3.035536050796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.00382435, + "epoch": 0.8716814159292036, + "flos": 774841221888.0, + "grad_norm": 0.03641439178250716, + "language_loss": 0.86728388, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87771976, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.39746094, + "step": 4531, + "time_per_iteration": 2.9879214763641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_mlp": 1.00408423, + "epoch": 0.8718737976144671, + "flos": 487798545408.0, + "grad_norm": 0.04080829111532849, + "language_loss": 0.85192716, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86236501, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.39672852, + "step": 4532, + "time_per_iteration": 2.563370943069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042257, + "balance_loss_mlp": 1.00256038, + "epoch": 0.8720661792997306, + "flos": 585220416000.0, + "grad_norm": 0.041574006136382645, + "language_loss": 0.79068941, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.80111194, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.39672852, + "step": 4533, + "time_per_iteration": 2.7681236267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_mlp": 1.00343323, + "epoch": 0.8722585609849942, + "flos": 1499002573824.0, + "grad_norm": 0.003968417299446405, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.82009149, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.38867188, + "step": 4534, + "time_per_iteration": 4.783138751983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_mlp": 1.00343895, + "epoch": 0.8724509426702578, + "flos": 597310055424.0, + "grad_norm": 0.03373273111678506, + "language_loss": 0.87846303, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88889456, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.39697266, + "step": 4535, + "time_per_iteration": 2.7677438259124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_mlp": 1.00268626, + "epoch": 0.8726433243555214, + "flos": 444546271488.0, + "grad_norm": 0.05771786559784756, + "language_loss": 0.81709069, + "learning_rate": 4.193567838376888e-05, + "loss": 0.8275162, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.3984375, + "step": 4536, + "time_per_iteration": 2.616766929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_mlp": 1.00272167, + "epoch": 0.8728357060407849, + "flos": 554235671040.0, + "grad_norm": 0.037464761752317666, + "language_loss": 0.82218051, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83260494, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.39697266, + "step": 4537, + "time_per_iteration": 2.6978237628936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_mlp": 1.00276279, + "epoch": 0.8730280877260485, + "flos": 629019964416.0, + "grad_norm": 0.03588843210953962, + "language_loss": 0.78862292, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79904926, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.3984375, + "step": 4538, + "time_per_iteration": 2.8252694606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_mlp": 1.00244486, + "epoch": 0.873220469411312, + "flos": 536502188544.0, + "grad_norm": 0.03884922184031476, + "language_loss": 0.80336553, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81378818, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.39794922, + "step": 4539, + "time_per_iteration": 2.75636625289917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_mlp": 1.00337529, + "epoch": 0.8734128510965756, + "flos": 563001801216.0, + "grad_norm": 0.03313154117432453, + "language_loss": 0.84412879, + "learning_rate": 4.143753177230242e-05, + "loss": 0.85455978, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.39697266, + "step": 4540, + "time_per_iteration": 2.7357964515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_mlp": 1.00307477, + "epoch": 0.8736052327818392, + "flos": 687804950784.0, + "grad_norm": 0.03579433558259156, + "language_loss": 0.79879081, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80921829, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.39648438, + "step": 4541, + "time_per_iteration": 3.0047316551208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042403, + "balance_loss_mlp": 1.00275385, + "epoch": 0.8737976144671027, + "flos": 532833593856.0, + "grad_norm": 0.03592025812495919, + "language_loss": 0.8180542, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82847822, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.39624023, + "step": 4542, + "time_per_iteration": 2.797070026397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_mlp": 1.00356984, + "epoch": 0.8739899961523663, + "flos": 576730351872.0, + "grad_norm": 0.0332051158629443, + "language_loss": 0.82107216, + "learning_rate": 4.106579095649649e-05, + "loss": 0.83150411, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.39599609, + "step": 4543, + "time_per_iteration": 2.84247088432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043126, + "balance_loss_mlp": 1.0034529, + "epoch": 0.8741823778376299, + "flos": 732632952576.0, + "grad_norm": 0.04335551998495939, + "language_loss": 0.76707387, + "learning_rate": 4.094223363527666e-05, + "loss": 0.7775051, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.39648438, + "step": 4544, + "time_per_iteration": 2.920760154724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042177, + "balance_loss_mlp": 1.00245607, + "epoch": 0.8743747595228935, + "flos": 568222791168.0, + "grad_norm": 0.03891736625399162, + "language_loss": 0.84551966, + "learning_rate": 4.081885453608747e-05, + "loss": 0.85594141, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.39697266, + "step": 4545, + "time_per_iteration": 2.758371114730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_mlp": 1.0021975, + "epoch": 0.8745671412081569, + "flos": 494395986432.0, + "grad_norm": 0.03573114845896075, + "language_loss": 0.82429254, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83471167, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.39697266, + "step": 4546, + "time_per_iteration": 2.593362808227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_mlp": 1.01320839, + "epoch": 0.8747595228934205, + "flos": 525167848704.0, + "grad_norm": 0.03540911918032387, + "language_loss": 0.83888662, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84941518, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.39624023, + "step": 4547, + "time_per_iteration": 2.723700761795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_mlp": 1.01117158, + "epoch": 0.8749519045786841, + "flos": 745753957632.0, + "grad_norm": 0.043747965680807695, + "language_loss": 0.80443823, + "learning_rate": 4.044978704935853e-05, + "loss": 0.81494784, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.39770508, + "step": 4548, + "time_per_iteration": 3.02632474899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105381, + "balance_loss_mlp": 1.01404202, + "epoch": 0.8751442862639477, + "flos": 595384384512.0, + "grad_norm": 0.035676199240782205, + "language_loss": 0.80288255, + "learning_rate": 4.032712131660027e-05, + "loss": 0.81342065, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.39746094, + "step": 4549, + "time_per_iteration": 2.870236873626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052162, + "balance_loss_mlp": 1.01229811, + "epoch": 0.8753366679492113, + "flos": 497515361280.0, + "grad_norm": 0.03757184698075675, + "language_loss": 0.79065835, + "learning_rate": 4.020463404468055e-05, + "loss": 0.80118001, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.3984375, + "step": 4550, + "time_per_iteration": 2.7937049865722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_mlp": 1.0073179, + "epoch": 0.8755290496344748, + "flos": 490850846208.0, + "grad_norm": 0.03757399856308613, + "language_loss": 0.82482672, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.83529806, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.39794922, + "step": 4551, + "time_per_iteration": 2.5715842247009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045971, + "balance_loss_mlp": 1.00596392, + "epoch": 0.8757214313197383, + "flos": 593072798976.0, + "grad_norm": 0.038733451202642565, + "language_loss": 0.8219583, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.83241796, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.39990234, + "step": 4552, + "time_per_iteration": 2.8051164150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043153, + "balance_loss_mlp": 1.00321829, + "epoch": 0.8759138130050019, + "flos": 978400548864.0, + "grad_norm": 0.039917821896877995, + "language_loss": 0.78999895, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.80043048, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.39916992, + "step": 4553, + "time_per_iteration": 3.2010762691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042774, + "balance_loss_mlp": 1.00286233, + "epoch": 0.8761061946902655, + "flos": 804206497536.0, + "grad_norm": 0.030968637089522598, + "language_loss": 0.78100884, + "learning_rate": 3.971647051542243e-05, + "loss": 0.79143655, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.39892578, + "step": 4554, + "time_per_iteration": 3.0976600646972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.0050838, + "epoch": 0.8762985763755291, + "flos": 699848903424.0, + "grad_norm": 0.03651037459653682, + "language_loss": 0.75311875, + "learning_rate": 3.95948762596155e-05, + "loss": 0.76356781, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.39794922, + "step": 4555, + "time_per_iteration": 2.9630236625671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044972, + "balance_loss_mlp": 1.00518036, + "epoch": 0.8764909580607926, + "flos": 630928138752.0, + "grad_norm": 0.0411717836105296, + "language_loss": 0.80529356, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81574327, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.39770508, + "step": 4556, + "time_per_iteration": 2.9351165294647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044616, + "balance_loss_mlp": 1.00482428, + "epoch": 0.8766833397460562, + "flos": 482538671616.0, + "grad_norm": 0.03468954168211097, + "language_loss": 0.80526578, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81571198, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.39770508, + "step": 4557, + "time_per_iteration": 2.673027276992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047126, + "balance_loss_mlp": 1.00738144, + "epoch": 0.8768757214313198, + "flos": 408618547968.0, + "grad_norm": 0.04259075794478707, + "language_loss": 0.78827333, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79874456, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.3972168, + "step": 4558, + "time_per_iteration": 2.4583702087402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_mlp": 1.00742388, + "epoch": 0.8770681031165833, + "flos": 583657327104.0, + "grad_norm": 0.03927894736445276, + "language_loss": 0.82466614, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.83513731, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.39672852, + "step": 4559, + "time_per_iteration": 2.6960582733154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.00597811, + "epoch": 0.8772604848018468, + "flos": 509689571328.0, + "grad_norm": 0.052770592517426745, + "language_loss": 0.8140527, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.82451165, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.39892578, + "step": 4560, + "time_per_iteration": 2.642340898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046128, + "balance_loss_mlp": 1.00635993, + "epoch": 0.8774528664871104, + "flos": 409716987648.0, + "grad_norm": 0.05406487435943987, + "language_loss": 0.85231709, + "learning_rate": 3.886906601970913e-05, + "loss": 0.86277837, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.39746094, + "step": 4561, + "time_per_iteration": 2.5048179626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049445, + "balance_loss_mlp": 1.00965309, + "epoch": 0.877645248172374, + "flos": 501870236160.0, + "grad_norm": 0.032221115056965136, + "language_loss": 0.83807063, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.8485651, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.39770508, + "step": 4562, + "time_per_iteration": 2.7065954208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_mlp": 1.00777078, + "epoch": 0.8778376298576376, + "flos": 634299280128.0, + "grad_norm": 0.05217941639464927, + "language_loss": 0.78634655, + "learning_rate": 3.862856098834189e-05, + "loss": 0.79682273, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.39819336, + "step": 4563, + "time_per_iteration": 2.9042325019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_mlp": 1.00696707, + "epoch": 0.8780300115429012, + "flos": 535115043840.0, + "grad_norm": 0.03350020734303954, + "language_loss": 0.80624408, + "learning_rate": 3.850857712974976e-05, + "loss": 0.8167119, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.39794922, + "step": 4564, + "time_per_iteration": 2.8995606899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046172, + "balance_loss_mlp": 1.00638008, + "epoch": 0.8782223932281646, + "flos": 512667995136.0, + "grad_norm": 0.03721004489901225, + "language_loss": 0.77783936, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78830111, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.39770508, + "step": 4565, + "time_per_iteration": 2.6816246509552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_mlp": 1.0053556, + "epoch": 0.8784147749134282, + "flos": 782246452224.0, + "grad_norm": 0.03741234984768557, + "language_loss": 0.70484185, + "learning_rate": 3.826914695965766e-05, + "loss": 0.71529448, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.39892578, + "step": 4566, + "time_per_iteration": 3.1608734130859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045831, + "balance_loss_mlp": 1.00596738, + "epoch": 0.8786071565986918, + "flos": 562072502784.0, + "grad_norm": 0.04196008602434955, + "language_loss": 0.76287764, + "learning_rate": 3.814970074111279e-05, + "loss": 0.77333593, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.3984375, + "step": 4567, + "time_per_iteration": 2.685582160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_mlp": 1.00612044, + "epoch": 0.8787995382839554, + "flos": 604652102400.0, + "grad_norm": 0.03250790005833066, + "language_loss": 0.7786507, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78911006, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.39794922, + "step": 4568, + "time_per_iteration": 2.825204372406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_mlp": 1.00573194, + "epoch": 0.8789919199692189, + "flos": 561290958336.0, + "grad_norm": 0.030274761831549164, + "language_loss": 0.85748357, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.86793929, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.39819336, + "step": 4569, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_mlp": 1.00483751, + "epoch": 0.8791843016544825, + "flos": 540153286656.0, + "grad_norm": 0.04159016368425986, + "language_loss": 0.83041501, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.84086037, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.39672852, + "step": 4570, + "time_per_iteration": 2.6693015098571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045599, + "balance_loss_mlp": 1.00590241, + "epoch": 0.8793766833397461, + "flos": 1010405965824.0, + "grad_norm": 0.03564202822554104, + "language_loss": 0.79841989, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80887592, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.39672852, + "step": 4571, + "time_per_iteration": 3.366255044937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_mlp": 1.00577366, + "epoch": 0.8795690650250096, + "flos": 679913683968.0, + "grad_norm": 0.033369767003960105, + "language_loss": 0.8118791, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82233334, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.39624023, + "step": 4572, + "time_per_iteration": 2.8579459190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_mlp": 1.00914609, + "epoch": 0.8797614467102732, + "flos": 454356406272.0, + "grad_norm": 0.03796242692369374, + "language_loss": 0.88904488, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89953238, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.39575195, + "step": 4573, + "time_per_iteration": 2.5628530979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043011, + "balance_loss_mlp": 1.00333774, + "epoch": 0.8799538283955367, + "flos": 551973662976.0, + "grad_norm": 0.034330484826967635, + "language_loss": 0.85011613, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.86054623, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.39648438, + "step": 4574, + "time_per_iteration": 2.699843645095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_mlp": 1.00357342, + "epoch": 0.8801462100808003, + "flos": 808860771072.0, + "grad_norm": 0.033758040666438734, + "language_loss": 0.84705424, + "learning_rate": 3.720058989624681e-05, + "loss": 0.8574872, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.39697266, + "step": 4575, + "time_per_iteration": 3.105905294418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_mlp": 1.00271654, + "epoch": 0.8803385917660639, + "flos": 770012949504.0, + "grad_norm": 0.03384904792749063, + "language_loss": 0.84867811, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85910225, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.39672852, + "step": 4576, + "time_per_iteration": 2.9150800704956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_mlp": 1.00258136, + "epoch": 0.8805309734513275, + "flos": 568420122624.0, + "grad_norm": 0.03302719749229089, + "language_loss": 0.81569564, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82611847, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.39672852, + "step": 4577, + "time_per_iteration": 2.7670326232910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104256, + "balance_loss_mlp": 1.00286317, + "epoch": 0.880723355136591, + "flos": 680977130496.0, + "grad_norm": 0.03426362906907379, + "language_loss": 0.81833982, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82876545, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.39672852, + "step": 4578, + "time_per_iteration": 2.8875744342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042717, + "balance_loss_mlp": 1.0029968, + "epoch": 0.8809157368218545, + "flos": 566761769472.0, + "grad_norm": 0.03161907542086704, + "language_loss": 0.79385138, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80427855, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.39697266, + "step": 4579, + "time_per_iteration": 2.7719390392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043228, + "balance_loss_mlp": 1.00348318, + "epoch": 0.8811081185071181, + "flos": 516427963392.0, + "grad_norm": 0.031290606513753615, + "language_loss": 0.76370418, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77413642, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.3972168, + "step": 4580, + "time_per_iteration": 2.720790147781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_mlp": 1.00212181, + "epoch": 0.8813005001923817, + "flos": 595449513216.0, + "grad_norm": 0.038529988459892694, + "language_loss": 0.81993824, + "learning_rate": 3.649630180424191e-05, + "loss": 0.830356, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.39624023, + "step": 4581, + "time_per_iteration": 2.7015504837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_mlp": 1.00310469, + "epoch": 0.8814928818776453, + "flos": 668186626560.0, + "grad_norm": 0.0360319379044657, + "language_loss": 0.79632461, + "learning_rate": 3.637955000868254e-05, + "loss": 0.8067522, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.39624023, + "step": 4582, + "time_per_iteration": 2.83176589012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_mlp": 1.00271213, + "epoch": 0.8816852635629088, + "flos": 610276503552.0, + "grad_norm": 0.03398118072297745, + "language_loss": 0.86405253, + "learning_rate": 3.626297820654467e-05, + "loss": 0.8744759, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.39599609, + "step": 4583, + "time_per_iteration": 2.741544485092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_mlp": 1.00260055, + "epoch": 0.8818776452481724, + "flos": 481375103232.0, + "grad_norm": 0.04470719780683464, + "language_loss": 0.82497907, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83540201, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.39672852, + "step": 4584, + "time_per_iteration": 2.652020215988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042011, + "balance_loss_mlp": 1.00231421, + "epoch": 0.882070026933436, + "flos": 1047034553856.0, + "grad_norm": 0.037631390647256145, + "language_loss": 0.73706174, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74748188, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.39672852, + "step": 4585, + "time_per_iteration": 3.3303396701812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045991, + "balance_loss_mlp": 1.00622249, + "epoch": 0.8822624086186995, + "flos": 475435752192.0, + "grad_norm": 0.034149923660639965, + "language_loss": 0.80189967, + "learning_rate": 3.591434321288345e-05, + "loss": 0.81235957, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.39746094, + "step": 4586, + "time_per_iteration": 2.7292559146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045973, + "balance_loss_mlp": 1.00634825, + "epoch": 0.882454790303963, + "flos": 655222123776.0, + "grad_norm": 0.04063008203109671, + "language_loss": 0.82156307, + "learning_rate": 3.579849183630485e-05, + "loss": 0.83202279, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.39599609, + "step": 4587, + "time_per_iteration": 2.8225555419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045855, + "balance_loss_mlp": 1.00613487, + "epoch": 0.8826471719892266, + "flos": 471304453632.0, + "grad_norm": 0.03549940663075914, + "language_loss": 0.78996181, + "learning_rate": 3.568282067873468e-05, + "loss": 0.80042034, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.39697266, + "step": 4588, + "time_per_iteration": 2.6043946743011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046048, + "balance_loss_mlp": 1.00637496, + "epoch": 0.8828395536744902, + "flos": 469767609600.0, + "grad_norm": 0.035767632266805204, + "language_loss": 0.84442842, + "learning_rate": 3.556732978508048e-05, + "loss": 0.8548888, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.39648438, + "step": 4589, + "time_per_iteration": 2.695120334625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045935, + "balance_loss_mlp": 1.00631011, + "epoch": 0.8830319353597538, + "flos": 722718805248.0, + "grad_norm": 0.03454304562764615, + "language_loss": 0.81774867, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82820797, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.39599609, + "step": 4590, + "time_per_iteration": 2.9288313388824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043285, + "balance_loss_mlp": 1.00356412, + "epoch": 0.8832243170450174, + "flos": 444192437760.0, + "grad_norm": 0.03744601205071845, + "language_loss": 0.82025963, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.83069241, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.39697266, + "step": 4591, + "time_per_iteration": 2.6343138217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042563, + "balance_loss_mlp": 1.00284278, + "epoch": 0.8834166987302808, + "flos": 567747448320.0, + "grad_norm": 0.039220305101438216, + "language_loss": 0.82777518, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83820081, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.39697266, + "step": 4592, + "time_per_iteration": 2.7458250522613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_mlp": 1.00282991, + "epoch": 0.8836090804155444, + "flos": 610498134528.0, + "grad_norm": 0.03429970098026536, + "language_loss": 0.82727444, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83770013, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.3972168, + "step": 4593, + "time_per_iteration": 2.82002592086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042679, + "balance_loss_mlp": 1.00288653, + "epoch": 0.883801462100808, + "flos": 558117148416.0, + "grad_norm": 0.03473522225274468, + "language_loss": 0.80918574, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81961256, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.39770508, + "step": 4594, + "time_per_iteration": 2.668245315551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042836, + "balance_loss_mlp": 1.00302017, + "epoch": 0.8839938437860716, + "flos": 517200759552.0, + "grad_norm": 0.041162800652707915, + "language_loss": 0.77860659, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78903496, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.39794922, + "step": 4595, + "time_per_iteration": 2.6603221893310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_mlp": 1.00541556, + "epoch": 0.8841862254713351, + "flos": 714940299264.0, + "grad_norm": 0.038114639972197946, + "language_loss": 0.79199928, + "learning_rate": 3.47639446766777e-05, + "loss": 0.8024509, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.3972168, + "step": 4596, + "time_per_iteration": 2.84773588180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045345, + "balance_loss_mlp": 1.00562418, + "epoch": 0.8843786071565987, + "flos": 835379825664.0, + "grad_norm": 0.03386878842029177, + "language_loss": 0.83102214, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.84147561, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.39697266, + "step": 4597, + "time_per_iteration": 3.067197322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048234, + "balance_loss_mlp": 1.00858498, + "epoch": 0.8845709888418622, + "flos": 658179160320.0, + "grad_norm": 0.030983419501464857, + "language_loss": 0.83426988, + "learning_rate": 3.453603099349462e-05, + "loss": 0.84475219, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.39624023, + "step": 4598, + "time_per_iteration": 2.8990516662597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_mlp": 1.00432217, + "epoch": 0.8847633705271258, + "flos": 524484480768.0, + "grad_norm": 0.03200381379307614, + "language_loss": 0.81129134, + "learning_rate": 3.442234519350823e-05, + "loss": 0.82173103, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.39624023, + "step": 4599, + "time_per_iteration": 2.739694118499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043927, + "balance_loss_mlp": 1.00425434, + "epoch": 0.8849557522123894, + "flos": 549637777920.0, + "grad_norm": 0.03709178353655612, + "language_loss": 0.84963202, + "learning_rate": 3.430884014679786e-05, + "loss": 0.86007124, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.39648438, + "step": 4600, + "time_per_iteration": 2.6870527267456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_mlp": 1.00534427, + "epoch": 0.8851481338976529, + "flos": 623584146432.0, + "grad_norm": 0.03445259220220357, + "language_loss": 0.83782369, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84827334, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.39599609, + "step": 4601, + "time_per_iteration": 2.778261423110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043724, + "balance_loss_mlp": 1.00400329, + "epoch": 0.8853405155829165, + "flos": 445308374016.0, + "grad_norm": 0.0338712411878003, + "language_loss": 0.81439084, + "learning_rate": 3.408237248940088e-05, + "loss": 0.82482803, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.39697266, + "step": 4602, + "time_per_iteration": 2.5989644527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.00365448, + "epoch": 0.8855328972681801, + "flos": 731749340928.0, + "grad_norm": 0.035815900220076725, + "language_loss": 0.78796673, + "learning_rate": 3.396940996663683e-05, + "loss": 0.79839998, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.39648438, + "step": 4603, + "time_per_iteration": 2.893944025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_mlp": 1.00261056, + "epoch": 0.8857252789534437, + "flos": 488356513536.0, + "grad_norm": 0.035459193509833585, + "language_loss": 0.7936362, + "learning_rate": 3.385662837299375e-05, + "loss": 0.80405909, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.39648438, + "step": 4604, + "time_per_iteration": 2.5552213191986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_mlp": 1.00250709, + "epoch": 0.8859176606387072, + "flos": 509622497280.0, + "grad_norm": 0.043634730763989146, + "language_loss": 0.82432818, + "learning_rate": 3.374402775225727e-05, + "loss": 0.83474994, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.39648438, + "step": 4605, + "time_per_iteration": 2.6891160011291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041456, + "balance_loss_mlp": 1.00173521, + "epoch": 0.8861100423239707, + "flos": 517665408768.0, + "grad_norm": 0.034049805393931584, + "language_loss": 0.86205089, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.87246549, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.39697266, + "step": 4606, + "time_per_iteration": 2.65995192527771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041919, + "balance_loss_mlp": 1.00222278, + "epoch": 0.8863024240092343, + "flos": 628110107904.0, + "grad_norm": 0.03497175764411169, + "language_loss": 0.79680067, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80721992, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.39672852, + "step": 4607, + "time_per_iteration": 2.763823986053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041821, + "balance_loss_mlp": 1.00214839, + "epoch": 0.8864948056944979, + "flos": 768298215936.0, + "grad_norm": 0.03212118258381081, + "language_loss": 0.83579981, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84621805, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.39648438, + "step": 4608, + "time_per_iteration": 2.987773895263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052631, + "balance_loss_mlp": 1.01381683, + "epoch": 0.8866871873797615, + "flos": 1505668055808.0, + "grad_norm": 0.013420842661803037, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79883587, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.38769531, + "step": 4609, + "time_per_iteration": 4.8158485889434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051288, + "balance_loss_mlp": 1.01142478, + "epoch": 0.886879569065025, + "flos": 812928886272.0, + "grad_norm": 0.035689455717370554, + "language_loss": 0.8246606, + "learning_rate": 3.3183740769755e-05, + "loss": 0.83517349, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.3984375, + "step": 4610, + "time_per_iteration": 3.1036856174468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052158, + "balance_loss_mlp": 1.01334381, + "epoch": 0.8870719507502886, + "flos": 1586226449664.0, + "grad_norm": 0.009608568330461998, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77962995, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.38769531, + "step": 4611, + "time_per_iteration": 4.96184229850769 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00853372, + "epoch": 0.8872643324355521, + "flos": 635165395200.0, + "grad_norm": 0.035989932069672784, + "language_loss": 0.75603622, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76651973, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.39794922, + "step": 4612, + "time_per_iteration": 2.736562967300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104857, + "balance_loss_mlp": 1.00877833, + "epoch": 0.8874567141208157, + "flos": 536784090624.0, + "grad_norm": 0.038218282698344784, + "language_loss": 0.83649206, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84697771, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.39770508, + "step": 4613, + "time_per_iteration": 2.6362390518188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048764, + "balance_loss_mlp": 1.00890064, + "epoch": 0.8876490958060793, + "flos": 1568719478784.0, + "grad_norm": 0.032946530708496874, + "language_loss": 0.79828751, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80877519, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.3984375, + "step": 4614, + "time_per_iteration": 3.8524417877197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048994, + "balance_loss_mlp": 1.00917864, + "epoch": 0.8878414774913428, + "flos": 637798733568.0, + "grad_norm": 0.034427393755430906, + "language_loss": 0.85503703, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.86552697, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.39794922, + "step": 4615, + "time_per_iteration": 2.8285470008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_mlp": 1.0090456, + "epoch": 0.8880338591766064, + "flos": 497422042368.0, + "grad_norm": 0.04825958458994101, + "language_loss": 0.81953943, + "learning_rate": 3.251737758834084e-05, + "loss": 0.83002782, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.39770508, + "step": 4616, + "time_per_iteration": 2.6075775623321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104902, + "balance_loss_mlp": 1.00918043, + "epoch": 0.88822624086187, + "flos": 543913254912.0, + "grad_norm": 0.03727530825330057, + "language_loss": 0.80842733, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.81891757, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.39819336, + "step": 4617, + "time_per_iteration": 2.636955976486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048789, + "balance_loss_mlp": 1.00897348, + "epoch": 0.8884186225471336, + "flos": 552876716544.0, + "grad_norm": 0.04548062549247532, + "language_loss": 0.84271151, + "learning_rate": 3.229670801173418e-05, + "loss": 0.85319942, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.39794922, + "step": 4618, + "time_per_iteration": 2.6750118732452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050083, + "balance_loss_mlp": 1.01126862, + "epoch": 0.888611004232397, + "flos": 1568662108416.0, + "grad_norm": 0.005691053324610859, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79562283, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.38769531, + "step": 4619, + "time_per_iteration": 5.008893013000488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049586, + "balance_loss_mlp": 1.0098182, + "epoch": 0.8888033859176606, + "flos": 768437221632.0, + "grad_norm": 0.03094273540620185, + "language_loss": 0.82804537, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83854127, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.39746094, + "step": 4620, + "time_per_iteration": 3.0561673641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010472, + "balance_loss_mlp": 1.00743198, + "epoch": 0.8889957676029242, + "flos": 935649862656.0, + "grad_norm": 0.04954559323619426, + "language_loss": 0.84685308, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85732502, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.39746094, + "step": 4621, + "time_per_iteration": 3.136061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_mlp": 1.00704467, + "epoch": 0.8891881492881878, + "flos": 590793294336.0, + "grad_norm": 0.03559696116277189, + "language_loss": 0.82163578, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.8321051, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.39868164, + "step": 4622, + "time_per_iteration": 2.76167893409729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_mlp": 1.00677288, + "epoch": 0.8893805309734514, + "flos": 541844687616.0, + "grad_norm": 0.03943600500029392, + "language_loss": 0.82868516, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83915204, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.39892578, + "step": 4623, + "time_per_iteration": 2.6835789680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046671, + "balance_loss_mlp": 1.00680768, + "epoch": 0.8895729126587149, + "flos": 561169449216.0, + "grad_norm": 0.037543227247628215, + "language_loss": 0.82209378, + "learning_rate": 3.163905853111054e-05, + "loss": 0.83256048, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.3984375, + "step": 4624, + "time_per_iteration": 2.6866161823272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048683, + "balance_loss_mlp": 1.00870013, + "epoch": 0.8897652943439784, + "flos": 611281624320.0, + "grad_norm": 0.03433233415002547, + "language_loss": 0.81767857, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82816535, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.3996582, + "step": 4625, + "time_per_iteration": 2.724318027496338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_mlp": 1.00863683, + "epoch": 0.889957676029242, + "flos": 919425033984.0, + "grad_norm": 0.04012941431513975, + "language_loss": 0.77691996, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78740519, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.39868164, + "step": 4626, + "time_per_iteration": 3.186610698699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_mlp": 1.00651431, + "epoch": 0.8901500577145056, + "flos": 489687277824.0, + "grad_norm": 0.03850246921489707, + "language_loss": 0.81163925, + "learning_rate": 3.131268797400588e-05, + "loss": 0.82210255, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.39794922, + "step": 4627, + "time_per_iteration": 2.555154800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046203, + "balance_loss_mlp": 1.0062921, + "epoch": 0.8903424393997691, + "flos": 734914402560.0, + "grad_norm": 0.03734341343229868, + "language_loss": 0.81121147, + "learning_rate": 3.120426165316398e-05, + "loss": 0.82167351, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.39892578, + "step": 4628, + "time_per_iteration": 2.997708797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046433, + "balance_loss_mlp": 1.00654614, + "epoch": 0.8905348210850327, + "flos": 520884905472.0, + "grad_norm": 0.0340998125038295, + "language_loss": 0.82300949, + "learning_rate": 3.109601733496881e-05, + "loss": 0.83347386, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.39868164, + "step": 4629, + "time_per_iteration": 2.6674559116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046772, + "balance_loss_mlp": 1.00690854, + "epoch": 0.8907272027702963, + "flos": 580199669760.0, + "grad_norm": 0.03316770819427237, + "language_loss": 0.80315387, + "learning_rate": 3.098795506144458e-05, + "loss": 0.81362164, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.3984375, + "step": 4630, + "time_per_iteration": 2.819411039352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047037, + "balance_loss_mlp": 1.0072211, + "epoch": 0.8909195844555599, + "flos": 895115497728.0, + "grad_norm": 0.035411813999275225, + "language_loss": 0.79863322, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80910361, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.39794922, + "step": 4631, + "time_per_iteration": 3.109464406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047287, + "balance_loss_mlp": 1.00747073, + "epoch": 0.8911119661408234, + "flos": 550949100288.0, + "grad_norm": 0.03590303860632242, + "language_loss": 0.84888053, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85935342, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.39794922, + "step": 4632, + "time_per_iteration": 2.631802558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_mlp": 1.00747597, + "epoch": 0.8913043478260869, + "flos": 482165395968.0, + "grad_norm": 0.04494839545328405, + "language_loss": 0.84571874, + "learning_rate": 3.066486092807874e-05, + "loss": 0.85619211, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.3984375, + "step": 4633, + "time_per_iteration": 2.6622323989868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049593, + "balance_loss_mlp": 1.0098244, + "epoch": 0.8914967295113505, + "flos": 485645407488.0, + "grad_norm": 0.03485029378747491, + "language_loss": 0.85605252, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86654842, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.39746094, + "step": 4634, + "time_per_iteration": 2.649674892425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049227, + "balance_loss_mlp": 1.00950658, + "epoch": 0.8916891111966141, + "flos": 446593451520.0, + "grad_norm": 0.03644338611466098, + "language_loss": 0.81789589, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82838821, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.39697266, + "step": 4635, + "time_per_iteration": 2.5484728813171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048488, + "balance_loss_mlp": 1.00864804, + "epoch": 0.8918814928818777, + "flos": 565079116800.0, + "grad_norm": 0.03346108733120089, + "language_loss": 0.788185, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79866982, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.39819336, + "step": 4636, + "time_per_iteration": 2.728586435317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048649, + "balance_loss_mlp": 1.00878584, + "epoch": 0.8920738745671412, + "flos": 577029750528.0, + "grad_norm": 0.03483769902924038, + "language_loss": 0.81798345, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82846999, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.3984375, + "step": 4637, + "time_per_iteration": 2.662823438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104713, + "balance_loss_mlp": 1.00724292, + "epoch": 0.8922662562524047, + "flos": 621315335424.0, + "grad_norm": 0.0330933366062548, + "language_loss": 0.84552616, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.85599744, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.39868164, + "step": 4638, + "time_per_iteration": 2.7474730014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_mlp": 1.00476134, + "epoch": 0.8924586379376683, + "flos": 584808256512.0, + "grad_norm": 0.03843989347022111, + "language_loss": 0.79864812, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80909455, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.39868164, + "step": 4639, + "time_per_iteration": 2.7433924674987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_mlp": 1.00508726, + "epoch": 0.8926510196229319, + "flos": 526201159680.0, + "grad_norm": 0.04548508479610925, + "language_loss": 0.82137775, + "learning_rate": 2.991735397786538e-05, + "loss": 0.83182728, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.3984375, + "step": 4640, + "time_per_iteration": 2.745567798614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045023, + "balance_loss_mlp": 1.00511181, + "epoch": 0.8928434013081955, + "flos": 487640097792.0, + "grad_norm": 0.04163390678432403, + "language_loss": 0.81369799, + "learning_rate": 2.981129694909146e-05, + "loss": 0.8241483, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.39892578, + "step": 4641, + "time_per_iteration": 2.5342392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_mlp": 1.01094055, + "epoch": 0.893035782993459, + "flos": 1451201984256.0, + "grad_norm": 0.009146057104072083, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81380612, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.38769531, + "step": 4642, + "time_per_iteration": 4.708850860595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045829, + "balance_loss_mlp": 1.00601351, + "epoch": 0.8932281646787226, + "flos": 612445192704.0, + "grad_norm": 0.03907380581076919, + "language_loss": 0.81196648, + "learning_rate": 2.95997305629786e-05, + "loss": 0.82242477, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.39794922, + "step": 4643, + "time_per_iteration": 2.782482385635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045296, + "balance_loss_mlp": 1.00548053, + "epoch": 0.8934205463639862, + "flos": 566828843520.0, + "grad_norm": 0.03557987256456931, + "language_loss": 0.84996665, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.86041963, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.39794922, + "step": 4644, + "time_per_iteration": 2.632826089859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042235, + "balance_loss_mlp": 1.00230014, + "epoch": 0.8936129280492497, + "flos": 489435511296.0, + "grad_norm": 0.041094605368718444, + "language_loss": 0.78782856, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79825091, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.39916992, + "step": 4645, + "time_per_iteration": 2.564196825027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_mlp": 1.00242722, + "epoch": 0.8938053097345132, + "flos": 888075761664.0, + "grad_norm": 0.03418718196027921, + "language_loss": 0.8146354, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.82505834, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.3984375, + "step": 4646, + "time_per_iteration": 3.25028920173645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042378, + "balance_loss_mlp": 1.00253797, + "epoch": 0.8939976914197768, + "flos": 594433698816.0, + "grad_norm": 0.036470108367040274, + "language_loss": 0.84610659, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85653043, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.39819336, + "step": 4647, + "time_per_iteration": 2.7234268188476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046732, + "balance_loss_mlp": 1.00689232, + "epoch": 0.8941900731050404, + "flos": 524310481920.0, + "grad_norm": 0.04108443023943927, + "language_loss": 0.81452197, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82498932, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.39819336, + "step": 4648, + "time_per_iteration": 2.629329204559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_mlp": 1.00691688, + "epoch": 0.894382454790304, + "flos": 801928938240.0, + "grad_norm": 0.03703495030211325, + "language_loss": 0.81576788, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82623482, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.39746094, + "step": 4649, + "time_per_iteration": 2.995352268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046385, + "balance_loss_mlp": 1.00668859, + "epoch": 0.8945748364755676, + "flos": 480061835520.0, + "grad_norm": 0.03242427770731825, + "language_loss": 0.85382026, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.86428416, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.39672852, + "step": 4650, + "time_per_iteration": 2.583432674407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042358, + "balance_loss_mlp": 1.00242329, + "epoch": 0.894767218160831, + "flos": 509854821888.0, + "grad_norm": 0.037913827599498295, + "language_loss": 0.83491743, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84534097, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.39916992, + "step": 4651, + "time_per_iteration": 2.6782608032226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042301, + "balance_loss_mlp": 1.00243759, + "epoch": 0.8949595998460946, + "flos": 687064235520.0, + "grad_norm": 0.036736331203873075, + "language_loss": 0.82394856, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.83437157, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.3984375, + "step": 4652, + "time_per_iteration": 2.8787331581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042316, + "balance_loss_mlp": 1.00242865, + "epoch": 0.8951519815313582, + "flos": 801295147776.0, + "grad_norm": 0.038252435154000265, + "language_loss": 0.7746408, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78506398, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.39868164, + "step": 4653, + "time_per_iteration": 3.003697156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_mlp": 1.00218761, + "epoch": 0.8953443632166218, + "flos": 667936805376.0, + "grad_norm": 0.038098195849768056, + "language_loss": 0.86601067, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87643117, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.3984375, + "step": 4654, + "time_per_iteration": 2.7735435962677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043081, + "balance_loss_mlp": 1.00328851, + "epoch": 0.8955367449018854, + "flos": 646211030016.0, + "grad_norm": 0.03546786074364338, + "language_loss": 0.8345741, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84500492, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.39770508, + "step": 4655, + "time_per_iteration": 2.831127166748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104265, + "balance_loss_mlp": 1.00278592, + "epoch": 0.8957291265871489, + "flos": 810163345152.0, + "grad_norm": 0.0374122133806325, + "language_loss": 0.77918243, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78960896, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.3984375, + "step": 4656, + "time_per_iteration": 3.0841567516326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043156, + "balance_loss_mlp": 1.00331628, + "epoch": 0.8959215082724125, + "flos": 519964355328.0, + "grad_norm": 0.0357969961625874, + "language_loss": 0.77545249, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78588402, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.39819336, + "step": 4657, + "time_per_iteration": 2.6084063053131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_mlp": 1.00317335, + "epoch": 0.896113889957676, + "flos": 477912588288.0, + "grad_norm": 0.03521425072666777, + "language_loss": 0.77677613, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78720629, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.39819336, + "step": 4658, + "time_per_iteration": 2.5991733074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043145, + "balance_loss_mlp": 1.00330484, + "epoch": 0.8963062716429396, + "flos": 519174062592.0, + "grad_norm": 0.03519101773550154, + "language_loss": 0.83259702, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84302849, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.39819336, + "step": 4659, + "time_per_iteration": 2.615996837615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042122, + "balance_loss_mlp": 1.00221038, + "epoch": 0.8964986533282031, + "flos": 509502933504.0, + "grad_norm": 0.038458776381427445, + "language_loss": 0.82031912, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.83074033, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.39892578, + "step": 4660, + "time_per_iteration": 2.6992716789245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042516, + "balance_loss_mlp": 1.00260508, + "epoch": 0.8966910350134667, + "flos": 537109734144.0, + "grad_norm": 0.043189430842781595, + "language_loss": 0.81613052, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82655567, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.39892578, + "step": 4661, + "time_per_iteration": 2.6292388439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042305, + "balance_loss_mlp": 1.00244117, + "epoch": 0.8968834166987303, + "flos": 724498667520.0, + "grad_norm": 0.03837822461358079, + "language_loss": 0.84645671, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85687977, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.3984375, + "step": 4662, + "time_per_iteration": 2.8729970455169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042136, + "balance_loss_mlp": 1.00231993, + "epoch": 0.8970757983839939, + "flos": 682948488192.0, + "grad_norm": 0.03094905267903294, + "language_loss": 0.84279275, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.85321409, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.39794922, + "step": 4663, + "time_per_iteration": 2.906511068344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104132, + "balance_loss_mlp": 1.00143254, + "epoch": 0.8972681800692575, + "flos": 614157980928.0, + "grad_norm": 0.04620542900915945, + "language_loss": 0.75984728, + "learning_rate": 2.742244971856006e-05, + "loss": 0.77026045, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.39868164, + "step": 4664, + "time_per_iteration": 2.7156736850738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040999, + "balance_loss_mlp": 1.00118363, + "epoch": 0.8974605617545209, + "flos": 573500161536.0, + "grad_norm": 0.03609164505863249, + "language_loss": 0.83621204, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84662199, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.39794922, + "step": 4665, + "time_per_iteration": 2.752067804336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_mlp": 1.0020771, + "epoch": 0.8976529434397845, + "flos": 521508002304.0, + "grad_norm": 0.0665418568893367, + "language_loss": 0.87346292, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88388181, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.39794922, + "step": 4666, + "time_per_iteration": 2.628237247467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041533, + "balance_loss_mlp": 1.00166953, + "epoch": 0.8978453251250481, + "flos": 472283329536.0, + "grad_norm": 0.0312359439189668, + "language_loss": 0.83060151, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.84101683, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.3984375, + "step": 4667, + "time_per_iteration": 2.632169008255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041569, + "balance_loss_mlp": 1.00170541, + "epoch": 0.8980377068103117, + "flos": 592822977792.0, + "grad_norm": 0.030629591551058093, + "language_loss": 0.82659423, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.83700991, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.3984375, + "step": 4668, + "time_per_iteration": 2.767726421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045376, + "balance_loss_mlp": 1.0056082, + "epoch": 0.8982300884955752, + "flos": 768951448320.0, + "grad_norm": 0.036651937719319225, + "language_loss": 0.83371913, + "learning_rate": 2.691596129049556e-05, + "loss": 0.84417284, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.39746094, + "step": 4669, + "time_per_iteration": 2.9341423511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_mlp": 1.00602329, + "epoch": 0.8984224701808388, + "flos": 846126061824.0, + "grad_norm": 0.036907430461080686, + "language_loss": 0.77928305, + "learning_rate": 2.681521445046775e-05, + "loss": 0.7897405, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.39697266, + "step": 4670, + "time_per_iteration": 3.222792625427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045725, + "balance_loss_mlp": 1.00593269, + "epoch": 0.8986148518661023, + "flos": 759100484352.0, + "grad_norm": 0.03395344580727902, + "language_loss": 0.76753604, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77799332, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.39770508, + "step": 4671, + "time_per_iteration": 3.153036594390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_mlp": 1.00551569, + "epoch": 0.8988072335513659, + "flos": 564147873024.0, + "grad_norm": 0.0448307164275886, + "language_loss": 0.76783341, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77828646, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.39770508, + "step": 4672, + "time_per_iteration": 2.7044925689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_mlp": 1.00707936, + "epoch": 0.8989996152366295, + "flos": 493662074112.0, + "grad_norm": 0.04081416561791378, + "language_loss": 0.87663758, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.88710511, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.39648438, + "step": 4673, + "time_per_iteration": 2.5325169563293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_mlp": 1.00721145, + "epoch": 0.899191996921893, + "flos": 543624549888.0, + "grad_norm": 0.04404130619875364, + "language_loss": 0.76413238, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.77460146, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.39672852, + "step": 4674, + "time_per_iteration": 2.7047698497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046859, + "balance_loss_mlp": 1.00709105, + "epoch": 0.8993843786071566, + "flos": 472309574400.0, + "grad_norm": 0.03723607547134794, + "language_loss": 0.79863477, + "learning_rate": 2.631423662948984e-05, + "loss": 0.80910337, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.39746094, + "step": 4675, + "time_per_iteration": 2.5789737701416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047065, + "balance_loss_mlp": 1.00727308, + "epoch": 0.8995767602924202, + "flos": 527818683648.0, + "grad_norm": 0.03740066278427069, + "language_loss": 0.82893097, + "learning_rate": 2.621459261342196e-05, + "loss": 0.8394016, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.39770508, + "step": 4676, + "time_per_iteration": 2.7864742279052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042225, + "balance_loss_mlp": 1.00233746, + "epoch": 0.8997691419776838, + "flos": 558712055040.0, + "grad_norm": 0.03256175332090757, + "language_loss": 0.85054183, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.86096412, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.39868164, + "step": 4677, + "time_per_iteration": 2.677170753479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_mlp": 1.00210464, + "epoch": 0.8999615236629472, + "flos": 640254182400.0, + "grad_norm": 0.03334384672588689, + "language_loss": 0.8101427, + "learning_rate": 2.601585643932436e-05, + "loss": 0.8205626, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.39868164, + "step": 4678, + "time_per_iteration": 2.851458787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041386, + "balance_loss_mlp": 1.00238037, + "epoch": 0.9001539053482108, + "flos": 1434591240960.0, + "grad_norm": 0.005247251501394147, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86825407, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.38964844, + "step": 4679, + "time_per_iteration": 4.843084812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042195, + "balance_loss_mlp": 1.00230718, + "epoch": 0.9003462870334744, + "flos": 568036153344.0, + "grad_norm": 0.037250893678606165, + "language_loss": 0.80231905, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.81274104, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.39868164, + "step": 4680, + "time_per_iteration": 2.8753278255462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042225, + "balance_loss_mlp": 1.00228965, + "epoch": 0.900538668718738, + "flos": 539706134016.0, + "grad_norm": 0.03919596922604473, + "language_loss": 0.78784555, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79826784, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.39916992, + "step": 4681, + "time_per_iteration": 2.638622283935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041942, + "balance_loss_mlp": 1.00200713, + "epoch": 0.9007310504040016, + "flos": 489352886016.0, + "grad_norm": 0.03766496369551758, + "language_loss": 0.86354792, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.87396729, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.39916992, + "step": 4682, + "time_per_iteration": 2.5297749042510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042236, + "balance_loss_mlp": 1.00232458, + "epoch": 0.9009234320892651, + "flos": 654141180672.0, + "grad_norm": 0.0372575142760268, + "language_loss": 0.79257679, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.80299914, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.39892578, + "step": 4683, + "time_per_iteration": 2.843735694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_mlp": 1.00201344, + "epoch": 0.9011158137745287, + "flos": 546639912192.0, + "grad_norm": 0.03287971107611015, + "language_loss": 0.85687071, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86728871, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.39770508, + "step": 4684, + "time_per_iteration": 2.6854100227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041901, + "balance_loss_mlp": 1.00210893, + "epoch": 0.9013081954597922, + "flos": 560787425280.0, + "grad_norm": 0.03642552885134523, + "language_loss": 0.83117259, + "learning_rate": 2.532607837883011e-05, + "loss": 0.8415916, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.39770508, + "step": 4685, + "time_per_iteration": 2.6633992195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104193, + "balance_loss_mlp": 1.0021615, + "epoch": 0.9015005771450558, + "flos": 729943233792.0, + "grad_norm": 0.03348030905856602, + "language_loss": 0.81757379, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82799315, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.39746094, + "step": 4686, + "time_per_iteration": 2.9701757431030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_mlp": 1.00181627, + "epoch": 0.9016929588303193, + "flos": 518492640000.0, + "grad_norm": 0.03548126030039495, + "language_loss": 0.81336671, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.8237828, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.39770508, + "step": 4687, + "time_per_iteration": 2.8197410106658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_mlp": 1.00445437, + "epoch": 0.9018853405155829, + "flos": 623555956224.0, + "grad_norm": 0.03200626373887687, + "language_loss": 0.86246824, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87291074, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.39770508, + "step": 4688, + "time_per_iteration": 2.8695383071899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042269, + "balance_loss_mlp": 1.0023576, + "epoch": 0.9020777222008465, + "flos": 524338672128.0, + "grad_norm": 0.03434760886532221, + "language_loss": 0.78137249, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.79179519, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.39892578, + "step": 4689, + "time_per_iteration": 2.6241252422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041799, + "balance_loss_mlp": 1.00188804, + "epoch": 0.9022701038861101, + "flos": 634894186752.0, + "grad_norm": 0.02955589406897841, + "language_loss": 0.82295549, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.83337349, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.39892578, + "step": 4690, + "time_per_iteration": 2.8230092525482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104175, + "balance_loss_mlp": 1.00188613, + "epoch": 0.9024624855713737, + "flos": 514333151232.0, + "grad_norm": 0.032923690811660945, + "language_loss": 0.84795076, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85836828, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.3984375, + "step": 4691, + "time_per_iteration": 2.6084606647491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044208, + "balance_loss_mlp": 1.00432038, + "epoch": 0.9026548672566371, + "flos": 478451114496.0, + "grad_norm": 0.03359361438657751, + "language_loss": 0.8643924, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87483442, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.39868164, + "step": 4692, + "time_per_iteration": 2.624701738357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044579, + "balance_loss_mlp": 1.00469148, + "epoch": 0.9028472489419007, + "flos": 663171716352.0, + "grad_norm": 0.0375528831642029, + "language_loss": 0.74257243, + "learning_rate": 2.454881842109058e-05, + "loss": 0.75301814, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.39868164, + "step": 4693, + "time_per_iteration": 2.8537118434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044968, + "balance_loss_mlp": 1.0050807, + "epoch": 0.9030396306271643, + "flos": 535620522240.0, + "grad_norm": 0.03845768429563665, + "language_loss": 0.82445383, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.83490348, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.39868164, + "step": 4694, + "time_per_iteration": 2.632303237915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_mlp": 1.00474119, + "epoch": 0.9032320123124279, + "flos": 802384839168.0, + "grad_norm": 0.03517692965609126, + "language_loss": 0.82984042, + "learning_rate": 2.43563485451328e-05, + "loss": 0.84028614, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.39819336, + "step": 4695, + "time_per_iteration": 2.9748177528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_mlp": 1.00484991, + "epoch": 0.9034243939976914, + "flos": 555025963776.0, + "grad_norm": 0.03909379859761359, + "language_loss": 0.77136493, + "learning_rate": 2.426039058035451e-05, + "loss": 0.78181207, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.3984375, + "step": 4696, + "time_per_iteration": 2.6545913219451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044716, + "balance_loss_mlp": 1.00485229, + "epoch": 0.903616775682955, + "flos": 504896292096.0, + "grad_norm": 0.03656588091717966, + "language_loss": 0.83326173, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.84370893, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.3984375, + "step": 4697, + "time_per_iteration": 2.6118416786193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046982, + "balance_loss_mlp": 1.00716567, + "epoch": 0.9038091573682185, + "flos": 437256714240.0, + "grad_norm": 0.03454643781931151, + "language_loss": 0.79546142, + "learning_rate": 2.406902878347017e-05, + "loss": 0.80593121, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.39794922, + "step": 4698, + "time_per_iteration": 2.627512216567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_mlp": 1.00444448, + "epoch": 0.9040015390534821, + "flos": 533990359296.0, + "grad_norm": 0.04424640859343309, + "language_loss": 0.82207114, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.8325147, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.39892578, + "step": 4699, + "time_per_iteration": 2.639582872390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_mlp": 1.00299335, + "epoch": 0.9041939207387457, + "flos": 565431005184.0, + "grad_norm": 0.053462163472176805, + "language_loss": 0.80744809, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.8178767, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.3984375, + "step": 4700, + "time_per_iteration": 2.7810487747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042912, + "balance_loss_mlp": 1.00300074, + "epoch": 0.9043863024240092, + "flos": 516521282304.0, + "grad_norm": 0.05013592196212244, + "language_loss": 0.78212988, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.79255903, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.39892578, + "step": 4701, + "time_per_iteration": 2.604733943939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_mlp": 1.00373077, + "epoch": 0.9045786841092728, + "flos": 1280785397760.0, + "grad_norm": 0.004379437455403695, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73972332, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.38769531, + "step": 4702, + "time_per_iteration": 5.015188455581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_mlp": 1.00230312, + "epoch": 0.9047710657945364, + "flos": 586933204224.0, + "grad_norm": 0.15726784027551832, + "language_loss": 0.83084238, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.84126312, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.39746094, + "step": 4703, + "time_per_iteration": 2.6818783283233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_mlp": 1.001899, + "epoch": 0.9049634474798, + "flos": 573072450816.0, + "grad_norm": 0.03926404752899239, + "language_loss": 0.79924166, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80965883, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.39794922, + "step": 4704, + "time_per_iteration": 2.7599122524261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_mlp": 1.00168145, + "epoch": 0.9051558291650635, + "flos": 573688744704.0, + "grad_norm": 0.041200102084881625, + "language_loss": 0.7489146, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75932956, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.39794922, + "step": 4705, + "time_per_iteration": 2.6949710845947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_mlp": 1.00184762, + "epoch": 0.905348210850327, + "flos": 541577369856.0, + "grad_norm": 0.03988349489111128, + "language_loss": 0.7962532, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80666983, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.39794922, + "step": 4706, + "time_per_iteration": 2.6960105895996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041736, + "balance_loss_mlp": 1.00196815, + "epoch": 0.9055405925355906, + "flos": 517396145664.0, + "grad_norm": 0.03894790437085165, + "language_loss": 0.81917119, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82958859, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.39746094, + "step": 4707, + "time_per_iteration": 2.579784393310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_mlp": 1.00206232, + "epoch": 0.9057329742208542, + "flos": 916223033856.0, + "grad_norm": 0.03029944141682911, + "language_loss": 0.85198569, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86240375, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.3972168, + "step": 4708, + "time_per_iteration": 3.178318500518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_mlp": 1.00348961, + "epoch": 0.9059253559061178, + "flos": 906777426432.0, + "grad_norm": 0.03975505233074101, + "language_loss": 0.83321095, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.84364378, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.39770508, + "step": 4709, + "time_per_iteration": 3.1223511695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_mlp": 1.00327146, + "epoch": 0.9061177375913813, + "flos": 665803109376.0, + "grad_norm": 0.042986952434846105, + "language_loss": 0.78179657, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.79222792, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.3984375, + "step": 4710, + "time_per_iteration": 2.904844284057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_mlp": 1.00377488, + "epoch": 0.9063101192766448, + "flos": 566779266048.0, + "grad_norm": 0.03589709877789938, + "language_loss": 0.82981563, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.84025156, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.39794922, + "step": 4711, + "time_per_iteration": 2.7522215843200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_mlp": 1.00335848, + "epoch": 0.9065025009619084, + "flos": 728631911424.0, + "grad_norm": 0.04760632658729641, + "language_loss": 0.79660898, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80704117, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.3984375, + "step": 4712, + "time_per_iteration": 2.874788761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043296, + "balance_loss_mlp": 1.00345671, + "epoch": 0.906694882647172, + "flos": 532548779520.0, + "grad_norm": 0.046151346888196505, + "language_loss": 0.80804384, + "learning_rate": 2.265739417041418e-05, + "loss": 0.8184768, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.39819336, + "step": 4713, + "time_per_iteration": 2.648693084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043542, + "balance_loss_mlp": 1.00375044, + "epoch": 0.9068872643324356, + "flos": 430696211712.0, + "grad_norm": 0.0360775633436318, + "language_loss": 0.85277104, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.86320645, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.39770508, + "step": 4714, + "time_per_iteration": 2.6000893115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_mlp": 1.00177884, + "epoch": 0.9070796460176991, + "flos": 589455727104.0, + "grad_norm": 0.044653943193246386, + "language_loss": 0.80620706, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.81662303, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.39794922, + "step": 4715, + "time_per_iteration": 2.7455286979675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041192, + "balance_loss_mlp": 1.00135255, + "epoch": 0.9072720277029627, + "flos": 572655433728.0, + "grad_norm": 0.03585300733233402, + "language_loss": 0.75799972, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76841164, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.39819336, + "step": 4716, + "time_per_iteration": 2.742478847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_mlp": 1.00165629, + "epoch": 0.9074644093882263, + "flos": 556860261120.0, + "grad_norm": 0.03498331217535132, + "language_loss": 0.8895576, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89997262, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.39819336, + "step": 4717, + "time_per_iteration": 2.6485562324523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_mlp": 1.00248528, + "epoch": 0.9076567910734898, + "flos": 642173050368.0, + "grad_norm": 0.04530512053024163, + "language_loss": 0.83037788, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.84080064, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.39770508, + "step": 4718, + "time_per_iteration": 2.8499996662139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042708, + "balance_loss_mlp": 1.00291562, + "epoch": 0.9078491727587533, + "flos": 735457786368.0, + "grad_norm": 0.033395309611424465, + "language_loss": 0.82320893, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.83363599, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.39770508, + "step": 4719, + "time_per_iteration": 3.0724966526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_mlp": 1.00336421, + "epoch": 0.9080415544440169, + "flos": 656021164800.0, + "grad_norm": 0.031212172268033682, + "language_loss": 0.87156701, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.88199806, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.3972168, + "step": 4720, + "time_per_iteration": 2.8246238231658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_mlp": 1.0035367, + "epoch": 0.9082339361292805, + "flos": 598603881216.0, + "grad_norm": 0.03520689760481103, + "language_loss": 0.79804933, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80848241, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.39746094, + "step": 4721, + "time_per_iteration": 2.742318630218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104355, + "balance_loss_mlp": 1.00371027, + "epoch": 0.9084263178145441, + "flos": 505426070016.0, + "grad_norm": 0.036483558406201176, + "language_loss": 0.84666395, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85709947, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.39819336, + "step": 4722, + "time_per_iteration": 2.6103546619415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043538, + "balance_loss_mlp": 1.00362682, + "epoch": 0.9086186994998077, + "flos": 551107547904.0, + "grad_norm": 0.03608698449655091, + "language_loss": 0.80968702, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.82012242, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.39892578, + "step": 4723, + "time_per_iteration": 2.713469982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_mlp": 1.00195599, + "epoch": 0.9088110811850711, + "flos": 1135909979904.0, + "grad_norm": 0.07605585117528456, + "language_loss": 0.75542474, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76584214, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.39770508, + "step": 4724, + "time_per_iteration": 3.526309013366699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041464, + "balance_loss_mlp": 1.00164771, + "epoch": 0.9090034628703347, + "flos": 558060768000.0, + "grad_norm": 0.03248840709412771, + "language_loss": 0.77556646, + "learning_rate": 2.155810244111628e-05, + "loss": 0.78598112, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.39794922, + "step": 4725, + "time_per_iteration": 2.6785218715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_mlp": 1.0020206, + "epoch": 0.9091958445555983, + "flos": 545066129664.0, + "grad_norm": 0.03903153521101798, + "language_loss": 0.84798872, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85840684, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.39770508, + "step": 4726, + "time_per_iteration": 2.6782310009002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041721, + "balance_loss_mlp": 1.00192916, + "epoch": 0.9093882262408619, + "flos": 527141151744.0, + "grad_norm": 0.05792704436205886, + "language_loss": 0.8138569, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82427418, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.39770508, + "step": 4727, + "time_per_iteration": 2.6294679641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043028, + "balance_loss_mlp": 1.0033077, + "epoch": 0.9095806079261254, + "flos": 549572649216.0, + "grad_norm": 0.038657617554072196, + "language_loss": 0.82212007, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.83255029, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.39697266, + "step": 4728, + "time_per_iteration": 2.63301944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043715, + "balance_loss_mlp": 1.00389957, + "epoch": 0.909772989611389, + "flos": 573641112576.0, + "grad_norm": 0.03527373598535026, + "language_loss": 0.84725654, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85769367, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.39794922, + "step": 4729, + "time_per_iteration": 2.725679636001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_mlp": 1.00318694, + "epoch": 0.9099653712966526, + "flos": 562882237440.0, + "grad_norm": 0.04198417845099068, + "language_loss": 0.80126184, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.81169212, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.39819336, + "step": 4730, + "time_per_iteration": 2.7127773761749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_mlp": 1.00447404, + "epoch": 0.9101577529819161, + "flos": 1095499069440.0, + "grad_norm": 0.03820196979176914, + "language_loss": 0.80539393, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81583661, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.39770508, + "step": 4731, + "time_per_iteration": 3.3886983394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042072, + "balance_loss_mlp": 1.00230372, + "epoch": 0.9103501346671797, + "flos": 446361126912.0, + "grad_norm": 0.04074901158585317, + "language_loss": 0.82276326, + "learning_rate": 2.092919721190678e-05, + "loss": 0.833184, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.39746094, + "step": 4732, + "time_per_iteration": 2.528346300125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042157, + "balance_loss_mlp": 1.00234151, + "epoch": 0.9105425163524432, + "flos": 501813855744.0, + "grad_norm": 0.04168871505997614, + "language_loss": 0.78099614, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.79141772, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.39794922, + "step": 4733, + "time_per_iteration": 2.612539768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042251, + "balance_loss_mlp": 1.00245869, + "epoch": 0.9107348980377068, + "flos": 658776012288.0, + "grad_norm": 0.03125846585912685, + "language_loss": 0.84275806, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85318053, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.39770508, + "step": 4734, + "time_per_iteration": 2.8593521118164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_mlp": 1.0021503, + "epoch": 0.9109272797229704, + "flos": 554719762176.0, + "grad_norm": 0.03419361098029299, + "language_loss": 0.85598445, + "learning_rate": 2.066245558029256e-05, + "loss": 0.8664031, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.39697266, + "step": 4735, + "time_per_iteration": 2.6386280059814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041636, + "balance_loss_mlp": 1.00186801, + "epoch": 0.911119661408234, + "flos": 520011987456.0, + "grad_norm": 0.03913303108798507, + "language_loss": 0.84620136, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85661769, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.39746094, + "step": 4736, + "time_per_iteration": 2.6297035217285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041906, + "balance_loss_mlp": 1.00218534, + "epoch": 0.9113120430934974, + "flos": 555436177920.0, + "grad_norm": 0.03828883818672991, + "language_loss": 0.83354127, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84396034, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.39697266, + "step": 4737, + "time_per_iteration": 2.6358323097229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043517, + "balance_loss_mlp": 1.00372517, + "epoch": 0.911504424778761, + "flos": 502957982208.0, + "grad_norm": 0.0363931699668563, + "language_loss": 0.81623733, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82667255, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.39770508, + "step": 4738, + "time_per_iteration": 2.6258926391601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043564, + "balance_loss_mlp": 1.00374854, + "epoch": 0.9116968064640246, + "flos": 612212868096.0, + "grad_norm": 0.03297484419299765, + "language_loss": 0.82335055, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83378625, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.39794922, + "step": 4739, + "time_per_iteration": 2.6935737133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044368, + "balance_loss_mlp": 1.00457621, + "epoch": 0.9118891881492882, + "flos": 574095068160.0, + "grad_norm": 0.041988619549609606, + "language_loss": 0.82920527, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.8396489, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.39770508, + "step": 4740, + "time_per_iteration": 2.8001527786254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_mlp": 1.00356817, + "epoch": 0.9120815698345518, + "flos": 637173691392.0, + "grad_norm": 0.03597469929718847, + "language_loss": 0.78218091, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79261565, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.39892578, + "step": 4741, + "time_per_iteration": 2.8082351684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_mlp": 1.00177824, + "epoch": 0.9122739515198153, + "flos": 703556381952.0, + "grad_norm": 0.051345638115528766, + "language_loss": 0.86597633, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.87639201, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.39770508, + "step": 4742, + "time_per_iteration": 2.846757650375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042847, + "balance_loss_mlp": 1.0030551, + "epoch": 0.9124663332050789, + "flos": 525717068544.0, + "grad_norm": 0.04127672147297515, + "language_loss": 0.88021904, + "learning_rate": 1.995933526832239e-05, + "loss": 0.89064753, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.39770508, + "step": 4743, + "time_per_iteration": 2.5983972549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_mlp": 1.00258601, + "epoch": 0.9126587148903424, + "flos": 564371449344.0, + "grad_norm": 0.03669827673453058, + "language_loss": 0.83077073, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.84119469, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.39794922, + "step": 4744, + "time_per_iteration": 2.638869524002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_mlp": 1.00275481, + "epoch": 0.912851096575606, + "flos": 506934723840.0, + "grad_norm": 0.040984614191802604, + "language_loss": 0.80266577, + "learning_rate": 1.978541819374574e-05, + "loss": 0.8130908, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.3972168, + "step": 4745, + "time_per_iteration": 2.6040964126586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_mlp": 1.0022434, + "epoch": 0.9130434782608695, + "flos": 551769528576.0, + "grad_norm": 0.035175280815842924, + "language_loss": 0.83013141, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.8405515, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.39746094, + "step": 4746, + "time_per_iteration": 2.621307373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042031, + "balance_loss_mlp": 1.00223851, + "epoch": 0.9132358599461331, + "flos": 469936750848.0, + "grad_norm": 0.035669438034904535, + "language_loss": 0.83020693, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84062719, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.39770508, + "step": 4747, + "time_per_iteration": 2.5283937454223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041728, + "balance_loss_mlp": 1.00196004, + "epoch": 0.9134282416313967, + "flos": 507101919744.0, + "grad_norm": 0.03785886131144422, + "language_loss": 0.80289733, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.81331468, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.39746094, + "step": 4748, + "time_per_iteration": 2.63830304145813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_mlp": 1.00378549, + "epoch": 0.9136206233166603, + "flos": 605939125248.0, + "grad_norm": 0.03504382906391139, + "language_loss": 0.84374118, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85417724, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.39794922, + "step": 4749, + "time_per_iteration": 2.765106439590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043164, + "balance_loss_mlp": 1.00327647, + "epoch": 0.9138130050019239, + "flos": 562825857024.0, + "grad_norm": 0.037154786087076466, + "language_loss": 0.83264536, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.843077, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.39868164, + "step": 4750, + "time_per_iteration": 2.6922707557678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_mlp": 1.00345743, + "epoch": 0.9140053866871873, + "flos": 691345233408.0, + "grad_norm": 0.032511709695782105, + "language_loss": 0.90623546, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91666842, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.39819336, + "step": 4751, + "time_per_iteration": 2.818824291229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_mlp": 1.00350499, + "epoch": 0.9141977683724509, + "flos": 552130165248.0, + "grad_norm": 0.03267689377344242, + "language_loss": 0.8422156, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85264862, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.39770508, + "step": 4752, + "time_per_iteration": 2.717310905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_mlp": 1.00199425, + "epoch": 0.9143901500577145, + "flos": 541121468928.0, + "grad_norm": 0.04120763635430611, + "language_loss": 0.75722265, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76764077, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.39794922, + "step": 4753, + "time_per_iteration": 2.705686092376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041376, + "balance_loss_mlp": 1.00158441, + "epoch": 0.9145825317429781, + "flos": 529793932032.0, + "grad_norm": 0.03780448267955887, + "language_loss": 0.81331134, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.8237251, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.39770508, + "step": 4754, + "time_per_iteration": 2.6305735111236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041884, + "balance_loss_mlp": 1.0020684, + "epoch": 0.9147749134282416, + "flos": 515813614848.0, + "grad_norm": 0.034250485968984036, + "language_loss": 0.79676282, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80718166, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.39794922, + "step": 4755, + "time_per_iteration": 2.627593755722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042217, + "balance_loss_mlp": 1.00247312, + "epoch": 0.9149672951135052, + "flos": 515514216192.0, + "grad_norm": 0.037789408744826655, + "language_loss": 0.86363244, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.87405461, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.3972168, + "step": 4756, + "time_per_iteration": 2.6330106258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042561, + "balance_loss_mlp": 1.0027926, + "epoch": 0.9151596767987688, + "flos": 578228312064.0, + "grad_norm": 0.03566379548068454, + "language_loss": 0.81810522, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.82853079, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.39746094, + "step": 4757, + "time_per_iteration": 2.7305617332458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044663, + "balance_loss_mlp": 1.00487077, + "epoch": 0.9153520584840323, + "flos": 620477410560.0, + "grad_norm": 0.033671157071198325, + "language_loss": 0.82926512, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83971179, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.39770508, + "step": 4758, + "time_per_iteration": 2.732607126235962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043403, + "balance_loss_mlp": 1.00351596, + "epoch": 0.9155444401692959, + "flos": 469862873856.0, + "grad_norm": 0.038586865658807416, + "language_loss": 0.83053923, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.84097326, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.39868164, + "step": 4759, + "time_per_iteration": 2.598182439804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_mlp": 1.00182343, + "epoch": 0.9157368218545594, + "flos": 1413842396160.0, + "grad_norm": 0.004422108568448796, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75859803, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.38867188, + "step": 4760, + "time_per_iteration": 4.8417699337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040726, + "balance_loss_mlp": 1.0018158, + "epoch": 0.915929203539823, + "flos": 1525327209216.0, + "grad_norm": 0.004306437529183597, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80616784, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.38867188, + "step": 4761, + "time_per_iteration": 4.890992641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041962, + "balance_loss_mlp": 1.00209796, + "epoch": 0.9161215852250866, + "flos": 536847273984.0, + "grad_norm": 0.03478123106753543, + "language_loss": 0.80950373, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81992334, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.3984375, + "step": 4762, + "time_per_iteration": 2.7062151432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_mlp": 1.00271964, + "epoch": 0.9163139669103502, + "flos": 591726483456.0, + "grad_norm": 0.03725306331667297, + "language_loss": 0.80738699, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81781185, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.39746094, + "step": 4763, + "time_per_iteration": 2.6649303436279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042635, + "balance_loss_mlp": 1.00289094, + "epoch": 0.9165063485956138, + "flos": 823372811520.0, + "grad_norm": 0.03263551709218471, + "language_loss": 0.85072815, + "learning_rate": 1.817043762598397e-05, + "loss": 0.86115456, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.3972168, + "step": 4764, + "time_per_iteration": 3.0888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042796, + "balance_loss_mlp": 1.00302815, + "epoch": 0.9166987302808772, + "flos": 526246846464.0, + "grad_norm": 0.034405436737363966, + "language_loss": 0.82834673, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83877468, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.39746094, + "step": 4765, + "time_per_iteration": 2.612149953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_mlp": 1.00197029, + "epoch": 0.9168911119661408, + "flos": 656346808320.0, + "grad_norm": 0.03211426888442645, + "language_loss": 0.8478266, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85824376, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.3972168, + "step": 4766, + "time_per_iteration": 2.9523656368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_mlp": 1.00349426, + "epoch": 0.9170834936514044, + "flos": 492722082048.0, + "grad_norm": 0.03421441707502224, + "language_loss": 0.85402191, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.86445475, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.39770508, + "step": 4767, + "time_per_iteration": 2.5405280590057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043107, + "balance_loss_mlp": 1.00329077, + "epoch": 0.917275875336668, + "flos": 629180357376.0, + "grad_norm": 0.0411136231829451, + "language_loss": 0.81023633, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.82066739, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.39794922, + "step": 4768, + "time_per_iteration": 2.7763261795043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_mlp": 1.00387573, + "epoch": 0.9174682570219315, + "flos": 1521215352576.0, + "grad_norm": 0.005980393358773289, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79223019, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.38867188, + "step": 4769, + "time_per_iteration": 4.983697175979614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041776, + "balance_loss_mlp": 1.00200808, + "epoch": 0.917660638707195, + "flos": 561113068800.0, + "grad_norm": 0.03356610756304218, + "language_loss": 0.85551798, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.8659358, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.39746094, + "step": 4770, + "time_per_iteration": 2.656285285949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104163, + "balance_loss_mlp": 1.00181425, + "epoch": 0.9178530203924586, + "flos": 448175982336.0, + "grad_norm": 0.03590542573213615, + "language_loss": 0.84399128, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.85440755, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.39794922, + "step": 4771, + "time_per_iteration": 2.4805028438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_mlp": 1.00193799, + "epoch": 0.9180454020777222, + "flos": 466975823616.0, + "grad_norm": 0.03408749549297663, + "language_loss": 0.81331682, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.82373387, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.39746094, + "step": 4772, + "time_per_iteration": 2.5695180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_mlp": 1.00198889, + "epoch": 0.9182377837629858, + "flos": 597485999616.0, + "grad_norm": 0.03332640277361682, + "language_loss": 0.87420475, + "learning_rate": 1.74290029706784e-05, + "loss": 0.8846221, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.3972168, + "step": 4773, + "time_per_iteration": 2.815886974334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041932, + "balance_loss_mlp": 1.00216413, + "epoch": 0.9184301654482493, + "flos": 998362013184.0, + "grad_norm": 0.03239829310404249, + "language_loss": 0.83262658, + "learning_rate": 1.734755767142876e-05, + "loss": 0.84304595, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.39746094, + "step": 4774, + "time_per_iteration": 3.356502056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044271, + "balance_loss_mlp": 1.00445557, + "epoch": 0.9186225471335129, + "flos": 509902454016.0, + "grad_norm": 0.03056870137677778, + "language_loss": 0.8524617, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.86290443, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.39794922, + "step": 4775, + "time_per_iteration": 2.62768292427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044303, + "balance_loss_mlp": 1.00448751, + "epoch": 0.9188149288187765, + "flos": 942078162432.0, + "grad_norm": 0.03846714084207721, + "language_loss": 0.79003048, + "learning_rate": 1.718522925136551e-05, + "loss": 0.80047351, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.39794922, + "step": 4776, + "time_per_iteration": 3.272127389907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_mlp": 1.0045718, + "epoch": 0.91900731050404, + "flos": 584764515072.0, + "grad_norm": 0.03467448263421975, + "language_loss": 0.84587908, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.85632205, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.39697266, + "step": 4777, + "time_per_iteration": 2.6851980686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_mlp": 1.00169146, + "epoch": 0.9191996921893035, + "flos": 582307120896.0, + "grad_norm": 0.04359090414483508, + "language_loss": 0.80117047, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.81158531, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.39770508, + "step": 4778, + "time_per_iteration": 2.7560482025146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041705, + "balance_loss_mlp": 1.00188911, + "epoch": 0.9193920738745671, + "flos": 910417830912.0, + "grad_norm": 0.03787808784410227, + "language_loss": 0.8024419, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81285894, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.39794922, + "step": 4779, + "time_per_iteration": 3.092148780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_mlp": 1.00309753, + "epoch": 0.9195844555598307, + "flos": 1561647650304.0, + "grad_norm": 0.0046095855092722115, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80837303, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.38769531, + "step": 4780, + "time_per_iteration": 4.6737401485443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_mlp": 1.00159907, + "epoch": 0.9197768372450943, + "flos": 475018735104.0, + "grad_norm": 0.03757173835384317, + "language_loss": 0.79206824, + "learning_rate": 1.678268904252317e-05, + "loss": 0.80248284, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.3984375, + "step": 4781, + "time_per_iteration": 2.5297844409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_mlp": 1.0018177, + "epoch": 0.9199692189303579, + "flos": 858597725184.0, + "grad_norm": 0.0351171984147798, + "language_loss": 0.84401453, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85443085, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.39794922, + "step": 4782, + "time_per_iteration": 3.2000656127929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_mlp": 1.00243175, + "epoch": 0.9201616006156214, + "flos": 505380383232.0, + "grad_norm": 0.035994154198359725, + "language_loss": 0.77748179, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78790379, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.39746094, + "step": 4783, + "time_per_iteration": 2.633715867996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_mlp": 1.00243866, + "epoch": 0.9203539823008849, + "flos": 549896347392.0, + "grad_norm": 0.03695579451272927, + "language_loss": 0.85761321, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.86803603, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.39819336, + "step": 4784, + "time_per_iteration": 2.7589988708496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043999, + "balance_loss_mlp": 1.00415969, + "epoch": 0.9205463639861485, + "flos": 541073836800.0, + "grad_norm": 0.034257304816951106, + "language_loss": 0.82876825, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83920825, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.39819336, + "step": 4785, + "time_per_iteration": 2.6440327167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.00467277, + "epoch": 0.9207387456714121, + "flos": 801162945024.0, + "grad_norm": 0.037754818387900006, + "language_loss": 0.78699261, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79743749, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.39794922, + "step": 4786, + "time_per_iteration": 3.0379316806793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044318, + "balance_loss_mlp": 1.00454926, + "epoch": 0.9209311273566756, + "flos": 503817294336.0, + "grad_norm": 0.034755260351108076, + "language_loss": 0.79367381, + "learning_rate": 1.630583198044333e-05, + "loss": 0.80411696, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.39746094, + "step": 4787, + "time_per_iteration": 2.6695258617401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043489, + "balance_loss_mlp": 1.0037446, + "epoch": 0.9211235090419392, + "flos": 570384677376.0, + "grad_norm": 0.04450902774793768, + "language_loss": 0.8309685, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.84140337, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.3972168, + "step": 4788, + "time_per_iteration": 2.716212034225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041703, + "balance_loss_mlp": 1.00195909, + "epoch": 0.9213158907272028, + "flos": 807931472640.0, + "grad_norm": 0.03942301040285612, + "language_loss": 0.82974708, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.84016412, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.3972168, + "step": 4789, + "time_per_iteration": 2.975245237350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041759, + "balance_loss_mlp": 1.00194263, + "epoch": 0.9215082724124664, + "flos": 491651832576.0, + "grad_norm": 0.035709044039591936, + "language_loss": 0.76630366, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77672124, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.39794922, + "step": 4790, + "time_per_iteration": 2.5468502044677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_mlp": 1.00228882, + "epoch": 0.9217006540977299, + "flos": 1517896700928.0, + "grad_norm": 0.004454200941114214, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78111368, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.38769531, + "step": 4791, + "time_per_iteration": 4.974085092544556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_mlp": 1.00179708, + "epoch": 0.9218930357829934, + "flos": 745087119360.0, + "grad_norm": 0.03326717695831736, + "language_loss": 0.76704144, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77745736, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.39770508, + "step": 4792, + "time_per_iteration": 2.9275741577148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_mlp": 1.0024513, + "epoch": 0.922085417468257, + "flos": 453974382336.0, + "grad_norm": 0.038882563606044994, + "language_loss": 0.81037605, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.82079852, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.39770508, + "step": 4793, + "time_per_iteration": 2.5030882358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042554, + "balance_loss_mlp": 1.0027858, + "epoch": 0.9222777991535206, + "flos": 501238391040.0, + "grad_norm": 0.03875743218845831, + "language_loss": 0.853522, + "learning_rate": 1.575804349061616e-05, + "loss": 0.86394763, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.39746094, + "step": 4794, + "time_per_iteration": 2.606114625930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_mlp": 1.00490916, + "epoch": 0.9224701808387842, + "flos": 528984197376.0, + "grad_norm": 0.03858977789908891, + "language_loss": 0.79467082, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.80511832, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.39819336, + "step": 4795, + "time_per_iteration": 2.5835952758789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_mlp": 1.00443649, + "epoch": 0.9226625625240477, + "flos": 876118324992.0, + "grad_norm": 0.033822189441903114, + "language_loss": 0.75610065, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76654249, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.3972168, + "step": 4796, + "time_per_iteration": 3.1282436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044248, + "balance_loss_mlp": 1.00440812, + "epoch": 0.9228549442093112, + "flos": 503760913920.0, + "grad_norm": 0.038137259870802354, + "language_loss": 0.88317525, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89361775, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.39819336, + "step": 4797, + "time_per_iteration": 2.562058210372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042279, + "balance_loss_mlp": 1.00248694, + "epoch": 0.9230473258945748, + "flos": 601126404096.0, + "grad_norm": 0.032937276215074016, + "language_loss": 0.85114181, + "learning_rate": 1.544915681564829e-05, + "loss": 0.86156458, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.39770508, + "step": 4798, + "time_per_iteration": 2.779236078262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104159, + "balance_loss_mlp": 1.00175047, + "epoch": 0.9232397075798384, + "flos": 823876344576.0, + "grad_norm": 0.0419903361052834, + "language_loss": 0.7935499, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80396575, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.39819336, + "step": 4799, + "time_per_iteration": 3.08843994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042074, + "balance_loss_mlp": 1.00230587, + "epoch": 0.923432089265102, + "flos": 708275784192.0, + "grad_norm": 0.036067131408147955, + "language_loss": 0.84829307, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85871375, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.39746094, + "step": 4800, + "time_per_iteration": 2.8723208904266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_mlp": 1.00212193, + "epoch": 0.9236244709503655, + "flos": 703091732736.0, + "grad_norm": 0.03994271275686309, + "language_loss": 0.77297044, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.78338909, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.3972168, + "step": 4801, + "time_per_iteration": 2.8293721675872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041551, + "balance_loss_mlp": 1.00178313, + "epoch": 0.9238168526356291, + "flos": 516082877952.0, + "grad_norm": 0.03341169039005822, + "language_loss": 0.84229976, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85271525, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.39746094, + "step": 4802, + "time_per_iteration": 2.647810935974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.00194943, + "epoch": 0.9240092343208927, + "flos": 492965100288.0, + "grad_norm": 0.04174387467477932, + "language_loss": 0.817182, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82759964, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.39794922, + "step": 4803, + "time_per_iteration": 2.5767948627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.00197279, + "epoch": 0.9242016160061562, + "flos": 648436099584.0, + "grad_norm": 0.03908108029217818, + "language_loss": 0.74151349, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.75193107, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.39770508, + "step": 4804, + "time_per_iteration": 2.885768413543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043548, + "balance_loss_mlp": 1.00375605, + "epoch": 0.9243939976914197, + "flos": 730779213312.0, + "grad_norm": 0.03876536348903563, + "language_loss": 0.79842925, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80886477, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.39770508, + "step": 4805, + "time_per_iteration": 2.995547294616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044249, + "balance_loss_mlp": 1.00448072, + "epoch": 0.9245863793766833, + "flos": 453210334464.0, + "grad_norm": 0.03391430386616441, + "language_loss": 0.90989828, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.92034078, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.39746094, + "step": 4806, + "time_per_iteration": 2.590815305709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044582, + "balance_loss_mlp": 1.00471807, + "epoch": 0.9247787610619469, + "flos": 756367024128.0, + "grad_norm": 0.0377014259439888, + "language_loss": 0.77555621, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78600192, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.3984375, + "step": 4807, + "time_per_iteration": 2.9227209091186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_mlp": 1.002388, + "epoch": 0.9249711427472105, + "flos": 563084426496.0, + "grad_norm": 0.032247453393700005, + "language_loss": 0.85188043, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.86230248, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.39794922, + "step": 4808, + "time_per_iteration": 2.7663607597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_mlp": 1.00201321, + "epoch": 0.9251635244324741, + "flos": 527781745152.0, + "grad_norm": 0.03881132010618585, + "language_loss": 0.85695136, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86736941, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.39770508, + "step": 4809, + "time_per_iteration": 2.697505235671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_mlp": 1.00197923, + "epoch": 0.9253559061177375, + "flos": 612480185856.0, + "grad_norm": 0.04670072109786849, + "language_loss": 0.79465836, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80507606, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.39770508, + "step": 4810, + "time_per_iteration": 2.8427956104278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_mlp": 1.00300598, + "epoch": 0.9255482878030011, + "flos": 1554464050944.0, + "grad_norm": 0.004376211492526392, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77967215, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.38769531, + "step": 4811, + "time_per_iteration": 4.711735486984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_mlp": 1.00202775, + "epoch": 0.9257406694882647, + "flos": 767803431168.0, + "grad_norm": 0.03861878751273161, + "language_loss": 0.81642079, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82683897, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.39770508, + "step": 4812, + "time_per_iteration": 3.057129383087158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_mlp": 1.00242972, + "epoch": 0.9259330511735283, + "flos": 498967634688.0, + "grad_norm": 0.038176642495154074, + "language_loss": 0.83979654, + "learning_rate": 1.431765421986686e-05, + "loss": 0.85021836, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.3972168, + "step": 4813, + "time_per_iteration": 2.546762704849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041888, + "balance_loss_mlp": 1.00211966, + "epoch": 0.9261254328587919, + "flos": 628016788992.0, + "grad_norm": 0.12158463805701603, + "language_loss": 0.79614502, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80656391, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.39746094, + "step": 4814, + "time_per_iteration": 2.792081594467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043933, + "balance_loss_mlp": 1.00411725, + "epoch": 0.9263178145440554, + "flos": 598493065728.0, + "grad_norm": 0.0375165100308534, + "language_loss": 0.86120522, + "learning_rate": 1.416999056594831e-05, + "loss": 0.8716445, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.39794922, + "step": 4815, + "time_per_iteration": 2.6949462890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_mlp": 1.00328422, + "epoch": 0.926510196229319, + "flos": 389417240832.0, + "grad_norm": 0.035459065578210734, + "language_loss": 0.84041262, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.85084414, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.3984375, + "step": 4816, + "time_per_iteration": 2.4716691970825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042309, + "balance_loss_mlp": 1.00254118, + "epoch": 0.9267025779145825, + "flos": 546863488512.0, + "grad_norm": 0.038061097062299015, + "language_loss": 0.84638411, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85680723, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.39746094, + "step": 4817, + "time_per_iteration": 2.611921787261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_mlp": 1.00207257, + "epoch": 0.9268949595998461, + "flos": 500791238400.0, + "grad_norm": 0.035400343706182905, + "language_loss": 0.82393169, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.83435035, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.39770508, + "step": 4818, + "time_per_iteration": 2.6435391902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041883, + "balance_loss_mlp": 1.00209129, + "epoch": 0.9270873412851096, + "flos": 433739764224.0, + "grad_norm": 0.03314348914664278, + "language_loss": 0.82907271, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.83949155, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.39770508, + "step": 4819, + "time_per_iteration": 2.6455531120300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043568, + "balance_loss_mlp": 1.00380015, + "epoch": 0.9272797229703732, + "flos": 467803054848.0, + "grad_norm": 0.03754391760651958, + "language_loss": 0.86858791, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87902355, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.39746094, + "step": 4820, + "time_per_iteration": 2.6332101821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043167, + "balance_loss_mlp": 1.00337493, + "epoch": 0.9274721046556368, + "flos": 706250958336.0, + "grad_norm": 0.03606074012081537, + "language_loss": 0.79443467, + "learning_rate": 1.373152729763938e-05, + "loss": 0.80486631, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.39770508, + "step": 4821, + "time_per_iteration": 3.026251792907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043518, + "balance_loss_mlp": 1.00460815, + "epoch": 0.9276644863409004, + "flos": 1405345529088.0, + "grad_norm": 0.00577391953495314, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83423984, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.38867188, + "step": 4822, + "time_per_iteration": 4.8578009605407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_mlp": 1.00189805, + "epoch": 0.927856868026164, + "flos": 743138115840.0, + "grad_norm": 0.034922429944732755, + "language_loss": 0.80315673, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81357461, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.39868164, + "step": 4823, + "time_per_iteration": 3.027656316757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041798, + "balance_loss_mlp": 1.00193429, + "epoch": 0.9280492497114274, + "flos": 413123122176.0, + "grad_norm": 0.038779336552669824, + "language_loss": 0.743617, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.754035, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.3984375, + "step": 4824, + "time_per_iteration": 2.49405837059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041752, + "balance_loss_mlp": 1.00195956, + "epoch": 0.928241631396691, + "flos": 647665248768.0, + "grad_norm": 0.03926006290923404, + "language_loss": 0.84507549, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85549301, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.39770508, + "step": 4825, + "time_per_iteration": 2.8209800720214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_mlp": 1.00201905, + "epoch": 0.9284340130819546, + "flos": 698129312256.0, + "grad_norm": 0.035194877667594326, + "language_loss": 0.81097031, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.82138836, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.39770508, + "step": 4826, + "time_per_iteration": 2.9241714477539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041901, + "balance_loss_mlp": 1.00213277, + "epoch": 0.9286263947672182, + "flos": 760544009472.0, + "grad_norm": 0.03422053226269929, + "language_loss": 0.83855939, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.8489784, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.39746094, + "step": 4827, + "time_per_iteration": 3.002506732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044201, + "balance_loss_mlp": 1.00445616, + "epoch": 0.9288187764524817, + "flos": 674141528832.0, + "grad_norm": 0.034644572167621394, + "language_loss": 0.80734801, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81779003, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.3972168, + "step": 4828, + "time_per_iteration": 2.9348583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044093, + "balance_loss_mlp": 1.00432444, + "epoch": 0.9290111581377453, + "flos": 501470715648.0, + "grad_norm": 0.034302029930516, + "language_loss": 0.84273684, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.85317779, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.39746094, + "step": 4829, + "time_per_iteration": 2.578808069229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_mlp": 1.00276947, + "epoch": 0.9292035398230089, + "flos": 1567060135680.0, + "grad_norm": 0.004229410701512915, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73163652, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.38769531, + "step": 4830, + "time_per_iteration": 4.9977943897247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_mlp": 1.00263214, + "epoch": 0.9293959215082724, + "flos": 1522066883328.0, + "grad_norm": 0.0031584024576732017, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80553377, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.38867188, + "step": 4831, + "time_per_iteration": 4.880908966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041782, + "balance_loss_mlp": 1.0020138, + "epoch": 0.929588303193536, + "flos": 558898692864.0, + "grad_norm": 0.04711683463428304, + "language_loss": 0.84607166, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85648948, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.39746094, + "step": 4832, + "time_per_iteration": 2.6638543605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104215, + "balance_loss_mlp": 1.00223875, + "epoch": 0.9297806848787995, + "flos": 479551499520.0, + "grad_norm": 0.04844473198255446, + "language_loss": 0.80835903, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81878054, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.39892578, + "step": 4833, + "time_per_iteration": 2.5647528171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_mlp": 1.00291598, + "epoch": 0.9299730665640631, + "flos": 565654581504.0, + "grad_norm": 0.04014985005184455, + "language_loss": 0.79991281, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81033969, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.39746094, + "step": 4834, + "time_per_iteration": 2.8138298988342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_mlp": 1.0030086, + "epoch": 0.9301654482493267, + "flos": 561343448064.0, + "grad_norm": 0.0350160959737894, + "language_loss": 0.82770115, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83812916, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.39770508, + "step": 4835, + "time_per_iteration": 2.7824862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_mlp": 1.00445557, + "epoch": 0.9303578299345903, + "flos": 1523490966528.0, + "grad_norm": 0.0038776105750953234, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77895713, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.38769531, + "step": 4836, + "time_per_iteration": 4.964673757553101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_mlp": 1.00239527, + "epoch": 0.9305502116198537, + "flos": 531860553984.0, + "grad_norm": 0.04009771865734981, + "language_loss": 0.83090073, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84132147, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.39648438, + "step": 4837, + "time_per_iteration": 2.6505465507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104178, + "balance_loss_mlp": 1.00201166, + "epoch": 0.9307425933051173, + "flos": 475856659968.0, + "grad_norm": 0.04414875410178277, + "language_loss": 0.82107651, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.83149433, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.39746094, + "step": 4838, + "time_per_iteration": 2.5214364528656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_mlp": 1.00214899, + "epoch": 0.9309349749903809, + "flos": 586065143808.0, + "grad_norm": 0.03586401591983268, + "language_loss": 0.87220377, + "learning_rate": 1.245693929549213e-05, + "loss": 0.88262272, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.3972168, + "step": 4839, + "time_per_iteration": 2.7310404777526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_mlp": 1.0021075, + "epoch": 0.9311273566756445, + "flos": 863143128576.0, + "grad_norm": 0.04744165409603807, + "language_loss": 0.77044845, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.78086698, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.3972168, + "step": 4840, + "time_per_iteration": 3.0941414833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_mlp": 1.00242364, + "epoch": 0.9313197383609081, + "flos": 549162435072.0, + "grad_norm": 0.035993751818789214, + "language_loss": 0.82954288, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83996511, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.39770508, + "step": 4841, + "time_per_iteration": 2.6443285942077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_mlp": 1.00221896, + "epoch": 0.9315121200461716, + "flos": 469704426240.0, + "grad_norm": 0.05808949887370269, + "language_loss": 0.81514281, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82556272, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.39746094, + "step": 4842, + "time_per_iteration": 2.527540922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_mlp": 1.00222027, + "epoch": 0.9317045017314352, + "flos": 418558940160.0, + "grad_norm": 0.037955527047526506, + "language_loss": 0.7833854, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.79380524, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.39746094, + "step": 4843, + "time_per_iteration": 2.509636640548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042112, + "balance_loss_mlp": 1.0023675, + "epoch": 0.9318968834166987, + "flos": 541621111296.0, + "grad_norm": 0.03533166990442565, + "language_loss": 0.77478361, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78520471, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.3972168, + "step": 4844, + "time_per_iteration": 2.7669665813446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042379, + "balance_loss_mlp": 1.00256312, + "epoch": 0.9320892651019623, + "flos": 522347872512.0, + "grad_norm": 0.04071468473445459, + "language_loss": 0.81091493, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.82133877, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.39794922, + "step": 4845, + "time_per_iteration": 2.5971779823303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.0020926, + "epoch": 0.9322816467872258, + "flos": 583253915904.0, + "grad_norm": 0.03228832372989255, + "language_loss": 0.80980742, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.82022512, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.39648438, + "step": 4846, + "time_per_iteration": 2.7529783248901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042078, + "balance_loss_mlp": 1.00233305, + "epoch": 0.9324740284724894, + "flos": 485803855104.0, + "grad_norm": 0.045024241247814824, + "language_loss": 0.82244754, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83286834, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.3972168, + "step": 4847, + "time_per_iteration": 2.714202880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010423, + "balance_loss_mlp": 1.00253224, + "epoch": 0.932666410157753, + "flos": 734024954880.0, + "grad_norm": 0.0379301042606838, + "language_loss": 0.83348429, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.8439073, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.39746094, + "step": 4848, + "time_per_iteration": 3.028153419494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042085, + "balance_loss_mlp": 1.00234032, + "epoch": 0.9328587918430166, + "flos": 967181882112.0, + "grad_norm": 0.038825479032873554, + "language_loss": 0.79343307, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.80385393, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.3972168, + "step": 4849, + "time_per_iteration": 3.2317538261413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041875, + "balance_loss_mlp": 1.00210679, + "epoch": 0.9330511735282802, + "flos": 615684131328.0, + "grad_norm": 0.04256976645826645, + "language_loss": 0.80641079, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81682956, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.39746094, + "step": 4850, + "time_per_iteration": 2.6996281147003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041781, + "balance_loss_mlp": 1.00201249, + "epoch": 0.9332435552135436, + "flos": 560218763520.0, + "grad_norm": 0.03532928511603242, + "language_loss": 0.86379266, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.87421048, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.39746094, + "step": 4851, + "time_per_iteration": 2.7163102626800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_mlp": 1.00426483, + "epoch": 0.9334359368988072, + "flos": 516558220800.0, + "grad_norm": 0.04157190776063736, + "language_loss": 0.82472408, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.83516461, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.39770508, + "step": 4852, + "time_per_iteration": 2.5901753902435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_mlp": 1.0044682, + "epoch": 0.9336283185840708, + "flos": 540941634048.0, + "grad_norm": 0.030952473235186646, + "language_loss": 0.83399379, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.84443641, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.39770508, + "step": 4853, + "time_per_iteration": 2.754560947418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_mlp": 1.00708771, + "epoch": 0.9338207002693344, + "flos": 1566124034304.0, + "grad_norm": 0.007144453314513685, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.7950092, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.38769531, + "step": 4854, + "time_per_iteration": 4.874703407287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_mlp": 1.0018059, + "epoch": 0.9340130819545979, + "flos": 646508483328.0, + "grad_norm": 0.052007470771972146, + "language_loss": 0.81776142, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82817757, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.39794922, + "step": 4855, + "time_per_iteration": 2.941030263900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_mlp": 1.00182235, + "epoch": 0.9342054636398615, + "flos": 504512322816.0, + "grad_norm": 0.03777189969793054, + "language_loss": 0.77505141, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.78546846, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.39868164, + "step": 4856, + "time_per_iteration": 2.7442283630371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041841, + "balance_loss_mlp": 1.00202465, + "epoch": 0.934397845325125, + "flos": 594236367360.0, + "grad_norm": 0.030117925829334365, + "language_loss": 0.84709656, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85751504, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.39794922, + "step": 4857, + "time_per_iteration": 2.87604022026062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041577, + "balance_loss_mlp": 1.00176144, + "epoch": 0.9345902270103886, + "flos": 500884557312.0, + "grad_norm": 0.03501485016862442, + "language_loss": 0.80603421, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81644994, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.39794922, + "step": 4858, + "time_per_iteration": 2.5634093284606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040668, + "balance_loss_mlp": 1.00185394, + "epoch": 0.9347826086956522, + "flos": 1523407374336.0, + "grad_norm": 0.004505904699897048, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.77028388, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.38769531, + "step": 4859, + "time_per_iteration": 4.670670509338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041982, + "balance_loss_mlp": 1.0022136, + "epoch": 0.9349749903809157, + "flos": 505665197568.0, + "grad_norm": 0.03170584508353696, + "language_loss": 0.81696236, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82738221, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.39746094, + "step": 4860, + "time_per_iteration": 2.8271596431732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041903, + "balance_loss_mlp": 1.00215864, + "epoch": 0.9351673720661793, + "flos": 569965714944.0, + "grad_norm": 0.04706495104714966, + "language_loss": 0.79096204, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.80138111, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.3972168, + "step": 4861, + "time_per_iteration": 2.639396905899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041923, + "balance_loss_mlp": 1.00215459, + "epoch": 0.9353597537514429, + "flos": 545662981632.0, + "grad_norm": 0.03639072385040861, + "language_loss": 0.86677241, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87719166, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.39746094, + "step": 4862, + "time_per_iteration": 2.611056089401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_mlp": 1.0022645, + "epoch": 0.9355521354367065, + "flos": 520020735744.0, + "grad_norm": 0.033869901697529216, + "language_loss": 0.85065103, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.86107087, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.39697266, + "step": 4863, + "time_per_iteration": 2.727292537689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_mlp": 1.00495505, + "epoch": 0.93574451712197, + "flos": 447235990272.0, + "grad_norm": 0.04078440918447503, + "language_loss": 0.79057157, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.80101883, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.39746094, + "step": 4864, + "time_per_iteration": 2.485029458999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044092, + "balance_loss_mlp": 1.0043242, + "epoch": 0.9359368988072335, + "flos": 481496612352.0, + "grad_norm": 0.04601803612471293, + "language_loss": 0.77498388, + "learning_rate": 1.072417553472832e-05, + "loss": 0.78542477, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.39746094, + "step": 4865, + "time_per_iteration": 2.5250749588012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_mlp": 1.00230527, + "epoch": 0.9361292804924971, + "flos": 498092771328.0, + "grad_norm": 0.03842244559328987, + "language_loss": 0.85445625, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86487693, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.39746094, + "step": 4866, + "time_per_iteration": 2.5852344036102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104214, + "balance_loss_mlp": 1.0023483, + "epoch": 0.9363216621777607, + "flos": 619294400256.0, + "grad_norm": 0.036496484155024655, + "language_loss": 0.84628427, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85670567, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.39770508, + "step": 4867, + "time_per_iteration": 2.7684035301208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_mlp": 1.00415039, + "epoch": 0.9365140438630243, + "flos": 1418982706176.0, + "grad_norm": 0.003430568488681332, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80246305, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.38769531, + "step": 4868, + "time_per_iteration": 4.8999762535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_mlp": 1.00201857, + "epoch": 0.9367064255482878, + "flos": 591650661120.0, + "grad_norm": 0.03595906257463508, + "language_loss": 0.8194294, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82984746, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.39770508, + "step": 4869, + "time_per_iteration": 2.717907667160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_mlp": 1.00230348, + "epoch": 0.9368988072335513, + "flos": 527653433088.0, + "grad_norm": 0.03657382574202116, + "language_loss": 0.82250357, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83292383, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.39697266, + "step": 4870, + "time_per_iteration": 2.7170798778533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_mlp": 1.00259185, + "epoch": 0.9370911889188149, + "flos": 744509709312.0, + "grad_norm": 0.03512967184309728, + "language_loss": 0.79642439, + "learning_rate": 1.034252625822113e-05, + "loss": 0.80684793, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.39746094, + "step": 4871, + "time_per_iteration": 2.9354794025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044725, + "balance_loss_mlp": 1.00498092, + "epoch": 0.9372835706040785, + "flos": 547078316544.0, + "grad_norm": 0.03351822012492755, + "language_loss": 0.79116702, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.80161428, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.3972168, + "step": 4872, + "time_per_iteration": 2.702842950820923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042221, + "balance_loss_mlp": 1.00245237, + "epoch": 0.9374759522893421, + "flos": 492700694784.0, + "grad_norm": 0.04331682149592437, + "language_loss": 0.8190614, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82948351, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.39746094, + "step": 4873, + "time_per_iteration": 2.685476541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_mlp": 1.0024718, + "epoch": 0.9376683339746056, + "flos": 579532831488.0, + "grad_norm": 0.04000808735517868, + "language_loss": 0.82723129, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83765376, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.39746094, + "step": 4874, + "time_per_iteration": 2.670814275741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_mlp": 1.0021373, + "epoch": 0.9378607156598692, + "flos": 507297305856.0, + "grad_norm": 0.04201396587358986, + "language_loss": 0.80951202, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81993079, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.3972168, + "step": 4875, + "time_per_iteration": 2.5980563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_mlp": 1.0022316, + "epoch": 0.9380530973451328, + "flos": 521071543296.0, + "grad_norm": 0.03892483876210915, + "language_loss": 0.78012693, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.79054689, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.39746094, + "step": 4876, + "time_per_iteration": 2.693706750869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_mlp": 1.00212526, + "epoch": 0.9382454790303963, + "flos": 558870502656.0, + "grad_norm": 0.03481280964401, + "language_loss": 0.85469687, + "learning_rate": 9.967720642029999e-06, + "loss": 0.86511576, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.39746094, + "step": 4877, + "time_per_iteration": 2.716329336166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_mlp": 1.00217426, + "epoch": 0.9384378607156598, + "flos": 696787854336.0, + "grad_norm": 0.03644452111006662, + "language_loss": 0.82310647, + "learning_rate": 9.905918764418153e-06, + "loss": 0.83352518, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.39672852, + "step": 4878, + "time_per_iteration": 2.9522945880889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_mlp": 1.0020355, + "epoch": 0.9386302424009234, + "flos": 555835698432.0, + "grad_norm": 0.038143529458428554, + "language_loss": 0.81298959, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82340783, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.39770508, + "step": 4879, + "time_per_iteration": 2.6760354042053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_mlp": 1.00196457, + "epoch": 0.938822624086187, + "flos": 568066288896.0, + "grad_norm": 0.042500450188158845, + "language_loss": 0.80407965, + "learning_rate": 9.782885847304469e-06, + "loss": 0.81449699, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.39746094, + "step": 4880, + "time_per_iteration": 2.660008430480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041615, + "balance_loss_mlp": 1.00182331, + "epoch": 0.9390150057714506, + "flos": 418548246528.0, + "grad_norm": 0.042624887174036764, + "language_loss": 0.80474532, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81516147, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.39770508, + "step": 4881, + "time_per_iteration": 2.586818218231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_mlp": 1.00186956, + "epoch": 0.9392073874567142, + "flos": 1556084510208.0, + "grad_norm": 0.037573723473785056, + "language_loss": 0.76786673, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77828383, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.39819336, + "step": 4882, + "time_per_iteration": 3.7182722091674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_mlp": 1.00369167, + "epoch": 0.9393997691419776, + "flos": 653732911872.0, + "grad_norm": 0.03712425994747192, + "language_loss": 0.78558093, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79601628, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.39819336, + "step": 4883, + "time_per_iteration": 2.7739627361297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044121, + "balance_loss_mlp": 1.00521088, + "epoch": 0.9395921508272412, + "flos": 1556565666048.0, + "grad_norm": 0.005772326140838005, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79214799, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.38867188, + "step": 4884, + "time_per_iteration": 4.832986116409302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_mlp": 1.00337636, + "epoch": 0.9397845325125048, + "flos": 499198013952.0, + "grad_norm": 0.03489865319805655, + "language_loss": 0.79100162, + "learning_rate": 9.478634554578314e-06, + "loss": 0.80143404, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.3984375, + "step": 4885, + "time_per_iteration": 2.6249029636383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043291, + "balance_loss_mlp": 1.00345111, + "epoch": 0.9399769141977684, + "flos": 499590731520.0, + "grad_norm": 0.038272147047931145, + "language_loss": 0.84108281, + "learning_rate": 9.418355513755638e-06, + "loss": 0.85151565, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.39819336, + "step": 4886, + "time_per_iteration": 2.602886199951172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_mlp": 1.00218201, + "epoch": 0.9401692958830319, + "flos": 1405677975552.0, + "grad_norm": 0.003077659566733789, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80373377, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.38964844, + "step": 4887, + "time_per_iteration": 4.847235202789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042464, + "balance_loss_mlp": 1.00264823, + "epoch": 0.9403616775682955, + "flos": 541212842496.0, + "grad_norm": 0.02984190168493181, + "language_loss": 0.85387403, + "learning_rate": 9.298368837495575e-06, + "loss": 0.8642987, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.39794922, + "step": 4888, + "time_per_iteration": 2.7261717319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_mlp": 1.00351715, + "epoch": 0.9405540592535591, + "flos": 1324940725248.0, + "grad_norm": 0.0037373643896522297, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76211762, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.38769531, + "step": 4889, + "time_per_iteration": 4.894392490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_mlp": 1.0029496, + "epoch": 0.9407464409388226, + "flos": 573428229888.0, + "grad_norm": 0.03814373852093679, + "language_loss": 0.8372674, + "learning_rate": 9.179144190235799e-06, + "loss": 0.84769458, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.39746094, + "step": 4890, + "time_per_iteration": 2.6488852500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_mlp": 1.0025363, + "epoch": 0.9409388226240862, + "flos": 512349154560.0, + "grad_norm": 0.03178395215267038, + "language_loss": 0.77375114, + "learning_rate": 9.119817685386112e-06, + "loss": 0.78417468, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.39794922, + "step": 4891, + "time_per_iteration": 2.7707180976867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043049, + "balance_loss_mlp": 1.00423431, + "epoch": 0.9411312043093497, + "flos": 1573279443456.0, + "grad_norm": 0.004890729003859763, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81284934, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.38769531, + "step": 4892, + "time_per_iteration": 4.890718936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041736, + "balance_loss_mlp": 1.0018965, + "epoch": 0.9413235859946133, + "flos": 570560621568.0, + "grad_norm": 0.03944178143363227, + "language_loss": 0.78793794, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79835528, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.39819336, + "step": 4893, + "time_per_iteration": 2.7482073307037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_mlp": 1.00170851, + "epoch": 0.9415159676798769, + "flos": 783266157312.0, + "grad_norm": 0.036973698375778005, + "language_loss": 0.80884314, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81925839, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.39794922, + "step": 4894, + "time_per_iteration": 3.00976300239563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_mlp": 1.00163937, + "epoch": 0.9417083493651405, + "flos": 850873654272.0, + "grad_norm": 0.031255435914645295, + "language_loss": 0.8029865, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81340152, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.3984375, + "step": 4895, + "time_per_iteration": 3.159879684448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_mlp": 1.00181615, + "epoch": 0.941900731050404, + "flos": 530452022016.0, + "grad_norm": 0.0360566247234317, + "language_loss": 0.86200356, + "learning_rate": 8.826044268024025e-06, + "loss": 0.87241995, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.39794922, + "step": 4896, + "time_per_iteration": 2.744678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041183, + "balance_loss_mlp": 1.00124776, + "epoch": 0.9420931127356675, + "flos": 558171583488.0, + "grad_norm": 0.0335549347969353, + "language_loss": 0.80863327, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81904507, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.39916992, + "step": 4897, + "time_per_iteration": 2.7431249618530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_mlp": 1.00145209, + "epoch": 0.9422854944209311, + "flos": 653787346944.0, + "grad_norm": 0.03717820296963101, + "language_loss": 0.86876309, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87917554, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.39770508, + "step": 4898, + "time_per_iteration": 2.840428590774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_mlp": 1.00148833, + "epoch": 0.9424778761061947, + "flos": 554765448960.0, + "grad_norm": 0.034123182933214626, + "language_loss": 0.84605157, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85646552, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.39892578, + "step": 4899, + "time_per_iteration": 2.699169874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_mlp": 1.00226939, + "epoch": 0.9426702577914583, + "flos": 589651113216.0, + "grad_norm": 0.03867302654620552, + "language_loss": 0.80447572, + "learning_rate": 8.594457827702406e-06, + "loss": 0.81489754, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.39892578, + "step": 4900, + "time_per_iteration": 2.6918928623199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042373, + "balance_loss_mlp": 1.00243771, + "epoch": 0.9428626394767218, + "flos": 617813936640.0, + "grad_norm": 0.04034073488325009, + "language_loss": 0.79256618, + "learning_rate": 8.537038112991114e-06, + "loss": 0.80298996, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.39916992, + "step": 4901, + "time_per_iteration": 2.7796003818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_mlp": 1.00216651, + "epoch": 0.9430550211619854, + "flos": 611542139136.0, + "grad_norm": 0.03752806991208156, + "language_loss": 0.82370108, + "learning_rate": 8.479809201123178e-06, + "loss": 0.83412206, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.39916992, + "step": 4902, + "time_per_iteration": 2.7410826683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_mlp": 1.00207889, + "epoch": 0.943247402847249, + "flos": 567052419840.0, + "grad_norm": 0.040940659305077086, + "language_loss": 0.78541058, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79583043, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.39892578, + "step": 4903, + "time_per_iteration": 2.7239608764648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042419, + "balance_loss_mlp": 1.00243592, + "epoch": 0.9434397845325125, + "flos": 528089892096.0, + "grad_norm": 0.04019064701674444, + "language_loss": 0.82004517, + "learning_rate": 8.365923874716297e-06, + "loss": 0.83046937, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.3996582, + "step": 4904, + "time_per_iteration": 2.6455512046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_mlp": 1.00237763, + "epoch": 0.943632166217776, + "flos": 594592146432.0, + "grad_norm": 0.03750564487525279, + "language_loss": 0.83164895, + "learning_rate": 8.309267504391593e-06, + "loss": 0.84207094, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.39794922, + "step": 4905, + "time_per_iteration": 2.753347873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_mlp": 1.00305521, + "epoch": 0.9438245479030396, + "flos": 573982307328.0, + "grad_norm": 0.028646757212572906, + "language_loss": 0.85765415, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86808407, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.39916992, + "step": 4906, + "time_per_iteration": 2.8059747219085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_mlp": 1.00305963, + "epoch": 0.9440169295883032, + "flos": 489222628608.0, + "grad_norm": 0.03890561239710249, + "language_loss": 0.82264918, + "learning_rate": 8.196527459479242e-06, + "loss": 0.8330791, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.39916992, + "step": 4907, + "time_per_iteration": 2.5627827644348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041352, + "balance_loss_mlp": 1.00141716, + "epoch": 0.9442093112735668, + "flos": 733123846656.0, + "grad_norm": 0.03542647022861663, + "language_loss": 0.74123418, + "learning_rate": 8.140443828661137e-06, + "loss": 0.75164777, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.39916992, + "step": 4908, + "time_per_iteration": 2.999734401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041412, + "balance_loss_mlp": 1.00152421, + "epoch": 0.9444016929588304, + "flos": 572106213888.0, + "grad_norm": 0.04040136580220783, + "language_loss": 0.82575059, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83616471, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.39868164, + "step": 4909, + "time_per_iteration": 2.6992504596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_mlp": 1.00123525, + "epoch": 0.9445940746440938, + "flos": 510312668160.0, + "grad_norm": 0.03923748527569452, + "language_loss": 0.86716592, + "learning_rate": 8.028849459169318e-06, + "loss": 0.87757671, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.39819336, + "step": 4910, + "time_per_iteration": 2.5987043380737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041384, + "balance_loss_mlp": 1.00156808, + "epoch": 0.9447864563293574, + "flos": 625798522368.0, + "grad_norm": 0.03481160306135877, + "language_loss": 0.81500655, + "learning_rate": 7.97333876382028e-06, + "loss": 0.82542038, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.39794922, + "step": 4911, + "time_per_iteration": 2.808239459991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042566, + "balance_loss_mlp": 1.00277388, + "epoch": 0.944978838014621, + "flos": 506309681664.0, + "grad_norm": 0.037707161835398115, + "language_loss": 0.81599504, + "learning_rate": 7.918019090162098e-06, + "loss": 0.82642066, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.39770508, + "step": 4912, + "time_per_iteration": 2.7564315795898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_mlp": 1.00340271, + "epoch": 0.9451712196998846, + "flos": 1487554494720.0, + "grad_norm": 0.004835932125538659, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79329652, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.38867188, + "step": 4913, + "time_per_iteration": 5.008509397506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042822, + "balance_loss_mlp": 1.00293481, + "epoch": 0.9453636013851482, + "flos": 522152486400.0, + "grad_norm": 0.04169579849524581, + "language_loss": 0.90761364, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91804183, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.39868164, + "step": 4914, + "time_per_iteration": 2.7010884284973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_mlp": 1.00128174, + "epoch": 0.9455559830704117, + "flos": 1500286672896.0, + "grad_norm": 0.004154634284500281, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84602541, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.38867188, + "step": 4915, + "time_per_iteration": 4.9480881690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_mlp": 1.00144696, + "epoch": 0.9457483647556753, + "flos": 499152327168.0, + "grad_norm": 0.033787532501163176, + "language_loss": 0.82344007, + "learning_rate": 7.698651040865534e-06, + "loss": 0.83385336, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.39868164, + "step": 4916, + "time_per_iteration": 2.621060609817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_mlp": 1.00159323, + "epoch": 0.9459407464409388, + "flos": 1021119154176.0, + "grad_norm": 0.03122939977346768, + "language_loss": 0.82703984, + "learning_rate": 7.644286796333222e-06, + "loss": 0.8374536, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.39770508, + "step": 4917, + "time_per_iteration": 3.369508981704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_mlp": 1.0030514, + "epoch": 0.9461331281262024, + "flos": 514621856256.0, + "grad_norm": 0.03918316931271036, + "language_loss": 0.81608689, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82651609, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.3984375, + "step": 4918, + "time_per_iteration": 2.605464458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_mlp": 1.00376344, + "epoch": 0.9463255098114659, + "flos": 529049326080.0, + "grad_norm": 0.043499008433838054, + "language_loss": 0.78687984, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79731631, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.39868164, + "step": 4919, + "time_per_iteration": 2.587440013885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_mlp": 1.00377798, + "epoch": 0.9465178914967295, + "flos": 507028042752.0, + "grad_norm": 0.04214834797927713, + "language_loss": 0.84009337, + "learning_rate": 7.482341043430485e-06, + "loss": 0.85053033, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.39892578, + "step": 4920, + "time_per_iteration": 2.6052064895629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042211, + "balance_loss_mlp": 1.00237119, + "epoch": 0.9467102731819931, + "flos": 661539608064.0, + "grad_norm": 0.03799419957281172, + "language_loss": 0.86398727, + "learning_rate": 7.428741522553184e-06, + "loss": 0.87440938, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.39819336, + "step": 4921, + "time_per_iteration": 2.878465175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_mlp": 1.00174677, + "epoch": 0.9469026548672567, + "flos": 676505604096.0, + "grad_norm": 0.033503677703785116, + "language_loss": 0.89720869, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90762508, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.39868164, + "step": 4922, + "time_per_iteration": 2.9082603454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041594, + "balance_loss_mlp": 1.00173008, + "epoch": 0.9470950365525203, + "flos": 515021376768.0, + "grad_norm": 0.04023426252004341, + "language_loss": 0.80039066, + "learning_rate": 7.32211620090012e-06, + "loss": 0.81080657, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.3984375, + "step": 4923, + "time_per_iteration": 2.59505033493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042749, + "balance_loss_mlp": 1.00290966, + "epoch": 0.9472874182377837, + "flos": 551227111680.0, + "grad_norm": 0.033951434514690154, + "language_loss": 0.81468022, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82510769, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.39819336, + "step": 4924, + "time_per_iteration": 2.75545334815979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104291, + "balance_loss_mlp": 1.00309443, + "epoch": 0.9474797999230473, + "flos": 543811187712.0, + "grad_norm": 0.034750930372707566, + "language_loss": 0.80470061, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81512976, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.39794922, + "step": 4925, + "time_per_iteration": 2.624394655227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104295, + "balance_loss_mlp": 1.00308681, + "epoch": 0.9476721816083109, + "flos": 846064823808.0, + "grad_norm": 0.033713578773609525, + "language_loss": 0.86285806, + "learning_rate": 7.163612828585242e-06, + "loss": 0.87328756, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.3984375, + "step": 4926, + "time_per_iteration": 3.0954294204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_mlp": 1.00312579, + "epoch": 0.9478645632935745, + "flos": 639148939776.0, + "grad_norm": 0.03739131726209916, + "language_loss": 0.79913974, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80956984, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.39868164, + "step": 4927, + "time_per_iteration": 2.753369092941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042404, + "balance_loss_mlp": 1.00256443, + "epoch": 0.948056944978838, + "flos": 658042099968.0, + "grad_norm": 0.03682204953186861, + "language_loss": 0.76448333, + "learning_rate": 7.058900559793469e-06, + "loss": 0.77490735, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.39819336, + "step": 4928, + "time_per_iteration": 2.833594560623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_mlp": 1.00130916, + "epoch": 0.9482493266641016, + "flos": 441837110784.0, + "grad_norm": 0.036391648532747914, + "language_loss": 0.83968282, + "learning_rate": 7.00683148031378e-06, + "loss": 0.85009503, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.39892578, + "step": 4929, + "time_per_iteration": 2.5252318382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_mlp": 1.00144029, + "epoch": 0.9484417083493651, + "flos": 547122057984.0, + "grad_norm": 0.0372848187794391, + "language_loss": 0.7867955, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.79720879, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.39868164, + "step": 4930, + "time_per_iteration": 2.7968673706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_mlp": 1.00149775, + "epoch": 0.9486340900346287, + "flos": 539695440384.0, + "grad_norm": 0.03536088760450684, + "language_loss": 0.80219245, + "learning_rate": 6.903267532262003e-06, + "loss": 0.81260628, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.39868164, + "step": 4931, + "time_per_iteration": 2.677079439163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041401, + "balance_loss_mlp": 1.00156105, + "epoch": 0.9488264717198923, + "flos": 682902801408.0, + "grad_norm": 0.03566286985886406, + "language_loss": 0.86654496, + "learning_rate": 6.851772703896975e-06, + "loss": 0.87695897, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.39819336, + "step": 4932, + "time_per_iteration": 2.814249038696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042864, + "balance_loss_mlp": 1.00309622, + "epoch": 0.9490188534051558, + "flos": 463560940800.0, + "grad_norm": 0.0386601386085668, + "language_loss": 0.88342351, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.89385211, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.39746094, + "step": 4933, + "time_per_iteration": 2.5076887607574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044538, + "balance_loss_mlp": 1.004722, + "epoch": 0.9492112350904194, + "flos": 544219456512.0, + "grad_norm": 0.036078560908697196, + "language_loss": 0.83274114, + "learning_rate": 6.7493574384489e-06, + "loss": 0.8431865, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.39794922, + "step": 4934, + "time_per_iteration": 2.66317081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_mlp": 1.00500488, + "epoch": 0.949403616775683, + "flos": 551459436288.0, + "grad_norm": 0.03441156495312572, + "language_loss": 0.84393692, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85438401, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.39672852, + "step": 4935, + "time_per_iteration": 2.702319860458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_mlp": 1.00510049, + "epoch": 0.9495959984609466, + "flos": 599498186496.0, + "grad_norm": 0.03383803796869729, + "language_loss": 0.83237123, + "learning_rate": 6.647708160456678e-06, + "loss": 0.84281969, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.3972168, + "step": 4936, + "time_per_iteration": 2.692322254180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_mlp": 1.00479722, + "epoch": 0.94978838014621, + "flos": 609531897600.0, + "grad_norm": 0.03709057763326247, + "language_loss": 0.82544994, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83589554, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.39746094, + "step": 4937, + "time_per_iteration": 2.805114984512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_mlp": 1.00272846, + "epoch": 0.9499807618314736, + "flos": 541866074880.0, + "grad_norm": 0.03343237583673612, + "language_loss": 0.87039685, + "learning_rate": 6.546825027775427e-06, + "loss": 0.88082153, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.3972168, + "step": 4938, + "time_per_iteration": 2.681006908416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043038, + "balance_loss_mlp": 1.00326955, + "epoch": 0.9501731435167372, + "flos": 595710028032.0, + "grad_norm": 0.03316869101198482, + "language_loss": 0.83294916, + "learning_rate": 6.496670814930717e-06, + "loss": 0.8433795, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.39746094, + "step": 4939, + "time_per_iteration": 2.7134695053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.00321257, + "epoch": 0.9503655252020008, + "flos": 455072822016.0, + "grad_norm": 0.03736501711507977, + "language_loss": 0.80317879, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81360853, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.39746094, + "step": 4940, + "time_per_iteration": 2.5654499530792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_mlp": 1.00352716, + "epoch": 0.9505579068872644, + "flos": 669128563968.0, + "grad_norm": 0.035510729926933764, + "language_loss": 0.84777826, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85821128, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.39746094, + "step": 4941, + "time_per_iteration": 2.814131736755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_mlp": 1.00497139, + "epoch": 0.9507502885725279, + "flos": 403080662784.0, + "grad_norm": 0.037080639816230825, + "language_loss": 0.81930745, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82975513, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.39770508, + "step": 4942, + "time_per_iteration": 2.4868767261505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_mlp": 1.00450122, + "epoch": 0.9509426702577914, + "flos": 701737635840.0, + "grad_norm": 0.03513225958551105, + "language_loss": 0.7980032, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80844545, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.39697266, + "step": 4943, + "time_per_iteration": 2.9845831394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_mlp": 1.00459003, + "epoch": 0.951135051943055, + "flos": 502401959424.0, + "grad_norm": 0.03803809561051826, + "language_loss": 0.8287642, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83920777, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.39746094, + "step": 4944, + "time_per_iteration": 2.585265874862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042732, + "balance_loss_mlp": 1.00296342, + "epoch": 0.9513274336283186, + "flos": 615866878464.0, + "grad_norm": 0.03642431100768812, + "language_loss": 0.81723523, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82766253, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.39746094, + "step": 4945, + "time_per_iteration": 2.9102370738983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_mlp": 1.00266135, + "epoch": 0.9515198153135821, + "flos": 520598145792.0, + "grad_norm": 0.04093977912249511, + "language_loss": 0.82534963, + "learning_rate": 6.150957065611363e-06, + "loss": 0.83577436, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.39794922, + "step": 4946, + "time_per_iteration": 2.569200038909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_mlp": 1.00292897, + "epoch": 0.9517121969988457, + "flos": 666285255168.0, + "grad_norm": 0.03445383496459535, + "language_loss": 0.77101904, + "learning_rate": 6.102336151595667e-06, + "loss": 0.78144598, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.39746094, + "step": 4947, + "time_per_iteration": 2.9556493759155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_mlp": 1.00329661, + "epoch": 0.9519045786841093, + "flos": 677616682752.0, + "grad_norm": 0.03959688141622468, + "language_loss": 0.76750845, + "learning_rate": 6.053906985658553e-06, + "loss": 0.77793932, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.39770508, + "step": 4948, + "time_per_iteration": 2.846489429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043021, + "balance_loss_mlp": 1.00308585, + "epoch": 0.9520969603693729, + "flos": 654141180672.0, + "grad_norm": 0.03311313171494322, + "language_loss": 0.80785477, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81828499, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.39916992, + "step": 4949, + "time_per_iteration": 2.872042655944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041327, + "balance_loss_mlp": 1.00148737, + "epoch": 0.9522893420546364, + "flos": 744683708160.0, + "grad_norm": 0.029795197161734573, + "language_loss": 0.83586603, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84627938, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.39819336, + "step": 4950, + "time_per_iteration": 3.068004846572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_mlp": 1.00318003, + "epoch": 0.9524817237398999, + "flos": 763031539200.0, + "grad_norm": 0.039165915603714734, + "language_loss": 0.81529355, + "learning_rate": 5.909770163964545e-06, + "loss": 0.82572323, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.39770508, + "step": 4951, + "time_per_iteration": 2.958136796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_mlp": 1.00177801, + "epoch": 0.9526741054251635, + "flos": 530147765760.0, + "grad_norm": 0.03797586697634241, + "language_loss": 0.82436419, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83477962, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.39746094, + "step": 4952, + "time_per_iteration": 2.5887794494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_mlp": 1.00191772, + "epoch": 0.9528664871104271, + "flos": 489426763008.0, + "grad_norm": 0.03867209497771876, + "language_loss": 0.81692654, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82734388, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.39794922, + "step": 4953, + "time_per_iteration": 2.60361909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_mlp": 1.00328088, + "epoch": 0.9530588687956907, + "flos": 518872718592.0, + "grad_norm": 0.035355195593526345, + "language_loss": 0.85669702, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86712778, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.39770508, + "step": 4954, + "time_per_iteration": 2.7473466396331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_mlp": 1.0029465, + "epoch": 0.9532512504809542, + "flos": 676414230528.0, + "grad_norm": 0.03834436064105519, + "language_loss": 0.81176341, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82219148, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.3984375, + "step": 4955, + "time_per_iteration": 2.8323822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_mlp": 1.00295496, + "epoch": 0.9534436321662177, + "flos": 490542699264.0, + "grad_norm": 0.03487032730513436, + "language_loss": 0.84557939, + "learning_rate": 5.673378829575249e-06, + "loss": 0.8560068, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.39770508, + "step": 4956, + "time_per_iteration": 2.5702896118164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104252, + "balance_loss_mlp": 1.00275135, + "epoch": 0.9536360138514813, + "flos": 497589238272.0, + "grad_norm": 0.036979832963634794, + "language_loss": 0.82516146, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83558667, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.39746094, + "step": 4957, + "time_per_iteration": 2.656902313232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_mlp": 1.00475693, + "epoch": 0.9538283955367449, + "flos": 802858236672.0, + "grad_norm": 0.03444373723979776, + "language_loss": 0.84639931, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85684431, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.3972168, + "step": 4958, + "time_per_iteration": 3.0350341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_mlp": 1.00439882, + "epoch": 0.9540207772220085, + "flos": 557798307840.0, + "grad_norm": 0.031107879429296895, + "language_loss": 0.80315, + "learning_rate": 5.533846857624203e-06, + "loss": 0.81359196, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.39770508, + "step": 4959, + "time_per_iteration": 2.779682159423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_mlp": 1.00489748, + "epoch": 0.954213158907272, + "flos": 685759716096.0, + "grad_norm": 0.03430720369577018, + "language_loss": 0.81983697, + "learning_rate": 5.487720113876882e-06, + "loss": 0.83028287, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.39672852, + "step": 4960, + "time_per_iteration": 2.9145681858062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042604, + "balance_loss_mlp": 1.00288343, + "epoch": 0.9544055405925356, + "flos": 536847273984.0, + "grad_norm": 0.0384807458715525, + "language_loss": 0.83184588, + "learning_rate": 5.441785356823214e-06, + "loss": 0.84227192, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.39697266, + "step": 4961, + "time_per_iteration": 2.727431535720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041086, + "balance_loss_mlp": 1.0013653, + "epoch": 0.9545979222777992, + "flos": 826924754688.0, + "grad_norm": 0.03962394126421547, + "language_loss": 0.80869973, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81911057, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.39697266, + "step": 4962, + "time_per_iteration": 3.1185572147369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_mlp": 1.00138175, + "epoch": 0.9547903039630627, + "flos": 763157905920.0, + "grad_norm": 0.04021618792762701, + "language_loss": 0.77994674, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.79035848, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.39770508, + "step": 4963, + "time_per_iteration": 3.1309633255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_mlp": 1.00191748, + "epoch": 0.9549826856483262, + "flos": 516334644480.0, + "grad_norm": 0.03883845801630645, + "language_loss": 0.83332193, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.84373903, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.39770508, + "step": 4964, + "time_per_iteration": 2.6028783321380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043315, + "balance_loss_mlp": 1.00349903, + "epoch": 0.9551750673335898, + "flos": 644267862528.0, + "grad_norm": 0.03455471990169676, + "language_loss": 0.82968116, + "learning_rate": 5.259966551095341e-06, + "loss": 0.84011436, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.39794922, + "step": 4965, + "time_per_iteration": 2.80012583732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042897, + "balance_loss_mlp": 1.00312924, + "epoch": 0.9553674490188534, + "flos": 473175689472.0, + "grad_norm": 0.03525799023609817, + "language_loss": 0.83457267, + "learning_rate": 5.214991993520546e-06, + "loss": 0.84500164, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.39746094, + "step": 4966, + "time_per_iteration": 2.626706838607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_mlp": 1.00325596, + "epoch": 0.955559830704117, + "flos": 529338031104.0, + "grad_norm": 0.04404774248069698, + "language_loss": 0.82227528, + "learning_rate": 5.170209528521763e-06, + "loss": 0.83270633, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.39819336, + "step": 4967, + "time_per_iteration": 2.599682569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_mlp": 1.00329304, + "epoch": 0.9557522123893806, + "flos": 549218815488.0, + "grad_norm": 0.0375296365771858, + "language_loss": 0.846102, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85653257, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.39746094, + "step": 4968, + "time_per_iteration": 2.690603017807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_mlp": 1.00377893, + "epoch": 0.955944594074644, + "flos": 510525550848.0, + "grad_norm": 0.03274819771004998, + "language_loss": 0.82464266, + "learning_rate": 5.08122094572222e-06, + "loss": 0.8350786, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.39794922, + "step": 4969, + "time_per_iteration": 2.687436103820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_mlp": 1.00153351, + "epoch": 0.9561369757599076, + "flos": 528711043584.0, + "grad_norm": 0.03580128099593846, + "language_loss": 0.80410147, + "learning_rate": 5.037014862469824e-06, + "loss": 0.81451499, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.39794922, + "step": 4970, + "time_per_iteration": 2.7537877559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_mlp": 1.00138855, + "epoch": 0.9563293574451712, + "flos": 499208707584.0, + "grad_norm": 0.04916035322851724, + "language_loss": 0.80648708, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81689912, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.39794922, + "step": 4971, + "time_per_iteration": 2.6075868606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_mlp": 1.00415039, + "epoch": 0.9565217391304348, + "flos": 1411746617088.0, + "grad_norm": 0.004977408547260208, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82816529, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.38769531, + "step": 4972, + "time_per_iteration": 4.884822845458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104272, + "balance_loss_mlp": 1.00297523, + "epoch": 0.9567141208156984, + "flos": 504885598464.0, + "grad_norm": 0.03188993073908652, + "language_loss": 0.78494072, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79536796, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.3972168, + "step": 4973, + "time_per_iteration": 2.730933666229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_mlp": 1.0030967, + "epoch": 0.9569065025009619, + "flos": 434130536448.0, + "grad_norm": 0.04150904011170753, + "language_loss": 0.80358505, + "learning_rate": 4.86211231669359e-06, + "loss": 0.81401271, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.39648438, + "step": 4974, + "time_per_iteration": 2.4550540447235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_mlp": 1.0030849, + "epoch": 0.9570988841862255, + "flos": 591155876352.0, + "grad_norm": 0.03814012594949819, + "language_loss": 0.78727484, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79770291, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.39697266, + "step": 4975, + "time_per_iteration": 2.7876100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_mlp": 1.00290823, + "epoch": 0.957291265871489, + "flos": 768643301376.0, + "grad_norm": 0.0533544119882121, + "language_loss": 0.78933519, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79976147, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.39697266, + "step": 4976, + "time_per_iteration": 2.9786951541900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_mlp": 1.00595045, + "epoch": 0.9574836475567526, + "flos": 640247379456.0, + "grad_norm": 0.044191723378042724, + "language_loss": 0.8518002, + "learning_rate": 4.732953758233849e-06, + "loss": 0.86225587, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.39599609, + "step": 4977, + "time_per_iteration": 2.795616388320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046776, + "balance_loss_mlp": 1.00805664, + "epoch": 0.9576760292420161, + "flos": 1579401541632.0, + "grad_norm": 0.008795466714776974, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79654026, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.38671875, + "step": 4978, + "time_per_iteration": 4.909578084945679 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043149, + "balance_loss_mlp": 1.00342846, + "epoch": 0.9578684109272797, + "flos": 497374410240.0, + "grad_norm": 0.03268780519478309, + "language_loss": 0.87571311, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88614452, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.39697266, + "step": 4979, + "time_per_iteration": 2.5885064601898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104325, + "balance_loss_mlp": 1.00352943, + "epoch": 0.9580607926125433, + "flos": 430854659328.0, + "grad_norm": 0.03948216870458314, + "language_loss": 0.85868418, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86911666, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.39697266, + "step": 4980, + "time_per_iteration": 2.479827404022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042844, + "balance_loss_mlp": 1.00324261, + "epoch": 0.9582531742978069, + "flos": 1129132704000.0, + "grad_norm": 0.032880007377631804, + "language_loss": 0.80475271, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81518114, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.39575195, + "step": 4981, + "time_per_iteration": 3.5456929206848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_mlp": 1.00335562, + "epoch": 0.9584455559830705, + "flos": 525556675584.0, + "grad_norm": 0.034452441041768166, + "language_loss": 0.79316235, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80359232, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.39624023, + "step": 4982, + "time_per_iteration": 2.6716814041137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104178, + "balance_loss_mlp": 1.00203562, + "epoch": 0.9586379376683339, + "flos": 635450209536.0, + "grad_norm": 0.03376520307989189, + "language_loss": 0.81836033, + "learning_rate": 4.479828637655392e-06, + "loss": 0.82877809, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.3972168, + "step": 4983, + "time_per_iteration": 2.842564582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041445, + "balance_loss_mlp": 1.00170052, + "epoch": 0.9588303193535975, + "flos": 416985157632.0, + "grad_norm": 0.036976444182102955, + "language_loss": 0.84129387, + "learning_rate": 4.438314345641459e-06, + "loss": 0.85170835, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.3972168, + "step": 4984, + "time_per_iteration": 2.50268816947937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_mlp": 1.00199842, + "epoch": 0.9590227010388611, + "flos": 482660180736.0, + "grad_norm": 0.049425598193085174, + "language_loss": 0.78694046, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79735744, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.39672852, + "step": 4985, + "time_per_iteration": 4.018237113952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_mlp": 1.00330746, + "epoch": 0.9592150827241247, + "flos": 685851089664.0, + "grad_norm": 0.054295758120399606, + "language_loss": 0.80803186, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81846166, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.39648438, + "step": 4986, + "time_per_iteration": 2.946019411087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042716, + "balance_loss_mlp": 1.00309062, + "epoch": 0.9594074644093882, + "flos": 575631912192.0, + "grad_norm": 0.0368102094197737, + "language_loss": 0.7151469, + "learning_rate": 4.314925898349642e-06, + "loss": 0.72557408, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.39599609, + "step": 4987, + "time_per_iteration": 2.718128204345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_mlp": 1.00328279, + "epoch": 0.9595998460946518, + "flos": 547988173056.0, + "grad_norm": 0.038441266597079346, + "language_loss": 0.78553158, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79596162, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.39697266, + "step": 4988, + "time_per_iteration": 2.810852289199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043081, + "balance_loss_mlp": 1.00336051, + "epoch": 0.9597922277799154, + "flos": 475027483392.0, + "grad_norm": 0.038398139954487105, + "language_loss": 0.78839409, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79882497, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.39697266, + "step": 4989, + "time_per_iteration": 2.525266408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_mlp": 1.0030148, + "epoch": 0.9599846094651789, + "flos": 515720295936.0, + "grad_norm": 0.040458239212947125, + "language_loss": 0.86110353, + "learning_rate": 4.193269428723889e-06, + "loss": 0.87153065, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.39672852, + "step": 4990, + "time_per_iteration": 2.6019325256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042466, + "balance_loss_mlp": 1.00281739, + "epoch": 0.9601769911504425, + "flos": 596163983616.0, + "grad_norm": 0.04715634947564762, + "language_loss": 0.78668261, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79710728, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.39624023, + "step": 4991, + "time_per_iteration": 2.7938621044158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00311601, + "epoch": 0.960369372835706, + "flos": 494042152704.0, + "grad_norm": 0.03203772527177207, + "language_loss": 0.79823196, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80865961, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.39624023, + "step": 4992, + "time_per_iteration": 2.5746684074401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_mlp": 1.00328052, + "epoch": 0.9605617545209696, + "flos": 580407694848.0, + "grad_norm": 0.03324631717517119, + "language_loss": 0.83355463, + "learning_rate": 4.073345361845171e-06, + "loss": 0.84398437, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.39672852, + "step": 4993, + "time_per_iteration": 2.7186391353607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_mlp": 1.00347352, + "epoch": 0.9607541362062332, + "flos": 929300297472.0, + "grad_norm": 0.028939559097249826, + "language_loss": 0.86792874, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87835968, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.39599609, + "step": 4994, + "time_per_iteration": 3.3311779499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042994, + "balance_loss_mlp": 1.00327384, + "epoch": 0.9609465178914968, + "flos": 574281705984.0, + "grad_norm": 0.03357798587309998, + "language_loss": 0.76040745, + "learning_rate": 3.994358637073036e-06, + "loss": 0.77083737, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.39697266, + "step": 4995, + "time_per_iteration": 2.8095812797546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_mlp": 1.00360358, + "epoch": 0.9611388995767602, + "flos": 531914989056.0, + "grad_norm": 0.03322291680520947, + "language_loss": 0.857813, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86824536, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.39599609, + "step": 4996, + "time_per_iteration": 2.630094528198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_mlp": 1.00336814, + "epoch": 0.9613312812620238, + "flos": 647404733952.0, + "grad_norm": 0.036018524695575205, + "language_loss": 0.82486397, + "learning_rate": 3.916142178097881e-06, + "loss": 0.83529437, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.39648438, + "step": 4997, + "time_per_iteration": 2.761890411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043069, + "balance_loss_mlp": 1.00339627, + "epoch": 0.9615236629472874, + "flos": 497179024128.0, + "grad_norm": 0.034560793449925374, + "language_loss": 0.78197134, + "learning_rate": 3.877322836288888e-06, + "loss": 0.79240203, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.39648438, + "step": 4998, + "time_per_iteration": 2.913933038711548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_mlp": 1.00315988, + "epoch": 0.961716044632551, + "flos": 514007507712.0, + "grad_norm": 0.05213638036394556, + "language_loss": 0.76171172, + "learning_rate": 3.838696106385153e-06, + "loss": 0.77213907, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.39550781, + "step": 4999, + "time_per_iteration": 2.6721343994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043033, + "balance_loss_mlp": 1.00338387, + "epoch": 0.9619084263178146, + "flos": 502085064192.0, + "grad_norm": 0.04055775067790823, + "language_loss": 0.81044245, + "learning_rate": 3.800262003382904e-06, + "loss": 0.82087278, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.39624023, + "step": 5000, + "time_per_iteration": 2.5917038917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_mlp": 1.00306058, + "epoch": 0.9621008080030781, + "flos": 596806522368.0, + "grad_norm": 0.04160425197136875, + "language_loss": 0.75552607, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.76595342, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.39648438, + "step": 5001, + "time_per_iteration": 2.768864631652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_mlp": 1.00340927, + "epoch": 0.9622931896883417, + "flos": 503248632576.0, + "grad_norm": 0.040216779291667094, + "language_loss": 0.82677174, + "learning_rate": 3.723971737693899e-06, + "loss": 0.83720231, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.39624023, + "step": 5002, + "time_per_iteration": 2.6624748706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042864, + "balance_loss_mlp": 1.00319076, + "epoch": 0.9624855713736052, + "flos": 608450954496.0, + "grad_norm": 0.036912255623922086, + "language_loss": 0.80918676, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81961536, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.39648438, + "step": 5003, + "time_per_iteration": 2.767632484436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_mlp": 1.00313723, + "epoch": 0.9626779530588688, + "flos": 511736751360.0, + "grad_norm": 0.04265801947296761, + "language_loss": 0.85481077, + "learning_rate": 3.648452157695936e-06, + "loss": 0.86523914, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.39672852, + "step": 5004, + "time_per_iteration": 2.6027941703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_mlp": 1.00191116, + "epoch": 0.9628703347441323, + "flos": 628498934784.0, + "grad_norm": 0.03505167867823401, + "language_loss": 0.82969689, + "learning_rate": 3.610981411526937e-06, + "loss": 0.84011322, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.39697266, + "step": 5005, + "time_per_iteration": 2.840470790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_mlp": 1.00182772, + "epoch": 0.9630627164293959, + "flos": 631898266368.0, + "grad_norm": 0.03609620968879647, + "language_loss": 0.7782557, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78867197, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.39770508, + "step": 5006, + "time_per_iteration": 2.7770707607269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041132, + "balance_loss_mlp": 1.00138748, + "epoch": 0.9632550981146595, + "flos": 571730992896.0, + "grad_norm": 0.03395739214114779, + "language_loss": 0.78756148, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.7979728, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.3972168, + "step": 5007, + "time_per_iteration": 2.8784968852996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_mlp": 1.0014658, + "epoch": 0.9634474797999231, + "flos": 467160516096.0, + "grad_norm": 0.03779584270298865, + "language_loss": 0.81459928, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.82501137, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.3972168, + "step": 5008, + "time_per_iteration": 2.6344192028045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042324, + "balance_loss_mlp": 1.00269866, + "epoch": 0.9636398614851867, + "flos": 527625242880.0, + "grad_norm": 0.03925966363653875, + "language_loss": 0.85927451, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86969769, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.39599609, + "step": 5009, + "time_per_iteration": 2.644716501235962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042559, + "balance_loss_mlp": 1.00288606, + "epoch": 0.9638322431704501, + "flos": 565943286528.0, + "grad_norm": 0.03833079566797296, + "language_loss": 0.75550568, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76593125, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.39648438, + "step": 5010, + "time_per_iteration": 2.770062208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104587, + "balance_loss_mlp": 1.0061729, + "epoch": 0.9640246248557137, + "flos": 478741764864.0, + "grad_norm": 0.03662615642017614, + "language_loss": 0.85119247, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.86165118, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.39672852, + "step": 5011, + "time_per_iteration": 2.604835271835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_mlp": 1.0055176, + "epoch": 0.9642170065409773, + "flos": 540339924480.0, + "grad_norm": 0.03893487686049037, + "language_loss": 0.8917706, + "learning_rate": 3.354083022201859e-06, + "loss": 0.90222299, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.39697266, + "step": 5012, + "time_per_iteration": 2.6692276000976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045169, + "balance_loss_mlp": 1.00542474, + "epoch": 0.9644093882262409, + "flos": 524777076480.0, + "grad_norm": 0.038874654820981284, + "language_loss": 0.84158158, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.85203332, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.3972168, + "step": 5013, + "time_per_iteration": 2.580549955368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045431, + "balance_loss_mlp": 1.00571084, + "epoch": 0.9646017699115044, + "flos": 575382091008.0, + "grad_norm": 0.03437411704258225, + "language_loss": 0.78946483, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79991913, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.39697266, + "step": 5014, + "time_per_iteration": 2.7400918006896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045204, + "balance_loss_mlp": 1.0054599, + "epoch": 0.964794151596768, + "flos": 637957181184.0, + "grad_norm": 0.03987907749747842, + "language_loss": 0.84913594, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85958803, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.3972168, + "step": 5015, + "time_per_iteration": 2.76391339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042625, + "balance_loss_mlp": 1.00285721, + "epoch": 0.9649865332820315, + "flos": 618560487936.0, + "grad_norm": 0.036457765801838675, + "language_loss": 0.86692178, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87734801, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.39746094, + "step": 5016, + "time_per_iteration": 2.762315511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_mlp": 1.00289977, + "epoch": 0.9651789149672951, + "flos": 517327126272.0, + "grad_norm": 0.03263683085754147, + "language_loss": 0.81227726, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82270324, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.39672852, + "step": 5017, + "time_per_iteration": 2.766784191131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042397, + "balance_loss_mlp": 1.00274801, + "epoch": 0.9653712966525587, + "flos": 493922588928.0, + "grad_norm": 0.037218470225731236, + "language_loss": 0.80549169, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.8159157, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.39624023, + "step": 5018, + "time_per_iteration": 2.5550506114959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_mlp": 1.00335717, + "epoch": 0.9655636783378222, + "flos": 537657008640.0, + "grad_norm": 0.02984947509813429, + "language_loss": 0.82836092, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83879143, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.39672852, + "step": 5019, + "time_per_iteration": 2.7989487648010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043137, + "balance_loss_mlp": 1.00334537, + "epoch": 0.9657560600230858, + "flos": 459959420160.0, + "grad_norm": 0.03464590555865733, + "language_loss": 0.82756871, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83800006, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.39770508, + "step": 5020, + "time_per_iteration": 2.6502068042755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042924, + "balance_loss_mlp": 1.00320303, + "epoch": 0.9659484417083494, + "flos": 687389879040.0, + "grad_norm": 0.04341700481017599, + "language_loss": 0.83695757, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84738678, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.39697266, + "step": 5021, + "time_per_iteration": 2.9042539596557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104174, + "balance_loss_mlp": 1.00302124, + "epoch": 0.966140823393613, + "flos": 1505459063808.0, + "grad_norm": 0.003981124048801385, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81735986, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.38671875, + "step": 5022, + "time_per_iteration": 4.69134259223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043105, + "balance_loss_mlp": 1.00345588, + "epoch": 0.9663332050788765, + "flos": 465859887360.0, + "grad_norm": 0.041583580268376795, + "language_loss": 0.8178041, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82823515, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.39624023, + "step": 5023, + "time_per_iteration": 2.612053394317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042898, + "balance_loss_mlp": 1.00322497, + "epoch": 0.96652558676414, + "flos": 501878984448.0, + "grad_norm": 0.037833370314126924, + "language_loss": 0.85891867, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86934769, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.39648438, + "step": 5024, + "time_per_iteration": 2.6159794330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_mlp": 1.0028168, + "epoch": 0.9667179684494036, + "flos": 425744484864.0, + "grad_norm": 0.0421263527174388, + "language_loss": 0.83179516, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.84222031, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.39672852, + "step": 5025, + "time_per_iteration": 2.525249481201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045691, + "balance_loss_mlp": 1.00604248, + "epoch": 0.9669103501346672, + "flos": 518010494208.0, + "grad_norm": 0.03417691570925141, + "language_loss": 0.86086416, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.87132108, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.39624023, + "step": 5026, + "time_per_iteration": 2.687157392501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_mlp": 1.00610387, + "epoch": 0.9671027318199308, + "flos": 457176382464.0, + "grad_norm": 0.04850651262651699, + "language_loss": 0.76400167, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.77445948, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.39648438, + "step": 5027, + "time_per_iteration": 2.6646482944488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_mlp": 1.0019294, + "epoch": 0.9672951135051943, + "flos": 526062153984.0, + "grad_norm": 0.03508971060006407, + "language_loss": 0.80259544, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81301212, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.3972168, + "step": 5028, + "time_per_iteration": 2.6135594844818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_mlp": 1.00237989, + "epoch": 0.9674874951904578, + "flos": 575102134272.0, + "grad_norm": 0.03464370697231736, + "language_loss": 0.80052054, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.81094229, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.39770508, + "step": 5029, + "time_per_iteration": 2.7764198780059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_mlp": 1.00205135, + "epoch": 0.9676798768757214, + "flos": 630424605696.0, + "grad_norm": 0.03130352473880452, + "language_loss": 0.80490315, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.81532162, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.39770508, + "step": 5030, + "time_per_iteration": 2.929133653640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_mlp": 1.00283051, + "epoch": 0.967872258560985, + "flos": 1467117687552.0, + "grad_norm": 0.00274412494684576, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76605004, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.38769531, + "step": 5031, + "time_per_iteration": 4.69275164604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_mlp": 1.00342023, + "epoch": 0.9680646402462486, + "flos": 566568328704.0, + "grad_norm": 0.040289351431982104, + "language_loss": 0.79295713, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80338806, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.39648438, + "step": 5032, + "time_per_iteration": 2.678314685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_mlp": 1.00508118, + "epoch": 0.9682570219315121, + "flos": 1437650344704.0, + "grad_norm": 0.004353892250927695, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79118514, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.38574219, + "step": 5033, + "time_per_iteration": 4.812420606613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041585, + "balance_loss_mlp": 1.00186443, + "epoch": 0.9684494036167757, + "flos": 585704507136.0, + "grad_norm": 0.03929500214727362, + "language_loss": 0.82216263, + "learning_rate": 2.608217639166688e-06, + "loss": 0.83257854, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.39697266, + "step": 5034, + "time_per_iteration": 2.7576022148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_mlp": 1.00206041, + "epoch": 0.9686417853020393, + "flos": 560190573312.0, + "grad_norm": 0.03269322965722924, + "language_loss": 0.84307742, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85349548, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.3972168, + "step": 5035, + "time_per_iteration": 2.6869993209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042898, + "balance_loss_mlp": 1.00327253, + "epoch": 0.9688341669873028, + "flos": 786264023040.0, + "grad_norm": 0.03680667246769885, + "language_loss": 0.83578247, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84621143, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.39599609, + "step": 5036, + "time_per_iteration": 3.0497498512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_mlp": 1.00312877, + "epoch": 0.9690265486725663, + "flos": 396769981440.0, + "grad_norm": 0.0383741827440454, + "language_loss": 0.80366373, + "learning_rate": 2.513747116326126e-06, + "loss": 0.81409198, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.39672852, + "step": 5037, + "time_per_iteration": 2.4929020404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_mlp": 1.00284219, + "epoch": 0.9692189303578299, + "flos": 477417803520.0, + "grad_norm": 0.04163535065667303, + "language_loss": 0.77594548, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78636992, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.39575195, + "step": 5038, + "time_per_iteration": 2.7530930042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042484, + "balance_loss_mlp": 1.00285935, + "epoch": 0.9694113120430935, + "flos": 598688451840.0, + "grad_norm": 0.04027422764598806, + "language_loss": 0.79350811, + "learning_rate": 2.451732453851385e-06, + "loss": 0.80393296, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.39599609, + "step": 5039, + "time_per_iteration": 2.7393152713775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_mlp": 1.00305641, + "epoch": 0.9696036937283571, + "flos": 501898426368.0, + "grad_norm": 0.03255232311351302, + "language_loss": 0.83089191, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.84131968, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.39697266, + "step": 5040, + "time_per_iteration": 2.6828835010528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_mlp": 1.003389, + "epoch": 0.9697960754136207, + "flos": 433190544384.0, + "grad_norm": 0.04018403496708726, + "language_loss": 0.8756386, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88606954, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.39672852, + "step": 5041, + "time_per_iteration": 2.4938342571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_mlp": 1.00332952, + "epoch": 0.9699884570988841, + "flos": 569675064576.0, + "grad_norm": 0.0340373321768144, + "language_loss": 0.85916209, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86959231, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.39672852, + "step": 5042, + "time_per_iteration": 2.7059319019317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041961, + "balance_loss_mlp": 1.00219274, + "epoch": 0.9701808387841477, + "flos": 517237698048.0, + "grad_norm": 0.03863133055938838, + "language_loss": 0.81873441, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82915401, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.39746094, + "step": 5043, + "time_per_iteration": 4.20774507522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_mlp": 1.00325406, + "epoch": 0.9703732204694113, + "flos": 492498505728.0, + "grad_norm": 0.04293843443964225, + "language_loss": 0.76847333, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77890217, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.39599609, + "step": 5044, + "time_per_iteration": 2.586209297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043006, + "balance_loss_mlp": 1.00338101, + "epoch": 0.9705656021546749, + "flos": 627280931328.0, + "grad_norm": 0.03978912283300949, + "language_loss": 0.80898952, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81941956, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.39599609, + "step": 5045, + "time_per_iteration": 2.7928311824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042537, + "balance_loss_mlp": 1.00291181, + "epoch": 0.9707579838399384, + "flos": 472394145024.0, + "grad_norm": 0.03662115624180283, + "language_loss": 0.82899636, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.83942175, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.39599609, + "step": 5046, + "time_per_iteration": 2.601562261581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042586, + "balance_loss_mlp": 1.00296044, + "epoch": 0.970950365525202, + "flos": 493139099136.0, + "grad_norm": 0.043534433084671205, + "language_loss": 0.80746675, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81789255, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.39599609, + "step": 5047, + "time_per_iteration": 2.7135002613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_mlp": 1.00305772, + "epoch": 0.9711427472104656, + "flos": 558377663232.0, + "grad_norm": 0.038890765592642804, + "language_loss": 0.80750018, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81792724, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.39624023, + "step": 5048, + "time_per_iteration": 2.7204275131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_mlp": 1.00332332, + "epoch": 0.9713351288957291, + "flos": 627101096448.0, + "grad_norm": 0.03085379211030374, + "language_loss": 0.8402819, + "learning_rate": 2.153250946564489e-06, + "loss": 0.85071111, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.39575195, + "step": 5049, + "time_per_iteration": 2.926865816116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_mlp": 1.00214159, + "epoch": 0.9715275105809927, + "flos": 500083570944.0, + "grad_norm": 0.03585393995796312, + "language_loss": 0.81363153, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82405019, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.39697266, + "step": 5050, + "time_per_iteration": 2.7249972820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_mlp": 1.00198603, + "epoch": 0.9717198922662562, + "flos": 478481250048.0, + "grad_norm": 0.040701458255183315, + "language_loss": 0.77633119, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78674829, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.39697266, + "step": 5051, + "time_per_iteration": 2.5472941398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041661, + "balance_loss_mlp": 1.00196385, + "epoch": 0.9719122739515198, + "flos": 554550620928.0, + "grad_norm": 0.035991264794850895, + "language_loss": 0.78920466, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79962128, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.39672852, + "step": 5052, + "time_per_iteration": 2.683638572692871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_mlp": 1.00313902, + "epoch": 0.9721046556367834, + "flos": 566930910720.0, + "grad_norm": 0.03574138021311205, + "language_loss": 0.80556166, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.81598926, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.39599609, + "step": 5053, + "time_per_iteration": 2.6971213817596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042967, + "balance_loss_mlp": 1.00329447, + "epoch": 0.972297037322047, + "flos": 561401773824.0, + "grad_norm": 0.04211125372208404, + "language_loss": 0.78557342, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.7960031, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.39648438, + "step": 5054, + "time_per_iteration": 2.739396810531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042582, + "balance_loss_mlp": 1.00290966, + "epoch": 0.9724894190073105, + "flos": 513503974656.0, + "grad_norm": 0.03993140177401839, + "language_loss": 0.79701972, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80744553, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.39648438, + "step": 5055, + "time_per_iteration": 2.6858069896698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_mlp": 1.00298762, + "epoch": 0.972681800692574, + "flos": 615039647232.0, + "grad_norm": 0.04326090214420337, + "language_loss": 0.80950338, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81992978, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.39624023, + "step": 5056, + "time_per_iteration": 2.7890920639038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_mlp": 1.00307882, + "epoch": 0.9728741823778376, + "flos": 835314696960.0, + "grad_norm": 0.03650842968056953, + "language_loss": 0.84358609, + "learning_rate": 1.92838141509849e-06, + "loss": 0.85401356, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.39648438, + "step": 5057, + "time_per_iteration": 3.0628442764282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042465, + "balance_loss_mlp": 1.00272107, + "epoch": 0.9730665640631012, + "flos": 572588359680.0, + "grad_norm": 0.03702571878539965, + "language_loss": 0.84975529, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.86018002, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.3972168, + "step": 5058, + "time_per_iteration": 2.737433671951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_mlp": 1.00340819, + "epoch": 0.9732589457483648, + "flos": 507520882176.0, + "grad_norm": 0.04068245546589233, + "language_loss": 0.77703661, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78746647, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.39550781, + "step": 5059, + "time_per_iteration": 2.584334373474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043043, + "balance_loss_mlp": 1.00346506, + "epoch": 0.9734513274336283, + "flos": 928483759872.0, + "grad_norm": 0.03701632354193653, + "language_loss": 0.80842561, + "learning_rate": 1.84724562509897e-06, + "loss": 0.818856, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.39550781, + "step": 5060, + "time_per_iteration": 3.1218395233154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_mlp": 1.00314617, + "epoch": 0.9736437091188919, + "flos": 492926216448.0, + "grad_norm": 0.03106926650886392, + "language_loss": 0.78421533, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79464358, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.39648438, + "step": 5061, + "time_per_iteration": 2.724682092666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_mlp": 1.003245, + "epoch": 0.9738360908041555, + "flos": 614455434240.0, + "grad_norm": 0.03875354346160162, + "language_loss": 0.8378309, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84826028, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.39672852, + "step": 5062, + "time_per_iteration": 2.7275002002716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044144, + "balance_loss_mlp": 1.0054245, + "epoch": 0.974028472489419, + "flos": 1552733766144.0, + "grad_norm": 0.004368152650976507, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77036238, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.38671875, + "step": 5063, + "time_per_iteration": 4.944604873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_mlp": 1.00630188, + "epoch": 0.9742208541746825, + "flos": 1414180678656.0, + "grad_norm": 0.007059570521795609, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80722737, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.38671875, + "step": 5064, + "time_per_iteration": 4.950110197067261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041149, + "balance_loss_mlp": 1.00147641, + "epoch": 0.9744132358599461, + "flos": 676099280640.0, + "grad_norm": 0.03034439726817451, + "language_loss": 0.77150154, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.78191304, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.39648438, + "step": 5065, + "time_per_iteration": 2.856956958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041453, + "balance_loss_mlp": 1.00168526, + "epoch": 0.9746056175452097, + "flos": 599598308352.0, + "grad_norm": 0.034912593112501174, + "language_loss": 0.78129935, + "learning_rate": 1.690196122544896e-06, + "loss": 0.79171389, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.39746094, + "step": 5066, + "time_per_iteration": 2.803445816040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_mlp": 1.00152469, + "epoch": 0.9747979992304733, + "flos": 733534060800.0, + "grad_norm": 0.03597891818923648, + "language_loss": 0.82944268, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83985561, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.39746094, + "step": 5067, + "time_per_iteration": 2.9895918369293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041402, + "balance_loss_mlp": 1.00168097, + "epoch": 0.9749903809157369, + "flos": 617620495872.0, + "grad_norm": 0.04145388586739779, + "language_loss": 0.76952046, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77993447, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.39697266, + "step": 5068, + "time_per_iteration": 2.743577480316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_mlp": 1.00169683, + "epoch": 0.9751827626010003, + "flos": 469350592512.0, + "grad_norm": 0.035842707054568955, + "language_loss": 0.84308249, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.85349715, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.39746094, + "step": 5069, + "time_per_iteration": 2.551919460296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_mlp": 1.00311708, + "epoch": 0.9753751442862639, + "flos": 600408043008.0, + "grad_norm": 0.03945480801973415, + "language_loss": 0.85697186, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86740017, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.39697266, + "step": 5070, + "time_per_iteration": 2.8267908096313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104272, + "balance_loss_mlp": 1.00299966, + "epoch": 0.9755675259715275, + "flos": 652092055296.0, + "grad_norm": 0.03287905354200328, + "language_loss": 0.82673955, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83716673, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.39697266, + "step": 5071, + "time_per_iteration": 2.898956537246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_mlp": 1.00323999, + "epoch": 0.9757599076567911, + "flos": 564725283072.0, + "grad_norm": 0.039048917165716984, + "language_loss": 0.79786921, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80829906, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.3972168, + "step": 5072, + "time_per_iteration": 2.687793016433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042691, + "balance_loss_mlp": 1.00301838, + "epoch": 0.9759522893420547, + "flos": 505649646336.0, + "grad_norm": 0.036489369587936984, + "language_loss": 0.80441165, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81483853, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.39648438, + "step": 5073, + "time_per_iteration": 2.646900177001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045537, + "balance_loss_mlp": 1.00591159, + "epoch": 0.9761446710273182, + "flos": 584838392064.0, + "grad_norm": 0.039814826111222036, + "language_loss": 0.82315922, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83361453, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.39599609, + "step": 5074, + "time_per_iteration": 2.69627046585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045692, + "balance_loss_mlp": 1.00604343, + "epoch": 0.9763370527125818, + "flos": 483172462080.0, + "grad_norm": 0.04765850391618214, + "language_loss": 0.82408017, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.83453709, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.39624023, + "step": 5075, + "time_per_iteration": 2.619927406311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045897, + "balance_loss_mlp": 1.00617659, + "epoch": 0.9765294343978453, + "flos": 620114828544.0, + "grad_norm": 0.04096277036826416, + "language_loss": 0.7898621, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.80032104, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.39697266, + "step": 5076, + "time_per_iteration": 2.72031569480896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_mlp": 1.002967, + "epoch": 0.9767218160831089, + "flos": 527588304384.0, + "grad_norm": 0.038563950916879496, + "language_loss": 0.85749316, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86791986, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.39672852, + "step": 5077, + "time_per_iteration": 2.5702054500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_mlp": 1.0028677, + "epoch": 0.9769141977683724, + "flos": 526246846464.0, + "grad_norm": 0.0452921679578959, + "language_loss": 0.84605044, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.85647535, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.39599609, + "step": 5078, + "time_per_iteration": 2.7002179622650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_mlp": 1.00295401, + "epoch": 0.977106579453636, + "flos": 458644207104.0, + "grad_norm": 0.036677586674253186, + "language_loss": 0.81371784, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.82414383, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.39624023, + "step": 5079, + "time_per_iteration": 2.796189308166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_mlp": 1.00250697, + "epoch": 0.9772989611388996, + "flos": 533134937856.0, + "grad_norm": 0.03711615810210664, + "language_loss": 0.81923473, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82965648, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.39648438, + "step": 5080, + "time_per_iteration": 2.614269495010376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_mlp": 1.0034405, + "epoch": 0.9774913428241632, + "flos": 756755851008.0, + "grad_norm": 0.03628191500870131, + "language_loss": 0.86773902, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87817061, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.39697266, + "step": 5081, + "time_per_iteration": 3.000998020172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_mlp": 1.00526428, + "epoch": 0.9776837245094268, + "flos": 1557670908672.0, + "grad_norm": 0.004261390789563241, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79939473, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.38671875, + "step": 5082, + "time_per_iteration": 4.966464281082153 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_mlp": 1.00311267, + "epoch": 0.9778761061946902, + "flos": 593634657792.0, + "grad_norm": 0.06033698096536924, + "language_loss": 0.84689307, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85732073, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.39624023, + "step": 5083, + "time_per_iteration": 2.6818041801452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_mlp": 1.00203848, + "epoch": 0.9780684878799538, + "flos": 415832282880.0, + "grad_norm": 0.03521694229943953, + "language_loss": 0.82329738, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.8337152, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.3972168, + "step": 5084, + "time_per_iteration": 2.487070322036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_mlp": 1.00188649, + "epoch": 0.9782608695652174, + "flos": 569544807168.0, + "grad_norm": 0.0361861830744165, + "language_loss": 0.85351056, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.86392707, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.39746094, + "step": 5085, + "time_per_iteration": 2.7171080112457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041786, + "balance_loss_mlp": 1.00204134, + "epoch": 0.978453251250481, + "flos": 691762250496.0, + "grad_norm": 0.03997358667222654, + "language_loss": 0.8329246, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84334242, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.3972168, + "step": 5086, + "time_per_iteration": 2.908737897872925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_mlp": 1.00167274, + "epoch": 0.9786456329357445, + "flos": 503572330752.0, + "grad_norm": 0.03505329178306916, + "language_loss": 0.77599311, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78640735, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.3972168, + "step": 5087, + "time_per_iteration": 2.654874324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_mlp": 1.00302482, + "epoch": 0.9788380146210081, + "flos": 864606095616.0, + "grad_norm": 0.04385354036304431, + "language_loss": 0.8105306, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.82095802, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.39697266, + "step": 5088, + "time_per_iteration": 3.037149667739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042665, + "balance_loss_mlp": 1.0030396, + "epoch": 0.9790303963062716, + "flos": 513746992896.0, + "grad_norm": 0.03583893943421543, + "language_loss": 0.84371638, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85414302, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.39599609, + "step": 5089, + "time_per_iteration": 2.5842764377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043218, + "balance_loss_mlp": 1.00349736, + "epoch": 0.9792227779915352, + "flos": 495411800832.0, + "grad_norm": 0.04233077980787355, + "language_loss": 0.86789143, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87832361, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.39697266, + "step": 5090, + "time_per_iteration": 2.5548324584960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010429, + "balance_loss_mlp": 1.00322676, + "epoch": 0.9794151596767988, + "flos": 609484265472.0, + "grad_norm": 0.03604199213932351, + "language_loss": 0.81851107, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82894003, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.39648438, + "step": 5091, + "time_per_iteration": 2.7624361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_mlp": 1.00305605, + "epoch": 0.9796075413620623, + "flos": 479197665792.0, + "grad_norm": 0.03750770403802879, + "language_loss": 0.8721531, + "learning_rate": 1.09015417612357e-06, + "loss": 0.88258064, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.39672852, + "step": 5092, + "time_per_iteration": 2.596569299697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042386, + "balance_loss_mlp": 1.00278485, + "epoch": 0.9797999230473259, + "flos": 593363449344.0, + "grad_norm": 0.03750271931428907, + "language_loss": 0.84715217, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85757607, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.39575195, + "step": 5093, + "time_per_iteration": 2.798842191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042858, + "balance_loss_mlp": 1.00313711, + "epoch": 0.9799923047325895, + "flos": 557564037888.0, + "grad_norm": 0.03732665936574346, + "language_loss": 0.82240361, + "learning_rate": 1.049418636655919e-06, + "loss": 0.83283222, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.39697266, + "step": 5094, + "time_per_iteration": 2.9326531887054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_mlp": 1.00196946, + "epoch": 0.9801846864178531, + "flos": 580629325824.0, + "grad_norm": 0.03472184065520672, + "language_loss": 0.84838355, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85880166, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.39819336, + "step": 5095, + "time_per_iteration": 2.7500767707824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042666, + "balance_loss_mlp": 1.00296915, + "epoch": 0.9803770681031165, + "flos": 516211190016.0, + "grad_norm": 0.03395616690027392, + "language_loss": 0.80524683, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.81567359, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.39672852, + "step": 5096, + "time_per_iteration": 2.670788288116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_mlp": 1.00285935, + "epoch": 0.9805694497883801, + "flos": 568120723968.0, + "grad_norm": 0.035035104097038514, + "language_loss": 0.78688991, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79731572, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.39697266, + "step": 5097, + "time_per_iteration": 2.7245545387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_mlp": 1.00313163, + "epoch": 0.9807618314736437, + "flos": 480333043968.0, + "grad_norm": 0.14407464636347864, + "language_loss": 0.74170625, + "learning_rate": 9.702721370922208e-07, + "loss": 0.75213504, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.3972168, + "step": 5098, + "time_per_iteration": 2.662116527557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_mlp": 1.00342762, + "epoch": 0.9809542131589073, + "flos": 546342458880.0, + "grad_norm": 0.04001499781359293, + "language_loss": 0.80586725, + "learning_rate": 9.509698444908344e-07, + "loss": 0.81629872, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.39697266, + "step": 5099, + "time_per_iteration": 2.621173143386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042571, + "balance_loss_mlp": 1.00299406, + "epoch": 0.9811465948441709, + "flos": 521863781376.0, + "grad_norm": 0.03872092892168167, + "language_loss": 0.80255032, + "learning_rate": 9.318612999057452e-07, + "loss": 0.81297612, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.39550781, + "step": 5100, + "time_per_iteration": 2.626246452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_mlp": 1.00308967, + "epoch": 0.9813389765294344, + "flos": 542321975808.0, + "grad_norm": 0.03966927263008885, + "language_loss": 0.80838525, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81881237, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.39599609, + "step": 5101, + "time_per_iteration": 2.6338257789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_mlp": 1.00304604, + "epoch": 0.981531358214698, + "flos": 568465809408.0, + "grad_norm": 0.036339927688089545, + "language_loss": 0.84584022, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85626721, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.39624023, + "step": 5102, + "time_per_iteration": 2.7215864658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_mlp": 1.00325489, + "epoch": 0.9817237398999615, + "flos": 578414949888.0, + "grad_norm": 0.03608910956574523, + "language_loss": 0.81383669, + "learning_rate": 8.756982280578307e-07, + "loss": 0.82426471, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.39526367, + "step": 5103, + "time_per_iteration": 2.6962757110595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_mlp": 1.00336289, + "epoch": 0.9819161215852251, + "flos": 702855517440.0, + "grad_norm": 0.03526602315736388, + "language_loss": 0.8230114, + "learning_rate": 8.573647489714676e-07, + "loss": 0.83344281, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.39746094, + "step": 5104, + "time_per_iteration": 2.952726125717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045292, + "balance_loss_mlp": 1.00552344, + "epoch": 0.9821085032704886, + "flos": 625453436928.0, + "grad_norm": 0.03571049923585047, + "language_loss": 0.84285426, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85330725, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.39746094, + "step": 5105, + "time_per_iteration": 2.919321060180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_mlp": 1.00573695, + "epoch": 0.9823008849557522, + "flos": 500493785088.0, + "grad_norm": 0.042150017218887774, + "language_loss": 0.81642014, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82687449, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.39672852, + "step": 5106, + "time_per_iteration": 2.6857118606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_mlp": 1.00149202, + "epoch": 0.9824932666410158, + "flos": 524905388544.0, + "grad_norm": 0.04484543637804275, + "language_loss": 0.73315752, + "learning_rate": 8.035270459489929e-07, + "loss": 0.74356979, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.3972168, + "step": 5107, + "time_per_iteration": 2.685387372970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041466, + "balance_loss_mlp": 1.00172162, + "epoch": 0.9826856483262794, + "flos": 503676343296.0, + "grad_norm": 0.035210892499995095, + "language_loss": 0.82785177, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83826649, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.3972168, + "step": 5108, + "time_per_iteration": 2.648824453353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041998, + "balance_loss_mlp": 1.00225329, + "epoch": 0.982878030011543, + "flos": 563214683904.0, + "grad_norm": 0.032324614510234884, + "language_loss": 0.84624445, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85666442, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.3972168, + "step": 5109, + "time_per_iteration": 2.8252689838409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041628, + "balance_loss_mlp": 1.00188339, + "epoch": 0.9830704116968064, + "flos": 538214976768.0, + "grad_norm": 0.032173031001019446, + "language_loss": 0.83402407, + "learning_rate": 7.514335898027857e-07, + "loss": 0.84444034, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.3972168, + "step": 5110, + "time_per_iteration": 2.7532317638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042923, + "balance_loss_mlp": 1.00322616, + "epoch": 0.98326279338207, + "flos": 459903039744.0, + "grad_norm": 0.03572969803594758, + "language_loss": 0.84500074, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85542995, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.39672852, + "step": 5111, + "time_per_iteration": 2.524393081665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042896, + "balance_loss_mlp": 1.00327051, + "epoch": 0.9834551750673336, + "flos": 642190546944.0, + "grad_norm": 0.032775358320962594, + "language_loss": 0.80079061, + "learning_rate": 7.17673735218416e-07, + "loss": 0.81121951, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.39599609, + "step": 5112, + "time_per_iteration": 2.821751594543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_mlp": 1.00320005, + "epoch": 0.9836475567525972, + "flos": 1073549717760.0, + "grad_norm": 0.03499581392563108, + "language_loss": 0.79561722, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80604577, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.39624023, + "step": 5113, + "time_per_iteration": 3.4073426723480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042885, + "balance_loss_mlp": 1.00318861, + "epoch": 0.9838399384378607, + "flos": 566279623680.0, + "grad_norm": 0.04234860016230719, + "language_loss": 0.7609002, + "learning_rate": 6.846892349181566e-07, + "loss": 0.77132899, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.39672852, + "step": 5114, + "time_per_iteration": 2.6536595821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_mlp": 1.00336301, + "epoch": 0.9840323201231242, + "flos": 774181186560.0, + "grad_norm": 0.05548408473987726, + "language_loss": 0.80118275, + "learning_rate": 6.684877586787819e-07, + "loss": 0.81161332, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.39672852, + "step": 5115, + "time_per_iteration": 2.966550350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_mlp": 1.00294471, + "epoch": 0.9842247018083878, + "flos": 473249566464.0, + "grad_norm": 0.03527003893098496, + "language_loss": 0.85987931, + "learning_rate": 6.524801401249225e-07, + "loss": 0.87030524, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.39624023, + "step": 5116, + "time_per_iteration": 2.5599422454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_mlp": 1.0028646, + "epoch": 0.9844170834936514, + "flos": 526311975168.0, + "grad_norm": 0.03473210805484262, + "language_loss": 0.85255396, + "learning_rate": 6.366663854713295e-07, + "loss": 0.86297911, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.39624023, + "step": 5117, + "time_per_iteration": 2.6238479614257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00504303, + "epoch": 0.984609465178915, + "flos": 1570626663168.0, + "grad_norm": 0.005544622435587091, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78206277, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.38671875, + "step": 5118, + "time_per_iteration": 4.924822568893433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104242, + "balance_loss_mlp": 1.0027473, + "epoch": 0.9848018468641785, + "flos": 520569955584.0, + "grad_norm": 0.04372972135981774, + "language_loss": 0.82049799, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83092213, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.39648438, + "step": 5119, + "time_per_iteration": 2.580916404724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_mlp": 1.00287449, + "epoch": 0.9849942285494421, + "flos": 493987717632.0, + "grad_norm": 0.03481962027898756, + "language_loss": 0.83367181, + "learning_rate": 5.903883659301167e-07, + "loss": 0.84409702, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.39624023, + "step": 5120, + "time_per_iteration": 2.5750389099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_mlp": 1.00320685, + "epoch": 0.9851866102347057, + "flos": 547050126336.0, + "grad_norm": 0.03798440780178793, + "language_loss": 0.8116461, + "learning_rate": 5.753501275193029e-07, + "loss": 0.82207561, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.3972168, + "step": 5121, + "time_per_iteration": 2.6916379928588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042599, + "balance_loss_mlp": 1.00290275, + "epoch": 0.9853789919199692, + "flos": 477215614464.0, + "grad_norm": 0.04012512347412675, + "language_loss": 0.80473411, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81516004, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.39672852, + "step": 5122, + "time_per_iteration": 2.5235373973846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_mlp": 1.00197303, + "epoch": 0.9855713736052328, + "flos": 1034309178624.0, + "grad_norm": 0.035879462257235414, + "language_loss": 0.7627933, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77321047, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.3972168, + "step": 5123, + "time_per_iteration": 3.4110076427459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_mlp": 1.00210428, + "epoch": 0.9857637552904963, + "flos": 496080584448.0, + "grad_norm": 0.03414998206245118, + "language_loss": 0.82715416, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83757216, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.39672852, + "step": 5124, + "time_per_iteration": 2.608877658843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042334, + "balance_loss_mlp": 1.0025183, + "epoch": 0.9859561369757599, + "flos": 593382891264.0, + "grad_norm": 0.040829141908968115, + "language_loss": 0.84081221, + "learning_rate": 5.17136169578103e-07, + "loss": 0.85123551, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.39794922, + "step": 5125, + "time_per_iteration": 2.68752121925354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042843, + "balance_loss_mlp": 1.00319362, + "epoch": 0.9861485186610235, + "flos": 487983237888.0, + "grad_norm": 0.03435380291834103, + "language_loss": 0.7900275, + "learning_rate": 5.030674572691907e-07, + "loss": 0.80045593, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.39624023, + "step": 5126, + "time_per_iteration": 2.6282498836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_mlp": 1.00335145, + "epoch": 0.9863409003462871, + "flos": 519834097920.0, + "grad_norm": 0.03004792454778602, + "language_loss": 0.83327055, + "learning_rate": 4.891926668676994e-07, + "loss": 0.84370041, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.39599609, + "step": 5127, + "time_per_iteration": 2.6406099796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_mlp": 1.00532532, + "epoch": 0.9865332820315506, + "flos": 1489297418496.0, + "grad_norm": 0.004350637395800546, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80226779, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.38671875, + "step": 5128, + "time_per_iteration": 4.868277549743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104271, + "balance_loss_mlp": 1.00301361, + "epoch": 0.9867256637168141, + "flos": 583218922752.0, + "grad_norm": 0.03562084735893716, + "language_loss": 0.79497892, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80540597, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.39672852, + "step": 5129, + "time_per_iteration": 2.710846185684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_mlp": 1.00284064, + "epoch": 0.9869180454020777, + "flos": 960927581184.0, + "grad_norm": 0.03494818497849394, + "language_loss": 0.86453211, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87495726, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.39648438, + "step": 5130, + "time_per_iteration": 3.2606029510498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041493, + "balance_loss_mlp": 1.00177276, + "epoch": 0.9871104270873413, + "flos": 772114564608.0, + "grad_norm": 0.03632052954178154, + "language_loss": 0.82985795, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8402729, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.39697266, + "step": 5131, + "time_per_iteration": 2.999943494796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_mlp": 1.00172222, + "epoch": 0.9873028087726049, + "flos": 447366247680.0, + "grad_norm": 0.04349858393955328, + "language_loss": 0.78963101, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.80004597, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.39746094, + "step": 5132, + "time_per_iteration": 2.523855209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041372, + "balance_loss_mlp": 1.00167501, + "epoch": 0.9874951904578684, + "flos": 508628070144.0, + "grad_norm": 0.03437271489823481, + "language_loss": 0.86646765, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87688142, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.39672852, + "step": 5133, + "time_per_iteration": 2.6064534187316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041756, + "balance_loss_mlp": 1.00203538, + "epoch": 0.987687572143132, + "flos": 718038286848.0, + "grad_norm": 0.0348410351371582, + "language_loss": 0.82776976, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83818728, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.39697266, + "step": 5134, + "time_per_iteration": 2.9822537899017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044029, + "balance_loss_mlp": 1.00531006, + "epoch": 0.9878799538283956, + "flos": 1541960306688.0, + "grad_norm": 0.004341111973416102, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.8086198, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.38671875, + "step": 5135, + "time_per_iteration": 4.959632396697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_mlp": 1.00305438, + "epoch": 0.9880723355136591, + "flos": 722738247168.0, + "grad_norm": 0.034114831969587756, + "language_loss": 0.81975973, + "learning_rate": 3.730469030412964e-07, + "loss": 0.83018649, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.39599609, + "step": 5136, + "time_per_iteration": 2.9661388397216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_mlp": 1.00325525, + "epoch": 0.9882647171989226, + "flos": 558414601728.0, + "grad_norm": 0.03182489959005223, + "language_loss": 0.84676516, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85719395, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.39599609, + "step": 5137, + "time_per_iteration": 2.6703665256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_mlp": 1.00207615, + "epoch": 0.9884570988841862, + "flos": 563941793280.0, + "grad_norm": 0.037188554451168025, + "language_loss": 0.80630195, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81672037, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.39746094, + "step": 5138, + "time_per_iteration": 2.707731008529663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_mlp": 1.00196183, + "epoch": 0.9886494805694498, + "flos": 432669514752.0, + "grad_norm": 0.038119930885520625, + "language_loss": 0.86471844, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87513542, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.3972168, + "step": 5139, + "time_per_iteration": 2.5015242099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00309241, + "epoch": 0.9888418622547134, + "flos": 593241940224.0, + "grad_norm": 0.031962548376230554, + "language_loss": 0.90608424, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91651189, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.39648438, + "step": 5140, + "time_per_iteration": 2.7830655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_mlp": 1.0027616, + "epoch": 0.989034243939977, + "flos": 1136867468544.0, + "grad_norm": 0.03159325853883531, + "language_loss": 0.80423748, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81466115, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.39575195, + "step": 5141, + "time_per_iteration": 3.5227456092834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_mlp": 1.00324464, + "epoch": 0.9892266256252404, + "flos": 567731897088.0, + "grad_norm": 0.03916610087942592, + "language_loss": 0.82117474, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83160412, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.39672852, + "step": 5142, + "time_per_iteration": 2.6800377368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041551, + "balance_loss_mlp": 1.00185382, + "epoch": 0.989419007310504, + "flos": 641871706368.0, + "grad_norm": 0.03635913094447108, + "language_loss": 0.84309208, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.85350764, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.39672852, + "step": 5143, + "time_per_iteration": 2.9835867881774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042368, + "balance_loss_mlp": 1.00276697, + "epoch": 0.9896113889957676, + "flos": 456449273088.0, + "grad_norm": 0.04157361104521907, + "language_loss": 0.81831914, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82874286, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.39575195, + "step": 5144, + "time_per_iteration": 2.60195255279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_mlp": 1.00274062, + "epoch": 0.9898037706810312, + "flos": 568420122624.0, + "grad_norm": 0.03633944048268901, + "language_loss": 0.80669045, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81711411, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.39599609, + "step": 5145, + "time_per_iteration": 2.6788082122802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_mlp": 1.00495148, + "epoch": 0.9899961523662947, + "flos": 1553450181888.0, + "grad_norm": 0.005562100769056325, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79190093, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.38671875, + "step": 5146, + "time_per_iteration": 4.948495149612427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_mlp": 1.00301051, + "epoch": 0.9901885340515583, + "flos": 611948462592.0, + "grad_norm": 0.034622699594423934, + "language_loss": 0.85446566, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86489272, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.39672852, + "step": 5147, + "time_per_iteration": 2.8891594409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104262, + "balance_loss_mlp": 1.00294733, + "epoch": 0.9903809157368219, + "flos": 518494585344.0, + "grad_norm": 0.033958592600161895, + "language_loss": 0.83530235, + "learning_rate": 2.426269020866512e-07, + "loss": 0.84572852, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.39648438, + "step": 5148, + "time_per_iteration": 2.590367317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_mlp": 1.00310874, + "epoch": 0.9905732974220854, + "flos": 1102198577664.0, + "grad_norm": 0.034821676713935955, + "language_loss": 0.80957055, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81999862, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.39672852, + "step": 5149, + "time_per_iteration": 3.433225631713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042394, + "balance_loss_mlp": 1.00269747, + "epoch": 0.990765679107349, + "flos": 859493975808.0, + "grad_norm": 0.0403314088319205, + "language_loss": 0.84558642, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85601032, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.39672852, + "step": 5150, + "time_per_iteration": 3.1001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_mlp": 1.00335491, + "epoch": 0.9909580607926125, + "flos": 492274929408.0, + "grad_norm": 0.038309376830719104, + "language_loss": 0.80395019, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81438005, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.39599609, + "step": 5151, + "time_per_iteration": 2.5904712677001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_mlp": 1.00319481, + "epoch": 0.9911504424778761, + "flos": 586160408064.0, + "grad_norm": 0.03518242264067306, + "language_loss": 0.79798514, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80841333, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.39599609, + "step": 5152, + "time_per_iteration": 2.7454349994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042561, + "balance_loss_mlp": 1.0028646, + "epoch": 0.9913428241631397, + "flos": 571101093120.0, + "grad_norm": 0.034987965927020324, + "language_loss": 0.81628764, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82671332, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.39672852, + "step": 5153, + "time_per_iteration": 2.6824519634246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045599, + "balance_loss_mlp": 1.00590229, + "epoch": 0.9915352058484033, + "flos": 490711840512.0, + "grad_norm": 0.034700171025390344, + "language_loss": 0.87125576, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.88171172, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.39672852, + "step": 5154, + "time_per_iteration": 2.6056690216064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044254, + "balance_loss_mlp": 1.00443792, + "epoch": 0.9917275875336667, + "flos": 745410817536.0, + "grad_norm": 0.033269276709246544, + "language_loss": 0.83338165, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.84382415, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.39794922, + "step": 5155, + "time_per_iteration": 2.9471802711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_mlp": 1.00215936, + "epoch": 0.9919199692189303, + "flos": 509325043968.0, + "grad_norm": 0.03681649434928698, + "language_loss": 0.80316067, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81357992, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.39746094, + "step": 5156, + "time_per_iteration": 2.658385753631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_mlp": 1.00167131, + "epoch": 0.9921123509041939, + "flos": 545011694592.0, + "grad_norm": 0.03652948817105682, + "language_loss": 0.84750915, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85792279, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.39672852, + "step": 5157, + "time_per_iteration": 2.651167631149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_mlp": 1.00166762, + "epoch": 0.9923047325894575, + "flos": 672758274816.0, + "grad_norm": 0.03668848147519562, + "language_loss": 0.77897781, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78939146, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.39672852, + "step": 5158, + "time_per_iteration": 2.772944927215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_mlp": 1.00161004, + "epoch": 0.9924971142747211, + "flos": 467625165312.0, + "grad_norm": 0.03636401763672204, + "language_loss": 0.80984938, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.82026315, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.39746094, + "step": 5159, + "time_per_iteration": 2.720477342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_mlp": 1.0017122, + "epoch": 0.9926894959599846, + "flos": 492563634432.0, + "grad_norm": 0.033750262156114055, + "language_loss": 0.83312631, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.84354109, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.39746094, + "step": 5160, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041373, + "balance_loss_mlp": 1.00162876, + "epoch": 0.9928818776452482, + "flos": 493373369088.0, + "grad_norm": 0.03981516905713452, + "language_loss": 0.82561183, + "learning_rate": 1.328673533166902e-07, + "loss": 0.83602554, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.3972168, + "step": 5161, + "time_per_iteration": 2.6184706687927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_mlp": 1.00297153, + "epoch": 0.9930742593305117, + "flos": 547467143424.0, + "grad_norm": 0.03499010450358828, + "language_loss": 0.8439256, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85435224, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.39672852, + "step": 5162, + "time_per_iteration": 2.8474743366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_mlp": 1.00280654, + "epoch": 0.9932666410157753, + "flos": 586616308992.0, + "grad_norm": 0.03668605125261659, + "language_loss": 0.86434215, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.87476742, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.39697266, + "step": 5163, + "time_per_iteration": 2.7860381603240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043001, + "balance_loss_mlp": 1.00330424, + "epoch": 0.9934590227010388, + "flos": 538106106624.0, + "grad_norm": 0.03728677713501654, + "language_loss": 0.84115875, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.85158879, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.39672852, + "step": 5164, + "time_per_iteration": 2.6766583919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_mlp": 1.00302172, + "epoch": 0.9936514043863024, + "flos": 519061301760.0, + "grad_norm": 0.033375171347095695, + "language_loss": 0.87032169, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.88074887, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.39672852, + "step": 5165, + "time_per_iteration": 2.60634708404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_mlp": 1.00305867, + "epoch": 0.993843786071566, + "flos": 745996975872.0, + "grad_norm": 0.03526910603932911, + "language_loss": 0.80838788, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81881613, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.39746094, + "step": 5166, + "time_per_iteration": 3.050039529800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045055, + "balance_loss_mlp": 1.00533426, + "epoch": 0.9940361677568296, + "flos": 527008948992.0, + "grad_norm": 0.04774689231823901, + "language_loss": 0.82173401, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83218455, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.39697266, + "step": 5167, + "time_per_iteration": 2.7380599975585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_mlp": 1.00587058, + "epoch": 0.9942285494420932, + "flos": 556747500288.0, + "grad_norm": 0.03707878471645631, + "language_loss": 0.80540991, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81586635, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.39746094, + "step": 5168, + "time_per_iteration": 2.748521327972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045556, + "balance_loss_mlp": 1.00590706, + "epoch": 0.9944209311273566, + "flos": 587100400128.0, + "grad_norm": 0.03550078016567863, + "language_loss": 0.82207251, + "learning_rate": 8.162407083411872e-08, + "loss": 0.832528, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.39624023, + "step": 5169, + "time_per_iteration": 2.7222771644592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045413, + "balance_loss_mlp": 1.00576413, + "epoch": 0.9946133128126202, + "flos": 736857570048.0, + "grad_norm": 0.03421051451845836, + "language_loss": 0.82663047, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83708465, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.39624023, + "step": 5170, + "time_per_iteration": 2.9847142696380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042863, + "balance_loss_mlp": 1.00316656, + "epoch": 0.9948056944978838, + "flos": 647181157632.0, + "grad_norm": 0.037725989212747564, + "language_loss": 0.82797301, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83840156, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.39672852, + "step": 5171, + "time_per_iteration": 2.7907352447509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_mlp": 1.00308061, + "epoch": 0.9949980761831474, + "flos": 446797585920.0, + "grad_norm": 0.035457906754411656, + "language_loss": 0.86522031, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87564778, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.39648438, + "step": 5172, + "time_per_iteration": 2.5566751956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_mlp": 1.00309753, + "epoch": 0.995190457868411, + "flos": 436559740416.0, + "grad_norm": 0.03425126759604689, + "language_loss": 0.86168361, + "learning_rate": 6.066040520641414e-08, + "loss": 0.87211132, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.39648438, + "step": 5173, + "time_per_iteration": 2.54295015335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_mlp": 1.00275624, + "epoch": 0.9953828395536745, + "flos": 515190518016.0, + "grad_norm": 0.035385628041685095, + "language_loss": 0.82161599, + "learning_rate": 5.590471806377062e-08, + "loss": 0.83204019, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.39648438, + "step": 5174, + "time_per_iteration": 2.5888471603393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_mlp": 1.00278032, + "epoch": 0.995575221238938, + "flos": 480808386816.0, + "grad_norm": 0.036646252224127956, + "language_loss": 0.82425147, + "learning_rate": 5.134312643245709e-08, + "loss": 0.83467579, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.39624023, + "step": 5175, + "time_per_iteration": 2.5513620376586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_mlp": 1.00285447, + "epoch": 0.9957676029242016, + "flos": 588932752128.0, + "grad_norm": 0.04319438463230705, + "language_loss": 0.77101338, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.78143859, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.39648438, + "step": 5176, + "time_per_iteration": 2.717487335205078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042501, + "balance_loss_mlp": 1.00285208, + "epoch": 0.9959599846094652, + "flos": 427355205888.0, + "grad_norm": 0.03951052655432241, + "language_loss": 0.80484152, + "learning_rate": 4.280223671243588e-08, + "loss": 0.81526649, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.39624023, + "step": 5177, + "time_per_iteration": 2.4777727127075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_mlp": 1.00313175, + "epoch": 0.9961523662947287, + "flos": 612851516160.0, + "grad_norm": 0.035244745134973283, + "language_loss": 0.81199628, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.82242405, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.39624023, + "step": 5178, + "time_per_iteration": 2.823143482208252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_mlp": 1.00330544, + "epoch": 0.9963447479799923, + "flos": 551843405568.0, + "grad_norm": 0.0468254833507274, + "language_loss": 0.74475646, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.75518698, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.3972168, + "step": 5179, + "time_per_iteration": 2.6599185466766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042788, + "balance_loss_mlp": 1.00304377, + "epoch": 0.9965371296652559, + "flos": 627011668224.0, + "grad_norm": 0.037984770973077585, + "language_loss": 0.89231646, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.90274435, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.3972168, + "step": 5180, + "time_per_iteration": 2.7122786045074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042634, + "balance_loss_mlp": 1.00296164, + "epoch": 0.9967295113505195, + "flos": 640792708608.0, + "grad_norm": 0.03477053437828652, + "language_loss": 0.82540727, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.83583367, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.39648438, + "step": 5181, + "time_per_iteration": 2.8635036945343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_mlp": 1.00305843, + "epoch": 0.996921893035783, + "flos": 608544273408.0, + "grad_norm": 0.03998145351976275, + "language_loss": 0.7724216, + "learning_rate": 2.484679859793282e-08, + "loss": 0.78284985, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.39746094, + "step": 5182, + "time_per_iteration": 2.7515056133270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_mlp": 1.00281239, + "epoch": 0.9971142747210465, + "flos": 645346860288.0, + "grad_norm": 0.04022267608220078, + "language_loss": 0.82593203, + "learning_rate": 2.183802848243488e-08, + "loss": 0.83635759, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.3972168, + "step": 5183, + "time_per_iteration": 2.8260281085968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042938, + "balance_loss_mlp": 1.00321734, + "epoch": 0.9973066564063101, + "flos": 1042462905600.0, + "grad_norm": 0.03259879634989159, + "language_loss": 0.81111103, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82154036, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.39697266, + "step": 5184, + "time_per_iteration": 3.3434250354766846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_mlp": 1.00291812, + "epoch": 0.9974990380915737, + "flos": 666343580928.0, + "grad_norm": 0.03634944798873604, + "language_loss": 0.83530021, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84572655, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.39697266, + "step": 5185, + "time_per_iteration": 2.8487842082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_mlp": 1.00315452, + "epoch": 0.9976914197768373, + "flos": 719379744768.0, + "grad_norm": 0.04487021918188512, + "language_loss": 0.77715981, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78758812, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.39648438, + "step": 5186, + "time_per_iteration": 2.857905149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_mlp": 1.00297856, + "epoch": 0.9978838014621008, + "flos": 519332510208.0, + "grad_norm": 0.037846988845411315, + "language_loss": 0.79708529, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80751228, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.39697266, + "step": 5187, + "time_per_iteration": 2.624683380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_mlp": 1.0029496, + "epoch": 0.9980761831473643, + "flos": 604606415616.0, + "grad_norm": 0.03166904331475621, + "language_loss": 0.84750795, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85793436, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.39672852, + "step": 5188, + "time_per_iteration": 2.844538688659668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_mlp": 1.00309408, + "epoch": 0.9982685648326279, + "flos": 454458473472.0, + "grad_norm": 0.03824806082394345, + "language_loss": 0.89841872, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90884709, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.3972168, + "step": 5189, + "time_per_iteration": 2.5720129013061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_mlp": 1.00303888, + "epoch": 0.9984609465178915, + "flos": 482462849280.0, + "grad_norm": 0.04089473715635275, + "language_loss": 0.79091811, + "learning_rate": 6.211738235173403e-09, + "loss": 0.80134523, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.39648438, + "step": 5190, + "time_per_iteration": 2.641157865524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_mlp": 1.0031848, + "epoch": 0.9986533282031551, + "flos": 478012710144.0, + "grad_norm": 0.034483206840534596, + "language_loss": 0.845792, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85622132, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.3972168, + "step": 5191, + "time_per_iteration": 2.6454806327819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_mlp": 1.00308967, + "epoch": 0.9988457098884186, + "flos": 643158729216.0, + "grad_norm": 0.03683984679007205, + "language_loss": 0.87209117, + "learning_rate": 3.494105922541291e-09, + "loss": 0.88251877, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.39648438, + "step": 5192, + "time_per_iteration": 2.785687208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042681, + "balance_loss_mlp": 1.00307941, + "epoch": 0.9990380915736822, + "flos": 397188943872.0, + "grad_norm": 0.04095019958337408, + "language_loss": 0.88459158, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.89501834, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.39575195, + "step": 5193, + "time_per_iteration": 2.438514232635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_mlp": 1.00314367, + "epoch": 0.9992304732589458, + "flos": 577297068288.0, + "grad_norm": 0.032947603634797866, + "language_loss": 0.85047054, + "learning_rate": 1.552936970405927e-09, + "loss": 0.86089826, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.39599609, + "step": 5194, + "time_per_iteration": 2.741009473800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_mlp": 1.00305462, + "epoch": 0.9994228549442093, + "flos": 545391773184.0, + "grad_norm": 0.038581373193003414, + "language_loss": 0.75944704, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76987433, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.39648438, + "step": 5195, + "time_per_iteration": 2.6541080474853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042882, + "balance_loss_mlp": 1.00323331, + "epoch": 0.9996152366294728, + "flos": 1473470188032.0, + "grad_norm": 0.03760249549203128, + "language_loss": 0.80863667, + "learning_rate": 3.882343933003796e-10, + "loss": 0.81906557, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.39624023, + "step": 5196, + "time_per_iteration": 3.698018789291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042585, + "balance_loss_mlp": 1.00312674, + "epoch": 0.9998076183147364, + "flos": 620086638336.0, + "grad_norm": 0.0677194003040628, + "language_loss": 0.70429897, + "learning_rate": 9.70586077619906e-11, + "loss": 0.7147249, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.39428711, + "step": 5197, + "time_per_iteration": 4.017072439193726 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044197, + "balance_loss_mlp": 1.0051676, + "epoch": 1.0, + "flos": 1293863628288.0, + "grad_norm": 0.017503250680704283, + "language_loss": 0.84201628, + "learning_rate": 0.0, + "loss": 0.85245824, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.38989258, + "step": 5198, + "time_per_iteration": 5.782602071762085 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1743165989388288e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/training_args.bin b/sft_pretrain/Full_xmoe/checkpoint-5198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e62437ed6fbf4cf3ea22fcfae3749bb9df2d0109 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144fbe7f1cf435dbbf0ef9621414cb3e97a5ff4a560571b878000caf2931b07 +size 7992 diff --git a/sft_pretrain/Full_xmoe/checkpoint-5198/zero_to_fp32.py b/sft_pretrain/Full_xmoe/checkpoint-5198/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/checkpoint-5198/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_xmoe/config.json b/sft_pretrain/Full_xmoe/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6a9fb76a3178f42b8776e6532d281848e7c1b67 --- /dev/null +++ b/sft_pretrain/Full_xmoe/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "xmoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_xmoe/generation_config.json b/sft_pretrain/Full_xmoe/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_xmoe/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_xmoe/model-00001-of-00002.safetensors b/sft_pretrain/Full_xmoe/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_xmoe/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_xmoe/model-00002-of-00002.safetensors b/sft_pretrain/Full_xmoe/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..06069d52819dbbf22163c9da5588880b2cc1c3d8 --- /dev/null +++ b/sft_pretrain/Full_xmoe/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b44129307c0d6bc8186640d841927892e858873599257b9253937fa4c18940df +size 3759044016 diff --git a/sft_pretrain/Full_xmoe/model.safetensors.index.json b/sft_pretrain/Full_xmoe/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..507806fb086ee2ffdb4c1df263574fc5a7cfa513 --- /dev/null +++ b/sft_pretrain/Full_xmoe/model.safetensors.index.json @@ -0,0 +1,675 @@ +{ + "metadata": { + "total_size": 8731443248 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.expert_embeddings": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.inp_reduction.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_xmoe/special_tokens_map.json b/sft_pretrain/Full_xmoe/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_xmoe/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_xmoe/tokenizer.model b/sft_pretrain/Full_xmoe/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_xmoe/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_xmoe/tokenizer_config.json b/sft_pretrain/Full_xmoe/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_xmoe/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_xmoe/trainer_state.json b/sft_pretrain/Full_xmoe/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..46220b08829b0a2753ff861be80df6149627b1ba --- /dev/null +++ b/sft_pretrain/Full_xmoe/trainer_state.json @@ -0,0 +1,78013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0334678, + "balance_loss_mlp": 2.48847342, + "epoch": 0.00019238168526356292, + "flos": 471022563072.0, + "grad_norm": 15.010934477254423, + "language_loss": 2.91277003, + "learning_rate": 0.0, + "loss": 1.95375419, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 8.6015625, + "step": 1, + "time_per_iteration": 23.313215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03608113, + "balance_loss_mlp": 3.00043201, + "epoch": 0.00038476337052712584, + "flos": 505538830848.0, + "grad_norm": 25.821694542927546, + "language_loss": 10.7459116, + "learning_rate": 0.00013726078121135892, + "loss": 10.78199196, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.06640625, + "step": 2, + "time_per_iteration": 2.6342098712921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03648002, + "balance_loss_mlp": 3.03803182, + "epoch": 0.0005771450557906887, + "flos": 600334166016.0, + "grad_norm": 27.537763142134942, + "language_loss": 10.88985825, + "learning_rate": 0.00021755319103969496, + "loss": 10.9263401, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.08984375, + "step": 3, + "time_per_iteration": 2.9129159450531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03639085, + "balance_loss_mlp": 3.03521824, + "epoch": 0.0007695267410542517, + "flos": 581497386240.0, + "grad_norm": 10.719163482624658, + "language_loss": 8.79598808, + "learning_rate": 0.00027452156242271784, + "loss": 8.83237934, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.02734375, + "step": 4, + "time_per_iteration": 2.72357439994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03604871, + "balance_loss_mlp": 3.01435566, + "epoch": 0.0009619084263178145, + "flos": 487154061312.0, + "grad_norm": 22.68157363884245, + "language_loss": 9.41989708, + "learning_rate": 0.0003187096642208417, + "loss": 9.45594501, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.8984375, + "step": 5, + "time_per_iteration": 2.6791844367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03472164, + "balance_loss_mlp": 2.9011035, + "epoch": 0.0011542901115813775, + "flos": 561167503872.0, + "grad_norm": 7.113488232519407, + "language_loss": 9.41725159, + "learning_rate": 0.0003548139722510539, + "loss": 9.45197296, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.72265625, + "step": 6, + "time_per_iteration": 2.7308623790740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03266853, + "balance_loss_mlp": 2.70799947, + "epoch": 0.0013466717968449403, + "flos": 534951738624.0, + "grad_norm": 3.189932925125429, + "language_loss": 8.01036549, + "learning_rate": 0.00038533972973918044, + "loss": 8.0430336, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.59765625, + "step": 7, + "time_per_iteration": 2.6907436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02962571, + "balance_loss_mlp": 2.41211033, + "epoch": 0.0015390534821085034, + "flos": 493334485248.0, + "grad_norm": 5.13822781788523, + "language_loss": 7.84486008, + "learning_rate": 0.0004117823436340768, + "loss": 7.87448597, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.51171875, + "step": 8, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02550478, + "balance_loss_mlp": 2.0114615, + "epoch": 0.0017314351673720662, + "flos": 565776090624.0, + "grad_norm": 3.8232757327488405, + "language_loss": 7.62468719, + "learning_rate": 0.00043510638207938993, + "loss": 7.65019178, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.7688682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02337757, + "balance_loss_mlp": 1.81705093, + "epoch": 0.001923816852635629, + "flos": 594509521152.0, + "grad_norm": 3.0012265425900817, + "language_loss": 6.96830463, + "learning_rate": 0.00045597044543220066, + "loss": 6.99168253, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.20703125, + "step": 10, + "time_per_iteration": 2.736985921859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02262083, + "balance_loss_mlp": 1.74290299, + "epoch": 0.002116198537899192, + "flos": 610895709696.0, + "grad_norm": 2.2728267884834983, + "language_loss": 6.92078686, + "learning_rate": 0.00047484428652143135, + "loss": 6.94340801, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.19140625, + "step": 11, + "time_per_iteration": 2.8857340812683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02308547, + "balance_loss_mlp": 1.78135598, + "epoch": 0.002308580223162755, + "flos": 546175262976.0, + "grad_norm": 4.334726148282724, + "language_loss": 6.71077013, + "learning_rate": 0.0004920747534624128, + "loss": 6.73385572, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 5.2734375, + "step": 12, + "time_per_iteration": 2.635601282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02317905, + "balance_loss_mlp": 1.79147708, + "epoch": 0.002500961908426318, + "flos": 645924270336.0, + "grad_norm": 3.1568536142119923, + "language_loss": 6.53248501, + "learning_rate": 0.0005079252465375872, + "loss": 6.55566406, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 5.265625, + "step": 13, + "time_per_iteration": 2.8112540245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02242807, + "balance_loss_mlp": 1.72019386, + "epoch": 0.0026933435936898806, + "flos": 488849352960.0, + "grad_norm": 7.572425831928954, + "language_loss": 6.47189951, + "learning_rate": 0.0005226005109505393, + "loss": 6.49432755, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 5.2265625, + "step": 14, + "time_per_iteration": 2.590078353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02247915, + "balance_loss_mlp": 1.72415757, + "epoch": 0.0028857252789534437, + "flos": 435526429440.0, + "grad_norm": 2.3229781853457747, + "language_loss": 6.01724243, + "learning_rate": 0.0005362628552605367, + "loss": 6.03972149, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 5.23828125, + "step": 15, + "time_per_iteration": 2.636983871459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02135688, + "balance_loss_mlp": 1.62108541, + "epoch": 0.0030781069642170067, + "flos": 597841778688.0, + "grad_norm": 4.36506198708269, + "language_loss": 5.46747923, + "learning_rate": 0.0005490431248454357, + "loss": 5.48883629, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 5.14453125, + "step": 16, + "time_per_iteration": 2.6904103755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02173305, + "balance_loss_mlp": 1.67586899, + "epoch": 0.0032704886494805694, + "flos": 1541513154048.0, + "grad_norm": 0.3693165783384919, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77878416, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 4.96875, + "step": 17, + "time_per_iteration": 6.815098285675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01958957, + "balance_loss_mlp": 1.45846832, + "epoch": 0.0034628703347441324, + "flos": 474971102976.0, + "grad_norm": 7.376330921510473, + "language_loss": 3.16160107, + "learning_rate": 0.0005723671632907488, + "loss": 3.18119049, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 5.0, + "step": 18, + "time_per_iteration": 2.7730185985565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974299, + "balance_loss_mlp": 1.48144007, + "epoch": 0.0036552520200076955, + "flos": 449478556416.0, + "grad_norm": 2.0435067055151803, + "language_loss": 1.8205657, + "learning_rate": 0.0005830738490244919, + "loss": 1.84030867, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 4.921875, + "step": 19, + "time_per_iteration": 2.5196421146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215561, + "balance_loss_mlp": 1.73147547, + "epoch": 0.003847633705271258, + "flos": 637351580928.0, + "grad_norm": 2.199322832792736, + "language_loss": 1.81859815, + "learning_rate": 0.0005932312266435596, + "loss": 1.84075379, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 4.83203125, + "step": 20, + "time_per_iteration": 2.7772061824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02397049, + "balance_loss_mlp": 1.91639686, + "epoch": 0.004040015390534821, + "flos": 590591105280.0, + "grad_norm": 2.068137361611091, + "language_loss": 1.81285238, + "learning_rate": 0.0006028929207788754, + "loss": 1.83682299, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 4.796875, + "step": 21, + "time_per_iteration": 2.7197327613830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02949394, + "balance_loss_mlp": 2.47560835, + "epoch": 0.004232397075798384, + "flos": 757866929664.0, + "grad_norm": 0.9893066861855494, + "language_loss": 1.43565178, + "learning_rate": 0.0006121050677327902, + "loss": 1.46514571, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 4.7265625, + "step": 22, + "time_per_iteration": 2.8821635246276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04240368, + "balance_loss_mlp": 3.77421188, + "epoch": 0.004424778761061947, + "flos": 527727310080.0, + "grad_norm": 1.6702760591351544, + "language_loss": 1.36044598, + "learning_rate": 0.0006209076479463684, + "loss": 1.40284979, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 4.6484375, + "step": 23, + "time_per_iteration": 2.6194069385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04405254, + "balance_loss_mlp": 3.93871665, + "epoch": 0.00461716044632551, + "flos": 549218815488.0, + "grad_norm": 1.6356367296774819, + "language_loss": 1.46302319, + "learning_rate": 0.0006293355346737718, + "loss": 1.50707567, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 4.65234375, + "step": 24, + "time_per_iteration": 2.741433620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03977472, + "balance_loss_mlp": 3.50483179, + "epoch": 0.004809542131589073, + "flos": 568752569088.0, + "grad_norm": 1.079559317914091, + "language_loss": 1.33177948, + "learning_rate": 0.0006374193284416834, + "loss": 1.37155437, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 4.71484375, + "step": 25, + "time_per_iteration": 2.902089834213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03127712, + "balance_loss_mlp": 2.642483, + "epoch": 0.005001923816852636, + "flos": 471584410368.0, + "grad_norm": 0.4847890845471295, + "language_loss": 1.26058078, + "learning_rate": 0.0006451860277489461, + "loss": 1.29185796, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.84375, + "step": 26, + "time_per_iteration": 2.6045680046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02733563, + "balance_loss_mlp": 2.23154879, + "epoch": 0.005194305502116198, + "flos": 416381502720.0, + "grad_norm": 0.2845036760864029, + "language_loss": 1.33193052, + "learning_rate": 0.0006526595731190848, + "loss": 1.35926616, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 5.015625, + "step": 27, + "time_per_iteration": 2.4412264823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02759137, + "balance_loss_mlp": 2.2411015, + "epoch": 0.005386687187379761, + "flos": 629996894976.0, + "grad_norm": 0.34713687972437796, + "language_loss": 1.22031224, + "learning_rate": 0.0006598612921618983, + "loss": 1.24790359, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 5.1796875, + "step": 28, + "time_per_iteration": 2.80483078956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575766, + "balance_loss_mlp": 2.05010033, + "epoch": 0.005579068872643324, + "flos": 888021326592.0, + "grad_norm": 0.3062478898066755, + "language_loss": 1.16221631, + "learning_rate": 0.0006668102665011454, + "loss": 1.18797398, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 5.2578125, + "step": 29, + "time_per_iteration": 3.243164300918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02507804, + "balance_loss_mlp": 1.97527242, + "epoch": 0.005771450557906887, + "flos": 548658902016.0, + "grad_norm": 0.22276861521731073, + "language_loss": 1.24634933, + "learning_rate": 0.0006735236364718957, + "loss": 1.27142727, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 5.328125, + "step": 30, + "time_per_iteration": 2.7701382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02465182, + "balance_loss_mlp": 1.93226886, + "epoch": 0.00596383224317045, + "flos": 533069809152.0, + "grad_norm": 0.21102664747409663, + "language_loss": 1.23222375, + "learning_rate": 0.0006800168558381346, + "loss": 1.25687563, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.33203125, + "step": 31, + "time_per_iteration": 2.635246515274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02445382, + "balance_loss_mlp": 1.91552007, + "epoch": 0.0061562139284340135, + "flos": 590163394560.0, + "grad_norm": 0.21886797396213825, + "language_loss": 1.26610851, + "learning_rate": 0.0006863039060567947, + "loss": 1.29056239, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.30078125, + "step": 32, + "time_per_iteration": 2.7791683673858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02338603, + "balance_loss_mlp": 1.80950415, + "epoch": 0.006348595613697576, + "flos": 619442154240.0, + "grad_norm": 0.18971916612404452, + "language_loss": 1.17543316, + "learning_rate": 0.0006923974775611263, + "loss": 1.19881916, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 5.29296875, + "step": 33, + "time_per_iteration": 2.836601495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02160521, + "balance_loss_mlp": 1.64134097, + "epoch": 0.006540977298961139, + "flos": 779300109312.0, + "grad_norm": 0.13369632510289112, + "language_loss": 1.13907146, + "learning_rate": 0.0006983091239737814, + "loss": 1.16067672, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 5.19140625, + "step": 34, + "time_per_iteration": 3.021479606628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0221033, + "balance_loss_mlp": 1.69649041, + "epoch": 0.006733358984224702, + "flos": 668373264384.0, + "grad_norm": 0.11522706717853448, + "language_loss": 1.11973858, + "learning_rate": 0.0007040493939600222, + "loss": 1.14184177, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 5.13671875, + "step": 35, + "time_per_iteration": 2.9400346279144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0227657, + "balance_loss_mlp": 1.76997864, + "epoch": 0.006925740669488265, + "flos": 565496133888.0, + "grad_norm": 0.11143421895921844, + "language_loss": 1.12295914, + "learning_rate": 0.0007096279445021078, + "loss": 1.14572477, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 5.0625, + "step": 36, + "time_per_iteration": 2.698153495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02284885, + "balance_loss_mlp": 1.78668559, + "epoch": 0.007118122354751828, + "flos": 551112405504.0, + "grad_norm": 0.11733654674395574, + "language_loss": 1.1734066, + "learning_rate": 0.0007150536386503726, + "loss": 1.19625545, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 4.9765625, + "step": 37, + "time_per_iteration": 2.8579084873199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02268399, + "balance_loss_mlp": 1.77782845, + "epoch": 0.007310504040015391, + "flos": 703814951424.0, + "grad_norm": 0.14208952684155102, + "language_loss": 1.10088778, + "learning_rate": 0.0007203346302358509, + "loss": 1.12357187, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 4.8984375, + "step": 38, + "time_per_iteration": 2.928835391998291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220017, + "balance_loss_mlp": 1.73555112, + "epoch": 0.007502885725278953, + "flos": 600501361920.0, + "grad_norm": 0.142042154575746, + "language_loss": 1.15486813, + "learning_rate": 0.000725478437577282, + "loss": 1.17706823, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 4.8359375, + "step": 39, + "time_per_iteration": 2.8706436157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0209897, + "balance_loss_mlp": 1.62251425, + "epoch": 0.007695267410542516, + "flos": 561428018688.0, + "grad_norm": 0.13255726845543458, + "language_loss": 1.10233212, + "learning_rate": 0.0007304920078549186, + "loss": 1.12332189, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 4.75390625, + "step": 40, + "time_per_iteration": 2.6895179748535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01939831, + "balance_loss_mlp": 1.46986008, + "epoch": 0.007887649095806078, + "flos": 509231725056.0, + "grad_norm": 0.11166218824526469, + "language_loss": 1.12161303, + "learning_rate": 0.0007353817735343603, + "loss": 1.14101124, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 4.6875, + "step": 41, + "time_per_iteration": 2.709167957305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0184399, + "balance_loss_mlp": 1.3778342, + "epoch": 0.008080030781069641, + "flos": 504905040384.0, + "grad_norm": 0.06254207778511488, + "language_loss": 1.07663667, + "learning_rate": 0.0007401537019902344, + "loss": 1.09507656, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 4.6484375, + "step": 42, + "time_per_iteration": 2.5947837829589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01789021, + "balance_loss_mlp": 1.32896876, + "epoch": 0.008272412466333205, + "flos": 519106988544.0, + "grad_norm": 0.07012531219711775, + "language_loss": 1.09992051, + "learning_rate": 0.0007448133392900729, + "loss": 1.11781073, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 4.5859375, + "step": 43, + "time_per_iteration": 2.6997878551483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01787217, + "balance_loss_mlp": 1.32983518, + "epoch": 0.008464794151596768, + "flos": 609184866816.0, + "grad_norm": 0.09276066699658307, + "language_loss": 1.05755496, + "learning_rate": 0.0007493658489441491, + "loss": 1.07542706, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 4.56640625, + "step": 44, + "time_per_iteration": 2.8852477073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177156, + "balance_loss_mlp": 1.31913674, + "epoch": 0.00865717583686033, + "flos": 539007214848.0, + "grad_norm": 0.11478380715178954, + "language_loss": 1.09959674, + "learning_rate": 0.0007538160463002316, + "loss": 1.11731243, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 4.53125, + "step": 45, + "time_per_iteration": 2.685568332672119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01802016, + "balance_loss_mlp": 1.35378933, + "epoch": 0.008849557522123894, + "flos": 509010094080.0, + "grad_norm": 0.14537339285711792, + "language_loss": 1.13533509, + "learning_rate": 0.0007581684291577274, + "loss": 1.15335524, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 4.49609375, + "step": 46, + "time_per_iteration": 2.5798568725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764716, + "balance_loss_mlp": 1.31915987, + "epoch": 0.009041939207387457, + "flos": 626508135168.0, + "grad_norm": 0.13285081251714825, + "language_loss": 1.15270185, + "learning_rate": 0.0007624272050891776, + "loss": 1.17034888, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 4.46875, + "step": 47, + "time_per_iteration": 2.822632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0175788, + "balance_loss_mlp": 1.31461263, + "epoch": 0.00923432089265102, + "flos": 550610817792.0, + "grad_norm": 0.11934546954286276, + "language_loss": 1.04916859, + "learning_rate": 0.0007665963158851307, + "loss": 1.06674731, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 4.4453125, + "step": 48, + "time_per_iteration": 2.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01741735, + "balance_loss_mlp": 1.29846764, + "epoch": 0.009426702577914583, + "flos": 563679333120.0, + "grad_norm": 0.08548395668661983, + "language_loss": 1.13647461, + "learning_rate": 0.0007706794594783609, + "loss": 1.15389204, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 4.4453125, + "step": 49, + "time_per_iteration": 2.734813928604126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01727457, + "balance_loss_mlp": 1.28838515, + "epoch": 0.009619084263178146, + "flos": 617926697472.0, + "grad_norm": 0.06892583067190382, + "language_loss": 1.12110853, + "learning_rate": 0.0007746801096530423, + "loss": 1.13838315, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 4.40234375, + "step": 50, + "time_per_iteration": 2.7447421550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01719129, + "balance_loss_mlp": 1.28043914, + "epoch": 0.009811465948441709, + "flos": 542489171712.0, + "grad_norm": 0.04778558244894799, + "language_loss": 1.16797209, + "learning_rate": 0.0007786015338021173, + "loss": 1.1851635, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 4.3984375, + "step": 51, + "time_per_iteration": 2.65645694732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01722789, + "balance_loss_mlp": 1.28562462, + "epoch": 0.010003847633705272, + "flos": 536977531392.0, + "grad_norm": 0.06217135289779639, + "language_loss": 1.09074998, + "learning_rate": 0.0007824468089603051, + "loss": 1.10797799, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 4.3828125, + "step": 52, + "time_per_iteration": 2.7218713760375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01697539, + "balance_loss_mlp": 1.26380801, + "epoch": 0.010196229318968833, + "flos": 910806657792.0, + "grad_norm": 0.04206474108062499, + "language_loss": 1.08130515, + "learning_rate": 0.0007862188363098669, + "loss": 1.09828055, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 4.34765625, + "step": 53, + "time_per_iteration": 3.149973154067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01668333, + "balance_loss_mlp": 1.23765349, + "epoch": 0.010388611004232396, + "flos": 586970142720.0, + "grad_norm": 0.050634309517598654, + "language_loss": 1.08688021, + "learning_rate": 0.0007899203543304438, + "loss": 1.10356343, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 4.31640625, + "step": 54, + "time_per_iteration": 2.7033088207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01691162, + "balance_loss_mlp": 1.26315343, + "epoch": 0.01058099268949596, + "flos": 503472208896.0, + "grad_norm": 0.06464656169002964, + "language_loss": 1.22991037, + "learning_rate": 0.0007935539507422731, + "loss": 1.246822, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 4.2890625, + "step": 55, + "time_per_iteration": 2.601745843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.017059, + "balance_loss_mlp": 1.28017938, + "epoch": 0.010773374374759523, + "flos": 545558969088.0, + "grad_norm": 0.06403483907250343, + "language_loss": 1.12561536, + "learning_rate": 0.0007971220733732573, + "loss": 1.14267421, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 4.265625, + "step": 56, + "time_per_iteration": 2.677314281463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0169453, + "balance_loss_mlp": 1.27262425, + "epoch": 0.010965756060023086, + "flos": 527286960384.0, + "grad_norm": 0.061369678053330295, + "language_loss": 1.07931721, + "learning_rate": 0.0008006270400641869, + "loss": 1.09626245, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 4.2265625, + "step": 57, + "time_per_iteration": 2.7162468433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699229, + "balance_loss_mlp": 1.27846837, + "epoch": 0.011158137745286649, + "flos": 578098054656.0, + "grad_norm": 0.06126094216688289, + "language_loss": 1.08923888, + "learning_rate": 0.0008040710477125043, + "loss": 1.10623109, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 4.21484375, + "step": 58, + "time_per_iteration": 2.724116563796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648065, + "balance_loss_mlp": 1.23150039, + "epoch": 0.011350519430550212, + "flos": 530314961664.0, + "grad_norm": 0.059594432794803906, + "language_loss": 1.09501219, + "learning_rate": 0.0008074561805429771, + "loss": 1.11149275, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 4.171875, + "step": 59, + "time_per_iteration": 2.613821268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01628698, + "balance_loss_mlp": 1.21594822, + "epoch": 0.011542901115813775, + "flos": 556971076608.0, + "grad_norm": 0.046387810099464834, + "language_loss": 1.0703913, + "learning_rate": 0.0008107844176832545, + "loss": 1.08667827, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 4.1328125, + "step": 60, + "time_per_iteration": 2.6809566020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01602811, + "balance_loss_mlp": 1.19349384, + "epoch": 0.011735282801077338, + "flos": 573176463360.0, + "grad_norm": 0.036957475185327084, + "language_loss": 1.08104563, + "learning_rate": 0.0008140576401132568, + "loss": 1.09707379, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 4.09765625, + "step": 61, + "time_per_iteration": 2.644085645675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596506, + "balance_loss_mlp": 1.19024038, + "epoch": 0.0119276644863409, + "flos": 616717442304.0, + "grad_norm": 0.034032461682055544, + "language_loss": 1.09685671, + "learning_rate": 0.0008172776370494935, + "loss": 1.11282182, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 4.06640625, + "step": 62, + "time_per_iteration": 2.7589328289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01605764, + "balance_loss_mlp": 1.20255029, + "epoch": 0.012120046171604464, + "flos": 502085064192.0, + "grad_norm": 0.035968497482949544, + "language_loss": 1.17104983, + "learning_rate": 0.0008204461118185703, + "loss": 1.18710756, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 4.03515625, + "step": 63, + "time_per_iteration": 2.594369411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01603311, + "balance_loss_mlp": 1.20353031, + "epoch": 0.012312427856868027, + "flos": 474302319360.0, + "grad_norm": 0.04911792883083492, + "language_loss": 1.06295228, + "learning_rate": 0.0008235646872681536, + "loss": 1.07898545, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 3.99609375, + "step": 64, + "time_per_iteration": 2.5651702880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01599528, + "balance_loss_mlp": 1.20279896, + "epoch": 0.012504809542131588, + "flos": 539471864064.0, + "grad_norm": 0.049725750424410776, + "language_loss": 1.06296277, + "learning_rate": 0.0008266349107584288, + "loss": 1.07895803, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 3.95898438, + "step": 65, + "time_per_iteration": 2.6876485347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01596697, + "balance_loss_mlp": 1.20492756, + "epoch": 0.012697191227395151, + "flos": 609857541120.0, + "grad_norm": 0.056540756097456804, + "language_loss": 1.08585978, + "learning_rate": 0.0008296582587724851, + "loss": 1.10182667, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 3.91210938, + "step": 66, + "time_per_iteration": 2.71223783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01587883, + "balance_loss_mlp": 1.19821179, + "epoch": 0.012889572912658714, + "flos": 769398600960.0, + "grad_norm": 0.04465917834699911, + "language_loss": 1.0627861, + "learning_rate": 0.0008326361411800136, + "loss": 1.07866502, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 3.89648438, + "step": 67, + "time_per_iteration": 2.9413115978240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01577237, + "balance_loss_mlp": 1.19099891, + "epoch": 0.013081954597922277, + "flos": 535021724928.0, + "grad_norm": 0.05343660826588632, + "language_loss": 1.06744349, + "learning_rate": 0.0008355699051851403, + "loss": 1.08321595, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 3.86132812, + "step": 68, + "time_per_iteration": 2.726212501525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0157129, + "balance_loss_mlp": 1.18829489, + "epoch": 0.01327433628318584, + "flos": 574181584128.0, + "grad_norm": 0.041490887209285586, + "language_loss": 1.14052749, + "learning_rate": 0.0008384608389860635, + "loss": 1.15624034, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 3.828125, + "step": 69, + "time_per_iteration": 2.6679208278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156381, + "balance_loss_mlp": 1.18386579, + "epoch": 0.013466717968449404, + "flos": 498259967232.0, + "grad_norm": 0.03618836919088814, + "language_loss": 1.04182374, + "learning_rate": 0.000841310175171381, + "loss": 1.05746174, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 3.796875, + "step": 70, + "time_per_iteration": 2.6277127265930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01563963, + "balance_loss_mlp": 1.18592632, + "epoch": 0.013659099653712967, + "flos": 566622763776.0, + "grad_norm": 0.04320101591589407, + "language_loss": 1.02295327, + "learning_rate": 0.000844119093875517, + "loss": 1.03859293, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 3.77734375, + "step": 71, + "time_per_iteration": 2.7236883640289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558639, + "balance_loss_mlp": 1.18403625, + "epoch": 0.01385148133897653, + "flos": 574943686656.0, + "grad_norm": 0.03416580025853519, + "language_loss": 1.06855714, + "learning_rate": 0.0008468887257134666, + "loss": 1.08414352, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 3.7421875, + "step": 72, + "time_per_iteration": 2.6696412563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558456, + "balance_loss_mlp": 1.18499684, + "epoch": 0.014043863024240093, + "flos": 577959048960.0, + "grad_norm": 0.037886537215891476, + "language_loss": 1.09368944, + "learning_rate": 0.0008496201545131264, + "loss": 1.10927403, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 3.73046875, + "step": 73, + "time_per_iteration": 2.701594591140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01545785, + "balance_loss_mlp": 1.17575896, + "epoch": 0.014236244709503656, + "flos": 940265252352.0, + "grad_norm": 0.04766211184506119, + "language_loss": 1.07240248, + "learning_rate": 0.0008523144198617317, + "loss": 1.08786011, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 3.6953125, + "step": 74, + "time_per_iteration": 3.1882145404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01551426, + "balance_loss_mlp": 1.18387985, + "epoch": 0.014428626394767219, + "flos": 529496478720.0, + "grad_norm": 0.031986864242930464, + "language_loss": 1.06216824, + "learning_rate": 0.0008549725194813783, + "loss": 1.0776825, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 3.66992188, + "step": 75, + "time_per_iteration": 2.666274309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01546662, + "balance_loss_mlp": 1.18102288, + "epoch": 0.014621008080030782, + "flos": 805283549952.0, + "grad_norm": 0.03321604497436844, + "language_loss": 1.05779314, + "learning_rate": 0.0008575954114472099, + "loss": 1.07325983, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 3.65039062, + "step": 76, + "time_per_iteration": 3.1192731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547179, + "balance_loss_mlp": 1.18478322, + "epoch": 0.014813389765294343, + "flos": 698357746176.0, + "grad_norm": 0.03477979781895141, + "language_loss": 1.02737951, + "learning_rate": 0.0008601840162606118, + "loss": 1.04285145, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 3.6171875, + "step": 77, + "time_per_iteration": 3.0015783309936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01547226, + "balance_loss_mlp": 1.18788171, + "epoch": 0.015005771450557906, + "flos": 598165476864.0, + "grad_norm": 0.032631512960834254, + "language_loss": 1.09477437, + "learning_rate": 0.000862739218788641, + "loss": 1.11024666, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 3.58984375, + "step": 78, + "time_per_iteration": 2.790245771408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536731, + "balance_loss_mlp": 1.18177319, + "epoch": 0.01519815313582147, + "flos": 550493199360.0, + "grad_norm": 0.0308447873241268, + "language_loss": 1.07131243, + "learning_rate": 0.0008652618700799138, + "loss": 1.0866797, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 3.55664062, + "step": 79, + "time_per_iteration": 2.6302430629730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01532812, + "balance_loss_mlp": 1.18033433, + "epoch": 0.015390534821085032, + "flos": 431440817664.0, + "grad_norm": 0.04595099678969376, + "language_loss": 1.06556606, + "learning_rate": 0.0008677527890662774, + "loss": 1.08089423, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 3.53125, + "step": 80, + "time_per_iteration": 2.4970459938049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01520539, + "balance_loss_mlp": 1.17130363, + "epoch": 0.015582916506348595, + "flos": 525185345280.0, + "grad_norm": 0.030530536654869142, + "language_loss": 1.07461143, + "learning_rate": 0.0008702127641587799, + "loss": 1.08981681, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 3.49804688, + "step": 81, + "time_per_iteration": 2.6258630752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01512144, + "balance_loss_mlp": 1.16500628, + "epoch": 0.015775298191612157, + "flos": 576617591040.0, + "grad_norm": 0.026948447424875538, + "language_loss": 1.02672768, + "learning_rate": 0.0008726425547457192, + "loss": 1.04184914, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 3.4765625, + "step": 82, + "time_per_iteration": 2.7344956398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517079, + "balance_loss_mlp": 1.17375636, + "epoch": 0.01596767987687572, + "flos": 611440071936.0, + "grad_norm": 0.03479426421062965, + "language_loss": 1.02940345, + "learning_rate": 0.0008750428925998964, + "loss": 1.04457426, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 3.4375, + "step": 83, + "time_per_iteration": 2.738685369491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01509349, + "balance_loss_mlp": 1.16850555, + "epoch": 0.016160061562139283, + "flos": 568233484800.0, + "grad_norm": 0.05178756375238081, + "language_loss": 1.08039558, + "learning_rate": 0.0008774144832015932, + "loss": 1.09548914, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 3.41210938, + "step": 84, + "time_per_iteration": 2.6948299407958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02575775, + "balance_loss_mlp": 2.26144409, + "epoch": 0.016352443247402846, + "flos": 1414502431488.0, + "grad_norm": 0.37456313977874084, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7735008, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 3.140625, + "step": 85, + "time_per_iteration": 4.596364974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01517697, + "balance_loss_mlp": 1.17895198, + "epoch": 0.01654482493266641, + "flos": 731786279424.0, + "grad_norm": 0.04138572693056026, + "language_loss": 1.03059626, + "learning_rate": 0.0008820741205014318, + "loss": 1.04577315, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 3.390625, + "step": 86, + "time_per_iteration": 2.901047706604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01566516, + "balance_loss_mlp": 1.22757995, + "epoch": 0.016737206617929972, + "flos": 537405242112.0, + "grad_norm": 0.0588613682629828, + "language_loss": 1.04849172, + "learning_rate": 0.0008843634575408404, + "loss": 1.06415701, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 3.39257812, + "step": 87, + "time_per_iteration": 2.6739823818206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583525, + "balance_loss_mlp": 1.24497032, + "epoch": 0.016929588303193535, + "flos": 538130406144.0, + "grad_norm": 0.09131872689500015, + "language_loss": 1.06101418, + "learning_rate": 0.0008866266301555082, + "loss": 1.07684946, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 3.38867188, + "step": 88, + "time_per_iteration": 2.741093635559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156783, + "balance_loss_mlp": 1.23118281, + "epoch": 0.017121969988457098, + "flos": 527792438784.0, + "grad_norm": 0.07103005743700296, + "language_loss": 1.07027078, + "learning_rate": 0.0008888642296509615, + "loss": 1.08594918, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 3.36914062, + "step": 89, + "time_per_iteration": 2.622267007827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01554346, + "balance_loss_mlp": 1.2196058, + "epoch": 0.01731435167372066, + "flos": 626768649984.0, + "grad_norm": 0.057543283798364535, + "language_loss": 1.11941445, + "learning_rate": 0.0008910768275115906, + "loss": 1.13495779, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 3.34960938, + "step": 90, + "time_per_iteration": 2.778939962387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01536545, + "balance_loss_mlp": 1.20409441, + "epoch": 0.017506733358984224, + "flos": 497385103872.0, + "grad_norm": 0.06951140803051024, + "language_loss": 1.07318401, + "learning_rate": 0.0008932649762767675, + "loss": 1.08854938, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 3.32617188, + "step": 91, + "time_per_iteration": 2.5841660499572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01529864, + "balance_loss_mlp": 1.20122755, + "epoch": 0.017699115044247787, + "flos": 747218870016.0, + "grad_norm": 0.037985069994816135, + "language_loss": 1.10022223, + "learning_rate": 0.0008954292103690864, + "loss": 1.11552095, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 3.28710938, + "step": 92, + "time_per_iteration": 2.976200580596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525091, + "balance_loss_mlp": 1.19893408, + "epoch": 0.01789149672951135, + "flos": 516521282304.0, + "grad_norm": 0.05507041657686672, + "language_loss": 1.1172272, + "learning_rate": 0.0008975700468778296, + "loss": 1.13247812, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 3.26171875, + "step": 93, + "time_per_iteration": 2.5778274536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01518338, + "balance_loss_mlp": 1.19427943, + "epoch": 0.018083878414774913, + "flos": 587230657536.0, + "grad_norm": 0.047907590915393955, + "language_loss": 1.05762661, + "learning_rate": 0.0008996879863005366, + "loss": 1.07280993, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 3.24023438, + "step": 94, + "time_per_iteration": 2.6827101707458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01506508, + "balance_loss_mlp": 1.18664575, + "epoch": 0.018276260100038477, + "flos": 498370782720.0, + "grad_norm": 0.03950158468897577, + "language_loss": 1.05640411, + "learning_rate": 0.0009017835132453337, + "loss": 1.07146931, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 3.19726562, + "step": 95, + "time_per_iteration": 2.5879104137420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01488471, + "balance_loss_mlp": 1.17223215, + "epoch": 0.01846864178530204, + "flos": 641233058304.0, + "grad_norm": 0.042611409633865054, + "language_loss": 1.05607677, + "learning_rate": 0.0009038570970964896, + "loss": 1.07096148, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 3.16015625, + "step": 96, + "time_per_iteration": 2.761634349822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487316, + "balance_loss_mlp": 1.17374837, + "epoch": 0.018661023470565603, + "flos": 512667995136.0, + "grad_norm": 0.026597294022958493, + "language_loss": 1.02809072, + "learning_rate": 0.0009059091926454854, + "loss": 1.04296374, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 3.1328125, + "step": 97, + "time_per_iteration": 2.602036952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01487556, + "balance_loss_mlp": 1.17742097, + "epoch": 0.018853405155829166, + "flos": 932697683712.0, + "grad_norm": 0.04097414840704221, + "language_loss": 1.01764143, + "learning_rate": 0.0009079402406897198, + "loss": 1.03251696, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 3.09765625, + "step": 98, + "time_per_iteration": 3.2514705657958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483888, + "balance_loss_mlp": 1.17642295, + "epoch": 0.01904578684109273, + "flos": 577587718656.0, + "grad_norm": 0.027217181555243938, + "language_loss": 1.03385735, + "learning_rate": 0.0009099506686008212, + "loss": 1.04869628, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 3.0703125, + "step": 99, + "time_per_iteration": 2.7867672443389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473245, + "balance_loss_mlp": 1.16883183, + "epoch": 0.019238168526356292, + "flos": 559521789696.0, + "grad_norm": 0.02943095981266107, + "language_loss": 1.06245995, + "learning_rate": 0.0009119408908644013, + "loss": 1.07719231, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 3.0390625, + "step": 100, + "time_per_iteration": 2.718982219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466855, + "balance_loss_mlp": 1.164922, + "epoch": 0.019430550211619855, + "flos": 725104267776.0, + "grad_norm": 0.035830377247789626, + "language_loss": 1.12020779, + "learning_rate": 0.0009139113095929519, + "loss": 1.13487625, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 3.01367188, + "step": 101, + "time_per_iteration": 2.9023444652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0146708, + "balance_loss_mlp": 1.16781712, + "epoch": 0.019622931896883418, + "flos": 500456846592.0, + "grad_norm": 0.031534744220975436, + "language_loss": 1.0658195, + "learning_rate": 0.0009158623150134762, + "loss": 1.08049035, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 2.98632812, + "step": 102, + "time_per_iteration": 2.5731325149536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01479653, + "balance_loss_mlp": 1.1828692, + "epoch": 0.01981531358214698, + "flos": 510282532608.0, + "grad_norm": 0.0334583858191085, + "language_loss": 1.05968487, + "learning_rate": 0.000917794285931332, + "loss": 1.07448149, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 2.9609375, + "step": 103, + "time_per_iteration": 2.656132221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477107, + "balance_loss_mlp": 1.18184972, + "epoch": 0.020007695267410544, + "flos": 522393559296.0, + "grad_norm": 0.033386157220771755, + "language_loss": 0.97816026, + "learning_rate": 0.0009197075901716639, + "loss": 0.99293131, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 2.9453125, + "step": 104, + "time_per_iteration": 2.7207133769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01472947, + "balance_loss_mlp": 1.1811223, + "epoch": 0.020200076952674107, + "flos": 534444314880.0, + "grad_norm": 0.03432724584635873, + "language_loss": 1.08410704, + "learning_rate": 0.0009216025849997171, + "loss": 1.09883642, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 2.92382812, + "step": 105, + "time_per_iteration": 2.783440113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461838, + "balance_loss_mlp": 1.17115784, + "epoch": 0.020392458637937667, + "flos": 686083414272.0, + "grad_norm": 0.04360543496830388, + "language_loss": 1.02907205, + "learning_rate": 0.0009234796175212258, + "loss": 1.04369044, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 2.9140625, + "step": 106, + "time_per_iteration": 2.914760112762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450941, + "balance_loss_mlp": 1.1615957, + "epoch": 0.02058484032320123, + "flos": 703415430912.0, + "grad_norm": 0.03266429542390293, + "language_loss": 1.06572628, + "learning_rate": 0.000925339025064007, + "loss": 1.08023572, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 2.90039062, + "step": 107, + "time_per_iteration": 2.951838254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453976, + "balance_loss_mlp": 1.16558492, + "epoch": 0.020777222008464793, + "flos": 640328059392.0, + "grad_norm": 0.03192051704400644, + "language_loss": 0.99516582, + "learning_rate": 0.0009271811355418027, + "loss": 1.00970554, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 2.890625, + "step": 108, + "time_per_iteration": 2.897881507873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01449031, + "balance_loss_mlp": 1.16159379, + "epoch": 0.020969603693728356, + "flos": 683321763840.0, + "grad_norm": 0.04466737388011785, + "language_loss": 1.06219566, + "learning_rate": 0.0009290062678013548, + "loss": 1.07668602, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 2.88085938, + "step": 109, + "time_per_iteration": 2.8423218727111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01430047, + "balance_loss_mlp": 1.14413536, + "epoch": 0.02116198537899192, + "flos": 534420015360.0, + "grad_norm": 0.034258615277409615, + "language_loss": 1.04797208, + "learning_rate": 0.0009308147319536321, + "loss": 1.06227255, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 2.86523438, + "step": 110, + "time_per_iteration": 2.6316323280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425495, + "balance_loss_mlp": 1.14053667, + "epoch": 0.021354367064255482, + "flos": 718728457728.0, + "grad_norm": 0.048864006828935096, + "language_loss": 1.11352324, + "learning_rate": 0.0009326068296900676, + "loss": 1.12777817, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 2.85546875, + "step": 111, + "time_per_iteration": 2.8313205242156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01416698, + "balance_loss_mlp": 1.13269377, + "epoch": 0.021546748749519045, + "flos": 520624390656.0, + "grad_norm": 0.040751650479700946, + "language_loss": 1.01643181, + "learning_rate": 0.0009343828545846161, + "loss": 1.03059864, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 2.84570312, + "step": 112, + "time_per_iteration": 2.7729175090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401674, + "balance_loss_mlp": 1.11805177, + "epoch": 0.021739130434782608, + "flos": 506161927680.0, + "grad_norm": 0.042106341000359294, + "language_loss": 1.06266427, + "learning_rate": 0.0009361430923823841, + "loss": 1.07668102, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 2.84179688, + "step": 113, + "time_per_iteration": 2.5920841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01394311, + "balance_loss_mlp": 1.11126053, + "epoch": 0.02193151212004617, + "flos": 464427055872.0, + "grad_norm": 0.07156510336232694, + "language_loss": 1.09574234, + "learning_rate": 0.0009378878212755459, + "loss": 1.10968542, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 2.8359375, + "step": 114, + "time_per_iteration": 2.5213706493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376382, + "balance_loss_mlp": 1.09371293, + "epoch": 0.022123893805309734, + "flos": 553332617472.0, + "grad_norm": 0.03568103744776456, + "language_loss": 0.9948864, + "learning_rate": 0.0009396173121672103, + "loss": 1.0086503, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 2.83203125, + "step": 115, + "time_per_iteration": 2.654648780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351182, + "balance_loss_mlp": 1.0677501, + "epoch": 0.022316275490573297, + "flos": 637379771136.0, + "grad_norm": 0.04471438423319615, + "language_loss": 1.05214882, + "learning_rate": 0.0009413318289238633, + "loss": 1.06566072, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 2.83984375, + "step": 116, + "time_per_iteration": 2.7842695713043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311882, + "balance_loss_mlp": 1.0282588, + "epoch": 0.02250865717583686, + "flos": 800316271872.0, + "grad_norm": 0.046340717018109684, + "language_loss": 0.97282118, + "learning_rate": 0.0009430316286169771, + "loss": 0.98593992, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 2.84179688, + "step": 117, + "time_per_iteration": 3.015839099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377985, + "balance_loss_mlp": 1.09283674, + "epoch": 0.022701038861100423, + "flos": 457063621632.0, + "grad_norm": 0.07808854544893538, + "language_loss": 1.02862036, + "learning_rate": 0.0009447169617543361, + "loss": 1.04240024, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 2.85742188, + "step": 118, + "time_per_iteration": 2.582919120788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371579, + "balance_loss_mlp": 1.08871901, + "epoch": 0.022893420546363986, + "flos": 584187105024.0, + "grad_norm": 0.08661397198668377, + "language_loss": 1.09685123, + "learning_rate": 0.0009463880725016029, + "loss": 1.11056697, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 2.83398438, + "step": 119, + "time_per_iteration": 2.6932969093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312523, + "balance_loss_mlp": 1.03252411, + "epoch": 0.02308580223162755, + "flos": 562478826240.0, + "grad_norm": 0.04303328442288268, + "language_loss": 1.04977584, + "learning_rate": 0.0009480451988946134, + "loss": 1.06290102, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 2.8046875, + "step": 120, + "time_per_iteration": 2.8070547580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01299284, + "balance_loss_mlp": 1.02252805, + "epoch": 0.023278183916891113, + "flos": 772646287872.0, + "grad_norm": 0.03799067846502037, + "language_loss": 1.05637264, + "learning_rate": 0.0009496885730428627, + "loss": 1.0693655, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 2.77148438, + "step": 121, + "time_per_iteration": 3.014753580093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130842, + "balance_loss_mlp": 1.03376198, + "epoch": 0.023470565602154676, + "flos": 554431057152.0, + "grad_norm": 0.04194740398285866, + "language_loss": 1.04016769, + "learning_rate": 0.0009513184213246156, + "loss": 1.05325174, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 2.75, + "step": 122, + "time_per_iteration": 2.633074998855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316034, + "balance_loss_mlp": 1.04442739, + "epoch": 0.02366294728741824, + "flos": 561167503872.0, + "grad_norm": 0.038872106950025416, + "language_loss": 1.07101583, + "learning_rate": 0.0009529349645740552, + "loss": 1.08417618, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 2.71875, + "step": 123, + "time_per_iteration": 2.6846470832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320226, + "balance_loss_mlp": 1.05014575, + "epoch": 0.0238553289726818, + "flos": 469517788416.0, + "grad_norm": 0.03403697644067516, + "language_loss": 1.05937934, + "learning_rate": 0.0009545384182608524, + "loss": 1.07258177, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 2.703125, + "step": 124, + "time_per_iteration": 2.5332376956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326404, + "balance_loss_mlp": 1.05880272, + "epoch": 0.024047710657945365, + "flos": 561104320512.0, + "grad_norm": 0.042208642163400256, + "language_loss": 1.03444421, + "learning_rate": 0.0009561289926625252, + "loss": 1.04770815, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 2.67773438, + "step": 125, + "time_per_iteration": 2.68180251121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324487, + "balance_loss_mlp": 1.05841172, + "epoch": 0.024240092343208928, + "flos": 505771155456.0, + "grad_norm": 0.03944680997458598, + "language_loss": 1.08491933, + "learning_rate": 0.0009577068930299292, + "loss": 1.0981642, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 2.66210938, + "step": 126, + "time_per_iteration": 2.602088689804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323529, + "balance_loss_mlp": 1.05936122, + "epoch": 0.02443247402847249, + "flos": 436753181184.0, + "grad_norm": 0.04017271590188075, + "language_loss": 1.04077768, + "learning_rate": 0.0009592723197462087, + "loss": 1.05401289, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 2.64257812, + "step": 127, + "time_per_iteration": 2.643617630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318089, + "balance_loss_mlp": 1.05563784, + "epoch": 0.024624855713736054, + "flos": 685069545216.0, + "grad_norm": 0.03549644551725154, + "language_loss": 1.0056293, + "learning_rate": 0.0009608254684795125, + "loss": 1.01881027, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 2.625, + "step": 128, + "time_per_iteration": 2.949061632156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309242, + "balance_loss_mlp": 1.04831672, + "epoch": 0.024817237398999614, + "flos": 526114643712.0, + "grad_norm": 0.03183934804306691, + "language_loss": 1.03377914, + "learning_rate": 0.0009623665303297678, + "loss": 1.04687166, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 2.609375, + "step": 129, + "time_per_iteration": 2.7315783500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130104, + "balance_loss_mlp": 1.04106867, + "epoch": 0.025009619084263177, + "flos": 656887279872.0, + "grad_norm": 0.038944166016075116, + "language_loss": 1.07603359, + "learning_rate": 0.0009638956919697878, + "loss": 1.08904397, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 2.59960938, + "step": 130, + "time_per_iteration": 2.9588887691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293161, + "balance_loss_mlp": 1.03395224, + "epoch": 0.02520200076952674, + "flos": 455370275328.0, + "grad_norm": 0.03345888261117193, + "language_loss": 0.99743778, + "learning_rate": 0.0009654131357809714, + "loss": 1.0103693, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 2.59179688, + "step": 131, + "time_per_iteration": 2.5802786350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296775, + "balance_loss_mlp": 1.03966463, + "epoch": 0.025394382454790303, + "flos": 841269599232.0, + "grad_norm": 0.04496153180844387, + "language_loss": 1.08517051, + "learning_rate": 0.0009669190399838441, + "loss": 1.09813821, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 2.5703125, + "step": 132, + "time_per_iteration": 3.1034374237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297684, + "balance_loss_mlp": 1.04190826, + "epoch": 0.025586764140053866, + "flos": 582229353216.0, + "grad_norm": 0.044253016077327914, + "language_loss": 1.0183959, + "learning_rate": 0.0009684135787636724, + "loss": 1.03137255, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 2.55664062, + "step": 133, + "time_per_iteration": 2.8056888580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284726, + "balance_loss_mlp": 1.03066742, + "epoch": 0.02577914582531743, + "flos": 791678453760.0, + "grad_norm": 0.04023348500073193, + "language_loss": 1.06134284, + "learning_rate": 0.0009698969223913726, + "loss": 1.07419014, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 2.5390625, + "step": 134, + "time_per_iteration": 3.0520598888397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279327, + "balance_loss_mlp": 1.02717578, + "epoch": 0.025971527510580992, + "flos": 596063861760.0, + "grad_norm": 0.02965492003563146, + "language_loss": 1.08660483, + "learning_rate": 0.0009713692373399265, + "loss": 1.09939814, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 2.51953125, + "step": 135, + "time_per_iteration": 2.679379463195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01931427, + "balance_loss_mlp": 1.66744995, + "epoch": 0.026163909195844555, + "flos": 1581077391360.0, + "grad_norm": 0.18396358569787127, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81387651, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 2.640625, + "step": 136, + "time_per_iteration": 5.69318151473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01580238, + "balance_loss_mlp": 1.32083893, + "epoch": 0.026356290881108118, + "flos": 1505163555840.0, + "grad_norm": 0.11058621392355464, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79391277, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 2.59375, + "step": 137, + "time_per_iteration": 4.930646896362305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336039, + "balance_loss_mlp": 1.08846498, + "epoch": 0.02654867256637168, + "flos": 598341421056.0, + "grad_norm": 0.05793494017899448, + "language_loss": 1.01254559, + "learning_rate": 0.0009757216201974225, + "loss": 1.02590609, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 2.47265625, + "step": 138, + "time_per_iteration": 2.8532111644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376264, + "balance_loss_mlp": 1.13059723, + "epoch": 0.026741054251635244, + "flos": 546136379136.0, + "grad_norm": 0.07027637242601113, + "language_loss": 1.06507492, + "learning_rate": 0.0009771514130396581, + "loss": 1.07883763, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 2.453125, + "step": 139, + "time_per_iteration": 2.742065668106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01373402, + "balance_loss_mlp": 1.12792611, + "epoch": 0.026933435936898807, + "flos": 507846525696.0, + "grad_norm": 0.06681977417406691, + "language_loss": 1.06790614, + "learning_rate": 0.00097857095638274, + "loss": 1.08164012, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 2.45117188, + "step": 140, + "time_per_iteration": 2.689812660217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01350241, + "balance_loss_mlp": 1.10533786, + "epoch": 0.02712581762216237, + "flos": 742254504192.0, + "grad_norm": 0.04346752833457442, + "language_loss": 0.97943556, + "learning_rate": 0.0009799803961288726, + "loss": 0.99293798, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 2.4453125, + "step": 141, + "time_per_iteration": 3.064852714538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340988, + "balance_loss_mlp": 1.09684777, + "epoch": 0.027318199307425933, + "flos": 849779105280.0, + "grad_norm": 0.04419232462487818, + "language_loss": 1.04253626, + "learning_rate": 0.000981379875086876, + "loss": 1.05594611, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 2.4375, + "step": 142, + "time_per_iteration": 3.049978494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342845, + "balance_loss_mlp": 1.09870481, + "epoch": 0.027510580992689496, + "flos": 576638978304.0, + "grad_norm": 0.03936283820829166, + "language_loss": 0.99339008, + "learning_rate": 0.0009827695330590185, + "loss": 1.00681853, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 2.4375, + "step": 143, + "time_per_iteration": 2.677050828933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360296, + "balance_loss_mlp": 1.11729932, + "epoch": 0.02770296267795306, + "flos": 773790414336.0, + "grad_norm": 0.036415015399305896, + "language_loss": 0.98794824, + "learning_rate": 0.0009841495069248256, + "loss": 1.00155115, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 2.42578125, + "step": 144, + "time_per_iteration": 2.9983932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369915, + "balance_loss_mlp": 1.12768197, + "epoch": 0.027895344363216622, + "flos": 570449806080.0, + "grad_norm": 0.04357781303470995, + "language_loss": 0.98341697, + "learning_rate": 0.0009855199307219871, + "loss": 0.99711609, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 2.41796875, + "step": 145, + "time_per_iteration": 2.6622605323791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0136275, + "balance_loss_mlp": 1.12261522, + "epoch": 0.028087726048480186, + "flos": 548409080832.0, + "grad_norm": 0.032618269384273584, + "language_loss": 1.00131154, + "learning_rate": 0.0009868809357244854, + "loss": 1.01493907, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 2.39648438, + "step": 146, + "time_per_iteration": 2.7002813816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01347166, + "balance_loss_mlp": 1.10836601, + "epoch": 0.02828010773374375, + "flos": 525873570816.0, + "grad_norm": 0.032542426789695725, + "language_loss": 1.04416764, + "learning_rate": 0.0009882326505180556, + "loss": 1.05763924, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 2.3828125, + "step": 147, + "time_per_iteration": 2.710149049758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334853, + "balance_loss_mlp": 1.09815085, + "epoch": 0.02847248941900731, + "flos": 773772917760.0, + "grad_norm": 0.045451062042893155, + "language_loss": 1.02790403, + "learning_rate": 0.0009895752010730906, + "loss": 1.04125249, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 2.36132812, + "step": 148, + "time_per_iteration": 2.965888261795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328294, + "balance_loss_mlp": 1.0936898, + "epoch": 0.028664871104270875, + "flos": 535470822912.0, + "grad_norm": 0.03549847888949514, + "language_loss": 1.08720016, + "learning_rate": 0.0009909087108150867, + "loss": 1.10048318, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 2.33984375, + "step": 149, + "time_per_iteration": 2.759585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328431, + "balance_loss_mlp": 1.09649718, + "epoch": 0.028857252789534438, + "flos": 368605212672.0, + "grad_norm": 0.04584721914032896, + "language_loss": 1.09262538, + "learning_rate": 0.0009922333006927371, + "loss": 1.10590982, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 2.3125, + "step": 150, + "time_per_iteration": 2.5677716732025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132956, + "balance_loss_mlp": 1.09896171, + "epoch": 0.029049634474798, + "flos": 516484343808.0, + "grad_norm": 0.054837011337671125, + "language_loss": 1.02855873, + "learning_rate": 0.0009935490892437632, + "loss": 1.04185438, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 2.29882812, + "step": 151, + "time_per_iteration": 2.5842795372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323589, + "balance_loss_mlp": 1.09623301, + "epoch": 0.029242016160061564, + "flos": 589349769216.0, + "grad_norm": 0.041624099188269474, + "language_loss": 1.01284385, + "learning_rate": 0.0009948561926585687, + "loss": 1.02607965, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 2.2734375, + "step": 152, + "time_per_iteration": 2.7717602252960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309484, + "balance_loss_mlp": 1.08422625, + "epoch": 0.029434397845325123, + "flos": 553137231360.0, + "grad_norm": 0.04242067063834005, + "language_loss": 1.0541966, + "learning_rate": 0.0009961547248418122, + "loss": 1.0672915, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 2.25976562, + "step": 153, + "time_per_iteration": 2.6492583751678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303287, + "balance_loss_mlp": 1.07898307, + "epoch": 0.029626779530588686, + "flos": 604608360960.0, + "grad_norm": 0.03242941124289258, + "language_loss": 1.02145946, + "learning_rate": 0.0009974447974719707, + "loss": 1.03449237, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 2.25, + "step": 154, + "time_per_iteration": 2.7111871242523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303637, + "balance_loss_mlp": 1.08181214, + "epoch": 0.02981916121585225, + "flos": 622218388992.0, + "grad_norm": 0.03743420896054, + "language_loss": 1.03581393, + "learning_rate": 0.0009987265200589763, + "loss": 1.0488503, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 2.22460938, + "step": 155, + "time_per_iteration": 2.7590832710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281243, + "balance_loss_mlp": 1.06151628, + "epoch": 0.030011542901115813, + "flos": 662881065984.0, + "grad_norm": 0.03665146617631418, + "language_loss": 1.03448439, + "learning_rate": 0.001, + "loss": 1.04729688, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 2.203125, + "step": 156, + "time_per_iteration": 2.868732452392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262046, + "balance_loss_mlp": 1.04441714, + "epoch": 0.030203924586379376, + "flos": 652819164672.0, + "grad_norm": 0.048414208125286275, + "language_loss": 1.0101347, + "learning_rate": 0.0009999999029413921, + "loss": 1.02275515, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 2.18164062, + "step": 157, + "time_per_iteration": 2.8458704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249467, + "balance_loss_mlp": 1.03393674, + "epoch": 0.03039630627164294, + "flos": 532444766976.0, + "grad_norm": 0.038165698108555156, + "language_loss": 1.02398324, + "learning_rate": 0.0009999996117656068, + "loss": 1.03647804, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 2.16015625, + "step": 158, + "time_per_iteration": 2.7255747318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250196, + "balance_loss_mlp": 1.03657281, + "epoch": 0.030588687956906502, + "flos": 587295786240.0, + "grad_norm": 0.04636715302465643, + "language_loss": 0.95869231, + "learning_rate": 0.0009999991264727564, + "loss": 0.97119427, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 2.140625, + "step": 159, + "time_per_iteration": 2.7805936336517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126418, + "balance_loss_mlp": 1.05284619, + "epoch": 0.030781069642170065, + "flos": 514287464448.0, + "grad_norm": 0.055354258548617474, + "language_loss": 1.07316554, + "learning_rate": 0.0009999984470630296, + "loss": 1.08580732, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 2.1171875, + "step": 160, + "time_per_iteration": 2.6011087894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284107, + "balance_loss_mlp": 1.07372677, + "epoch": 0.030973451327433628, + "flos": 719560546560.0, + "grad_norm": 0.03499871632601644, + "language_loss": 0.95530587, + "learning_rate": 0.0009999975735366902, + "loss": 0.96814692, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 2.10742188, + "step": 161, + "time_per_iteration": 3.083415985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283442, + "balance_loss_mlp": 1.07439709, + "epoch": 0.03116583301269719, + "flos": 1111615994880.0, + "grad_norm": 0.03722431710536786, + "language_loss": 0.96960843, + "learning_rate": 0.0009999965058940775, + "loss": 0.9824428, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 2.09375, + "step": 162, + "time_per_iteration": 3.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264072, + "balance_loss_mlp": 1.05655301, + "epoch": 0.031358214697960754, + "flos": 451833883392.0, + "grad_norm": 0.04231417263227255, + "language_loss": 1.04135799, + "learning_rate": 0.0009999952441356057, + "loss": 1.05399871, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 2.078125, + "step": 163, + "time_per_iteration": 2.5445146560668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239952, + "balance_loss_mlp": 1.03357697, + "epoch": 0.031550596383224314, + "flos": 1257087309312.0, + "grad_norm": 0.03293922474511325, + "language_loss": 1.04807603, + "learning_rate": 0.000999993788261765, + "loss": 1.06047547, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 2.06640625, + "step": 164, + "time_per_iteration": 3.603273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233685, + "balance_loss_mlp": 1.02769136, + "epoch": 0.03174297806848788, + "flos": 669323950080.0, + "grad_norm": 0.03785089383184646, + "language_loss": 1.05591631, + "learning_rate": 0.00099999213827312, + "loss": 1.06825328, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 2.0625, + "step": 165, + "time_per_iteration": 2.822242498397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237294, + "balance_loss_mlp": 1.03206336, + "epoch": 0.03193535975375144, + "flos": 552364435200.0, + "grad_norm": 0.03413051380570177, + "language_loss": 1.00392842, + "learning_rate": 0.000999990294170312, + "loss": 1.01630139, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 2.0546875, + "step": 166, + "time_per_iteration": 2.6473989486694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124218, + "balance_loss_mlp": 1.03790259, + "epoch": 0.032127741439015006, + "flos": 544740486144.0, + "grad_norm": 0.02951320831702663, + "language_loss": 1.04371905, + "learning_rate": 0.0009999882559540566, + "loss": 1.0561409, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 2.04492188, + "step": 167, + "time_per_iteration": 2.654994487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249753, + "balance_loss_mlp": 1.04661989, + "epoch": 0.032320123124278566, + "flos": 549514323456.0, + "grad_norm": 0.03217165834370848, + "language_loss": 1.01348543, + "learning_rate": 0.000999986023625145, + "loss": 1.02598298, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 2.03320312, + "step": 168, + "time_per_iteration": 2.759324550628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01736656, + "balance_loss_mlp": 1.53829193, + "epoch": 0.03251250480954213, + "flos": 1308817963776.0, + "grad_norm": 0.15145695156494207, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.8066107, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.9765625, + "step": 169, + "time_per_iteration": 4.9954283237457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125768, + "balance_loss_mlp": 1.05588245, + "epoch": 0.03270488649480569, + "flos": 562202760192.0, + "grad_norm": 0.04037677915440104, + "language_loss": 1.01481748, + "learning_rate": 0.0009999809766328958, + "loss": 1.02739429, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 2.01953125, + "step": 170, + "time_per_iteration": 2.6656970977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250876, + "balance_loss_mlp": 1.0494597, + "epoch": 0.03289726818006926, + "flos": 483339657984.0, + "grad_norm": 0.04232720535630845, + "language_loss": 1.03883123, + "learning_rate": 0.0009999781619715177, + "loss": 1.0513401, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 2.015625, + "step": 171, + "time_per_iteration": 2.5408902168273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238141, + "balance_loss_mlp": 1.03786898, + "epoch": 0.03308964986533282, + "flos": 675821269248.0, + "grad_norm": 0.04278552863969592, + "language_loss": 1.04043615, + "learning_rate": 0.000999975153201402, + "loss": 1.05281758, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 2.00390625, + "step": 172, + "time_per_iteration": 2.85229754447937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233367, + "balance_loss_mlp": 1.03385854, + "epoch": 0.033282031550596385, + "flos": 610341632256.0, + "grad_norm": 0.04144744195910536, + "language_loss": 1.01965618, + "learning_rate": 0.0009999719503237174, + "loss": 1.03198993, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 1.9921875, + "step": 173, + "time_per_iteration": 2.7612979412078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234993, + "balance_loss_mlp": 1.03739214, + "epoch": 0.033474413235859944, + "flos": 468996758784.0, + "grad_norm": 0.06741318195929925, + "language_loss": 1.10547054, + "learning_rate": 0.0009999685533397073, + "loss": 1.1178205, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 1.97265625, + "step": 174, + "time_per_iteration": 2.5750949382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246896, + "balance_loss_mlp": 1.05101097, + "epoch": 0.03366679492112351, + "flos": 580715841792.0, + "grad_norm": 0.0354258140398677, + "language_loss": 1.02665091, + "learning_rate": 0.00099996496225069, + "loss": 1.03911996, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 1.95605469, + "step": 175, + "time_per_iteration": 2.6886191368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124614, + "balance_loss_mlp": 1.05168545, + "epoch": 0.03385917660638707, + "flos": 638886479616.0, + "grad_norm": 0.036851717024697625, + "language_loss": 1.04551578, + "learning_rate": 0.0009999611770580604, + "loss": 1.0579772, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 1.94433594, + "step": 176, + "time_per_iteration": 2.8528547286987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227252, + "balance_loss_mlp": 1.03422809, + "epoch": 0.03405155829165064, + "flos": 442740164352.0, + "grad_norm": 0.05003520598604069, + "language_loss": 1.03819132, + "learning_rate": 0.0009999571977632876, + "loss": 1.0504638, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 1.9296875, + "step": 177, + "time_per_iteration": 2.6220269203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224145, + "balance_loss_mlp": 1.03188384, + "epoch": 0.034243939976914196, + "flos": 467275222272.0, + "grad_norm": 0.0554689754659714, + "language_loss": 1.0658946, + "learning_rate": 0.0009999530243679166, + "loss": 1.07813609, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 1.921875, + "step": 178, + "time_per_iteration": 2.5593671798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235914, + "balance_loss_mlp": 1.04479802, + "epoch": 0.03443632166217776, + "flos": 780713498880.0, + "grad_norm": 0.03675993055709111, + "language_loss": 1.01102996, + "learning_rate": 0.0009999486568735675, + "loss": 1.02338898, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 1.91015625, + "step": 179, + "time_per_iteration": 3.083312749862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235549, + "balance_loss_mlp": 1.04548192, + "epoch": 0.03462870334744132, + "flos": 1265760120576.0, + "grad_norm": 0.04656515886260978, + "language_loss": 1.01660061, + "learning_rate": 0.0009999440952819362, + "loss": 1.02895617, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 1.89941406, + "step": 180, + "time_per_iteration": 3.691354513168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231777, + "balance_loss_mlp": 1.04390287, + "epoch": 0.03482108503270489, + "flos": 608303200512.0, + "grad_norm": 0.04339398829325753, + "language_loss": 1.02140999, + "learning_rate": 0.0009999393395947935, + "loss": 1.03372765, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 1.87695312, + "step": 181, + "time_per_iteration": 2.8826780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222143, + "balance_loss_mlp": 1.03617644, + "epoch": 0.03501346671796845, + "flos": 539315361792.0, + "grad_norm": 0.033650569268787865, + "language_loss": 1.05363226, + "learning_rate": 0.0009999343898139858, + "loss": 1.06585371, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 1.85742188, + "step": 182, + "time_per_iteration": 2.6785037517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217643, + "balance_loss_mlp": 1.03329813, + "epoch": 0.035205848403232015, + "flos": 519499706112.0, + "grad_norm": 0.04889617812287003, + "language_loss": 1.03914642, + "learning_rate": 0.0009999292459414348, + "loss": 1.05132294, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 1.84082031, + "step": 183, + "time_per_iteration": 2.648263931274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223183, + "balance_loss_mlp": 1.04103076, + "epoch": 0.035398230088495575, + "flos": 473334137088.0, + "grad_norm": 0.03546540132303448, + "language_loss": 1.08284354, + "learning_rate": 0.0009999239079791374, + "loss": 1.09507537, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 1.81835938, + "step": 184, + "time_per_iteration": 2.6003947257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229231, + "balance_loss_mlp": 1.04908144, + "epoch": 0.03559061177375914, + "flos": 513095705856.0, + "grad_norm": 0.03580873522044792, + "language_loss": 1.00877666, + "learning_rate": 0.0009999183759291659, + "loss": 1.02106905, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 1.79785156, + "step": 185, + "time_per_iteration": 2.7518959045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229953, + "balance_loss_mlp": 1.05161583, + "epoch": 0.0357829934590227, + "flos": 478350992640.0, + "grad_norm": 0.05401643684385997, + "language_loss": 1.03586912, + "learning_rate": 0.0009999126497936682, + "loss": 1.04816866, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 1.78710938, + "step": 186, + "time_per_iteration": 2.565373659133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218003, + "balance_loss_mlp": 1.04052448, + "epoch": 0.03597537514428627, + "flos": 645885386496.0, + "grad_norm": 0.027605248849540943, + "language_loss": 1.06344712, + "learning_rate": 0.0009999067295748676, + "loss": 1.07562721, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 1.77832031, + "step": 187, + "time_per_iteration": 2.862023115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208675, + "balance_loss_mlp": 1.03167319, + "epoch": 0.03616775682954983, + "flos": 582270182400.0, + "grad_norm": 0.041753828035088196, + "language_loss": 1.04174721, + "learning_rate": 0.000999900615275062, + "loss": 1.05383396, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 1.7734375, + "step": 188, + "time_per_iteration": 2.7248780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206757, + "balance_loss_mlp": 1.02994609, + "epoch": 0.03636013851481339, + "flos": 383265007104.0, + "grad_norm": 0.05119808239604003, + "language_loss": 1.10189009, + "learning_rate": 0.0009998943068966256, + "loss": 1.11395764, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 1.77148438, + "step": 189, + "time_per_iteration": 2.487445592880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216253, + "balance_loss_mlp": 1.04010975, + "epoch": 0.03655252020007695, + "flos": 584308614144.0, + "grad_norm": 0.029643950017142998, + "language_loss": 1.04644084, + "learning_rate": 0.0009998878044420072, + "loss": 1.05860329, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 1.76464844, + "step": 190, + "time_per_iteration": 2.736809015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012186, + "balance_loss_mlp": 1.04321897, + "epoch": 0.03674490188534051, + "flos": 472598279424.0, + "grad_norm": 0.03987592529636011, + "language_loss": 1.00565469, + "learning_rate": 0.0009998811079137318, + "loss": 1.01784062, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 1.75683594, + "step": 191, + "time_per_iteration": 2.6006946563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214791, + "balance_loss_mlp": 1.04017353, + "epoch": 0.03693728357060408, + "flos": 529411908096.0, + "grad_norm": 0.03601320862003297, + "language_loss": 1.01597381, + "learning_rate": 0.0009998742173143987, + "loss": 1.02812171, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 1.74902344, + "step": 192, + "time_per_iteration": 2.6246893405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200861, + "balance_loss_mlp": 1.02719736, + "epoch": 0.03712966525586764, + "flos": 800346407424.0, + "grad_norm": 0.02962706666311765, + "language_loss": 1.0204885, + "learning_rate": 0.0009998671326466833, + "loss": 1.03249693, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 1.73925781, + "step": 193, + "time_per_iteration": 2.9852418899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194118, + "balance_loss_mlp": 1.02121651, + "epoch": 0.037322046941131205, + "flos": 831359342592.0, + "grad_norm": 0.049736474928026, + "language_loss": 1.0340569, + "learning_rate": 0.0009998598539133362, + "loss": 1.04599798, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 1.73144531, + "step": 194, + "time_per_iteration": 3.0510568618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194861, + "balance_loss_mlp": 1.02339077, + "epoch": 0.037514428626394765, + "flos": 438589423872.0, + "grad_norm": 0.030819097200883293, + "language_loss": 1.03682184, + "learning_rate": 0.0009998523811171828, + "loss": 1.04877055, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 1.71679688, + "step": 195, + "time_per_iteration": 2.5203936100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197718, + "balance_loss_mlp": 1.0269146, + "epoch": 0.03770681031165833, + "flos": 512639804928.0, + "grad_norm": 0.031890398221933944, + "language_loss": 1.04342675, + "learning_rate": 0.0009998447142611248, + "loss": 1.05540395, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 1.70996094, + "step": 196, + "time_per_iteration": 2.659193754196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193737, + "balance_loss_mlp": 1.02341044, + "epoch": 0.03789919199692189, + "flos": 808843274496.0, + "grad_norm": 0.030368823498634023, + "language_loss": 0.97672093, + "learning_rate": 0.0009998368533481387, + "loss": 0.98865831, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 1.70507812, + "step": 197, + "time_per_iteration": 3.031437397003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185957, + "balance_loss_mlp": 1.01677489, + "epoch": 0.03809157368218546, + "flos": 691792386048.0, + "grad_norm": 0.027429804092446938, + "language_loss": 1.00742936, + "learning_rate": 0.0009998287983812762, + "loss": 1.01928902, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 1.69335938, + "step": 198, + "time_per_iteration": 2.8533172607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186443, + "balance_loss_mlp": 1.01764262, + "epoch": 0.03828395536744902, + "flos": 519004921344.0, + "grad_norm": 0.029672573654994608, + "language_loss": 1.06761527, + "learning_rate": 0.0009998205493636646, + "loss": 1.07947969, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 1.68945312, + "step": 199, + "time_per_iteration": 2.6512415409088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190294, + "balance_loss_mlp": 1.02197027, + "epoch": 0.038476337052712584, + "flos": 582763021824.0, + "grad_norm": 0.03300049351517658, + "language_loss": 0.99112457, + "learning_rate": 0.0009998121062985063, + "loss": 1.00302756, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 1.68457031, + "step": 200, + "time_per_iteration": 2.6979846954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187054, + "balance_loss_mlp": 1.01996994, + "epoch": 0.03866871873797614, + "flos": 578273998848.0, + "grad_norm": 0.03164459486115397, + "language_loss": 1.0110172, + "learning_rate": 0.0009998034691890794, + "loss": 1.02288771, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 1.671875, + "step": 201, + "time_per_iteration": 2.80670166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183327, + "balance_loss_mlp": 1.01672018, + "epoch": 0.03886110042323971, + "flos": 541772755968.0, + "grad_norm": 0.032663388617215364, + "language_loss": 1.05587053, + "learning_rate": 0.0009997946380387369, + "loss": 1.06770372, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 1.66699219, + "step": 202, + "time_per_iteration": 2.6591310501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179406, + "balance_loss_mlp": 1.01394379, + "epoch": 0.03905348210850327, + "flos": 719240739072.0, + "grad_norm": 0.030305493428663434, + "language_loss": 1.08528447, + "learning_rate": 0.0009997856128509076, + "loss": 1.09707844, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 1.65527344, + "step": 203, + "time_per_iteration": 2.9006340503692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181527, + "balance_loss_mlp": 1.01720893, + "epoch": 0.039245863793766836, + "flos": 428397265152.0, + "grad_norm": 0.03189317300504765, + "language_loss": 1.03375864, + "learning_rate": 0.0009997763936290952, + "loss": 1.04557395, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 1.64355469, + "step": 204, + "time_per_iteration": 2.5836358070373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178747, + "balance_loss_mlp": 1.01538289, + "epoch": 0.039438245479030395, + "flos": 664270156032.0, + "grad_norm": 0.033629424624266296, + "language_loss": 1.0866276, + "learning_rate": 0.0009997669803768789, + "loss": 1.09841514, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 1.63378906, + "step": 205, + "time_per_iteration": 2.7809464931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180514, + "balance_loss_mlp": 1.01791251, + "epoch": 0.03963062716429396, + "flos": 636496159488.0, + "grad_norm": 0.025840840316256445, + "language_loss": 1.03755617, + "learning_rate": 0.0009997573730979134, + "loss": 1.04936123, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 1.62597656, + "step": 206, + "time_per_iteration": 2.7759904861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207138, + "balance_loss_mlp": 1.04272461, + "epoch": 0.03982300884955752, + "flos": 1421589799680.0, + "grad_norm": 0.03078548913711826, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80400336, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.64453125, + "step": 207, + "time_per_iteration": 4.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177297, + "balance_loss_mlp": 1.0162214, + "epoch": 0.04001539053482109, + "flos": 690520914432.0, + "grad_norm": 0.03233621027438014, + "language_loss": 1.02104092, + "learning_rate": 0.0009997375764747294, + "loss": 1.03281379, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 1.61035156, + "step": 208, + "time_per_iteration": 2.9808952808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181785, + "balance_loss_mlp": 1.02156758, + "epoch": 0.04020777222008465, + "flos": 534752461824.0, + "grad_norm": 0.037334696417832054, + "language_loss": 0.99876916, + "learning_rate": 0.0009997273871381967, + "loss": 1.01058698, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 1.6015625, + "step": 209, + "time_per_iteration": 2.6938650608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183132, + "balance_loss_mlp": 1.02396429, + "epoch": 0.040400153905348214, + "flos": 568997532672.0, + "grad_norm": 0.03228633343407045, + "language_loss": 1.04497194, + "learning_rate": 0.0009997170037902862, + "loss": 1.05680323, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 1.59082031, + "step": 210, + "time_per_iteration": 2.722900629043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189763, + "balance_loss_mlp": 1.03145349, + "epoch": 0.040592535590611774, + "flos": 714679784448.0, + "grad_norm": 0.026587079094436805, + "language_loss": 1.0723207, + "learning_rate": 0.0009997064264350292, + "loss": 1.08421838, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 1.58203125, + "step": 211, + "time_per_iteration": 2.8636813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186902, + "balance_loss_mlp": 1.02954614, + "epoch": 0.04078491727587533, + "flos": 579207187968.0, + "grad_norm": 0.028855359605628288, + "language_loss": 1.01311755, + "learning_rate": 0.0009996956550765317, + "loss": 1.02498662, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 1.57226562, + "step": 212, + "time_per_iteration": 2.6752002239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183355, + "balance_loss_mlp": 1.0270474, + "epoch": 0.0409772989611389, + "flos": 553369555968.0, + "grad_norm": 0.03615073574048419, + "language_loss": 0.96463609, + "learning_rate": 0.0009996846897189762, + "loss": 0.97646964, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 1.56152344, + "step": 213, + "time_per_iteration": 2.618417501449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180441, + "balance_loss_mlp": 1.02470577, + "epoch": 0.04116968064640246, + "flos": 556764996864.0, + "grad_norm": 0.04473264124517712, + "language_loss": 1.02233624, + "learning_rate": 0.0009996735303666193, + "loss": 1.03414059, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 1.55566406, + "step": 214, + "time_per_iteration": 2.7398550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118203, + "balance_loss_mlp": 1.026963, + "epoch": 0.041362062331666026, + "flos": 579652395264.0, + "grad_norm": 0.027182691243245845, + "language_loss": 1.04435229, + "learning_rate": 0.0009996621770237937, + "loss": 1.05617261, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 1.54882812, + "step": 215, + "time_per_iteration": 2.7773804664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182535, + "balance_loss_mlp": 1.02775347, + "epoch": 0.041554444016929586, + "flos": 612701816832.0, + "grad_norm": 0.028683660550217302, + "language_loss": 1.00582075, + "learning_rate": 0.0009996506296949073, + "loss": 1.01764607, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 1.54589844, + "step": 216, + "time_per_iteration": 2.877587080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180569, + "balance_loss_mlp": 1.02607429, + "epoch": 0.04174682570219315, + "flos": 529151393280.0, + "grad_norm": 0.031901868987761664, + "language_loss": 1.00452459, + "learning_rate": 0.0009996388883844428, + "loss": 1.01633024, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 1.54296875, + "step": 217, + "time_per_iteration": 2.6346311569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173664, + "balance_loss_mlp": 1.02002692, + "epoch": 0.04193920738745671, + "flos": 512500799232.0, + "grad_norm": 0.02715845750356807, + "language_loss": 1.03465486, + "learning_rate": 0.0009996269530969588, + "loss": 1.04639161, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 1.53417969, + "step": 218, + "time_per_iteration": 2.6205921173095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170474, + "balance_loss_mlp": 1.0176959, + "epoch": 0.04213158907272028, + "flos": 572553366528.0, + "grad_norm": 0.03606301207395498, + "language_loss": 1.04169452, + "learning_rate": 0.0009996148238370888, + "loss": 1.05339921, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 1.52539062, + "step": 219, + "time_per_iteration": 2.8047173023223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.01725543, + "epoch": 0.04232397075798384, + "flos": 965905552896.0, + "grad_norm": 0.026524392964530758, + "language_loss": 0.99111861, + "learning_rate": 0.0009996025006095421, + "loss": 1.00281417, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 1.52050781, + "step": 220, + "time_per_iteration": 3.315859317779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147995, + "balance_loss_mlp": 0.99693298, + "epoch": 0.042516352443247404, + "flos": 1472733340416.0, + "grad_norm": 0.01509407607306266, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.78931135, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 1.5078125, + "step": 221, + "time_per_iteration": 5.540910243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166927, + "balance_loss_mlp": 1.0164367, + "epoch": 0.042708734128510964, + "flos": 655892852736.0, + "grad_norm": 0.029367950869880366, + "language_loss": 0.99126619, + "learning_rate": 0.0009995772722706307, + "loss": 1.00293541, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 1.50195312, + "step": 222, + "time_per_iteration": 2.901489019393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167445, + "balance_loss_mlp": 1.01705015, + "epoch": 0.04290111581377453, + "flos": 432734643456.0, + "grad_norm": 0.04040999725558835, + "language_loss": 1.13508129, + "learning_rate": 0.0009995643671690604, + "loss": 1.1467557, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 1.50097656, + "step": 223, + "time_per_iteration": 2.5576720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168153, + "balance_loss_mlp": 1.01823533, + "epoch": 0.04309349749903809, + "flos": 645867889920.0, + "grad_norm": 0.02824445481068148, + "language_loss": 1.00763512, + "learning_rate": 0.0009995512681194023, + "loss": 1.01931667, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 1.49609375, + "step": 224, + "time_per_iteration": 2.9571568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167559, + "balance_loss_mlp": 1.01840472, + "epoch": 0.04328587918430166, + "flos": 832897153536.0, + "grad_norm": 0.025764365733734692, + "language_loss": 0.98235118, + "learning_rate": 0.0009995379751267417, + "loss": 0.99402678, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 1.48828125, + "step": 225, + "time_per_iteration": 3.2627484798431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166832, + "balance_loss_mlp": 1.01824963, + "epoch": 0.043478260869565216, + "flos": 526116589056.0, + "grad_norm": 0.03531387708455554, + "language_loss": 1.00006318, + "learning_rate": 0.0009995244881962398, + "loss": 1.01173151, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 1.48242188, + "step": 226, + "time_per_iteration": 2.624209403991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170136, + "balance_loss_mlp": 1.02212548, + "epoch": 0.04367064255482878, + "flos": 440413027584.0, + "grad_norm": 0.039279482080902435, + "language_loss": 1.01293874, + "learning_rate": 0.0009995108073331323, + "loss": 1.02464008, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 1.4765625, + "step": 227, + "time_per_iteration": 2.6042520999908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164096, + "balance_loss_mlp": 1.01742136, + "epoch": 0.04386302424009234, + "flos": 508467677184.0, + "grad_norm": 0.03801127181345805, + "language_loss": 1.03535032, + "learning_rate": 0.0009994969325427309, + "loss": 1.04699123, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 1.46582031, + "step": 228, + "time_per_iteration": 2.6691603660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163607, + "balance_loss_mlp": 1.01769507, + "epoch": 0.04405540592535591, + "flos": 541744565760.0, + "grad_norm": 0.03512041362752814, + "language_loss": 1.00143218, + "learning_rate": 0.0009994828638304218, + "loss": 1.0130682, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 1.46191406, + "step": 229, + "time_per_iteration": 2.627833366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164617, + "balance_loss_mlp": 1.01927722, + "epoch": 0.04424778761061947, + "flos": 447309867264.0, + "grad_norm": 0.03576658395893793, + "language_loss": 1.06260157, + "learning_rate": 0.0009994686012016675, + "loss": 1.07424784, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 1.45703125, + "step": 230, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159156, + "balance_loss_mlp": 1.01448417, + "epoch": 0.044440169295883035, + "flos": 701982599424.0, + "grad_norm": 0.03592315304636455, + "language_loss": 1.05298328, + "learning_rate": 0.000999454144662005, + "loss": 1.06457496, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 1.45019531, + "step": 231, + "time_per_iteration": 2.918896436691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156937, + "balance_loss_mlp": 1.01274192, + "epoch": 0.044632550981146595, + "flos": 589427536896.0, + "grad_norm": 0.032106980286660924, + "language_loss": 0.996499, + "learning_rate": 0.0009994394942170468, + "loss": 1.00806844, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 1.4453125, + "step": 232, + "time_per_iteration": 2.700378179550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169343, + "balance_loss_mlp": 1.02524316, + "epoch": 0.04482493266641016, + "flos": 555855140352.0, + "grad_norm": 0.03061962333593277, + "language_loss": 0.97402102, + "learning_rate": 0.0009994246498724808, + "loss": 0.9857145, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 1.44433594, + "step": 233, + "time_per_iteration": 2.692657232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171534, + "balance_loss_mlp": 1.02848291, + "epoch": 0.04501731435167372, + "flos": 724070956800.0, + "grad_norm": 0.03598428268947968, + "language_loss": 1.00358808, + "learning_rate": 0.00099940961163407, + "loss": 1.01530337, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 1.43359375, + "step": 234, + "time_per_iteration": 2.8496198654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167121, + "balance_loss_mlp": 1.02473748, + "epoch": 0.04520969603693728, + "flos": 512798252544.0, + "grad_norm": 0.03236637347420306, + "language_loss": 1.0231185, + "learning_rate": 0.0009993943795076528, + "loss": 1.03478956, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 1.42675781, + "step": 235, + "time_per_iteration": 2.6304001808166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157951, + "balance_loss_mlp": 1.01623452, + "epoch": 0.04540207772220085, + "flos": 365878555392.0, + "grad_norm": 0.04557463461025321, + "language_loss": 1.04854226, + "learning_rate": 0.0009993789534991427, + "loss": 1.06012177, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 1.41992188, + "step": 236, + "time_per_iteration": 2.500347852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156422, + "balance_loss_mlp": 1.01613641, + "epoch": 0.045594459407464406, + "flos": 523724323584.0, + "grad_norm": 0.028810086143122388, + "language_loss": 0.99360317, + "learning_rate": 0.0009993633336145287, + "loss": 1.00516737, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 1.40527344, + "step": 237, + "time_per_iteration": 2.6991968154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156358, + "balance_loss_mlp": 1.01664495, + "epoch": 0.04578684109272797, + "flos": 673116966144.0, + "grad_norm": 0.036851747197037266, + "language_loss": 1.03695393, + "learning_rate": 0.0009993475198598752, + "loss": 1.04851758, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 1.39941406, + "step": 238, + "time_per_iteration": 3.0150160789489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160083, + "balance_loss_mlp": 1.02084696, + "epoch": 0.04597922277799153, + "flos": 542621374464.0, + "grad_norm": 0.03967898438127139, + "language_loss": 1.00323462, + "learning_rate": 0.0009993315122413212, + "loss": 1.01483548, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 1.39453125, + "step": 239, + "time_per_iteration": 2.6226179599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.01690221, + "epoch": 0.0461716044632551, + "flos": 459994413312.0, + "grad_norm": 0.029756199222484733, + "language_loss": 1.00536144, + "learning_rate": 0.0009993153107650818, + "loss": 1.01691425, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 1.38574219, + "step": 240, + "time_per_iteration": 2.635673999786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154071, + "balance_loss_mlp": 1.01607406, + "epoch": 0.04636398614851866, + "flos": 456171261696.0, + "grad_norm": 0.03103837756937707, + "language_loss": 0.99882519, + "learning_rate": 0.0009992989154374468, + "loss": 1.01036584, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 1.38183594, + "step": 241, + "time_per_iteration": 2.5449135303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115588, + "balance_loss_mlp": 1.01836014, + "epoch": 0.046556367833782225, + "flos": 557902320384.0, + "grad_norm": 0.06487144756994469, + "language_loss": 1.0686537, + "learning_rate": 0.0009992823262647817, + "loss": 1.08021247, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 1.37695312, + "step": 242, + "time_per_iteration": 2.705120325088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011561, + "balance_loss_mlp": 1.01905739, + "epoch": 0.046748749519045785, + "flos": 594088613376.0, + "grad_norm": 0.03633512017688626, + "language_loss": 1.00915635, + "learning_rate": 0.0009992655432535264, + "loss": 1.02071738, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 1.37207031, + "step": 243, + "time_per_iteration": 2.8158721923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160044, + "balance_loss_mlp": 1.02347767, + "epoch": 0.04694113120430935, + "flos": 570942645504.0, + "grad_norm": 0.036353271768507285, + "language_loss": 1.01172018, + "learning_rate": 0.0009992485664101973, + "loss": 1.02332067, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 1.3671875, + "step": 244, + "time_per_iteration": 2.723409414291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156505, + "balance_loss_mlp": 1.0207969, + "epoch": 0.04713351288957291, + "flos": 865246689024.0, + "grad_norm": 0.05316255083066814, + "language_loss": 1.03417325, + "learning_rate": 0.000999231395741385, + "loss": 1.04573822, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 1.35839844, + "step": 245, + "time_per_iteration": 3.1441562175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155254, + "balance_loss_mlp": 1.02011812, + "epoch": 0.04732589457483648, + "flos": 538236364032.0, + "grad_norm": 0.039550829703112036, + "language_loss": 1.01375949, + "learning_rate": 0.0009992140312537557, + "loss": 1.02531195, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 1.35253906, + "step": 246, + "time_per_iteration": 2.6407320499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158298, + "balance_loss_mlp": 1.02402055, + "epoch": 0.04751827626010004, + "flos": 763272612096.0, + "grad_norm": 0.029332271702031103, + "language_loss": 0.96132767, + "learning_rate": 0.000999196472954051, + "loss": 0.97291064, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 1.34375, + "step": 247, + "time_per_iteration": 2.9791386127471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115741, + "balance_loss_mlp": 1.02313232, + "epoch": 0.0477106579453636, + "flos": 1583128462080.0, + "grad_norm": 0.019406803026512872, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.80582267, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.34375, + "step": 248, + "time_per_iteration": 5.547277927398682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115733, + "balance_loss_mlp": 1.02457833, + "epoch": 0.04790303963062716, + "flos": 458693784576.0, + "grad_norm": 0.04949407998464004, + "language_loss": 1.04053593, + "learning_rate": 0.0009991607749457578, + "loss": 1.05210924, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 1.328125, + "step": 249, + "time_per_iteration": 2.610372304916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158188, + "balance_loss_mlp": 1.02629459, + "epoch": 0.04809542131589073, + "flos": 783787186944.0, + "grad_norm": 0.03428496832179458, + "language_loss": 1.01565814, + "learning_rate": 0.0009991426352510286, + "loss": 1.02723992, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 1.31933594, + "step": 250, + "time_per_iteration": 2.9723451137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.0272516, + "epoch": 0.04828780300115429, + "flos": 560322776064.0, + "grad_norm": 0.03370153589925739, + "language_loss": 1.02967048, + "learning_rate": 0.0009991243017719422, + "loss": 1.04125512, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 1.3125, + "step": 251, + "time_per_iteration": 2.691317319869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115263, + "balance_loss_mlp": 1.02149975, + "epoch": 0.048480184686417856, + "flos": 502922989056.0, + "grad_norm": 0.033537523086657674, + "language_loss": 0.98110956, + "learning_rate": 0.0009991057745156165, + "loss": 0.99263585, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 1.31152344, + "step": 252, + "time_per_iteration": 2.615726947784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126877, + "balance_loss_mlp": 0.99641418, + "epoch": 0.048672566371681415, + "flos": 1539471810048.0, + "grad_norm": 0.00943295316075806, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.83037865, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.3046875, + "step": 253, + "time_per_iteration": 5.119662523269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155145, + "balance_loss_mlp": 1.02439594, + "epoch": 0.04886494805694498, + "flos": 538952779776.0, + "grad_norm": 0.04101934284448647, + "language_loss": 1.06555986, + "learning_rate": 0.0009990681387000943, + "loss": 1.07711136, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 1.30761719, + "step": 254, + "time_per_iteration": 2.7494144439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153346, + "balance_loss_mlp": 1.02316916, + "epoch": 0.04905732974220854, + "flos": 681485521152.0, + "grad_norm": 0.029284228955777224, + "language_loss": 1.01195645, + "learning_rate": 0.0009990490301555093, + "loss": 1.02348995, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 1.30175781, + "step": 255, + "time_per_iteration": 2.9595844745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113356, + "balance_loss_mlp": 1.00462341, + "epoch": 0.04924971142747211, + "flos": 1424277573120.0, + "grad_norm": 0.011666997955433429, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.80348712, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 1.2890625, + "step": 256, + "time_per_iteration": 4.918023347854614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126785, + "balance_loss_mlp": 0.99822998, + "epoch": 0.04944209311273567, + "flos": 1561239381504.0, + "grad_norm": 0.006197531934497474, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80369532, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.28515625, + "step": 257, + "time_per_iteration": 4.996341228485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127556, + "balance_loss_mlp": 0.99976349, + "epoch": 0.04963447479799923, + "flos": 1574173748736.0, + "grad_norm": 0.01126324229515774, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71103442, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.27734375, + "step": 258, + "time_per_iteration": 4.951507329940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167552, + "balance_loss_mlp": 1.03966403, + "epoch": 0.049826856483262794, + "flos": 626499386880.0, + "grad_norm": 0.07394024090910019, + "language_loss": 0.96613419, + "learning_rate": 0.0009989706585723202, + "loss": 0.97780967, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 1.27832031, + "step": 259, + "time_per_iteration": 2.819796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158437, + "balance_loss_mlp": 1.03073978, + "epoch": 0.05001923816852635, + "flos": 505156806912.0, + "grad_norm": 0.042054435700702504, + "language_loss": 1.02184892, + "learning_rate": 0.0009989505813633442, + "loss": 1.0334332, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 1.27636719, + "step": 260, + "time_per_iteration": 2.671597719192505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149384, + "balance_loss_mlp": 1.02206886, + "epoch": 0.05021161985378992, + "flos": 588468102912.0, + "grad_norm": 0.05343186989039486, + "language_loss": 1.02308297, + "learning_rate": 0.000998930310444573, + "loss": 1.03457689, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 1.27246094, + "step": 261, + "time_per_iteration": 2.7573728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145576, + "balance_loss_mlp": 1.01883233, + "epoch": 0.05040400153905348, + "flos": 634403292672.0, + "grad_norm": 0.052960623500171895, + "language_loss": 1.00806391, + "learning_rate": 0.0009989098458238765, + "loss": 1.01951981, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 1.26660156, + "step": 262, + "time_per_iteration": 2.7937912940979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146699, + "balance_loss_mlp": 1.02033675, + "epoch": 0.050596383224317046, + "flos": 554809190400.0, + "grad_norm": 0.04531187332347281, + "language_loss": 0.99888676, + "learning_rate": 0.0009988891875091998, + "loss": 1.0103538, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 1.26269531, + "step": 263, + "time_per_iteration": 2.811218500137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.01939976, + "epoch": 0.050788764909580605, + "flos": 550762462464.0, + "grad_norm": 0.03965392167411722, + "language_loss": 0.94696999, + "learning_rate": 0.0009988683355085636, + "loss": 0.95842183, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 1.25683594, + "step": 264, + "time_per_iteration": 2.7378242015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141586, + "balance_loss_mlp": 1.01617777, + "epoch": 0.05098114659484417, + "flos": 606345448704.0, + "grad_norm": 0.024717188615823983, + "language_loss": 1.02827787, + "learning_rate": 0.000998847289830063, + "loss": 1.03969371, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 1.25292969, + "step": 265, + "time_per_iteration": 2.8625917434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142152, + "balance_loss_mlp": 1.01693416, + "epoch": 0.05117352828010773, + "flos": 439473035520.0, + "grad_norm": 0.036783183293041616, + "language_loss": 0.96527213, + "learning_rate": 0.0009988260504818682, + "loss": 0.97669363, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 1.25097656, + "step": 266, + "time_per_iteration": 2.5658230781555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138539, + "balance_loss_mlp": 1.0135119, + "epoch": 0.0513659099653713, + "flos": 506031670272.0, + "grad_norm": 0.04116504124695153, + "language_loss": 1.03285778, + "learning_rate": 0.000998804617472226, + "loss": 1.0442431, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 1.24902344, + "step": 267, + "time_per_iteration": 2.63395094871521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138256, + "balance_loss_mlp": 1.01418352, + "epoch": 0.05155829165063486, + "flos": 696715922688.0, + "grad_norm": 0.034853618125567455, + "language_loss": 0.98327756, + "learning_rate": 0.0009987829908094568, + "loss": 0.9946602, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 1.23925781, + "step": 268, + "time_per_iteration": 2.8239262104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136912, + "balance_loss_mlp": 1.01331627, + "epoch": 0.051750673335898424, + "flos": 1350302059008.0, + "grad_norm": 0.042488112993129025, + "language_loss": 1.04893267, + "learning_rate": 0.0009987611705019569, + "loss": 1.0603019, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 1.234375, + "step": 269, + "time_per_iteration": 4.33854079246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137181, + "balance_loss_mlp": 1.01387095, + "epoch": 0.051943055021161984, + "flos": 490590331392.0, + "grad_norm": 0.037116049987967636, + "language_loss": 1.03026497, + "learning_rate": 0.0009987391565581978, + "loss": 1.04163671, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 1.23144531, + "step": 270, + "time_per_iteration": 2.609722852706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136969, + "balance_loss_mlp": 1.01365864, + "epoch": 0.05213543670642555, + "flos": 546880985088.0, + "grad_norm": 0.03927026934880779, + "language_loss": 0.95517516, + "learning_rate": 0.000998716948986726, + "loss": 0.96654487, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 1.23144531, + "step": 271, + "time_per_iteration": 2.797673225402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137765, + "balance_loss_mlp": 1.01512277, + "epoch": 0.05232781839168911, + "flos": 604673489664.0, + "grad_norm": 0.04118655717732696, + "language_loss": 0.97937191, + "learning_rate": 0.0009986945477961633, + "loss": 0.9907496, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 1.22460938, + "step": 272, + "time_per_iteration": 2.6988775730133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135438, + "balance_loss_mlp": 1.01336777, + "epoch": 0.052520200076952676, + "flos": 539656556544.0, + "grad_norm": 0.027940819886650203, + "language_loss": 1.02222085, + "learning_rate": 0.0009986719529952066, + "loss": 1.0335753, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 1.21875, + "step": 273, + "time_per_iteration": 2.9503016471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133251, + "balance_loss_mlp": 1.01175284, + "epoch": 0.052712581762216236, + "flos": 464333736960.0, + "grad_norm": 0.036678205813438995, + "language_loss": 1.02377117, + "learning_rate": 0.000998649164592628, + "loss": 1.0351038, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 1.21289062, + "step": 274, + "time_per_iteration": 2.575183868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134193, + "balance_loss_mlp": 1.01279056, + "epoch": 0.0529049634474798, + "flos": 549106054656.0, + "grad_norm": 0.029580362230619023, + "language_loss": 1.00386071, + "learning_rate": 0.0009986261825972748, + "loss": 1.01520276, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 1.21191406, + "step": 275, + "time_per_iteration": 2.781388521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136428, + "balance_loss_mlp": 1.01578796, + "epoch": 0.05309734513274336, + "flos": 619201081344.0, + "grad_norm": 0.028327187192750843, + "language_loss": 1.01742268, + "learning_rate": 0.000998603007018069, + "loss": 1.0287869, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 1.20410156, + "step": 276, + "time_per_iteration": 2.8231008052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137197, + "balance_loss_mlp": 1.01665294, + "epoch": 0.05328972681800693, + "flos": 606618602496.0, + "grad_norm": 0.02408735734832513, + "language_loss": 1.00149679, + "learning_rate": 0.0009985796378640089, + "loss": 1.01286888, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 1.203125, + "step": 277, + "time_per_iteration": 2.721719264984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136169, + "balance_loss_mlp": 1.01610124, + "epoch": 0.05348210850327049, + "flos": 605731100160.0, + "grad_norm": 0.0319931943489141, + "language_loss": 0.99697894, + "learning_rate": 0.0009985560751441665, + "loss": 1.0083406, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 1.19824219, + "step": 278, + "time_per_iteration": 2.835160255432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133809, + "balance_loss_mlp": 1.01412332, + "epoch": 0.053674490188534055, + "flos": 631998388224.0, + "grad_norm": 0.030840524384760076, + "language_loss": 1.0228467, + "learning_rate": 0.00099853231886769, + "loss": 1.03418469, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 1.19433594, + "step": 279, + "time_per_iteration": 2.8541102409362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131641, + "balance_loss_mlp": 1.01243138, + "epoch": 0.053866871873797614, + "flos": 480174596352.0, + "grad_norm": 0.030057370429500904, + "language_loss": 1.01521945, + "learning_rate": 0.0009985083690438024, + "loss": 1.02653599, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 1.18945312, + "step": 280, + "time_per_iteration": 2.778996706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133594, + "balance_loss_mlp": 1.01514757, + "epoch": 0.054059253559061174, + "flos": 789490322688.0, + "grad_norm": 0.030570218765999514, + "language_loss": 0.92515564, + "learning_rate": 0.0009984842256818016, + "loss": 0.93649161, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 1.18164062, + "step": 281, + "time_per_iteration": 3.113694429397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137735, + "balance_loss_mlp": 1.01928854, + "epoch": 0.05425163524432474, + "flos": 629506000896.0, + "grad_norm": 0.043548376252248826, + "language_loss": 1.03102541, + "learning_rate": 0.0009984598887910613, + "loss": 1.04240274, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 1.18164062, + "step": 282, + "time_per_iteration": 2.8303444385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.01504183, + "epoch": 0.0544440169295883, + "flos": 616993508352.0, + "grad_norm": 0.05077708884656826, + "language_loss": 0.98823464, + "learning_rate": 0.0009984353583810297, + "loss": 0.99956, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 1.171875, + "step": 283, + "time_per_iteration": 2.835850954055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.01315546, + "epoch": 0.05463639861485187, + "flos": 648930884352.0, + "grad_norm": 0.03524270200319673, + "language_loss": 1.0117259, + "learning_rate": 0.0009984106344612302, + "loss": 1.02302563, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 1.16503906, + "step": 284, + "time_per_iteration": 2.760528564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129453, + "balance_loss_mlp": 1.01319993, + "epoch": 0.054828780300115426, + "flos": 798585987072.0, + "grad_norm": 0.03078454247465455, + "language_loss": 0.96210134, + "learning_rate": 0.0009983857170412615, + "loss": 0.97339588, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 1.15917969, + "step": 285, + "time_per_iteration": 2.9911587238311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131741, + "balance_loss_mlp": 1.01567924, + "epoch": 0.05502116198537899, + "flos": 550799400960.0, + "grad_norm": 0.028192528419898312, + "language_loss": 0.95645988, + "learning_rate": 0.000998360606130798, + "loss": 0.96777725, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 1.15722656, + "step": 286, + "time_per_iteration": 2.8603405952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119957, + "balance_loss_mlp": 1.00475311, + "epoch": 0.05521354367064255, + "flos": 1410909659136.0, + "grad_norm": 0.016802553847575376, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.70193076, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.1484375, + "step": 287, + "time_per_iteration": 4.872994899749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139216, + "balance_loss_mlp": 1.02372622, + "epoch": 0.05540592535590612, + "flos": 646612495872.0, + "grad_norm": 0.03160477576624613, + "language_loss": 1.01500821, + "learning_rate": 0.0009983098038774552, + "loss": 1.02640033, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 1.15136719, + "step": 288, + "time_per_iteration": 2.7645044326782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119652, + "balance_loss_mlp": 1.00521088, + "epoch": 0.05559830704116968, + "flos": 1514318512896.0, + "grad_norm": 0.011772143096286682, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79289877, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.140625, + "step": 289, + "time_per_iteration": 4.783201456069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150059, + "balance_loss_mlp": 1.03542745, + "epoch": 0.055790688726433245, + "flos": 509335737600.0, + "grad_norm": 0.037615798403722346, + "language_loss": 1.00063777, + "learning_rate": 0.0009982582277800948, + "loss": 1.01213825, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 1.14257812, + "step": 290, + "time_per_iteration": 2.5825588703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142418, + "balance_loss_mlp": 1.02873969, + "epoch": 0.055983070411696804, + "flos": 659075410944.0, + "grad_norm": 0.03490310528255379, + "language_loss": 1.06654799, + "learning_rate": 0.0009982321495648908, + "loss": 1.07797217, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 1.13671875, + "step": 291, + "time_per_iteration": 2.8099231719970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137522, + "balance_loss_mlp": 1.02470279, + "epoch": 0.05617545209696037, + "flos": 588476851200.0, + "grad_norm": 0.035465642673631545, + "language_loss": 0.97683877, + "learning_rate": 0.0009982058779188115, + "loss": 0.98821402, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 1.13183594, + "step": 292, + "time_per_iteration": 2.7125580310821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136096, + "balance_loss_mlp": 1.02384841, + "epoch": 0.05636783378222393, + "flos": 612788332800.0, + "grad_norm": 0.032210362870472055, + "language_loss": 1.05647731, + "learning_rate": 0.0009981794128520567, + "loss": 1.06783831, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 1.12597656, + "step": 293, + "time_per_iteration": 2.7916390895843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135514, + "balance_loss_mlp": 1.0241251, + "epoch": 0.0565602154674875, + "flos": 669424071936.0, + "grad_norm": 0.03595229916115603, + "language_loss": 1.02550793, + "learning_rate": 0.000998152754374901, + "loss": 1.03686309, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 1.1171875, + "step": 294, + "time_per_iteration": 2.8770558834075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134115, + "balance_loss_mlp": 1.0227263, + "epoch": 0.05675259715275106, + "flos": 618365101824.0, + "grad_norm": 0.028486588423889302, + "language_loss": 0.98274708, + "learning_rate": 0.0009981259024976943, + "loss": 0.99408829, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 1.1171875, + "step": 295, + "time_per_iteration": 2.729853630065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133246, + "balance_loss_mlp": 1.02204788, + "epoch": 0.05694497883801462, + "flos": 753154330368.0, + "grad_norm": 0.04188437456637708, + "language_loss": 0.968624, + "learning_rate": 0.0009980988572308612, + "loss": 0.97995651, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 1.11523438, + "step": 296, + "time_per_iteration": 3.0135345458984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132496, + "balance_loss_mlp": 1.02187026, + "epoch": 0.05713736052327818, + "flos": 713382067968.0, + "grad_norm": 0.0305883196599643, + "language_loss": 0.9903996, + "learning_rate": 0.0009980716185849015, + "loss": 1.0017246, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 1.109375, + "step": 297, + "time_per_iteration": 2.9962668418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.01865172, + "epoch": 0.05732974220854175, + "flos": 469936750848.0, + "grad_norm": 0.029025981508343963, + "language_loss": 0.95620793, + "learning_rate": 0.0009980441865703904, + "loss": 0.96750069, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 1.109375, + "step": 298, + "time_per_iteration": 2.67486572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.0163666, + "epoch": 0.05752212389380531, + "flos": 602541739008.0, + "grad_norm": 0.028406065642448373, + "language_loss": 1.04190016, + "learning_rate": 0.000998016561197978, + "loss": 1.05316436, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 1.10351562, + "step": 299, + "time_per_iteration": 2.7435965538024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127499, + "balance_loss_mlp": 1.01773107, + "epoch": 0.057714505579068875, + "flos": 679950622464.0, + "grad_norm": 0.02999406165417261, + "language_loss": 0.957955, + "learning_rate": 0.0009979887424783895, + "loss": 0.96922994, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 1.10058594, + "step": 300, + "time_per_iteration": 2.868412494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127678, + "balance_loss_mlp": 1.01800561, + "epoch": 0.057906887264332435, + "flos": 597012602112.0, + "grad_norm": 0.033381964405594114, + "language_loss": 0.95279002, + "learning_rate": 0.0009979607304224248, + "loss": 0.96406674, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 1.09960938, + "step": 301, + "time_per_iteration": 2.7196099758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127179, + "balance_loss_mlp": 1.01760185, + "epoch": 0.058099268949596, + "flos": 553165421568.0, + "grad_norm": 0.029428698202492602, + "language_loss": 1.02305853, + "learning_rate": 0.000997932525040959, + "loss": 1.03433037, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 1.09863281, + "step": 302, + "time_per_iteration": 2.645131826400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126073, + "balance_loss_mlp": 1.0166868, + "epoch": 0.05829165063485956, + "flos": 509231725056.0, + "grad_norm": 0.033454482596205204, + "language_loss": 1.04832363, + "learning_rate": 0.000997904126344943, + "loss": 1.05958426, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 1.09667969, + "step": 303, + "time_per_iteration": 2.60955810546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125584, + "balance_loss_mlp": 1.0157212, + "epoch": 0.05848403232012313, + "flos": 616363608576.0, + "grad_norm": 0.0319979050325151, + "language_loss": 1.00779867, + "learning_rate": 0.0009978755343454018, + "loss": 1.01905453, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 1.1015625, + "step": 304, + "time_per_iteration": 2.733825206756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124254, + "balance_loss_mlp": 1.01467645, + "epoch": 0.05867641400538669, + "flos": 501079943424.0, + "grad_norm": 0.03385536533959698, + "language_loss": 1.01509869, + "learning_rate": 0.0009978467490534355, + "loss": 1.0263412, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 1.09863281, + "step": 305, + "time_per_iteration": 2.6263206005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121932, + "balance_loss_mlp": 1.01292717, + "epoch": 0.05886879569065025, + "flos": 532379638272.0, + "grad_norm": 0.03088897761094542, + "language_loss": 0.98605353, + "learning_rate": 0.00099781777048022, + "loss": 0.99727285, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 1.09277344, + "step": 306, + "time_per_iteration": 2.7351841926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122107, + "balance_loss_mlp": 1.01329267, + "epoch": 0.05906117737591381, + "flos": 490041111552.0, + "grad_norm": 0.034758856969872284, + "language_loss": 0.99957371, + "learning_rate": 0.0009977885986370057, + "loss": 1.01079476, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 1.09082031, + "step": 307, + "time_per_iteration": 2.566316843032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120423, + "balance_loss_mlp": 1.01199007, + "epoch": 0.05925355906117737, + "flos": 592710216960.0, + "grad_norm": 0.0408216139096099, + "language_loss": 0.95604599, + "learning_rate": 0.000997759233535118, + "loss": 0.96725023, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 1.08691406, + "step": 308, + "time_per_iteration": 2.781667470932007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119623, + "balance_loss_mlp": 1.01147592, + "epoch": 0.05944594074644094, + "flos": 564788466432.0, + "grad_norm": 0.03543125546238922, + "language_loss": 1.01945186, + "learning_rate": 0.0009977296751859576, + "loss": 1.03064811, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 1.08398438, + "step": 309, + "time_per_iteration": 2.778700828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121487, + "balance_loss_mlp": 1.0137223, + "epoch": 0.0596383224317045, + "flos": 539808201216.0, + "grad_norm": 0.03208598270087784, + "language_loss": 1.03591859, + "learning_rate": 0.0009976999236009998, + "loss": 1.04713345, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 1.08007812, + "step": 310, + "time_per_iteration": 2.790116786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121449, + "balance_loss_mlp": 1.01387453, + "epoch": 0.059830704116968066, + "flos": 562053060864.0, + "grad_norm": 0.03260901983169028, + "language_loss": 1.05564129, + "learning_rate": 0.0009976699787917955, + "loss": 1.06685579, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 1.078125, + "step": 311, + "time_per_iteration": 2.6586148738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108932, + "balance_loss_mlp": 1.00326538, + "epoch": 0.060023085802231625, + "flos": 1574050294272.0, + "grad_norm": 0.018314702584398344, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.74551928, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 1.05859375, + "step": 312, + "time_per_iteration": 4.943182945251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128012, + "balance_loss_mlp": 1.02101004, + "epoch": 0.06021546748749519, + "flos": 483628363008.0, + "grad_norm": 0.04396023920554742, + "language_loss": 0.97026515, + "learning_rate": 0.0009976095095472243, + "loss": 0.98154521, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 1.07226562, + "step": 313, + "time_per_iteration": 2.619016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131165, + "balance_loss_mlp": 1.02425838, + "epoch": 0.06040784917275875, + "flos": 621424205568.0, + "grad_norm": 0.03687701456451143, + "language_loss": 0.97965562, + "learning_rate": 0.0009975789851353334, + "loss": 0.99096727, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 1.07128906, + "step": 314, + "time_per_iteration": 2.8331894874572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125758, + "balance_loss_mlp": 1.01980519, + "epoch": 0.06060023085802232, + "flos": 484603348224.0, + "grad_norm": 0.029408756794299912, + "language_loss": 1.00726843, + "learning_rate": 0.0009975482675461487, + "loss": 1.01852608, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 1.06152344, + "step": 315, + "time_per_iteration": 2.659079074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125971, + "balance_loss_mlp": 1.02001762, + "epoch": 0.06079261254328588, + "flos": 582986598144.0, + "grad_norm": 0.027344501346145803, + "language_loss": 0.98408186, + "learning_rate": 0.0009975173567915952, + "loss": 0.99534154, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 1.06152344, + "step": 316, + "time_per_iteration": 2.6947872638702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123736, + "balance_loss_mlp": 1.01873684, + "epoch": 0.060984994228549444, + "flos": 689009348352.0, + "grad_norm": 0.03553374767777348, + "language_loss": 0.92618632, + "learning_rate": 0.000997486252883674, + "loss": 0.93742371, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 1.05175781, + "step": 317, + "time_per_iteration": 2.8523428440093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123139, + "balance_loss_mlp": 1.01861632, + "epoch": 0.061177375913813004, + "flos": 1316749104384.0, + "grad_norm": 0.03506621320439297, + "language_loss": 0.97693729, + "learning_rate": 0.0009974549558344602, + "loss": 0.98816866, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 1.046875, + "step": 318, + "time_per_iteration": 3.705524206161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121913, + "balance_loss_mlp": 1.01805806, + "epoch": 0.06136975759907657, + "flos": 575401532928.0, + "grad_norm": 0.03493031867187039, + "language_loss": 1.07333064, + "learning_rate": 0.000997423465656105, + "loss": 1.08454978, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 1.04003906, + "step": 319, + "time_per_iteration": 2.75838565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119126, + "balance_loss_mlp": 1.01546133, + "epoch": 0.06156213928434013, + "flos": 528565234944.0, + "grad_norm": 0.037170039701900144, + "language_loss": 1.04350638, + "learning_rate": 0.0009973917823608335, + "loss": 1.05469775, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 1.03808594, + "step": 320, + "time_per_iteration": 2.6494460105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117676, + "balance_loss_mlp": 1.01458335, + "epoch": 0.061754520969603696, + "flos": 496590920448.0, + "grad_norm": 0.030464742512101767, + "language_loss": 0.98981547, + "learning_rate": 0.0009973599059609462, + "loss": 1.00099218, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 1.03222656, + "step": 321, + "time_per_iteration": 2.7119081020355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116635, + "balance_loss_mlp": 1.01344728, + "epoch": 0.061946902654867256, + "flos": 441044872704.0, + "grad_norm": 0.031106795532346753, + "language_loss": 0.97035432, + "learning_rate": 0.000997327836468819, + "loss": 0.98152065, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 1.03320312, + "step": 322, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121262, + "balance_loss_mlp": 1.01836073, + "epoch": 0.06213928434013082, + "flos": 600043515648.0, + "grad_norm": 0.031546338171402045, + "language_loss": 1.00120687, + "learning_rate": 0.000997295573896902, + "loss": 1.01241946, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 1.03027344, + "step": 323, + "time_per_iteration": 2.825425624847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113502, + "balance_loss_mlp": 1.01126862, + "epoch": 0.06233166602539438, + "flos": 1453116961536.0, + "grad_norm": 0.009515746361157745, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82309544, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 1.0234375, + "step": 324, + "time_per_iteration": 4.7325074672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.0074234, + "epoch": 0.06252404771065795, + "flos": 1466631651072.0, + "grad_norm": 0.010337204897298672, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.79680836, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 1.015625, + "step": 325, + "time_per_iteration": 4.845058917999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131262, + "balance_loss_mlp": 1.02950513, + "epoch": 0.06271642939592151, + "flos": 465236790528.0, + "grad_norm": 0.04479189972062717, + "language_loss": 0.94122899, + "learning_rate": 0.000997197627828043, + "loss": 0.95254159, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 1.01855469, + "step": 326, + "time_per_iteration": 2.531477689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136139, + "balance_loss_mlp": 1.03466833, + "epoch": 0.06290881108118507, + "flos": 533432391168.0, + "grad_norm": 0.03210871152906133, + "language_loss": 0.89633012, + "learning_rate": 0.0009971645930629716, + "loss": 0.9076916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 1.015625, + "step": 327, + "time_per_iteration": 2.766155481338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131438, + "balance_loss_mlp": 1.0305388, + "epoch": 0.06310119276644863, + "flos": 674768516352.0, + "grad_norm": 0.03217671154768682, + "language_loss": 1.03418863, + "learning_rate": 0.0009971313652814872, + "loss": 1.0455029, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 1.00976562, + "step": 328, + "time_per_iteration": 2.818718433380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125209, + "balance_loss_mlp": 1.02440596, + "epoch": 0.0632935744517122, + "flos": 772051381248.0, + "grad_norm": 0.03902843256426295, + "language_loss": 1.00692391, + "learning_rate": 0.0009970979444964903, + "loss": 1.01817608, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 1.00878906, + "step": 329, + "time_per_iteration": 2.9847218990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119216, + "balance_loss_mlp": 1.01869905, + "epoch": 0.06348595613697576, + "flos": 562975556352.0, + "grad_norm": 0.040034835413812295, + "language_loss": 1.01797342, + "learning_rate": 0.0009970643307209556, + "loss": 1.02916563, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 1.00585938, + "step": 330, + "time_per_iteration": 2.817711353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112644, + "balance_loss_mlp": 1.01250839, + "epoch": 0.06367833782223932, + "flos": 677384358144.0, + "grad_norm": 0.031424074947949916, + "language_loss": 0.98358697, + "learning_rate": 0.0009970305239679334, + "loss": 0.99471337, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 1.00195312, + "step": 331, + "time_per_iteration": 2.8216280937194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.01247358, + "epoch": 0.06387071950750288, + "flos": 496349847552.0, + "grad_norm": 0.04016029313197435, + "language_loss": 1.03082633, + "learning_rate": 0.0009969965242505483, + "loss": 1.04195428, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 1.00390625, + "step": 332, + "time_per_iteration": 2.631326675415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113411, + "balance_loss_mlp": 1.01317954, + "epoch": 0.06406310119276645, + "flos": 534557075712.0, + "grad_norm": 0.03761595064373852, + "language_loss": 0.99054992, + "learning_rate": 0.0009969623315820007, + "loss": 1.00168395, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 1.00292969, + "step": 333, + "time_per_iteration": 2.6700048446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.01369655, + "epoch": 0.06425548287803001, + "flos": 457165688832.0, + "grad_norm": 0.0356255093132357, + "language_loss": 0.99075055, + "learning_rate": 0.000996927945975565, + "loss": 1.00188696, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.99951172, + "step": 334, + "time_per_iteration": 2.567225933074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112774, + "balance_loss_mlp": 1.01282871, + "epoch": 0.06444786456329357, + "flos": 561123762432.0, + "grad_norm": 0.034265188200332725, + "language_loss": 0.96451521, + "learning_rate": 0.0009968933674445906, + "loss": 0.97564298, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.99951172, + "step": 335, + "time_per_iteration": 2.6834452152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110954, + "balance_loss_mlp": 1.01100898, + "epoch": 0.06464024624855713, + "flos": 667357449984.0, + "grad_norm": 0.026754476738251005, + "language_loss": 0.980811, + "learning_rate": 0.0009968585960025028, + "loss": 0.99192053, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.99853516, + "step": 336, + "time_per_iteration": 2.9675402641296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112488, + "balance_loss_mlp": 1.01368713, + "epoch": 0.0648326279338207, + "flos": 1524558303744.0, + "grad_norm": 0.027483244216433014, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.78765678, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.98632812, + "step": 337, + "time_per_iteration": 4.80242133140564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.01540756, + "epoch": 0.06502500961908426, + "flos": 1145216581632.0, + "grad_norm": 0.03509421691107687, + "language_loss": 0.96500707, + "learning_rate": 0.0009967884744390583, + "loss": 0.97615772, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.99414062, + "step": 338, + "time_per_iteration": 3.517488479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118827, + "balance_loss_mlp": 1.01945412, + "epoch": 0.06521739130434782, + "flos": 583694265600.0, + "grad_norm": 0.03507378265000135, + "language_loss": 0.97375119, + "learning_rate": 0.0009967531243449256, + "loss": 0.98493946, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.9921875, + "step": 339, + "time_per_iteration": 2.713430404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119012, + "balance_loss_mlp": 1.02002037, + "epoch": 0.06540977298961138, + "flos": 498659487744.0, + "grad_norm": 0.03215705196534619, + "language_loss": 1.04762673, + "learning_rate": 0.000996717581394126, + "loss": 1.05881691, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.98876953, + "step": 340, + "time_per_iteration": 2.5391135215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116775, + "balance_loss_mlp": 1.01787901, + "epoch": 0.06560215467487496, + "flos": 543904506624.0, + "grad_norm": 0.030763143460584817, + "language_loss": 1.05044627, + "learning_rate": 0.000996681845600459, + "loss": 1.06161404, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.98632812, + "step": 341, + "time_per_iteration": 2.670804262161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118249, + "balance_loss_mlp": 1.01963949, + "epoch": 0.06579453636013852, + "flos": 414351819264.0, + "grad_norm": 0.040583240554979534, + "language_loss": 0.9744029, + "learning_rate": 0.0009966459169777982, + "loss": 0.98558539, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.98388672, + "step": 342, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115105, + "balance_loss_mlp": 1.01706719, + "epoch": 0.06598691804540208, + "flos": 561681730560.0, + "grad_norm": 0.04164342519277061, + "language_loss": 1.05655766, + "learning_rate": 0.0009966097955400924, + "loss": 1.06770873, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.97949219, + "step": 343, + "time_per_iteration": 2.666548728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112413, + "balance_loss_mlp": 1.01532912, + "epoch": 0.06617929973066564, + "flos": 573302830080.0, + "grad_norm": 0.03386977599556249, + "language_loss": 0.99970496, + "learning_rate": 0.0009965734813013652, + "loss": 1.01082909, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.97070312, + "step": 344, + "time_per_iteration": 2.8448328971862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109743, + "balance_loss_mlp": 1.01261127, + "epoch": 0.06637168141592921, + "flos": 491465194752.0, + "grad_norm": 0.03376822413453626, + "language_loss": 1.02026749, + "learning_rate": 0.0009965369742757151, + "loss": 1.03136492, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.97119141, + "step": 345, + "time_per_iteration": 2.568521738052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.01176453, + "epoch": 0.06656406310119277, + "flos": 1081039518720.0, + "grad_norm": 0.03449730062562062, + "language_loss": 0.98245382, + "learning_rate": 0.0009965002744773152, + "loss": 0.99353665, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.96484375, + "step": 346, + "time_per_iteration": 3.501471519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109602, + "balance_loss_mlp": 1.01347148, + "epoch": 0.06675644478645633, + "flos": 514723923456.0, + "grad_norm": 0.029121068034632647, + "language_loss": 0.95998263, + "learning_rate": 0.0009964633819204139, + "loss": 0.97107863, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.9609375, + "step": 347, + "time_per_iteration": 2.6675100326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093636, + "balance_loss_mlp": 0.9986496, + "epoch": 0.06694882647171989, + "flos": 1450537079808.0, + "grad_norm": 0.008592618933675954, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.82894754, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.94921875, + "step": 348, + "time_per_iteration": 4.92915415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_mlp": 0.99832916, + "epoch": 0.06714120815698346, + "flos": 1555400152320.0, + "grad_norm": 0.006174818833869298, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76247013, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.94726562, + "step": 349, + "time_per_iteration": 4.8783159255981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112042, + "balance_loss_mlp": 1.01719952, + "epoch": 0.06733358984224702, + "flos": 881617326336.0, + "grad_norm": 0.039044792628629706, + "language_loss": 0.95966816, + "learning_rate": 0.000996351547842304, + "loss": 0.97078854, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.94775391, + "step": 350, + "time_per_iteration": 3.151158094406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106972, + "balance_loss_mlp": 1.01222503, + "epoch": 0.06752597152751058, + "flos": 519918668544.0, + "grad_norm": 0.04011951728876299, + "language_loss": 0.94198334, + "learning_rate": 0.0009963138843953744, + "loss": 0.953053, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.94677734, + "step": 351, + "time_per_iteration": 2.6077194213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111271, + "balance_loss_mlp": 1.01661849, + "epoch": 0.06771835321277414, + "flos": 540883308288.0, + "grad_norm": 0.02897454745239974, + "language_loss": 0.98297268, + "learning_rate": 0.000996276028262306, + "loss": 0.99408543, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.94580078, + "step": 352, + "time_per_iteration": 2.8440346717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115128, + "balance_loss_mlp": 1.02052331, + "epoch": 0.0679107348980377, + "flos": 461615827968.0, + "grad_norm": 0.03358261828070724, + "language_loss": 1.05270672, + "learning_rate": 0.0009962379794577964, + "loss": 1.06385791, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.9453125, + "step": 353, + "time_per_iteration": 2.6153147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115897, + "balance_loss_mlp": 1.02129257, + "epoch": 0.06810311658330127, + "flos": 637208684544.0, + "grad_norm": 0.03193767698980152, + "language_loss": 0.94629884, + "learning_rate": 0.000996199737996617, + "loss": 0.95745778, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.9453125, + "step": 354, + "time_per_iteration": 2.9557363986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114833, + "balance_loss_mlp": 1.0208956, + "epoch": 0.06829549826856483, + "flos": 465627562752.0, + "grad_norm": 0.034421374529713736, + "language_loss": 1.03816652, + "learning_rate": 0.0009961613038936149, + "loss": 1.04931474, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.93847656, + "step": 355, + "time_per_iteration": 2.583648204803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112591, + "balance_loss_mlp": 1.01879704, + "epoch": 0.06848787995382839, + "flos": 635897362176.0, + "grad_norm": 0.027271592740405557, + "language_loss": 0.95725697, + "learning_rate": 0.000996122677163711, + "loss": 0.96838284, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.93701172, + "step": 356, + "time_per_iteration": 2.7997536659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113642, + "balance_loss_mlp": 1.02022934, + "epoch": 0.06868026163909195, + "flos": 807781773312.0, + "grad_norm": 0.036098266403844226, + "language_loss": 1.02058005, + "learning_rate": 0.000996083857821902, + "loss": 1.03171647, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.93310547, + "step": 357, + "time_per_iteration": 3.0117554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113245, + "balance_loss_mlp": 1.01978505, + "epoch": 0.06887264332435553, + "flos": 440152512768.0, + "grad_norm": 0.03587140172627376, + "language_loss": 1.00045025, + "learning_rate": 0.0009960448458832588, + "loss": 1.01158273, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.93359375, + "step": 358, + "time_per_iteration": 2.6948373317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110172, + "balance_loss_mlp": 1.01714087, + "epoch": 0.06906502500961909, + "flos": 485786358528.0, + "grad_norm": 0.028895953236024122, + "language_loss": 0.99980301, + "learning_rate": 0.000996005641362927, + "loss": 1.01090467, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.92919922, + "step": 359, + "time_per_iteration": 2.600889205932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110333, + "balance_loss_mlp": 1.01715922, + "epoch": 0.06925740669488265, + "flos": 734886212352.0, + "grad_norm": 0.03093408458560108, + "language_loss": 1.02453041, + "learning_rate": 0.0009959662442761274, + "loss": 1.0356338, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.93066406, + "step": 360, + "time_per_iteration": 2.9324746131896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107676, + "balance_loss_mlp": 1.01445436, + "epoch": 0.0694497883801462, + "flos": 553571745024.0, + "grad_norm": 0.03028505188811882, + "language_loss": 0.95860314, + "learning_rate": 0.000995926654638155, + "loss": 0.96967983, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.93115234, + "step": 361, + "time_per_iteration": 2.8280868530273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104746, + "balance_loss_mlp": 1.01157248, + "epoch": 0.06964217006540978, + "flos": 679244900352.0, + "grad_norm": 0.03450824772288923, + "language_loss": 0.98644811, + "learning_rate": 0.00099588687246438, + "loss": 0.99749553, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.93066406, + "step": 362, + "time_per_iteration": 2.8108932971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108438, + "balance_loss_mlp": 1.01535928, + "epoch": 0.06983455175067334, + "flos": 525261167616.0, + "grad_norm": 0.03621302361184023, + "language_loss": 1.06105995, + "learning_rate": 0.0009958468977702471, + "loss": 1.07214439, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.9296875, + "step": 363, + "time_per_iteration": 2.6087372303009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135422, + "balance_loss_mlp": 1.04272461, + "epoch": 0.0700269334359369, + "flos": 1580176283136.0, + "grad_norm": 0.03651647631774479, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.80870128, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.92578125, + "step": 364, + "time_per_iteration": 4.806072235107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104861, + "balance_loss_mlp": 1.01254511, + "epoch": 0.07021931512120046, + "flos": 1014858050304.0, + "grad_norm": 0.04058448706036458, + "language_loss": 0.94071019, + "learning_rate": 0.0009957663708830612, + "loss": 0.9517588, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.921875, + "step": 365, + "time_per_iteration": 3.30859637260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110656, + "balance_loss_mlp": 1.01862633, + "epoch": 0.07041169680646403, + "flos": 824432367360.0, + "grad_norm": 0.04186203278400794, + "language_loss": 0.98041129, + "learning_rate": 0.0009957258187212714, + "loss": 0.9915179, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.91894531, + "step": 366, + "time_per_iteration": 3.00058913230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097015, + "balance_loss_mlp": 1.00565338, + "epoch": 0.07060407849172759, + "flos": 1417293250560.0, + "grad_norm": 0.011820269564466843, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80291873, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.91210938, + "step": 367, + "time_per_iteration": 4.794500827789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113703, + "balance_loss_mlp": 1.02186394, + "epoch": 0.07079646017699115, + "flos": 513942379008.0, + "grad_norm": 0.041641563183133855, + "language_loss": 0.94691038, + "learning_rate": 0.0009956441370400167, + "loss": 0.95804739, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.91699219, + "step": 368, + "time_per_iteration": 2.63948917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111577, + "balance_loss_mlp": 1.02436066, + "epoch": 0.07098884186225471, + "flos": 541549179648.0, + "grad_norm": 0.03426405251061256, + "language_loss": 1.00885093, + "learning_rate": 0.0009956030075522636, + "loss": 1.02000868, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.91259766, + "step": 369, + "time_per_iteration": 2.74157452583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107449, + "balance_loss_mlp": 1.01613438, + "epoch": 0.07118122354751828, + "flos": 549739845120.0, + "grad_norm": 0.030296400642036637, + "language_loss": 1.0031743, + "learning_rate": 0.0009955616856543587, + "loss": 1.01424885, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.91162109, + "step": 370, + "time_per_iteration": 2.6210479736328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105786, + "balance_loss_mlp": 1.01475775, + "epoch": 0.07137360523278184, + "flos": 622077437952.0, + "grad_norm": 0.029509682347833893, + "language_loss": 0.92550498, + "learning_rate": 0.0009955201713623448, + "loss": 0.93656284, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.90869141, + "step": 371, + "time_per_iteration": 2.757277011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.00284576, + "epoch": 0.0715659869180454, + "flos": 1505976202752.0, + "grad_norm": 0.005566886599578838, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77765214, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.89648438, + "step": 372, + "time_per_iteration": 4.947838306427002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126764, + "balance_loss_mlp": 1.0361172, + "epoch": 0.07175836860330896, + "flos": 496482050304.0, + "grad_norm": 0.040308561934975694, + "language_loss": 1.05629396, + "learning_rate": 0.0009954365656605333, + "loss": 1.06756163, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.90478516, + "step": 373, + "time_per_iteration": 2.5537302494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124141, + "balance_loss_mlp": 1.03416181, + "epoch": 0.07195075028857253, + "flos": 787082505984.0, + "grad_norm": 0.034789914575730614, + "language_loss": 0.98912442, + "learning_rate": 0.0009953944742831947, + "loss": 1.00036585, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.89892578, + "step": 374, + "time_per_iteration": 2.976074695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106044, + "balance_loss_mlp": 1.01678061, + "epoch": 0.0721431319738361, + "flos": 594347182848.0, + "grad_norm": 0.029628456658550576, + "language_loss": 1.02558136, + "learning_rate": 0.0009953521905766642, + "loss": 1.03664172, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.89404297, + "step": 375, + "time_per_iteration": 2.9556005001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101275, + "balance_loss_mlp": 1.01234496, + "epoch": 0.07233551365909965, + "flos": 549329630976.0, + "grad_norm": 0.034208323574026145, + "language_loss": 1.01073325, + "learning_rate": 0.0009953097145573577, + "loss": 1.02174592, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.89111328, + "step": 376, + "time_per_iteration": 2.6449482440948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106433, + "balance_loss_mlp": 1.01759815, + "epoch": 0.07252789534436321, + "flos": 959169106176.0, + "grad_norm": 0.031040198427254525, + "language_loss": 0.98588479, + "learning_rate": 0.000995267046241766, + "loss": 0.99694908, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.89013672, + "step": 377, + "time_per_iteration": 3.2564361095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106989, + "balance_loss_mlp": 1.01877415, + "epoch": 0.07272027702962677, + "flos": 508656260352.0, + "grad_norm": 0.029229214223645432, + "language_loss": 0.98238575, + "learning_rate": 0.0009952241856464547, + "loss": 0.99345565, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.88378906, + "step": 378, + "time_per_iteration": 2.5843191146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108111, + "balance_loss_mlp": 1.02013505, + "epoch": 0.07291265871489035, + "flos": 613552380672.0, + "grad_norm": 0.03194005050639913, + "language_loss": 1.05557346, + "learning_rate": 0.0009951811327880632, + "loss": 1.06665444, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.88134766, + "step": 379, + "time_per_iteration": 2.727449655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107323, + "balance_loss_mlp": 1.01934636, + "epoch": 0.0731050404001539, + "flos": 496742565120.0, + "grad_norm": 0.03092115392183015, + "language_loss": 0.98400533, + "learning_rate": 0.0009951378876833063, + "loss": 0.99507862, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.88134766, + "step": 380, + "time_per_iteration": 2.5320205688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101258, + "balance_loss_mlp": 1.01332915, + "epoch": 0.07329742208541747, + "flos": 641130991104.0, + "grad_norm": 0.032065094183830696, + "language_loss": 1.04703462, + "learning_rate": 0.0009950944503489736, + "loss": 1.05804706, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.88085938, + "step": 381, + "time_per_iteration": 2.7422876358032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102134, + "balance_loss_mlp": 1.01453876, + "epoch": 0.07348980377068103, + "flos": 817741607424.0, + "grad_norm": 0.030510114485064205, + "language_loss": 0.99112171, + "learning_rate": 0.0009950508208019285, + "loss": 1.00214303, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.87744141, + "step": 382, + "time_per_iteration": 3.046475410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101212, + "balance_loss_mlp": 1.01323569, + "epoch": 0.0736821854559446, + "flos": 509670129408.0, + "grad_norm": 0.035756321159612754, + "language_loss": 1.03789318, + "learning_rate": 0.0009950069990591096, + "loss": 1.04890537, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.88134766, + "step": 383, + "time_per_iteration": 2.620088577270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113144, + "balance_loss_mlp": 1.02674103, + "epoch": 0.07387456714120816, + "flos": 1558050987264.0, + "grad_norm": 0.043940663043905655, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77514511, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.86523438, + "step": 384, + "time_per_iteration": 4.87653374671936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121594, + "balance_loss_mlp": 1.03299809, + "epoch": 0.07406694882647172, + "flos": 526644421632.0, + "grad_norm": 0.039102279996233, + "language_loss": 0.96614265, + "learning_rate": 0.0009949187790542777, + "loss": 0.97735858, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.88769531, + "step": 385, + "time_per_iteration": 2.734100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112471, + "balance_loss_mlp": 1.03625691, + "epoch": 0.07425933051173528, + "flos": 498824738304.0, + "grad_norm": 0.03701278047407747, + "language_loss": 0.92462552, + "learning_rate": 0.0009948743808265148, + "loss": 0.93587261, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.88623047, + "step": 386, + "time_per_iteration": 2.7154581546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125455, + "balance_loss_mlp": 1.03704965, + "epoch": 0.07445171219699885, + "flos": 506057915136.0, + "grad_norm": 0.06663512882119103, + "language_loss": 1.02268195, + "learning_rate": 0.0009948297904714782, + "loss": 1.0339365, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.88574219, + "step": 387, + "time_per_iteration": 2.68532133102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112575, + "balance_loss_mlp": 1.03777313, + "epoch": 0.07464409388226241, + "flos": 555117337344.0, + "grad_norm": 0.036483324457394946, + "language_loss": 0.94151849, + "learning_rate": 0.0009947850080064796, + "loss": 0.95277596, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.88134766, + "step": 388, + "time_per_iteration": 2.789128303527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121204, + "balance_loss_mlp": 1.03370392, + "epoch": 0.07483647556752597, + "flos": 778275546624.0, + "grad_norm": 0.0421926900222792, + "language_loss": 0.99476451, + "learning_rate": 0.0009947400334489047, + "loss": 1.00597644, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.87646484, + "step": 389, + "time_per_iteration": 2.9937496185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011085, + "balance_loss_mlp": 1.02133441, + "epoch": 0.07502885725278953, + "flos": 613682638080.0, + "grad_norm": 0.0417493031738284, + "language_loss": 0.90741575, + "learning_rate": 0.0009946948668162145, + "loss": 0.91850078, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.87304688, + "step": 390, + "time_per_iteration": 2.7264010906219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101473, + "balance_loss_mlp": 1.01502275, + "epoch": 0.0752212389380531, + "flos": 689856021504.0, + "grad_norm": 0.03330838563423677, + "language_loss": 0.95001, + "learning_rate": 0.0009946495081259441, + "loss": 0.9610247, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.86572266, + "step": 391, + "time_per_iteration": 2.832472085952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097898, + "balance_loss_mlp": 1.01182938, + "epoch": 0.07541362062331666, + "flos": 767052022272.0, + "grad_norm": 0.03859494705227578, + "language_loss": 0.99014449, + "learning_rate": 0.0009946039573957035, + "loss": 1.00112355, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.86181641, + "step": 392, + "time_per_iteration": 2.925933361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101692, + "balance_loss_mlp": 1.01576602, + "epoch": 0.07560600230858022, + "flos": 589909682688.0, + "grad_norm": 0.039112379024015986, + "language_loss": 0.95485294, + "learning_rate": 0.000994558214643177, + "loss": 0.9658699, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.86035156, + "step": 393, + "time_per_iteration": 2.763448476791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095538, + "balance_loss_mlp": 1.00961244, + "epoch": 0.07579838399384378, + "flos": 751146034176.0, + "grad_norm": 0.03818992224284351, + "language_loss": 0.96862066, + "learning_rate": 0.000994512279886123, + "loss": 0.97957599, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.86035156, + "step": 394, + "time_per_iteration": 3.143615245819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101397, + "balance_loss_mlp": 1.01561391, + "epoch": 0.07599076567910736, + "flos": 524551554816.0, + "grad_norm": 0.030240351127206026, + "language_loss": 0.96659988, + "learning_rate": 0.0009944661531423758, + "loss": 0.97761387, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.85888672, + "step": 395, + "time_per_iteration": 2.6748764514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107513, + "balance_loss_mlp": 1.02206361, + "epoch": 0.07618314736437092, + "flos": 552186545664.0, + "grad_norm": 0.03358451790414236, + "language_loss": 0.95614338, + "learning_rate": 0.000994419834429843, + "loss": 0.96721858, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.85546875, + "step": 396, + "time_per_iteration": 2.6525089740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105372, + "balance_loss_mlp": 1.01987493, + "epoch": 0.07637552904963447, + "flos": 699433831680.0, + "grad_norm": 0.04315212632526892, + "language_loss": 1.00552011, + "learning_rate": 0.0009943733237665069, + "loss": 1.01657379, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.85595703, + "step": 397, + "time_per_iteration": 2.8678157329559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097353, + "balance_loss_mlp": 1.01218963, + "epoch": 0.07656791073489803, + "flos": 580636128768.0, + "grad_norm": 0.029538416941692198, + "language_loss": 0.99224108, + "learning_rate": 0.0009943266211704248, + "loss": 1.0032146, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.85253906, + "step": 398, + "time_per_iteration": 3.0023248195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099387, + "balance_loss_mlp": 1.01460528, + "epoch": 0.0767602924201616, + "flos": 418037910528.0, + "grad_norm": 0.03167845871290285, + "language_loss": 1.01143491, + "learning_rate": 0.000994279726659728, + "loss": 1.02242875, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.84863281, + "step": 399, + "time_per_iteration": 2.527693271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107007, + "balance_loss_mlp": 1.02246368, + "epoch": 0.07695267410542517, + "flos": 483888877824.0, + "grad_norm": 0.03414294034973106, + "language_loss": 0.9968133, + "learning_rate": 0.0009942326402526231, + "loss": 1.00788331, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.84619141, + "step": 400, + "time_per_iteration": 2.5610573291778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112837, + "balance_loss_mlp": 1.02848434, + "epoch": 0.07714505579068873, + "flos": 532027749888.0, + "grad_norm": 0.030264499227930883, + "language_loss": 0.97403878, + "learning_rate": 0.0009941853619673902, + "loss": 0.98516715, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.84423828, + "step": 401, + "time_per_iteration": 2.680175542831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107236, + "balance_loss_mlp": 1.02302694, + "epoch": 0.07733743747595229, + "flos": 806440315392.0, + "grad_norm": 0.03979329481069023, + "language_loss": 1.01160502, + "learning_rate": 0.0009941378918223844, + "loss": 1.02267742, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.84277344, + "step": 402, + "time_per_iteration": 3.0908427238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098686, + "balance_loss_mlp": 1.01447606, + "epoch": 0.07752981916121585, + "flos": 623614281984.0, + "grad_norm": 0.03310929598543939, + "language_loss": 0.93567806, + "learning_rate": 0.0009940902298360354, + "loss": 0.94666493, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.84277344, + "step": 403, + "time_per_iteration": 2.7569308280944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.01048076, + "epoch": 0.07772220084647942, + "flos": 729543713280.0, + "grad_norm": 0.03955766616265138, + "language_loss": 1.03173304, + "learning_rate": 0.0009940423760268473, + "loss": 1.04268289, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.84570312, + "step": 404, + "time_per_iteration": 2.8456103801727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098252, + "balance_loss_mlp": 1.01375628, + "epoch": 0.07791458253174298, + "flos": 556469488896.0, + "grad_norm": 0.042207617679060144, + "language_loss": 0.96929657, + "learning_rate": 0.0009939943304133982, + "loss": 0.98027909, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.84570312, + "step": 405, + "time_per_iteration": 2.615145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104796, + "balance_loss_mlp": 1.02044404, + "epoch": 0.07810696421700654, + "flos": 554235671040.0, + "grad_norm": 0.04104566792755741, + "language_loss": 1.03659868, + "learning_rate": 0.0009939460930143416, + "loss": 1.04764676, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.84423828, + "step": 406, + "time_per_iteration": 2.6304614543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_mlp": 1.01745594, + "epoch": 0.0782993459022701, + "flos": 651879172608.0, + "grad_norm": 0.0317151282671847, + "language_loss": 0.97752666, + "learning_rate": 0.0009938976638484043, + "loss": 0.98854232, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.84179688, + "step": 407, + "time_per_iteration": 2.9032115936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109564, + "balance_loss_mlp": 1.01205039, + "epoch": 0.07849172758753367, + "flos": 497161527552.0, + "grad_norm": 0.04013855375776475, + "language_loss": 0.97246277, + "learning_rate": 0.0009938490429343887, + "loss": 0.98341918, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.83642578, + "step": 408, + "time_per_iteration": 2.5688796043395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.01236188, + "epoch": 0.07868410927279723, + "flos": 579076930560.0, + "grad_norm": 0.0397915036848884, + "language_loss": 0.97571141, + "learning_rate": 0.0009938002302911709, + "loss": 0.98666751, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.83300781, + "step": 409, + "time_per_iteration": 2.75036883354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096533, + "balance_loss_mlp": 1.01365864, + "epoch": 0.07887649095806079, + "flos": 524067463680.0, + "grad_norm": 0.03678821175613874, + "language_loss": 1.00230122, + "learning_rate": 0.0009937512259377015, + "loss": 1.01326644, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.82910156, + "step": 410, + "time_per_iteration": 2.6584975719451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110197, + "balance_loss_mlp": 1.01938236, + "epoch": 0.07906887264332435, + "flos": 558438901248.0, + "grad_norm": 0.04956969404692801, + "language_loss": 0.989124, + "learning_rate": 0.000993702029893006, + "loss": 1.00014377, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.82617188, + "step": 411, + "time_per_iteration": 2.7666263580322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102247, + "balance_loss_mlp": 1.0196116, + "epoch": 0.07926125432858792, + "flos": 823364063232.0, + "grad_norm": 0.03322797228086769, + "language_loss": 0.99091381, + "learning_rate": 0.0009936526421761838, + "loss": 1.00193632, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.82666016, + "step": 412, + "time_per_iteration": 3.0222113132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102099, + "balance_loss_mlp": 1.01955855, + "epoch": 0.07945363601385148, + "flos": 563394518784.0, + "grad_norm": 0.04210923401756456, + "language_loss": 1.01423764, + "learning_rate": 0.000993603062806409, + "loss": 1.02525866, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.82568359, + "step": 413, + "time_per_iteration": 2.713226079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100317, + "balance_loss_mlp": 1.0176332, + "epoch": 0.07964601769911504, + "flos": 518885357568.0, + "grad_norm": 0.041362228888401006, + "language_loss": 1.04903626, + "learning_rate": 0.0009935532918029298, + "loss": 1.06003952, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.82714844, + "step": 414, + "time_per_iteration": 2.59602689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095709, + "balance_loss_mlp": 1.01326394, + "epoch": 0.0798383993843786, + "flos": 540301040640.0, + "grad_norm": 0.030384950019726516, + "language_loss": 0.97377884, + "learning_rate": 0.0009935033291850694, + "loss": 0.98473597, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.82470703, + "step": 415, + "time_per_iteration": 2.6417808532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094851, + "balance_loss_mlp": 1.013026, + "epoch": 0.08003078106964218, + "flos": 486122695680.0, + "grad_norm": 0.03579523867672845, + "language_loss": 1.00004411, + "learning_rate": 0.0009934531749722247, + "loss": 1.01099253, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.81835938, + "step": 416, + "time_per_iteration": 2.593029737472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095566, + "balance_loss_mlp": 1.01383638, + "epoch": 0.08022316275490574, + "flos": 519276129792.0, + "grad_norm": 0.0354518245662521, + "language_loss": 0.98370755, + "learning_rate": 0.0009934028291838672, + "loss": 0.99466318, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.81738281, + "step": 417, + "time_per_iteration": 2.7351250648498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096643, + "balance_loss_mlp": 1.01496112, + "epoch": 0.0804155444401693, + "flos": 495047273472.0, + "grad_norm": 0.032920982329526526, + "language_loss": 0.93668723, + "learning_rate": 0.0009933522918395433, + "loss": 0.94765365, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.81689453, + "step": 418, + "time_per_iteration": 2.6427221298217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114799, + "balance_loss_mlp": 1.03316498, + "epoch": 0.08060792612543285, + "flos": 1584856801536.0, + "grad_norm": 0.029973653623271358, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79365897, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.81640625, + "step": 419, + "time_per_iteration": 4.8632917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.01569724, + "epoch": 0.08080030781069643, + "flos": 526359607296.0, + "grad_norm": 0.04163447523548115, + "language_loss": 1.12134457, + "learning_rate": 0.000993250642561551, + "loss": 1.13230991, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.80810547, + "step": 420, + "time_per_iteration": 2.608396053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109578, + "balance_loss_mlp": 1.01505113, + "epoch": 0.08099268949595999, + "flos": 547757793792.0, + "grad_norm": 0.04746808509414602, + "language_loss": 0.97398257, + "learning_rate": 0.0009931995306673466, + "loss": 0.98494035, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.80712891, + "step": 421, + "time_per_iteration": 2.7215850353240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097341, + "balance_loss_mlp": 1.01670778, + "epoch": 0.08118507118122355, + "flos": 511374169344.0, + "grad_norm": 0.04020038552675014, + "language_loss": 1.02514148, + "learning_rate": 0.000993148227296103, + "loss": 1.03611493, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.80615234, + "step": 422, + "time_per_iteration": 2.625366449356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010968, + "balance_loss_mlp": 1.01607168, + "epoch": 0.08137745286648711, + "flos": 722002389504.0, + "grad_norm": 0.03556088777041087, + "language_loss": 0.90137196, + "learning_rate": 0.000993096732467738, + "loss": 0.91233999, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.80712891, + "step": 423, + "time_per_iteration": 2.9795689582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092343, + "balance_loss_mlp": 1.0118531, + "epoch": 0.08156983455175067, + "flos": 680818682880.0, + "grad_norm": 0.04422604915428747, + "language_loss": 0.99073571, + "learning_rate": 0.0009930450462022435, + "loss": 1.00165915, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.8046875, + "step": 424, + "time_per_iteration": 2.879889726638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.00783539, + "epoch": 0.08176221623701424, + "flos": 1456591137024.0, + "grad_norm": 0.006453860192715822, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.8027699, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.79296875, + "step": 425, + "time_per_iteration": 4.908784627914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.01541877, + "epoch": 0.0819545979222778, + "flos": 1558885044480.0, + "grad_norm": 0.04271462185638088, + "language_loss": 0.96659774, + "learning_rate": 0.0009929410994402065, + "loss": 0.9775573, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.80517578, + "step": 426, + "time_per_iteration": 3.7266876697540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100573, + "balance_loss_mlp": 1.02013052, + "epoch": 0.08214697960754136, + "flos": 513801427968.0, + "grad_norm": 0.040597463537132866, + "language_loss": 1.00489211, + "learning_rate": 0.0009928888389840196, + "loss": 1.01589799, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.80419922, + "step": 427, + "time_per_iteration": 2.695010185241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098577, + "balance_loss_mlp": 1.01822996, + "epoch": 0.08233936129280492, + "flos": 596222309376.0, + "grad_norm": 0.03622779747664415, + "language_loss": 1.02622843, + "learning_rate": 0.0009928363871714147, + "loss": 1.03721428, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.80322266, + "step": 428, + "time_per_iteration": 2.66733455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097324, + "balance_loss_mlp": 1.01721525, + "epoch": 0.08253174297806849, + "flos": 573165769728.0, + "grad_norm": 0.028981657602537042, + "language_loss": 0.97141832, + "learning_rate": 0.0009927837440227556, + "loss": 0.98239154, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.80078125, + "step": 429, + "time_per_iteration": 2.8499114513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093938, + "balance_loss_mlp": 1.01392436, + "epoch": 0.08272412466333205, + "flos": 624643702272.0, + "grad_norm": 0.031878488957356683, + "language_loss": 0.91184896, + "learning_rate": 0.0009927309095584798, + "loss": 0.92278832, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.79980469, + "step": 430, + "time_per_iteration": 3.020768165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097624, + "balance_loss_mlp": 1.01756275, + "epoch": 0.08291650634859561, + "flos": 514995131904.0, + "grad_norm": 0.040558959270141796, + "language_loss": 1.03523278, + "learning_rate": 0.0009926778837991, + "loss": 1.0462091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.80029297, + "step": 431, + "time_per_iteration": 2.609189033508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101176, + "balance_loss_mlp": 1.02125835, + "epoch": 0.08310888803385917, + "flos": 668542405632.0, + "grad_norm": 0.035092839201242565, + "language_loss": 1.01323938, + "learning_rate": 0.000992624666765202, + "loss": 1.0242511, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.79882812, + "step": 432, + "time_per_iteration": 2.817399501800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101699, + "balance_loss_mlp": 1.02154219, + "epoch": 0.08330126971912274, + "flos": 584491361280.0, + "grad_norm": 0.0354530922421884, + "language_loss": 0.98992586, + "learning_rate": 0.000992571258477447, + "loss": 1.00094295, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.80126953, + "step": 433, + "time_per_iteration": 2.777506113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010961, + "balance_loss_mlp": 1.0161345, + "epoch": 0.0834936514043863, + "flos": 562498268160.0, + "grad_norm": 0.03167346665720251, + "language_loss": 0.92772877, + "learning_rate": 0.0009925176589565695, + "loss": 0.93868983, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.79931641, + "step": 434, + "time_per_iteration": 2.801501512527466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093857, + "balance_loss_mlp": 1.01398647, + "epoch": 0.08368603308964986, + "flos": 495513868032.0, + "grad_norm": 0.03411426988917409, + "language_loss": 1.03318536, + "learning_rate": 0.0009924638682233791, + "loss": 1.04412401, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.79833984, + "step": 435, + "time_per_iteration": 2.573282241821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.01512909, + "epoch": 0.08387841477491342, + "flos": 1391811397632.0, + "grad_norm": 0.030642245427906535, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.8065716, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.7734375, + "step": 436, + "time_per_iteration": 4.596274375915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099407, + "balance_loss_mlp": 1.02006125, + "epoch": 0.084070796460177, + "flos": 800355155712.0, + "grad_norm": 0.040681894877429646, + "language_loss": 0.92768085, + "learning_rate": 0.0009923557132036668, + "loss": 0.93867493, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.79296875, + "step": 437, + "time_per_iteration": 3.0366878509521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110232, + "balance_loss_mlp": 1.02364242, + "epoch": 0.08426317814544056, + "flos": 560097254400.0, + "grad_norm": 0.034275916488964116, + "language_loss": 0.96774155, + "learning_rate": 0.0009923013489591345, + "loss": 0.97876477, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.78613281, + "step": 438, + "time_per_iteration": 2.8060851097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_mlp": 1.0219903, + "epoch": 0.08445555983070412, + "flos": 811884881664.0, + "grad_norm": 0.035250716051411925, + "language_loss": 0.95655745, + "learning_rate": 0.0009922467935862681, + "loss": 0.96756417, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.78613281, + "step": 439, + "time_per_iteration": 3.116757869720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098598, + "balance_loss_mlp": 1.0204916, + "epoch": 0.08464794151596768, + "flos": 511170034944.0, + "grad_norm": 0.03561138790794706, + "language_loss": 0.98418635, + "learning_rate": 0.0009921920471062478, + "loss": 0.99517238, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.78027344, + "step": 440, + "time_per_iteration": 2.6008944511413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093389, + "balance_loss_mlp": 1.01561701, + "epoch": 0.08484032320123125, + "flos": 557474609664.0, + "grad_norm": 0.02914226137027636, + "language_loss": 0.96590662, + "learning_rate": 0.0009921371095403281, + "loss": 0.97684056, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.77685547, + "step": 441, + "time_per_iteration": 2.638679265975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_mlp": 1.01697087, + "epoch": 0.08503270488649481, + "flos": 528361100544.0, + "grad_norm": 0.02987504029564206, + "language_loss": 0.99685514, + "learning_rate": 0.0009920819809098379, + "loss": 1.00780344, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.77783203, + "step": 442, + "time_per_iteration": 2.5915398597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089542, + "balance_loss_mlp": 1.01172209, + "epoch": 0.08522508657175837, + "flos": 615386678016.0, + "grad_norm": 0.03983619354546574, + "language_loss": 0.95535469, + "learning_rate": 0.0009920266612361798, + "loss": 0.96625006, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.77734375, + "step": 443, + "time_per_iteration": 2.724025249481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091681, + "balance_loss_mlp": 1.01371801, + "epoch": 0.08541746825702193, + "flos": 620987746560.0, + "grad_norm": 0.032808156584867194, + "language_loss": 0.9504559, + "learning_rate": 0.0009919711505408308, + "loss": 0.96137273, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.77880859, + "step": 444, + "time_per_iteration": 2.780973434448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087177, + "balance_loss_mlp": 1.00926137, + "epoch": 0.08560984994228549, + "flos": 483888877824.0, + "grad_norm": 0.03232110076143325, + "language_loss": 0.92813373, + "learning_rate": 0.000991915448845342, + "loss": 0.93900549, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.77832031, + "step": 445, + "time_per_iteration": 2.6011459827423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090069, + "balance_loss_mlp": 1.01243973, + "epoch": 0.08580223162754906, + "flos": 518177690112.0, + "grad_norm": 0.03377956208163177, + "language_loss": 1.02285504, + "learning_rate": 0.000991859556171339, + "loss": 1.03375578, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.77539062, + "step": 446, + "time_per_iteration": 2.606220006942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.01086187, + "epoch": 0.08599461331281262, + "flos": 532520589312.0, + "grad_norm": 0.037753212584348855, + "language_loss": 1.04541254, + "learning_rate": 0.000991803472540521, + "loss": 1.0562979, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.77587891, + "step": 447, + "time_per_iteration": 2.625401735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088712, + "balance_loss_mlp": 1.01113105, + "epoch": 0.08618699499807618, + "flos": 791634712320.0, + "grad_norm": 0.030920782852134367, + "language_loss": 0.98781657, + "learning_rate": 0.0009917471979746615, + "loss": 0.99870372, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.77490234, + "step": 448, + "time_per_iteration": 3.0066978931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.01195049, + "epoch": 0.08637937668333974, + "flos": 567115603200.0, + "grad_norm": 0.03238149886931097, + "language_loss": 0.98317528, + "learning_rate": 0.0009916907324956086, + "loss": 0.99407488, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.77929688, + "step": 449, + "time_per_iteration": 2.7561135292053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091057, + "balance_loss_mlp": 1.01333201, + "epoch": 0.08657175836860331, + "flos": 446118108672.0, + "grad_norm": 0.029046506526173844, + "language_loss": 0.94927382, + "learning_rate": 0.0009916340761252837, + "loss": 0.96018445, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.77636719, + "step": 450, + "time_per_iteration": 2.6452889442443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089525, + "balance_loss_mlp": 1.01222932, + "epoch": 0.08676414005386687, + "flos": 845589480960.0, + "grad_norm": 0.032144406787761336, + "language_loss": 0.91630232, + "learning_rate": 0.0009915772288856832, + "loss": 0.92719758, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.77197266, + "step": 451, + "time_per_iteration": 3.0991322994232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108797, + "balance_loss_mlp": 1.01086605, + "epoch": 0.08695652173913043, + "flos": 604484906496.0, + "grad_norm": 0.025568476728402203, + "language_loss": 0.93134868, + "learning_rate": 0.000991520190798877, + "loss": 0.94222844, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.77001953, + "step": 452, + "time_per_iteration": 2.833534002304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093662, + "balance_loss_mlp": 1.01660514, + "epoch": 0.08714890342439399, + "flos": 732001107456.0, + "grad_norm": 0.03795734255344977, + "language_loss": 1.02428043, + "learning_rate": 0.0009914629618870089, + "loss": 1.03521705, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.76953125, + "step": 453, + "time_per_iteration": 2.9043643474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.02319336, + "epoch": 0.08734128510965757, + "flos": 1485456770304.0, + "grad_norm": 0.019964198948139205, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79774594, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.75390625, + "step": 454, + "time_per_iteration": 2.093019723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_mlp": 1.01278687, + "epoch": 0.08753366679492113, + "flos": 1526269146624.0, + "grad_norm": 0.012226751630218, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82515901, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.75, + "step": 455, + "time_per_iteration": 4.905871391296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091043, + "balance_loss_mlp": 1.01379561, + "epoch": 0.08772604848018468, + "flos": 722525364480.0, + "grad_norm": 0.044152825797527884, + "language_loss": 0.95217329, + "learning_rate": 0.0009912901304235883, + "loss": 0.96308374, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.77148438, + "step": 456, + "time_per_iteration": 2.850330352783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090876, + "balance_loss_mlp": 1.01396191, + "epoch": 0.08791843016544824, + "flos": 709467542784.0, + "grad_norm": 0.038854584599924205, + "language_loss": 0.92178857, + "learning_rate": 0.000991232138434397, + "loss": 0.9326973, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.76806641, + "step": 457, + "time_per_iteration": 2.868957757949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091399, + "balance_loss_mlp": 1.01477098, + "epoch": 0.08811081185071182, + "flos": 474022362624.0, + "grad_norm": 0.04035146689108268, + "language_loss": 0.99321103, + "learning_rate": 0.000991173955731976, + "loss": 1.00412512, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.76513672, + "step": 458, + "time_per_iteration": 2.6747970581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089272, + "balance_loss_mlp": 1.01288271, + "epoch": 0.08830319353597538, + "flos": 686315738880.0, + "grad_norm": 0.033089720334054364, + "language_loss": 1.03213239, + "learning_rate": 0.0009911155823389137, + "loss": 1.04302514, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.76269531, + "step": 459, + "time_per_iteration": 2.9462268352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085442, + "balance_loss_mlp": 1.00881398, + "epoch": 0.08849557522123894, + "flos": 574609294848.0, + "grad_norm": 0.035557366742091014, + "language_loss": 0.99025905, + "learning_rate": 0.000991057018277873, + "loss": 1.00111353, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.76513672, + "step": 460, + "time_per_iteration": 2.6903369426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.00968456, + "epoch": 0.0886879569065025, + "flos": 565628336640.0, + "grad_norm": 0.039664118418905284, + "language_loss": 1.00002789, + "learning_rate": 0.0009909982635715898, + "loss": 1.01089334, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.76757812, + "step": 461, + "time_per_iteration": 2.620046615600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010895, + "balance_loss_mlp": 1.0128243, + "epoch": 0.08888033859176607, + "flos": 564957607680.0, + "grad_norm": 0.03231802322071402, + "language_loss": 0.98670942, + "learning_rate": 0.0009909393182428751, + "loss": 0.99760437, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.765625, + "step": 462, + "time_per_iteration": 2.6466307640075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090991, + "balance_loss_mlp": 1.01412475, + "epoch": 0.08907272027702963, + "flos": 466743499008.0, + "grad_norm": 0.03344290639259395, + "language_loss": 0.93214953, + "learning_rate": 0.000990880182314614, + "loss": 0.94305944, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.76757812, + "step": 463, + "time_per_iteration": 2.6666839122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086555, + "balance_loss_mlp": 1.0100224, + "epoch": 0.08926510196229319, + "flos": 682844475648.0, + "grad_norm": 0.03261982194681884, + "language_loss": 0.93093467, + "learning_rate": 0.0009908208558097643, + "loss": 0.94180012, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.76416016, + "step": 464, + "time_per_iteration": 2.9068925380706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089482, + "balance_loss_mlp": 1.01323605, + "epoch": 0.08945748364755675, + "flos": 597822336768.0, + "grad_norm": 0.03309433671244878, + "language_loss": 0.95414662, + "learning_rate": 0.000990761338751359, + "loss": 0.9650414, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.76123047, + "step": 465, + "time_per_iteration": 2.774606227874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079613, + "balance_loss_mlp": 1.00732422, + "epoch": 0.08964986533282032, + "flos": 1589343879168.0, + "grad_norm": 0.03434681355524106, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74739242, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.72460938, + "step": 466, + "time_per_iteration": 4.996358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092523, + "balance_loss_mlp": 1.01646745, + "epoch": 0.08984224701808388, + "flos": 534550272768.0, + "grad_norm": 0.03379784984504044, + "language_loss": 0.98391378, + "learning_rate": 0.0009906417330663815, + "loss": 0.99483901, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.75927734, + "step": 467, + "time_per_iteration": 2.6774964332580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092653, + "balance_loss_mlp": 1.01678836, + "epoch": 0.09003462870334744, + "flos": 479850898176.0, + "grad_norm": 0.04271038491910547, + "language_loss": 0.94838965, + "learning_rate": 0.0009905816444862442, + "loss": 0.95931625, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.75732422, + "step": 468, + "time_per_iteration": 2.6558451652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092919, + "balance_loss_mlp": 1.01691103, + "epoch": 0.090227010388611, + "flos": 654903283200.0, + "grad_norm": 0.031716132767048565, + "language_loss": 0.92225289, + "learning_rate": 0.0009905213654454216, + "loss": 0.933182, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.75878906, + "step": 469, + "time_per_iteration": 2.9322757720947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.01796389, + "epoch": 0.09041939207387456, + "flos": 619359528960.0, + "grad_norm": 0.03474651138537023, + "language_loss": 1.00819349, + "learning_rate": 0.0009904608959673158, + "loss": 1.01913023, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.75585938, + "step": 470, + "time_per_iteration": 2.7938003540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091787, + "balance_loss_mlp": 1.01620793, + "epoch": 0.09061177375913813, + "flos": 455296398336.0, + "grad_norm": 0.04023106246537731, + "language_loss": 1.00852847, + "learning_rate": 0.000990400236075403, + "loss": 1.01944637, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.75439453, + "step": 471, + "time_per_iteration": 2.5231049060821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085542, + "balance_loss_mlp": 1.01024961, + "epoch": 0.0908041554444017, + "flos": 545309147904.0, + "grad_norm": 0.036372029021066864, + "language_loss": 0.97571105, + "learning_rate": 0.0009903393857932338, + "loss": 0.98656648, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.75146484, + "step": 472, + "time_per_iteration": 2.700449228286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082684, + "balance_loss_mlp": 1.00786841, + "epoch": 0.09099653712966525, + "flos": 565467943680.0, + "grad_norm": 0.03263919317425628, + "language_loss": 0.95124531, + "learning_rate": 0.0009902783451444317, + "loss": 0.96207213, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.74658203, + "step": 473, + "time_per_iteration": 2.7006537914276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_mlp": 1.00667381, + "epoch": 0.09118891881492881, + "flos": 475502826240.0, + "grad_norm": 0.036465550100162274, + "language_loss": 0.98778975, + "learning_rate": 0.0009902171141526956, + "loss": 0.99860233, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.74414062, + "step": 474, + "time_per_iteration": 2.565852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081522, + "balance_loss_mlp": 1.00732613, + "epoch": 0.09138130050019239, + "flos": 546991800576.0, + "grad_norm": 0.03189281102051162, + "language_loss": 0.86324012, + "learning_rate": 0.000990155692841797, + "loss": 0.87405533, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.74023438, + "step": 475, + "time_per_iteration": 2.9694621562957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.0079515, + "epoch": 0.09157368218545595, + "flos": 733974410496.0, + "grad_norm": 0.03574286330183218, + "language_loss": 0.98287529, + "learning_rate": 0.0009900940812355818, + "loss": 0.99369442, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.73779297, + "step": 476, + "time_per_iteration": 2.8549702167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.00835192, + "epoch": 0.0917660638707195, + "flos": 612073862400.0, + "grad_norm": 0.03800316101532587, + "language_loss": 0.95275486, + "learning_rate": 0.00099003227935797, + "loss": 0.96357656, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.73632812, + "step": 477, + "time_per_iteration": 2.709808349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084092, + "balance_loss_mlp": 1.01051593, + "epoch": 0.09195844555598306, + "flos": 657019482624.0, + "grad_norm": 0.03875864993538346, + "language_loss": 0.99037415, + "learning_rate": 0.000989970287232955, + "loss": 1.0012151, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.73486328, + "step": 478, + "time_per_iteration": 2.7670538425445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.01252699, + "epoch": 0.09215082724124664, + "flos": 477541257984.0, + "grad_norm": 0.03367109557456403, + "language_loss": 0.95731258, + "learning_rate": 0.0009899081048846043, + "loss": 0.96817166, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.73339844, + "step": 479, + "time_per_iteration": 2.588352918624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_mlp": 1.01208997, + "epoch": 0.0923432089265102, + "flos": 525326296320.0, + "grad_norm": 0.0462740033589213, + "language_loss": 1.00606585, + "learning_rate": 0.0009898457323370593, + "loss": 1.01691723, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.73046875, + "step": 480, + "time_per_iteration": 2.5808160305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082132, + "balance_loss_mlp": 1.00936687, + "epoch": 0.09253559061177376, + "flos": 546639912192.0, + "grad_norm": 0.03676160983227949, + "language_loss": 0.9798522, + "learning_rate": 0.000989783169614535, + "loss": 0.99067354, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.72900391, + "step": 481, + "time_per_iteration": 2.624483108520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097145, + "balance_loss_mlp": 1.02485657, + "epoch": 0.09272797229703732, + "flos": 1541337209856.0, + "grad_norm": 0.023489610904585654, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79849905, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.72460938, + "step": 482, + "time_per_iteration": 4.897305965423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085543, + "balance_loss_mlp": 1.01330173, + "epoch": 0.09292035398230089, + "flos": 691065276672.0, + "grad_norm": 0.04252493421314706, + "language_loss": 0.95552129, + "learning_rate": 0.000989657473741779, + "loss": 0.96637678, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.72412109, + "step": 483, + "time_per_iteration": 2.8165738582611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084133, + "balance_loss_mlp": 1.01184416, + "epoch": 0.09311273566756445, + "flos": 510823004160.0, + "grad_norm": 0.03895509426778844, + "language_loss": 0.97422099, + "learning_rate": 0.0009895943406403465, + "loss": 0.98506236, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.72460938, + "step": 484, + "time_per_iteration": 2.7523326873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086134, + "balance_loss_mlp": 1.01384509, + "epoch": 0.09330511735282801, + "flos": 660584064768.0, + "grad_norm": 0.04754513437429821, + "language_loss": 0.90526009, + "learning_rate": 0.0009895310174615338, + "loss": 0.91612148, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.72460938, + "step": 485, + "time_per_iteration": 2.843790292739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070969, + "balance_loss_mlp": 0.99982452, + "epoch": 0.09349749903809157, + "flos": 1456024420608.0, + "grad_norm": 0.007982392205281765, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76789486, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.71289062, + "step": 486, + "time_per_iteration": 4.649716138839722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.00877845, + "epoch": 0.09368988072335514, + "flos": 521900719872.0, + "grad_norm": 0.0379904908867083, + "language_loss": 0.94096279, + "learning_rate": 0.0009894038009701782, + "loss": 0.95177054, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.72167969, + "step": 487, + "time_per_iteration": 2.615767002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085039, + "balance_loss_mlp": 1.012941, + "epoch": 0.0938822624086187, + "flos": 498752806656.0, + "grad_norm": 0.041516659048387576, + "language_loss": 0.97017074, + "learning_rate": 0.0009893399077070253, + "loss": 0.98102111, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.72265625, + "step": 488, + "time_per_iteration": 2.592867612838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.01828361, + "epoch": 0.09407464409388226, + "flos": 534224629248.0, + "grad_norm": 0.031087819309936707, + "language_loss": 0.91152203, + "learning_rate": 0.0009892758244652718, + "loss": 0.92242396, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.72070312, + "step": 489, + "time_per_iteration": 2.702681541442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080571, + "balance_loss_mlp": 1.00852132, + "epoch": 0.09426702577914582, + "flos": 587091651840.0, + "grad_norm": 0.037758062155454256, + "language_loss": 0.98290044, + "learning_rate": 0.0009892115512697968, + "loss": 0.99370617, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.72216797, + "step": 490, + "time_per_iteration": 2.7222015857696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.01649261, + "epoch": 0.0944594074644094, + "flos": 504464690688.0, + "grad_norm": 0.03400132145466818, + "language_loss": 0.98617911, + "learning_rate": 0.0009891470881455537, + "loss": 0.99706453, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.72216797, + "step": 491, + "time_per_iteration": 2.6978650093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.01626599, + "epoch": 0.09465178914967295, + "flos": 572114962176.0, + "grad_norm": 0.03537229102294209, + "language_loss": 0.97051454, + "learning_rate": 0.0009890824351175692, + "loss": 0.98139298, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.71728516, + "step": 492, + "time_per_iteration": 2.7183802127838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.01590919, + "epoch": 0.09484417083493651, + "flos": 550419322368.0, + "grad_norm": 0.028677449722299516, + "language_loss": 1.00688422, + "learning_rate": 0.0009890175922109435, + "loss": 1.01776004, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.71826172, + "step": 493, + "time_per_iteration": 2.680469512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082456, + "balance_loss_mlp": 1.01088285, + "epoch": 0.09503655252020007, + "flos": 825272237568.0, + "grad_norm": 0.03488638846892438, + "language_loss": 0.98808897, + "learning_rate": 0.0009889525594508513, + "loss": 0.99891359, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.71728516, + "step": 494, + "time_per_iteration": 2.983400344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083066, + "balance_loss_mlp": 1.01154041, + "epoch": 0.09522893420546363, + "flos": 405518615040.0, + "grad_norm": 0.028649644857800794, + "language_loss": 0.9245472, + "learning_rate": 0.0009888873368625404, + "loss": 0.93537784, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.71679688, + "step": 495, + "time_per_iteration": 2.497526168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108369, + "balance_loss_mlp": 1.01206875, + "epoch": 0.0954213158907272, + "flos": 692257035264.0, + "grad_norm": 0.03396045626839725, + "language_loss": 0.96602595, + "learning_rate": 0.0009888219244713326, + "loss": 0.97686291, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.71777344, + "step": 496, + "time_per_iteration": 2.8588504791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.01018417, + "epoch": 0.09561369757599077, + "flos": 520075170816.0, + "grad_norm": 0.039869543083186736, + "language_loss": 0.97707164, + "learning_rate": 0.0009887563223026229, + "loss": 0.98788875, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.71679688, + "step": 497, + "time_per_iteration": 2.6856894493103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075874, + "balance_loss_mlp": 1.00644684, + "epoch": 0.09580607926125433, + "flos": 1388784363264.0, + "grad_norm": 0.01625235818526382, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80144036, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.6953125, + "step": 498, + "time_per_iteration": 4.882593393325806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086748, + "balance_loss_mlp": 1.0150795, + "epoch": 0.09599846094651789, + "flos": 718826634240.0, + "grad_norm": 0.03326061844711544, + "language_loss": 0.95632416, + "learning_rate": 0.0009886245487346482, + "loss": 0.9671917, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.71826172, + "step": 499, + "time_per_iteration": 3.0426785945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087163, + "balance_loss_mlp": 1.01568544, + "epoch": 0.09619084263178146, + "flos": 386894717952.0, + "grad_norm": 0.04298067648683731, + "language_loss": 0.98954022, + "learning_rate": 0.0009885583773865422, + "loss": 1.00041187, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.71630859, + "step": 500, + "time_per_iteration": 2.452941417694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.01467967, + "epoch": 0.09638322431704502, + "flos": 535173369600.0, + "grad_norm": 0.04172266818012015, + "language_loss": 0.95971203, + "learning_rate": 0.0009884920163632524, + "loss": 0.97057414, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.71679688, + "step": 501, + "time_per_iteration": 2.657940626144409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080406, + "balance_loss_mlp": 1.00911927, + "epoch": 0.09657560600230858, + "flos": 501657353472.0, + "grad_norm": 0.041437287127294276, + "language_loss": 0.9960922, + "learning_rate": 0.000988425465690543, + "loss": 1.00689626, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.71435547, + "step": 502, + "time_per_iteration": 2.5540428161621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.00642741, + "epoch": 0.09676798768757214, + "flos": 530332458240.0, + "grad_norm": 0.03187665411612151, + "language_loss": 0.96807587, + "learning_rate": 0.0009883587253942505, + "loss": 0.97885495, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.71630859, + "step": 503, + "time_per_iteration": 2.7744338512420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086901, + "balance_loss_mlp": 1.01542282, + "epoch": 0.09696036937283571, + "flos": 464557313280.0, + "grad_norm": 0.038653015311582224, + "language_loss": 1.0234406, + "learning_rate": 0.0009882917955002862, + "loss": 1.03430974, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.71630859, + "step": 504, + "time_per_iteration": 2.500669479370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_mlp": 1.01074982, + "epoch": 0.09715275105809927, + "flos": 536011294464.0, + "grad_norm": 0.035792041916504785, + "language_loss": 0.94188601, + "learning_rate": 0.0009882246760346343, + "loss": 0.95270395, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.71191406, + "step": 505, + "time_per_iteration": 2.6442148685455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077575, + "balance_loss_mlp": 1.00652647, + "epoch": 0.09734513274336283, + "flos": 455882556672.0, + "grad_norm": 0.04461237962136338, + "language_loss": 1.00418711, + "learning_rate": 0.0009881573670233533, + "loss": 1.01496279, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.71191406, + "step": 506, + "time_per_iteration": 2.5102410316467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075502, + "balance_loss_mlp": 1.00450063, + "epoch": 0.09753751442862639, + "flos": 509828577024.0, + "grad_norm": 0.03506590591484262, + "language_loss": 0.93374205, + "learning_rate": 0.0009880898684925747, + "loss": 0.94449711, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.71142578, + "step": 507, + "time_per_iteration": 2.652381658554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.00624609, + "epoch": 0.09772989611388996, + "flos": 485247832320.0, + "grad_norm": 0.03501422949918711, + "language_loss": 0.92606336, + "learning_rate": 0.0009880221804685037, + "loss": 0.9368335, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.70898438, + "step": 508, + "time_per_iteration": 2.5481274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073608, + "balance_loss_mlp": 1.00456238, + "epoch": 0.09792227779915352, + "flos": 1569319231488.0, + "grad_norm": 0.011873284077886747, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80418032, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.69140625, + "step": 509, + "time_per_iteration": 4.725191354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076044, + "balance_loss_mlp": 1.00590122, + "epoch": 0.09811465948441708, + "flos": 588915255552.0, + "grad_norm": 0.04172960474096109, + "language_loss": 0.98818666, + "learning_rate": 0.0009878862360456733, + "loss": 0.99894708, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.70263672, + "step": 510, + "time_per_iteration": 2.7094569206237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_mlp": 1.00828481, + "epoch": 0.09830704116968064, + "flos": 614129790720.0, + "grad_norm": 0.037035801977756785, + "language_loss": 0.90851068, + "learning_rate": 0.0009878179796996922, + "loss": 0.919294, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.70166016, + "step": 511, + "time_per_iteration": 2.6973366737365723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079637, + "balance_loss_mlp": 1.00973296, + "epoch": 0.09849942285494422, + "flos": 539936513280.0, + "grad_norm": 0.0318668020933778, + "language_loss": 0.94484478, + "learning_rate": 0.0009877495339659754, + "loss": 0.95564115, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.70019531, + "step": 512, + "time_per_iteration": 2.7476089000701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083598, + "balance_loss_mlp": 1.0137887, + "epoch": 0.09869180454020778, + "flos": 621604040448.0, + "grad_norm": 0.03763698097825182, + "language_loss": 0.89467418, + "learning_rate": 0.000987680898871096, + "loss": 0.90551007, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.69921875, + "step": 513, + "time_per_iteration": 2.7254321575164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083583, + "balance_loss_mlp": 1.01382184, + "epoch": 0.09888418622547133, + "flos": 813061089024.0, + "grad_norm": 0.049179676158016515, + "language_loss": 0.91816097, + "learning_rate": 0.0009876120744417, + "loss": 0.9289968, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.69873047, + "step": 514, + "time_per_iteration": 2.9596974849700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083293, + "balance_loss_mlp": 1.01357901, + "epoch": 0.0990765679107349, + "flos": 536857967616.0, + "grad_norm": 0.03966041946019195, + "language_loss": 0.99294269, + "learning_rate": 0.0009875430607045078, + "loss": 1.0037756, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.69824219, + "step": 515, + "time_per_iteration": 2.7065181732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083779, + "balance_loss_mlp": 1.01439941, + "epoch": 0.09926894959599845, + "flos": 588971635968.0, + "grad_norm": 0.037836000479060286, + "language_loss": 0.94664383, + "learning_rate": 0.000987473857686313, + "loss": 0.95748156, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.69482422, + "step": 516, + "time_per_iteration": 2.712947130203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085113, + "balance_loss_mlp": 1.01582849, + "epoch": 0.09946133128126203, + "flos": 642387878400.0, + "grad_norm": 0.04191957443387863, + "language_loss": 0.98466003, + "learning_rate": 0.0009874044654139824, + "loss": 0.99551111, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.69384766, + "step": 517, + "time_per_iteration": 2.7391469478607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081227, + "balance_loss_mlp": 1.01194227, + "epoch": 0.09965371296652559, + "flos": 466726002432.0, + "grad_norm": 0.049265237591549625, + "language_loss": 0.97911566, + "learning_rate": 0.0009873348839144563, + "loss": 0.98992795, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.69384766, + "step": 518, + "time_per_iteration": 2.5496554374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081078, + "balance_loss_mlp": 1.01198411, + "epoch": 0.09984609465178915, + "flos": 484559606784.0, + "grad_norm": 0.04039588305244337, + "language_loss": 0.99084902, + "learning_rate": 0.000987265113214749, + "loss": 1.00165975, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.69189453, + "step": 519, + "time_per_iteration": 2.592350721359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_mlp": 1.01200545, + "epoch": 0.1000384763370527, + "flos": 570095972352.0, + "grad_norm": 0.04690738730083641, + "language_loss": 1.01784182, + "learning_rate": 0.0009871951533419476, + "loss": 1.02865279, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.69189453, + "step": 520, + "time_per_iteration": 2.699725866317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.00854921, + "epoch": 0.10023085802231628, + "flos": 546926671872.0, + "grad_norm": 0.03422053119670882, + "language_loss": 0.91227025, + "learning_rate": 0.0009871250043232132, + "loss": 0.92304718, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.69238281, + "step": 521, + "time_per_iteration": 2.74124813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078273, + "balance_loss_mlp": 1.00913203, + "epoch": 0.10042323970757984, + "flos": 504440391168.0, + "grad_norm": 0.0407416967929008, + "language_loss": 0.91114902, + "learning_rate": 0.0009870546661857797, + "loss": 0.92193174, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.69238281, + "step": 522, + "time_per_iteration": 2.6524126529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080712, + "balance_loss_mlp": 1.01199949, + "epoch": 0.1006156213928434, + "flos": 771725737728.0, + "grad_norm": 0.04764395650012834, + "language_loss": 1.0071038, + "learning_rate": 0.0009869841389569553, + "loss": 1.01791096, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.68798828, + "step": 523, + "time_per_iteration": 2.9797816276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.01237857, + "epoch": 0.10080800307810696, + "flos": 491009293824.0, + "grad_norm": 0.04526617857315469, + "language_loss": 0.93126583, + "learning_rate": 0.0009869134226641206, + "loss": 0.94207817, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.68945312, + "step": 524, + "time_per_iteration": 2.624396562576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079355, + "balance_loss_mlp": 1.01064241, + "epoch": 0.10100038476337053, + "flos": 455713415424.0, + "grad_norm": 0.04976961118682096, + "language_loss": 0.93662071, + "learning_rate": 0.0009868425173347303, + "loss": 0.94741422, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.68798828, + "step": 525, + "time_per_iteration": 2.659106731414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077138, + "balance_loss_mlp": 1.00809169, + "epoch": 0.10119276644863409, + "flos": 557574731520.0, + "grad_norm": 0.04197638521891018, + "language_loss": 0.9924143, + "learning_rate": 0.0009867714229963125, + "loss": 1.00318575, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.69140625, + "step": 526, + "time_per_iteration": 2.7414495944976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.01201165, + "epoch": 0.10138514813389765, + "flos": 517220201472.0, + "grad_norm": 0.044929109849797505, + "language_loss": 0.96641302, + "learning_rate": 0.000986700139676468, + "loss": 0.97722065, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.68847656, + "step": 527, + "time_per_iteration": 2.620313882827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083057, + "balance_loss_mlp": 1.01405847, + "epoch": 0.10157752981916121, + "flos": 501564034560.0, + "grad_norm": 0.03558874762709202, + "language_loss": 0.9424324, + "learning_rate": 0.0009866286674028717, + "loss": 0.95326293, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.69091797, + "step": 528, + "time_per_iteration": 2.632835865020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082511, + "balance_loss_mlp": 1.01379848, + "epoch": 0.10176991150442478, + "flos": 658094589696.0, + "grad_norm": 0.042026744727430246, + "language_loss": 0.91470444, + "learning_rate": 0.0009865570062032717, + "loss": 0.9255296, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.68798828, + "step": 529, + "time_per_iteration": 2.9185874462127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.01519477, + "epoch": 0.10196229318968834, + "flos": 574403215104.0, + "grad_norm": 0.031693910674612406, + "language_loss": 0.95307148, + "learning_rate": 0.0009864851561054893, + "loss": 0.96391344, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.69091797, + "step": 530, + "time_per_iteration": 2.7826597690582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.01765728, + "epoch": 0.1021546748749519, + "flos": 519256687872.0, + "grad_norm": 0.0418084670656813, + "language_loss": 0.94574928, + "learning_rate": 0.0009864131171374191, + "loss": 0.95661592, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.69091797, + "step": 531, + "time_per_iteration": 2.67000150680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088238, + "balance_loss_mlp": 1.01919198, + "epoch": 0.10234705656021546, + "flos": 610954035456.0, + "grad_norm": 0.03906444640078033, + "language_loss": 0.94287467, + "learning_rate": 0.0009863408893270292, + "loss": 0.95375705, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.69140625, + "step": 532, + "time_per_iteration": 2.7893166542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_mlp": 1.02029741, + "epoch": 0.10253943824547904, + "flos": 602913069312.0, + "grad_norm": 0.046708965243717, + "language_loss": 0.90346718, + "learning_rate": 0.0009862684727023605, + "loss": 0.91435778, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.68847656, + "step": 533, + "time_per_iteration": 2.7212483882904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.0105468, + "epoch": 0.1027318199307426, + "flos": 664157395200.0, + "grad_norm": 0.04923575085492922, + "language_loss": 0.9286049, + "learning_rate": 0.0009861958672915283, + "loss": 0.93939555, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.68603516, + "step": 534, + "time_per_iteration": 2.8216443061828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080998, + "balance_loss_mlp": 1.01271474, + "epoch": 0.10292420161600616, + "flos": 684531019008.0, + "grad_norm": 0.03566434899904423, + "language_loss": 0.91122925, + "learning_rate": 0.0009861230731227201, + "loss": 0.92203927, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.68359375, + "step": 535, + "time_per_iteration": 2.8432843685150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082908, + "balance_loss_mlp": 1.01514912, + "epoch": 0.10311658330126972, + "flos": 491269808640.0, + "grad_norm": 0.04656876258351904, + "language_loss": 0.9494285, + "learning_rate": 0.0009860500902241973, + "loss": 0.96025753, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.67822266, + "step": 536, + "time_per_iteration": 2.601234197616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085787, + "balance_loss_mlp": 1.01831496, + "epoch": 0.10330896498653329, + "flos": 432687011328.0, + "grad_norm": 0.046264109011482965, + "language_loss": 0.99409795, + "learning_rate": 0.0009859769186242942, + "loss": 1.00495577, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.67529297, + "step": 537, + "time_per_iteration": 2.527156114578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079891, + "balance_loss_mlp": 1.01265681, + "epoch": 0.10350134667179685, + "flos": 550642898688.0, + "grad_norm": 0.04274411195548745, + "language_loss": 0.92667055, + "learning_rate": 0.0009859035583514187, + "loss": 0.93746948, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.67285156, + "step": 538, + "time_per_iteration": 2.6489107608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082236, + "balance_loss_mlp": 1.01505005, + "epoch": 0.10369372835706041, + "flos": 641827964928.0, + "grad_norm": 0.04978782417937993, + "language_loss": 0.95941103, + "learning_rate": 0.0009858300094340517, + "loss": 0.97023344, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.67236328, + "step": 539, + "time_per_iteration": 2.8078534603118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107826, + "balance_loss_mlp": 1.01102614, + "epoch": 0.10388611004232397, + "flos": 522766834944.0, + "grad_norm": 0.04233995967203171, + "language_loss": 0.8846426, + "learning_rate": 0.0009857562719007473, + "loss": 0.8954252, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.67285156, + "step": 540, + "time_per_iteration": 2.605253219604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108211, + "balance_loss_mlp": 1.01487637, + "epoch": 0.10407849172758753, + "flos": 703741074432.0, + "grad_norm": 0.04489314852578161, + "language_loss": 0.9024663, + "learning_rate": 0.0009856823457801331, + "loss": 0.91328734, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.67285156, + "step": 541, + "time_per_iteration": 2.8836264610290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074756, + "balance_loss_mlp": 1.00737894, + "epoch": 0.1042708734128511, + "flos": 503945606400.0, + "grad_norm": 0.04545070943505171, + "language_loss": 0.97841358, + "learning_rate": 0.00098560823110091, + "loss": 0.98916113, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.67431641, + "step": 542, + "time_per_iteration": 2.629241466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078174, + "balance_loss_mlp": 1.01084471, + "epoch": 0.10446325509811466, + "flos": 486641779968.0, + "grad_norm": 0.04151430298304091, + "language_loss": 0.974545, + "learning_rate": 0.000985533927891851, + "loss": 0.98532677, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.67382812, + "step": 543, + "time_per_iteration": 2.712714195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.01110125, + "epoch": 0.10465563678337822, + "flos": 569713948416.0, + "grad_norm": 0.043537531534841835, + "language_loss": 0.9559319, + "learning_rate": 0.0009854594361818044, + "loss": 0.96671236, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.66992188, + "step": 544, + "time_per_iteration": 2.66324520111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075343, + "balance_loss_mlp": 1.00806153, + "epoch": 0.10484801846864178, + "flos": 627243992832.0, + "grad_norm": 0.042858245855360314, + "language_loss": 0.94459403, + "learning_rate": 0.0009853847559996897, + "loss": 0.95534742, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.67333984, + "step": 545, + "time_per_iteration": 2.749379873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.00697374, + "epoch": 0.10504040015390535, + "flos": 744813965568.0, + "grad_norm": 0.04113973833070077, + "language_loss": 0.93940508, + "learning_rate": 0.0009853098873745, + "loss": 0.95015049, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.67626953, + "step": 546, + "time_per_iteration": 3.0356035232543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082094, + "balance_loss_mlp": 1.01457405, + "epoch": 0.10523278183916891, + "flos": 587843060736.0, + "grad_norm": 0.04039468180414331, + "language_loss": 0.92498314, + "learning_rate": 0.0009852348303353027, + "loss": 0.93580401, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.67578125, + "step": 547, + "time_per_iteration": 2.787853479385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_mlp": 1.01283157, + "epoch": 0.10542516352443247, + "flos": 871147156224.0, + "grad_norm": 0.04319215205461418, + "language_loss": 0.86143011, + "learning_rate": 0.000985159584911237, + "loss": 0.872235, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.67724609, + "step": 548, + "time_per_iteration": 3.103173017501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.00949633, + "epoch": 0.10561754520969603, + "flos": 506413694208.0, + "grad_norm": 0.04405333210851084, + "language_loss": 0.94064271, + "learning_rate": 0.0009850841511315162, + "loss": 0.95141286, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.67578125, + "step": 549, + "time_per_iteration": 2.647629737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107948, + "balance_loss_mlp": 1.01176953, + "epoch": 0.1058099268949596, + "flos": 561148061952.0, + "grad_norm": 0.03728506713954383, + "language_loss": 0.9326818, + "learning_rate": 0.0009850085290254256, + "loss": 0.94347662, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.67773438, + "step": 550, + "time_per_iteration": 2.7680838108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081585, + "balance_loss_mlp": 1.01411295, + "epoch": 0.10600230858022316, + "flos": 563160248832.0, + "grad_norm": 0.031635589688873186, + "language_loss": 0.90350562, + "learning_rate": 0.0009849327186223246, + "loss": 0.91432148, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.67529297, + "step": 551, + "time_per_iteration": 2.7540531158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077249, + "balance_loss_mlp": 1.01001453, + "epoch": 0.10619469026548672, + "flos": 495318481920.0, + "grad_norm": 0.03875875468173829, + "language_loss": 0.97612774, + "learning_rate": 0.000984856719951646, + "loss": 0.98690015, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.67285156, + "step": 552, + "time_per_iteration": 2.5471906661987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080098, + "balance_loss_mlp": 1.01300704, + "epoch": 0.10638707195075028, + "flos": 677465038080.0, + "grad_norm": 0.04041077275123314, + "language_loss": 0.94560456, + "learning_rate": 0.0009847805330428943, + "loss": 0.95640558, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.67138672, + "step": 553, + "time_per_iteration": 2.879901647567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081173, + "balance_loss_mlp": 1.01398706, + "epoch": 0.10657945363601386, + "flos": 489035990784.0, + "grad_norm": 0.051524237529684984, + "language_loss": 0.97161597, + "learning_rate": 0.0009847041579256481, + "loss": 0.98242772, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.67236328, + "step": 554, + "time_per_iteration": 2.5838425159454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076637, + "balance_loss_mlp": 1.00997543, + "epoch": 0.10677183532127742, + "flos": 483971503104.0, + "grad_norm": 0.03890900728724459, + "language_loss": 0.96058643, + "learning_rate": 0.0009846275946295592, + "loss": 0.97135282, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.66699219, + "step": 555, + "time_per_iteration": 2.619490623474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074794, + "balance_loss_mlp": 1.00813222, + "epoch": 0.10696421700654098, + "flos": 657582308352.0, + "grad_norm": 0.03350037319549477, + "language_loss": 0.89189553, + "learning_rate": 0.0009845508431843518, + "loss": 0.9026435, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.66699219, + "step": 556, + "time_per_iteration": 3.0074055194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_mlp": 1.00895333, + "epoch": 0.10715659869180454, + "flos": 568793398272.0, + "grad_norm": 0.03867425342149035, + "language_loss": 0.90383601, + "learning_rate": 0.0009844739036198233, + "loss": 0.91459262, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.66748047, + "step": 557, + "time_per_iteration": 2.719309091567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073849, + "balance_loss_mlp": 1.00756896, + "epoch": 0.10734898037706811, + "flos": 541744565760.0, + "grad_norm": 0.03845092177051005, + "language_loss": 0.97656357, + "learning_rate": 0.0009843967759658448, + "loss": 0.98730206, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.66308594, + "step": 558, + "time_per_iteration": 2.679964065551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077583, + "balance_loss_mlp": 1.01311493, + "epoch": 0.10754136206233167, + "flos": 1479734192640.0, + "grad_norm": 0.013283033162601723, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73845339, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.64453125, + "step": 559, + "time_per_iteration": 4.837440729141235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_mlp": 1.00977802, + "epoch": 0.10773374374759523, + "flos": 513412601088.0, + "grad_norm": 0.03702065367467253, + "language_loss": 0.97501957, + "learning_rate": 0.000984241956509384, + "loss": 0.98577774, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.66064453, + "step": 560, + "time_per_iteration": 2.6579978466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079218, + "balance_loss_mlp": 1.01312864, + "epoch": 0.10792612543285879, + "flos": 497478422784.0, + "grad_norm": 0.05173888564395698, + "language_loss": 0.9404971, + "learning_rate": 0.0009841642647670078, + "loss": 0.9512893, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.66113281, + "step": 561, + "time_per_iteration": 2.557605743408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080143, + "balance_loss_mlp": 1.01429176, + "epoch": 0.10811850711812235, + "flos": 736838128128.0, + "grad_norm": 0.0493873548723288, + "language_loss": 0.88547891, + "learning_rate": 0.0009840863850553944, + "loss": 0.89628035, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.65869141, + "step": 562, + "time_per_iteration": 2.949580669403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077668, + "balance_loss_mlp": 1.0115304, + "epoch": 0.10831088880338592, + "flos": 612677517312.0, + "grad_norm": 0.04173462884607535, + "language_loss": 0.94150907, + "learning_rate": 0.0009840083174047782, + "loss": 0.95228577, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.66162109, + "step": 563, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081559, + "balance_loss_mlp": 1.01561248, + "epoch": 0.10850327048864948, + "flos": 557498909184.0, + "grad_norm": 0.034100755270258146, + "language_loss": 0.88515103, + "learning_rate": 0.0009839300618454685, + "loss": 0.89596659, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.65966797, + "step": 564, + "time_per_iteration": 2.8846256732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080528, + "balance_loss_mlp": 1.0148201, + "epoch": 0.10869565217391304, + "flos": 604437274368.0, + "grad_norm": 0.036735298053950545, + "language_loss": 0.93941957, + "learning_rate": 0.0009838516184078466, + "loss": 0.95022488, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.65722656, + "step": 565, + "time_per_iteration": 2.813284158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078727, + "balance_loss_mlp": 1.01297164, + "epoch": 0.1088880338591766, + "flos": 527206280448.0, + "grad_norm": 0.040314305725270186, + "language_loss": 0.91096556, + "learning_rate": 0.0009837729871223669, + "loss": 0.92175281, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.65771484, + "step": 566, + "time_per_iteration": 2.651611089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078801, + "balance_loss_mlp": 1.01318836, + "epoch": 0.10908041554444017, + "flos": 621417402624.0, + "grad_norm": 0.042325065837349046, + "language_loss": 0.91458869, + "learning_rate": 0.0009836941680195568, + "loss": 0.92537665, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.65625, + "step": 567, + "time_per_iteration": 2.8296427726745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.01508534, + "epoch": 0.10927279722970373, + "flos": 899674507008.0, + "grad_norm": 0.04990856516123606, + "language_loss": 0.87414277, + "learning_rate": 0.0009836151611300166, + "loss": 0.88495302, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.65966797, + "step": 568, + "time_per_iteration": 3.2401816844940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107825, + "balance_loss_mlp": 1.01206517, + "epoch": 0.10946517891496729, + "flos": 529700613120.0, + "grad_norm": 0.0427731854110213, + "language_loss": 0.96863574, + "learning_rate": 0.0009835359664844194, + "loss": 0.97941828, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.66210938, + "step": 569, + "time_per_iteration": 2.6190173625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064438, + "balance_loss_mlp": 1.00092316, + "epoch": 0.10965756060023085, + "flos": 1563994228992.0, + "grad_norm": 0.005811935039235345, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.8210125, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.63476562, + "step": 570, + "time_per_iteration": 4.957117795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080699, + "balance_loss_mlp": 1.0151341, + "epoch": 0.10984994228549443, + "flos": 514100826624.0, + "grad_norm": 0.04369440603786518, + "language_loss": 0.94858396, + "learning_rate": 0.0009833770140481118, + "loss": 0.95939088, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.65576172, + "step": 571, + "time_per_iteration": 2.6529860496520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086256, + "balance_loss_mlp": 1.02059519, + "epoch": 0.11004232397075799, + "flos": 956275252992.0, + "grad_norm": 0.04378732511153692, + "language_loss": 0.85010409, + "learning_rate": 0.000983297256319112, + "loss": 0.86096668, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.65673828, + "step": 572, + "time_per_iteration": 3.2036497592926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.01499045, + "epoch": 0.11023470565602154, + "flos": 489229431552.0, + "grad_norm": 0.043497603291787354, + "language_loss": 0.89141667, + "learning_rate": 0.000983217310957477, + "loss": 0.90222269, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.65625, + "step": 573, + "time_per_iteration": 2.7763278484344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078757, + "balance_loss_mlp": 1.01333535, + "epoch": 0.1104270873412851, + "flos": 656991292416.0, + "grad_norm": 0.04901418812727031, + "language_loss": 0.9269613, + "learning_rate": 0.000983137177994244, + "loss": 0.93774891, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.65429688, + "step": 574, + "time_per_iteration": 2.8529646396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080019, + "balance_loss_mlp": 1.01474011, + "epoch": 0.11061946902654868, + "flos": 724748488704.0, + "grad_norm": 0.03457948694206611, + "language_loss": 0.87449324, + "learning_rate": 0.0009830568574605235, + "loss": 0.88529336, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.65283203, + "step": 575, + "time_per_iteration": 2.94710373878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010791, + "balance_loss_mlp": 1.01367807, + "epoch": 0.11081185071181224, + "flos": 836869037568.0, + "grad_norm": 0.04085001299476677, + "language_loss": 0.90086508, + "learning_rate": 0.0009829763493874992, + "loss": 0.91165602, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.65429688, + "step": 576, + "time_per_iteration": 3.0296730995178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107807, + "balance_loss_mlp": 1.01283884, + "epoch": 0.1110042323970758, + "flos": 610283306496.0, + "grad_norm": 0.03775485835018356, + "language_loss": 0.95256275, + "learning_rate": 0.0009828956538064264, + "loss": 0.9633435, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.65234375, + "step": 577, + "time_per_iteration": 2.7944416999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073929, + "balance_loss_mlp": 1.00893569, + "epoch": 0.11119661408233936, + "flos": 597040792320.0, + "grad_norm": 0.04378674390965236, + "language_loss": 0.93033826, + "learning_rate": 0.0009828147707486344, + "loss": 0.94107759, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.64990234, + "step": 578, + "time_per_iteration": 2.7034592628479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.01099229, + "epoch": 0.11138899576760293, + "flos": 556888451328.0, + "grad_norm": 0.05042820660432219, + "language_loss": 0.89312434, + "learning_rate": 0.0009827337002455245, + "loss": 0.90388274, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.6484375, + "step": 579, + "time_per_iteration": 2.6187195777893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074948, + "balance_loss_mlp": 1.01057482, + "epoch": 0.11158137745286649, + "flos": 691063331328.0, + "grad_norm": 0.03501309245374513, + "language_loss": 0.89977694, + "learning_rate": 0.0009826524423285712, + "loss": 0.91052639, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.64355469, + "step": 580, + "time_per_iteration": 2.9009909629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.0148946, + "epoch": 0.11177375913813005, + "flos": 764307868416.0, + "grad_norm": 0.04023884017549449, + "language_loss": 0.91280103, + "learning_rate": 0.0009825709970293218, + "loss": 0.92359698, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.64697266, + "step": 581, + "time_per_iteration": 2.9111618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.0095998, + "epoch": 0.11196614082339361, + "flos": 808031594496.0, + "grad_norm": 0.038028140255108665, + "language_loss": 0.97163212, + "learning_rate": 0.0009824893643793956, + "loss": 0.98237336, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.64501953, + "step": 582, + "time_per_iteration": 3.0907368659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072862, + "balance_loss_mlp": 1.00796497, + "epoch": 0.11215852250865718, + "flos": 559725924096.0, + "grad_norm": 0.04580369165919148, + "language_loss": 0.90464842, + "learning_rate": 0.0009824075444104857, + "loss": 0.91537702, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.64892578, + "step": 583, + "time_per_iteration": 2.7276525497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_mlp": 1.01285601, + "epoch": 0.11235090419392074, + "flos": 514576169472.0, + "grad_norm": 0.03926612419770205, + "language_loss": 0.95381963, + "learning_rate": 0.000982325537154357, + "loss": 0.96459383, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.64550781, + "step": 584, + "time_per_iteration": 2.6261777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074375, + "balance_loss_mlp": 1.0100019, + "epoch": 0.1125432858791843, + "flos": 492433377024.0, + "grad_norm": 0.043221505898455144, + "language_loss": 0.96143711, + "learning_rate": 0.0009822433426428484, + "loss": 0.97218084, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.64355469, + "step": 585, + "time_per_iteration": 2.5630125999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.01079714, + "epoch": 0.11273566756444786, + "flos": 511728003072.0, + "grad_norm": 0.04466131563000304, + "language_loss": 0.88984096, + "learning_rate": 0.0009821609609078697, + "loss": 0.90059173, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.64257812, + "step": 586, + "time_per_iteration": 2.649122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.01077783, + "epoch": 0.11292804924971142, + "flos": 623640526848.0, + "grad_norm": 0.03579172726266892, + "language_loss": 0.91595018, + "learning_rate": 0.0009820783919814045, + "loss": 0.92670119, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.64306641, + "step": 587, + "time_per_iteration": 2.7977845668792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_mlp": 1.00830126, + "epoch": 0.113120430934975, + "flos": 479039218176.0, + "grad_norm": 0.04738669495581529, + "language_loss": 0.85574889, + "learning_rate": 0.0009819956358955095, + "loss": 0.86647511, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.64306641, + "step": 588, + "time_per_iteration": 2.59133243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_mlp": 1.01245642, + "epoch": 0.11331281262023855, + "flos": 467991638016.0, + "grad_norm": 0.048752038127388646, + "language_loss": 0.86982751, + "learning_rate": 0.0009819126926823127, + "loss": 0.88059437, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.64208984, + "step": 589, + "time_per_iteration": 2.511939764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075971, + "balance_loss_mlp": 1.01174104, + "epoch": 0.11350519430550211, + "flos": 651611854848.0, + "grad_norm": 0.04204370934342767, + "language_loss": 0.89311969, + "learning_rate": 0.000981829562374016, + "loss": 0.9038794, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.64208984, + "step": 590, + "time_per_iteration": 2.798734426498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.01111591, + "epoch": 0.11369757599076567, + "flos": 558861754368.0, + "grad_norm": 0.04723710161718091, + "language_loss": 0.99783856, + "learning_rate": 0.0009817462450028933, + "loss": 1.00858927, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.63916016, + "step": 591, + "time_per_iteration": 2.717622756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076867, + "balance_loss_mlp": 1.01316178, + "epoch": 0.11388995767602925, + "flos": 572306457600.0, + "grad_norm": 0.041300229846526024, + "language_loss": 0.87103492, + "learning_rate": 0.0009816627406012916, + "loss": 0.88180363, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.63671875, + "step": 592, + "time_per_iteration": 2.783677339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077614, + "balance_loss_mlp": 1.01376593, + "epoch": 0.1140823393612928, + "flos": 741744168192.0, + "grad_norm": 0.04574882804976793, + "language_loss": 0.87044728, + "learning_rate": 0.0009815790492016295, + "loss": 0.88122344, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.63818359, + "step": 593, + "time_per_iteration": 2.920262336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079358, + "balance_loss_mlp": 1.01560438, + "epoch": 0.11427472104655637, + "flos": 700252314624.0, + "grad_norm": 0.042792726491020304, + "language_loss": 0.89086539, + "learning_rate": 0.0009814951708363993, + "loss": 0.90165901, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.63720703, + "step": 594, + "time_per_iteration": 2.8244025707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.00799561, + "epoch": 0.11446710273181993, + "flos": 1480355344128.0, + "grad_norm": 0.0135056408383676, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79060781, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.6171875, + "step": 595, + "time_per_iteration": 4.779642105102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_mlp": 1.01189995, + "epoch": 0.1146594844170835, + "flos": 495913388544.0, + "grad_norm": 0.038757735955663945, + "language_loss": 0.90035105, + "learning_rate": 0.0009813268533395648, + "loss": 0.91110331, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.6328125, + "step": 596, + "time_per_iteration": 2.5933825969696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082133, + "balance_loss_mlp": 1.01895213, + "epoch": 0.11485186610234706, + "flos": 475791531264.0, + "grad_norm": 0.0538004660752225, + "language_loss": 0.90474582, + "learning_rate": 0.0009812424142733073, + "loss": 0.9155671, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.63134766, + "step": 597, + "time_per_iteration": 2.528027296066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073976, + "balance_loss_mlp": 1.01089013, + "epoch": 0.11504424778761062, + "flos": 732620313600.0, + "grad_norm": 0.03283482462688361, + "language_loss": 0.87953097, + "learning_rate": 0.000981157788372175, + "loss": 0.89027071, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.63037109, + "step": 598, + "time_per_iteration": 3.008469343185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.01160276, + "epoch": 0.11523662947287418, + "flos": 546963610368.0, + "grad_norm": 0.037424804687157906, + "language_loss": 0.91041148, + "learning_rate": 0.0009810729756690223, + "loss": 0.92115927, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.63134766, + "step": 599, + "time_per_iteration": 2.75840163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077312, + "balance_loss_mlp": 1.01408339, + "epoch": 0.11542901115813775, + "flos": 776388759552.0, + "grad_norm": 0.04126969924944996, + "language_loss": 0.9391377, + "learning_rate": 0.0009809879761967766, + "loss": 0.94991082, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.63183594, + "step": 600, + "time_per_iteration": 2.9511778354644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081843, + "balance_loss_mlp": 1.01828074, + "epoch": 0.11562139284340131, + "flos": 732213990144.0, + "grad_norm": 0.05544181306164312, + "language_loss": 0.88981479, + "learning_rate": 0.0009809027899884378, + "loss": 0.90063322, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.63525391, + "step": 601, + "time_per_iteration": 2.888591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.01256609, + "epoch": 0.11581377452866487, + "flos": 537040714752.0, + "grad_norm": 0.03483284203155477, + "language_loss": 0.90335476, + "learning_rate": 0.0009808174170770779, + "loss": 0.9141165, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.63574219, + "step": 602, + "time_per_iteration": 2.7933802604675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073479, + "balance_loss_mlp": 1.01263428, + "epoch": 0.11600615621392843, + "flos": 1559214555648.0, + "grad_norm": 0.012041981792172347, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85971725, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.60742188, + "step": 603, + "time_per_iteration": 4.875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.01658237, + "epoch": 0.116198537899192, + "flos": 538468688640.0, + "grad_norm": 0.046063141341509364, + "language_loss": 0.95944118, + "learning_rate": 0.0009806461112779462, + "loss": 0.97023928, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.63183594, + "step": 604, + "time_per_iteration": 2.708552360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077772, + "balance_loss_mlp": 1.01444781, + "epoch": 0.11639091958445556, + "flos": 455137950720.0, + "grad_norm": 0.05737724930332189, + "language_loss": 0.90764457, + "learning_rate": 0.0009805601784566814, + "loss": 0.91842222, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.6328125, + "step": 605, + "time_per_iteration": 2.545696496963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076475, + "balance_loss_mlp": 1.01329422, + "epoch": 0.11658330126971912, + "flos": 556152593664.0, + "grad_norm": 0.04016687987230144, + "language_loss": 0.97276044, + "learning_rate": 0.0009804740590654089, + "loss": 0.98352522, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.63134766, + "step": 606, + "time_per_iteration": 2.6464574337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_mlp": 1.01399851, + "epoch": 0.11677568295498268, + "flos": 717601827840.0, + "grad_norm": 0.0453344941203476, + "language_loss": 0.91881627, + "learning_rate": 0.0009803877531375635, + "loss": 0.9295876, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.63085938, + "step": 607, + "time_per_iteration": 2.8467392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074228, + "balance_loss_mlp": 1.0111903, + "epoch": 0.11696806464024626, + "flos": 610899600384.0, + "grad_norm": 0.04469679718872237, + "language_loss": 0.92976171, + "learning_rate": 0.0009803012607066523, + "loss": 0.94050401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.62988281, + "step": 608, + "time_per_iteration": 2.7587811946868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.01030838, + "epoch": 0.11716044632550981, + "flos": 521416628736.0, + "grad_norm": 0.04044307397502579, + "language_loss": 0.91207683, + "learning_rate": 0.0009802145818062543, + "loss": 0.92280889, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.62841797, + "step": 609, + "time_per_iteration": 2.7623538970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.00919068, + "epoch": 0.11735282801077337, + "flos": 508489064448.0, + "grad_norm": 0.04251091083777229, + "language_loss": 0.93763256, + "learning_rate": 0.0009801277164700212, + "loss": 0.9483524, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.62744141, + "step": 610, + "time_per_iteration": 2.6250369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079805, + "balance_loss_mlp": 1.0171963, + "epoch": 0.11754520969603693, + "flos": 687837031680.0, + "grad_norm": 0.044835447829723894, + "language_loss": 0.91796255, + "learning_rate": 0.0009800406647316776, + "loss": 0.92876053, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.62548828, + "step": 611, + "time_per_iteration": 2.81438946723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058731, + "balance_loss_mlp": 0.99807739, + "epoch": 0.1177375913813005, + "flos": 1545759158784.0, + "grad_norm": 0.00493114536612535, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77973187, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.60546875, + "step": 612, + "time_per_iteration": 4.795796871185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_mlp": 1.01008153, + "epoch": 0.11792997306656407, + "flos": 521538137856.0, + "grad_norm": 0.049162221556570344, + "language_loss": 0.91035461, + "learning_rate": 0.000979866002183916, + "loss": 0.92108488, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.62890625, + "step": 613, + "time_per_iteration": 2.6470768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071385, + "balance_loss_mlp": 1.00820458, + "epoch": 0.11812235475182763, + "flos": 667489652736.0, + "grad_norm": 0.0453482214384289, + "language_loss": 0.92239928, + "learning_rate": 0.0009797783914423082, + "loss": 0.93311322, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.63134766, + "step": 614, + "time_per_iteration": 2.8020856380462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_mlp": 1.01220894, + "epoch": 0.11831473643709119, + "flos": 622505148672.0, + "grad_norm": 0.04034391423157231, + "language_loss": 0.86097217, + "learning_rate": 0.0009796905944342094, + "loss": 0.87172604, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.63134766, + "step": 615, + "time_per_iteration": 2.839617967605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079046, + "balance_loss_mlp": 1.0160079, + "epoch": 0.11850711812235475, + "flos": 457695466752.0, + "grad_norm": 0.03330066749319758, + "language_loss": 0.89949274, + "learning_rate": 0.0009796026111937057, + "loss": 0.91028321, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.62988281, + "step": 616, + "time_per_iteration": 2.6211540699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077601, + "balance_loss_mlp": 1.0150882, + "epoch": 0.11869949980761832, + "flos": 514928057856.0, + "grad_norm": 0.034464018290856886, + "language_loss": 0.90251315, + "learning_rate": 0.0009795144417549552, + "loss": 0.91328913, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.62451172, + "step": 617, + "time_per_iteration": 2.6946897506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080332, + "balance_loss_mlp": 1.01815259, + "epoch": 0.11889188149288188, + "flos": 536157103104.0, + "grad_norm": 0.035314864293198016, + "language_loss": 0.91583192, + "learning_rate": 0.0009794260861521883, + "loss": 0.92663527, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.62109375, + "step": 618, + "time_per_iteration": 2.77822208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.01979554, + "epoch": 0.11908426317814544, + "flos": 499645166592.0, + "grad_norm": 0.042334404758790994, + "language_loss": 0.88659471, + "learning_rate": 0.0009793375444197075, + "loss": 0.89741158, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.61816406, + "step": 619, + "time_per_iteration": 2.6199400424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086152, + "balance_loss_mlp": 1.02416277, + "epoch": 0.119276644863409, + "flos": 661068155904.0, + "grad_norm": 0.043937618111938345, + "language_loss": 0.86906028, + "learning_rate": 0.000979248816591888, + "loss": 0.87992179, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.61914062, + "step": 620, + "time_per_iteration": 2.789858341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_mlp": 1.01947308, + "epoch": 0.11946902654867257, + "flos": 760153237248.0, + "grad_norm": 0.04701199265522289, + "language_loss": 0.87992656, + "learning_rate": 0.0009791599027031766, + "loss": 0.89074314, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.62109375, + "step": 621, + "time_per_iteration": 3.026487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074317, + "balance_loss_mlp": 1.01223314, + "epoch": 0.11966140823393613, + "flos": 682214575872.0, + "grad_norm": 0.0506686420393155, + "language_loss": 0.88143325, + "learning_rate": 0.0009790708027880932, + "loss": 0.89217639, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.62011719, + "step": 622, + "time_per_iteration": 2.8321774005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_mlp": 1.02143097, + "epoch": 0.11985378991919969, + "flos": 1454300938752.0, + "grad_norm": 0.023212611497014573, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78508806, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.59960938, + "step": 623, + "time_per_iteration": 4.862462759017944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071261, + "balance_loss_mlp": 1.00936747, + "epoch": 0.12004617160446325, + "flos": 528899626752.0, + "grad_norm": 0.04437858339694968, + "language_loss": 0.95209736, + "learning_rate": 0.0009788920450172487, + "loss": 0.96280998, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.61816406, + "step": 624, + "time_per_iteration": 2.630764961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078232, + "balance_loss_mlp": 1.01619518, + "epoch": 0.12023855328972682, + "flos": 475177182720.0, + "grad_norm": 0.048047229360432486, + "language_loss": 0.92430472, + "learning_rate": 0.0009788023872308875, + "loss": 0.93508708, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.61962891, + "step": 625, + "time_per_iteration": 2.5534780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076523, + "balance_loss_mlp": 1.01682281, + "epoch": 0.12043093497499038, + "flos": 1535054718720.0, + "grad_norm": 0.022021305117703366, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.7650553, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.59570312, + "step": 626, + "time_per_iteration": 4.738527536392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108475, + "balance_loss_mlp": 1.023, + "epoch": 0.12062331666025394, + "flos": 540915389184.0, + "grad_norm": 0.04663901515177362, + "language_loss": 0.9603011, + "learning_rate": 0.0009786225140303285, + "loss": 0.97114861, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.61669922, + "step": 627, + "time_per_iteration": 2.634160280227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.02427304, + "epoch": 0.1208156983455175, + "flos": 513000441600.0, + "grad_norm": 0.042540459475059536, + "language_loss": 0.94019556, + "learning_rate": 0.0009785322986859634, + "loss": 0.95105481, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.61572266, + "step": 628, + "time_per_iteration": 2.681070327758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078772, + "balance_loss_mlp": 1.01725972, + "epoch": 0.12100808003078108, + "flos": 597590012160.0, + "grad_norm": 0.03866803919075334, + "language_loss": 0.94614279, + "learning_rate": 0.0009784418975588838, + "loss": 0.95693052, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.61425781, + "step": 629, + "time_per_iteration": 2.7337839603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073027, + "balance_loss_mlp": 1.01132393, + "epoch": 0.12120046171604464, + "flos": 524067463680.0, + "grad_norm": 0.03279843121618067, + "language_loss": 0.94581258, + "learning_rate": 0.0009783513106841862, + "loss": 0.95654285, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.61621094, + "step": 630, + "time_per_iteration": 2.702615737915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080086, + "balance_loss_mlp": 1.01981354, + "epoch": 0.1213928434013082, + "flos": 1557910036224.0, + "grad_norm": 0.01502333088768157, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77812791, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.6015625, + "step": 631, + "time_per_iteration": 4.998409032821655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080339, + "balance_loss_mlp": 1.01835024, + "epoch": 0.12158522508657175, + "flos": 496388731392.0, + "grad_norm": 0.04174070683076465, + "language_loss": 0.89320499, + "learning_rate": 0.0009781695798326854, + "loss": 0.90400839, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.61914062, + "step": 632, + "time_per_iteration": 2.5908379554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079195, + "balance_loss_mlp": 1.01744485, + "epoch": 0.12177760677183531, + "flos": 476590572288.0, + "grad_norm": 0.04165368210868703, + "language_loss": 0.89744723, + "learning_rate": 0.0009780784359264365, + "loss": 0.90823919, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.61669922, + "step": 633, + "time_per_iteration": 2.689202070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073334, + "balance_loss_mlp": 1.01382446, + "epoch": 0.12196998845709889, + "flos": 1471787512320.0, + "grad_norm": 0.011333314510513573, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75262028, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.59375, + "step": 634, + "time_per_iteration": 4.762145757675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073337, + "balance_loss_mlp": 1.01187229, + "epoch": 0.12216237014236245, + "flos": 587749741824.0, + "grad_norm": 0.03178889939160208, + "language_loss": 0.88649213, + "learning_rate": 0.000977895591329867, + "loss": 0.8972255, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.61376953, + "step": 635, + "time_per_iteration": 2.7996504306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075051, + "balance_loss_mlp": 1.01372933, + "epoch": 0.12235475182762601, + "flos": 599107414272.0, + "grad_norm": 0.038321985001081305, + "language_loss": 0.88459468, + "learning_rate": 0.000977803890710533, + "loss": 0.89534515, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.61230469, + "step": 636, + "time_per_iteration": 2.7200405597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072884, + "balance_loss_mlp": 1.0117538, + "epoch": 0.12254713351288957, + "flos": 498761554944.0, + "grad_norm": 0.03313527469264444, + "language_loss": 0.94808865, + "learning_rate": 0.0009777120045912774, + "loss": 0.95881748, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.61035156, + "step": 637, + "time_per_iteration": 2.6253507137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072019, + "balance_loss_mlp": 1.01084125, + "epoch": 0.12273951519815314, + "flos": 606981184512.0, + "grad_norm": 0.04065251745031248, + "language_loss": 0.91558111, + "learning_rate": 0.0009776199330077736, + "loss": 0.92630136, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.61083984, + "step": 638, + "time_per_iteration": 2.724416732788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069779, + "balance_loss_mlp": 1.0086484, + "epoch": 0.1229318968834167, + "flos": 598985905152.0, + "grad_norm": 0.04427923240085457, + "language_loss": 0.94062102, + "learning_rate": 0.0009775276759957667, + "loss": 0.9513188, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.61035156, + "step": 639, + "time_per_iteration": 2.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_mlp": 1.00851989, + "epoch": 0.12312427856868026, + "flos": 679589985792.0, + "grad_norm": 0.04435656949952303, + "language_loss": 0.91938198, + "learning_rate": 0.0009774352335910745, + "loss": 0.93008226, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.61425781, + "step": 640, + "time_per_iteration": 2.8135974407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072128, + "balance_loss_mlp": 1.01095021, + "epoch": 0.12331666025394382, + "flos": 610044178944.0, + "grad_norm": 0.03352322480141845, + "language_loss": 0.95842457, + "learning_rate": 0.000977342605829586, + "loss": 0.96914589, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.61083984, + "step": 641, + "time_per_iteration": 2.734373092651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107341, + "balance_loss_mlp": 1.01208854, + "epoch": 0.12350904193920739, + "flos": 763841273856.0, + "grad_norm": 0.04166007448412618, + "language_loss": 0.87458932, + "learning_rate": 0.0009772497927472623, + "loss": 0.88532341, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.61230469, + "step": 642, + "time_per_iteration": 3.069495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.01199543, + "epoch": 0.12370142362447095, + "flos": 542050767360.0, + "grad_norm": 0.04189965725350253, + "language_loss": 0.86664522, + "learning_rate": 0.0009771567943801368, + "loss": 0.87737978, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.61376953, + "step": 643, + "time_per_iteration": 2.6783955097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071185, + "balance_loss_mlp": 1.01000655, + "epoch": 0.12389380530973451, + "flos": 549253808640.0, + "grad_norm": 0.03907898995026106, + "language_loss": 0.90534973, + "learning_rate": 0.0009770636107643152, + "loss": 0.91606158, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.61083984, + "step": 644, + "time_per_iteration": 2.7792532444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.01343274, + "epoch": 0.12408618699499807, + "flos": 541353793536.0, + "grad_norm": 0.03775088580197231, + "language_loss": 0.89077818, + "learning_rate": 0.0009769702419359738, + "loss": 0.9015224, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.60888672, + "step": 645, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071725, + "balance_loss_mlp": 1.01083338, + "epoch": 0.12427856868026164, + "flos": 747160544256.0, + "grad_norm": 0.03491310842571494, + "language_loss": 0.90435565, + "learning_rate": 0.000976876687931362, + "loss": 0.91507292, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.60791016, + "step": 646, + "time_per_iteration": 3.028578758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074215, + "balance_loss_mlp": 1.01332271, + "epoch": 0.1244709503655252, + "flos": 534745658880.0, + "grad_norm": 0.04739554944994068, + "language_loss": 0.86433625, + "learning_rate": 0.0009767829487868005, + "loss": 0.87507832, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.60791016, + "step": 647, + "time_per_iteration": 2.6323471069335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075713, + "balance_loss_mlp": 1.01472592, + "epoch": 0.12466333205078876, + "flos": 509112161280.0, + "grad_norm": 0.0390766896094967, + "language_loss": 0.89632404, + "learning_rate": 0.000976689024538682, + "loss": 0.90708113, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.60888672, + "step": 648, + "time_per_iteration": 2.6233997344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069043, + "balance_loss_mlp": 1.00819838, + "epoch": 0.12485571373605232, + "flos": 682640341248.0, + "grad_norm": 0.04106035596266842, + "language_loss": 0.87981439, + "learning_rate": 0.0009765949152234716, + "loss": 0.89050484, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.60742188, + "step": 649, + "time_per_iteration": 2.9135711193084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064964, + "balance_loss_mlp": 1.00659943, + "epoch": 0.1250480954213159, + "flos": 1333201377024.0, + "grad_norm": 0.013063081234142807, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79751045, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.58203125, + "step": 650, + "time_per_iteration": 4.696362495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.0093261, + "epoch": 0.12524047710657946, + "flos": 940198178304.0, + "grad_norm": 0.03723688894295025, + "language_loss": 0.82869852, + "learning_rate": 0.0009764061415379919, + "loss": 0.83939779, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.60498047, + "step": 651, + "time_per_iteration": 3.287029504776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071729, + "balance_loss_mlp": 1.01078951, + "epoch": 0.12543285879184302, + "flos": 514901812992.0, + "grad_norm": 0.03842788822410913, + "language_loss": 0.90123397, + "learning_rate": 0.0009763114772410109, + "loss": 0.91195124, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.60839844, + "step": 652, + "time_per_iteration": 2.5726470947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071215, + "balance_loss_mlp": 1.01075244, + "epoch": 0.12562524047710658, + "flos": 719684001024.0, + "grad_norm": 0.03790395950388449, + "language_loss": 0.88320071, + "learning_rate": 0.0009762166280235146, + "loss": 0.89391285, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.60351562, + "step": 653, + "time_per_iteration": 2.9728682041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073992, + "balance_loss_mlp": 1.01372027, + "epoch": 0.12581762216237014, + "flos": 564799160064.0, + "grad_norm": 0.039966468352906216, + "language_loss": 0.88308495, + "learning_rate": 0.0009761215939223267, + "loss": 0.89382488, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.6015625, + "step": 654, + "time_per_iteration": 2.7552366256713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_mlp": 1.01100981, + "epoch": 0.1260100038476337, + "flos": 482901253632.0, + "grad_norm": 0.045851790315233704, + "language_loss": 0.87049586, + "learning_rate": 0.0009760263749743428, + "loss": 0.88121206, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.60498047, + "step": 655, + "time_per_iteration": 2.5859339237213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.01301908, + "epoch": 0.12620238553289725, + "flos": 576702161664.0, + "grad_norm": 0.03680601760412016, + "language_loss": 0.91127861, + "learning_rate": 0.0009759309712165299, + "loss": 0.9220134, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.60351562, + "step": 656, + "time_per_iteration": 2.7411043643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069023, + "balance_loss_mlp": 1.00841653, + "epoch": 0.12639476721816084, + "flos": 532186197504.0, + "grad_norm": 0.050748048847022796, + "language_loss": 0.94208288, + "learning_rate": 0.0009758353826859272, + "loss": 0.95277309, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.60498047, + "step": 657, + "time_per_iteration": 2.5851681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071924, + "balance_loss_mlp": 1.01117456, + "epoch": 0.1265871489034244, + "flos": 691232472576.0, + "grad_norm": 0.04052834214006204, + "language_loss": 0.90056133, + "learning_rate": 0.0009757396094196456, + "loss": 0.91128063, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.60644531, + "step": 658, + "time_per_iteration": 2.9119739532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.01083672, + "epoch": 0.12677953058868796, + "flos": 538243166976.0, + "grad_norm": 0.03305987481805703, + "language_loss": 0.85138786, + "learning_rate": 0.0009756436514548673, + "loss": 0.86210179, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.60449219, + "step": 659, + "time_per_iteration": 2.8146860599517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070631, + "balance_loss_mlp": 1.01021552, + "epoch": 0.12697191227395152, + "flos": 520120857600.0, + "grad_norm": 0.03322369158928612, + "language_loss": 0.89052176, + "learning_rate": 0.0009755475088288466, + "loss": 0.90122807, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.60302734, + "step": 660, + "time_per_iteration": 2.7092652320861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070351, + "balance_loss_mlp": 1.01007843, + "epoch": 0.12716429395921508, + "flos": 567666768384.0, + "grad_norm": 0.0427017471912124, + "language_loss": 0.91535795, + "learning_rate": 0.0009754511815789095, + "loss": 0.92606151, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.6015625, + "step": 661, + "time_per_iteration": 2.790198564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.00809085, + "epoch": 0.12735667564447864, + "flos": 515142885888.0, + "grad_norm": 0.0409493229321676, + "language_loss": 0.8685838, + "learning_rate": 0.0009753546697424533, + "loss": 0.87926698, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.60107422, + "step": 662, + "time_per_iteration": 2.6784565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070378, + "balance_loss_mlp": 1.01020074, + "epoch": 0.1275490573297422, + "flos": 542321975808.0, + "grad_norm": 0.039351291895580044, + "language_loss": 0.91270494, + "learning_rate": 0.0009752579733569475, + "loss": 0.92340875, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.60058594, + "step": 663, + "time_per_iteration": 2.679379940032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071762, + "balance_loss_mlp": 1.01358795, + "epoch": 0.12774143901500576, + "flos": 1562027728896.0, + "grad_norm": 0.016936801864205438, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7595315, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.58007812, + "step": 664, + "time_per_iteration": 4.936127424240112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070961, + "balance_loss_mlp": 1.01092672, + "epoch": 0.12793382070026935, + "flos": 614874396672.0, + "grad_norm": 0.047422479810277696, + "language_loss": 0.90634137, + "learning_rate": 0.0009750640270890217, + "loss": 0.91705096, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.59912109, + "step": 665, + "time_per_iteration": 2.712202548980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073503, + "balance_loss_mlp": 1.01361179, + "epoch": 0.1281262023855329, + "flos": 709118566656.0, + "grad_norm": 0.04721256261198653, + "language_loss": 0.97348696, + "learning_rate": 0.0009749667772818983, + "loss": 0.98422199, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.59765625, + "step": 666, + "time_per_iteration": 2.959563732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065521, + "balance_loss_mlp": 1.00791931, + "epoch": 0.12831858407079647, + "flos": 1428185295360.0, + "grad_norm": 0.00958948420866419, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78001463, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.57421875, + "step": 667, + "time_per_iteration": 4.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071916, + "balance_loss_mlp": 1.01259768, + "epoch": 0.12851096575606002, + "flos": 450019027968.0, + "grad_norm": 0.04331482152431362, + "language_loss": 0.96237415, + "learning_rate": 0.0009747717245101093, + "loss": 0.97309327, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.59179688, + "step": 668, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_mlp": 1.01193655, + "epoch": 0.12870334744132358, + "flos": 480910454016.0, + "grad_norm": 0.040015395826151615, + "language_loss": 0.86231172, + "learning_rate": 0.00097467392162117, + "loss": 0.87302423, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.59179688, + "step": 669, + "time_per_iteration": 2.620121717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.01342034, + "epoch": 0.12889572912658714, + "flos": 640152115200.0, + "grad_norm": 0.03307407171369126, + "language_loss": 0.91950834, + "learning_rate": 0.0009745759344474708, + "loss": 0.9302386, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.59472656, + "step": 670, + "time_per_iteration": 2.834406852722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070894, + "balance_loss_mlp": 1.01114607, + "epoch": 0.1290881108118507, + "flos": 510955206912.0, + "grad_norm": 0.03904079329345599, + "language_loss": 0.90752548, + "learning_rate": 0.0009744777630270536, + "loss": 0.91823441, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.59619141, + "step": 671, + "time_per_iteration": 2.5841259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.00967062, + "epoch": 0.12928049249711426, + "flos": 672291680256.0, + "grad_norm": 0.0427916369984872, + "language_loss": 0.94394779, + "learning_rate": 0.000974379407398032, + "loss": 0.95464385, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.59814453, + "step": 672, + "time_per_iteration": 2.8698208332061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072071, + "balance_loss_mlp": 1.0120368, + "epoch": 0.12947287418237785, + "flos": 795000017664.0, + "grad_norm": 0.03399258645873994, + "language_loss": 0.83039552, + "learning_rate": 0.0009742808675985913, + "loss": 0.84111625, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.59912109, + "step": 673, + "time_per_iteration": 3.1018688678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_mlp": 1.00729334, + "epoch": 0.1296652558676414, + "flos": 486448339200.0, + "grad_norm": 0.039807509100232605, + "language_loss": 0.91899526, + "learning_rate": 0.0009741821436669876, + "loss": 0.92966807, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.59863281, + "step": 674, + "time_per_iteration": 2.6348536014556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068278, + "balance_loss_mlp": 1.00853038, + "epoch": 0.12985763755290497, + "flos": 454393344768.0, + "grad_norm": 0.044170807310258554, + "language_loss": 0.93403888, + "learning_rate": 0.0009740832356415492, + "loss": 0.9447217, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.59619141, + "step": 675, + "time_per_iteration": 2.483262538909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072639, + "balance_loss_mlp": 1.01265311, + "epoch": 0.13005001923816853, + "flos": 826435805952.0, + "grad_norm": 0.043859966784303914, + "language_loss": 0.89693773, + "learning_rate": 0.0009739841435606756, + "loss": 0.90766412, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.59863281, + "step": 676, + "time_per_iteration": 2.992385149002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_mlp": 1.00726056, + "epoch": 0.1302424009234321, + "flos": 532481705472.0, + "grad_norm": 0.03559705023164985, + "language_loss": 0.91210669, + "learning_rate": 0.0009738848674628377, + "loss": 0.92277622, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.59570312, + "step": 677, + "time_per_iteration": 2.766364574432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.00765288, + "epoch": 0.13043478260869565, + "flos": 526917575424.0, + "grad_norm": 0.03838556287658105, + "language_loss": 0.90382779, + "learning_rate": 0.000973785407386578, + "loss": 0.91449988, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.59423828, + "step": 678, + "time_per_iteration": 2.772854804992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070658, + "balance_loss_mlp": 1.01076782, + "epoch": 0.1306271642939592, + "flos": 627417991680.0, + "grad_norm": 0.03509098765963207, + "language_loss": 0.88142246, + "learning_rate": 0.0009736857633705103, + "loss": 0.89212906, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.59765625, + "step": 679, + "time_per_iteration": 2.851567268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.01602292, + "epoch": 0.13081954597922277, + "flos": 551841460224.0, + "grad_norm": 0.03859467755451503, + "language_loss": 0.94306064, + "learning_rate": 0.0009735859354533196, + "loss": 0.95381933, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.59716797, + "step": 680, + "time_per_iteration": 2.6908183097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.01038456, + "epoch": 0.13101192766448633, + "flos": 537956407296.0, + "grad_norm": 0.04695623305024525, + "language_loss": 0.92768431, + "learning_rate": 0.0009734859236737628, + "loss": 0.93838656, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.59716797, + "step": 681, + "time_per_iteration": 2.618556261062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.00631785, + "epoch": 0.13120430934974991, + "flos": 504514268160.0, + "grad_norm": 0.03771498494962771, + "language_loss": 0.94425803, + "learning_rate": 0.0009733857280706678, + "loss": 0.95491678, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.59423828, + "step": 682, + "time_per_iteration": 2.607445240020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.00883758, + "epoch": 0.13139669103501347, + "flos": 615423616512.0, + "grad_norm": 0.040497909024236244, + "language_loss": 0.85748106, + "learning_rate": 0.000973285348682934, + "loss": 0.86816311, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.59228516, + "step": 683, + "time_per_iteration": 2.749258518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.00846863, + "epoch": 0.13158907272027703, + "flos": 1488218420736.0, + "grad_norm": 0.017735586482065788, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.78962922, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.5625, + "step": 684, + "time_per_iteration": 4.792337894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069614, + "balance_loss_mlp": 1.01053405, + "epoch": 0.1317814544055406, + "flos": 987119046912.0, + "grad_norm": 0.04121230716493085, + "language_loss": 0.86815995, + "learning_rate": 0.0009730840387095046, + "loss": 0.87885606, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.58935547, + "step": 685, + "time_per_iteration": 3.324737071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_mlp": 1.00972676, + "epoch": 0.13197383609080415, + "flos": 612629885184.0, + "grad_norm": 0.03769323902360627, + "language_loss": 0.91733027, + "learning_rate": 0.0009729831082019642, + "loss": 0.92801929, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.59033203, + "step": 686, + "time_per_iteration": 2.883368968963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069054, + "balance_loss_mlp": 1.0096879, + "epoch": 0.1321662177760677, + "flos": 495555664128.0, + "grad_norm": 0.03344682577786829, + "language_loss": 0.90060174, + "learning_rate": 0.0009728819940660958, + "loss": 0.91129231, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.59228516, + "step": 687, + "time_per_iteration": 2.7771294116973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069131, + "balance_loss_mlp": 1.00971675, + "epoch": 0.13235859946133127, + "flos": 496844632320.0, + "grad_norm": 0.041743180753116546, + "language_loss": 0.8673048, + "learning_rate": 0.0009727806963411557, + "loss": 0.87799615, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.59277344, + "step": 688, + "time_per_iteration": 2.5879924297332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069414, + "balance_loss_mlp": 1.00971425, + "epoch": 0.13255098114659483, + "flos": 512768116992.0, + "grad_norm": 0.035278095584539565, + "language_loss": 0.88457793, + "learning_rate": 0.000972679215066471, + "loss": 0.89527214, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.59570312, + "step": 689, + "time_per_iteration": 2.6660075187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_mlp": 1.00826621, + "epoch": 0.13274336283185842, + "flos": 548400332544.0, + "grad_norm": 0.043703661342582356, + "language_loss": 1.0036962, + "learning_rate": 0.0009725775502814401, + "loss": 1.01437247, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.59228516, + "step": 690, + "time_per_iteration": 2.580975294113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072547, + "balance_loss_mlp": 1.01313293, + "epoch": 0.13293574451712198, + "flos": 642003909120.0, + "grad_norm": 0.041755939912029, + "language_loss": 0.86554468, + "learning_rate": 0.0009724757020255327, + "loss": 0.87627012, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.59277344, + "step": 691, + "time_per_iteration": 2.895805835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074323, + "balance_loss_mlp": 1.01533794, + "epoch": 0.13312812620238554, + "flos": 492470315520.0, + "grad_norm": 0.04584738151589033, + "language_loss": 0.8907311, + "learning_rate": 0.0009723736703382902, + "loss": 0.90147436, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.58837891, + "step": 692, + "time_per_iteration": 2.593621253967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073259, + "balance_loss_mlp": 1.01427472, + "epoch": 0.1333205078876491, + "flos": 509950086144.0, + "grad_norm": 0.042207641511909956, + "language_loss": 0.84734881, + "learning_rate": 0.0009722714552593244, + "loss": 0.85808134, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.58837891, + "step": 693, + "time_per_iteration": 2.6628286838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069094, + "balance_loss_mlp": 1.01010931, + "epoch": 0.13351288957291266, + "flos": 419592251136.0, + "grad_norm": 0.04342856140262568, + "language_loss": 0.95545483, + "learning_rate": 0.000972169056828319, + "loss": 0.96614575, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.58837891, + "step": 694, + "time_per_iteration": 2.491511821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.00896847, + "epoch": 0.13370527125817622, + "flos": 617051834112.0, + "grad_norm": 0.03328111889388194, + "language_loss": 0.87929142, + "learning_rate": 0.0009720664750850283, + "loss": 0.88997287, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.59033203, + "step": 695, + "time_per_iteration": 2.802238941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066112, + "balance_loss_mlp": 1.00693631, + "epoch": 0.13389765294343978, + "flos": 627170115840.0, + "grad_norm": 0.04111883948503256, + "language_loss": 0.94899035, + "learning_rate": 0.0009719637100692784, + "loss": 0.95965147, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.59033203, + "step": 696, + "time_per_iteration": 2.752716541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066724, + "balance_loss_mlp": 1.00764382, + "epoch": 0.13409003462870334, + "flos": 610897655040.0, + "grad_norm": 0.03903466400724949, + "language_loss": 0.84625083, + "learning_rate": 0.0009718607618209661, + "loss": 0.85691804, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.58935547, + "step": 697, + "time_per_iteration": 2.8612687587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.00915492, + "epoch": 0.13428241631396692, + "flos": 685088987136.0, + "grad_norm": 0.03548160791415639, + "language_loss": 0.8885181, + "learning_rate": 0.0009717576303800595, + "loss": 0.89919716, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.5859375, + "step": 698, + "time_per_iteration": 3.046081304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.00870502, + "epoch": 0.13447479799923048, + "flos": 509819828736.0, + "grad_norm": 0.04099621387271608, + "language_loss": 0.8689754, + "learning_rate": 0.0009716543157865975, + "loss": 0.87964994, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.5859375, + "step": 699, + "time_per_iteration": 2.7116739749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.00893724, + "epoch": 0.13466717968449404, + "flos": 899060158464.0, + "grad_norm": 0.03800712734159662, + "language_loss": 0.8517018, + "learning_rate": 0.0009715508180806907, + "loss": 0.86237621, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.58349609, + "step": 700, + "time_per_iteration": 3.184324026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.00777256, + "epoch": 0.1348595613697576, + "flos": 991695552768.0, + "grad_norm": 0.036541360765650906, + "language_loss": 0.91219282, + "learning_rate": 0.0009714471373025202, + "loss": 0.92285609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.58398438, + "step": 701, + "time_per_iteration": 3.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.0059582, + "epoch": 0.13505194305502116, + "flos": 488812414464.0, + "grad_norm": 0.038284394577449095, + "language_loss": 0.90020943, + "learning_rate": 0.0009713432734923386, + "loss": 0.91085601, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.58544922, + "step": 702, + "time_per_iteration": 2.6416144371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067313, + "balance_loss_mlp": 1.00842357, + "epoch": 0.13524432474028472, + "flos": 614520562944.0, + "grad_norm": 0.03635122731697363, + "language_loss": 0.87970936, + "learning_rate": 0.0009712392266904696, + "loss": 0.89038247, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.58740234, + "step": 703, + "time_per_iteration": 2.73490309715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.00782144, + "epoch": 0.13543670642554828, + "flos": 906275838720.0, + "grad_norm": 0.040994558071305906, + "language_loss": 0.86788869, + "learning_rate": 0.0009711349969373076, + "loss": 0.87855482, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.58642578, + "step": 704, + "time_per_iteration": 3.1667368412017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066356, + "balance_loss_mlp": 1.00765777, + "epoch": 0.13562908811081184, + "flos": 551748141312.0, + "grad_norm": 0.040707128775991024, + "language_loss": 0.81448901, + "learning_rate": 0.0009710305842733178, + "loss": 0.82515258, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.58544922, + "step": 705, + "time_per_iteration": 2.7456798553466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064755, + "balance_loss_mlp": 1.00648558, + "epoch": 0.1358214697960754, + "flos": 509038284288.0, + "grad_norm": 0.04235852839756889, + "language_loss": 0.91048527, + "learning_rate": 0.0009709259887390373, + "loss": 0.9211328, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.58105469, + "step": 706, + "time_per_iteration": 2.614645481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067613, + "balance_loss_mlp": 1.0098201, + "epoch": 0.136013851481339, + "flos": 529924189440.0, + "grad_norm": 0.045207837368539144, + "language_loss": 0.92539275, + "learning_rate": 0.0009708212103750737, + "loss": 0.93606889, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.57617188, + "step": 707, + "time_per_iteration": 2.5839250087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073243, + "balance_loss_mlp": 1.01525927, + "epoch": 0.13620623316660255, + "flos": 660321604608.0, + "grad_norm": 0.04139663244511697, + "language_loss": 0.88690269, + "learning_rate": 0.0009707162492221051, + "loss": 0.8976351, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.578125, + "step": 708, + "time_per_iteration": 2.8753738403320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106855, + "balance_loss_mlp": 1.01051939, + "epoch": 0.1363986148518661, + "flos": 673083918336.0, + "grad_norm": 0.04870142688483653, + "language_loss": 0.89226341, + "learning_rate": 0.0009706111053208815, + "loss": 0.90294898, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.57861328, + "step": 709, + "time_per_iteration": 2.792555570602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.0069865, + "epoch": 0.13659099653712967, + "flos": 474004866048.0, + "grad_norm": 0.041589756065930725, + "language_loss": 0.87875092, + "learning_rate": 0.0009705057787122232, + "loss": 0.88940346, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.58105469, + "step": 710, + "time_per_iteration": 2.5474488735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106841, + "balance_loss_mlp": 1.00980711, + "epoch": 0.13678337822239323, + "flos": 453648738816.0, + "grad_norm": 0.03947638411835938, + "language_loss": 0.92397159, + "learning_rate": 0.0009704002694370216, + "loss": 0.93465567, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.58447266, + "step": 711, + "time_per_iteration": 2.5812153816223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107162, + "balance_loss_mlp": 1.01306474, + "epoch": 0.13697575990765679, + "flos": 520626336000.0, + "grad_norm": 0.04103000756090051, + "language_loss": 0.88202429, + "learning_rate": 0.0009702945775362388, + "loss": 0.89274049, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.58398438, + "step": 712, + "time_per_iteration": 2.6084940433502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067699, + "balance_loss_mlp": 1.00914371, + "epoch": 0.13716814159292035, + "flos": 481366354944.0, + "grad_norm": 0.04017855754763819, + "language_loss": 0.88458985, + "learning_rate": 0.0009701887030509086, + "loss": 0.89526689, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.58398438, + "step": 713, + "time_per_iteration": 2.6361663341522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.01425505, + "epoch": 0.1373605232781839, + "flos": 546750727680.0, + "grad_norm": 0.04169009137316196, + "language_loss": 0.92536753, + "learning_rate": 0.0009700826460221346, + "loss": 0.93609238, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.58056641, + "step": 714, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068882, + "balance_loss_mlp": 1.01080275, + "epoch": 0.1375529049634475, + "flos": 710071197696.0, + "grad_norm": 0.042053375460334, + "language_loss": 0.94210052, + "learning_rate": 0.0009699764064910921, + "loss": 0.95278937, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.57910156, + "step": 715, + "time_per_iteration": 2.870835542678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069166, + "balance_loss_mlp": 1.01099169, + "epoch": 0.13774528664871105, + "flos": 487677036288.0, + "grad_norm": 0.04018028408764831, + "language_loss": 0.88572168, + "learning_rate": 0.0009698699844990268, + "loss": 0.89641333, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.58007812, + "step": 716, + "time_per_iteration": 2.6557233333587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106644, + "balance_loss_mlp": 1.00817037, + "epoch": 0.1379376683339746, + "flos": 681459276288.0, + "grad_norm": 0.03631196674856893, + "language_loss": 0.89737439, + "learning_rate": 0.0009697633800872555, + "loss": 0.90803885, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.58105469, + "step": 717, + "time_per_iteration": 2.9236202239990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.00998127, + "epoch": 0.13813005001923817, + "flos": 612226473984.0, + "grad_norm": 0.040527486313319094, + "language_loss": 0.9214747, + "learning_rate": 0.0009696565932971655, + "loss": 0.93215865, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.58251953, + "step": 718, + "time_per_iteration": 2.8931636810302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_mlp": 1.01394677, + "epoch": 0.13832243170450173, + "flos": 589927179264.0, + "grad_norm": 0.042228364331249636, + "language_loss": 0.91184157, + "learning_rate": 0.0009695496241702153, + "loss": 0.92256421, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.58154297, + "step": 719, + "time_per_iteration": 2.8006720542907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010673, + "balance_loss_mlp": 1.00917327, + "epoch": 0.1385148133897653, + "flos": 701320618752.0, + "grad_norm": 0.04012183054192491, + "language_loss": 0.87174737, + "learning_rate": 0.0009694424727479339, + "loss": 0.88242036, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.57958984, + "step": 720, + "time_per_iteration": 2.9363977909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.0081414, + "epoch": 0.13870719507502885, + "flos": 599367929088.0, + "grad_norm": 0.04032336097495746, + "language_loss": 0.90803999, + "learning_rate": 0.0009693351390719213, + "loss": 0.91870457, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.58154297, + "step": 721, + "time_per_iteration": 2.7786271572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01273775, + "epoch": 0.1388995767602924, + "flos": 587749741824.0, + "grad_norm": 0.04179929290372652, + "language_loss": 0.92465305, + "learning_rate": 0.000969227623183848, + "loss": 0.93536115, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.57910156, + "step": 722, + "time_per_iteration": 2.777453660964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.00870621, + "epoch": 0.139091958445556, + "flos": 652363263744.0, + "grad_norm": 0.041578114374578125, + "language_loss": 0.92603219, + "learning_rate": 0.0009691199251254554, + "loss": 0.9366982, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.57714844, + "step": 723, + "time_per_iteration": 2.813610553741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.00586045, + "epoch": 0.13928434013081956, + "flos": 576906296064.0, + "grad_norm": 0.03663552971403626, + "language_loss": 0.88541949, + "learning_rate": 0.0009690120449385555, + "loss": 0.89605606, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.57617188, + "step": 724, + "time_per_iteration": 2.7604424953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063903, + "balance_loss_mlp": 1.00582433, + "epoch": 0.13947672181608312, + "flos": 564315068928.0, + "grad_norm": 0.034271197388489986, + "language_loss": 0.93926299, + "learning_rate": 0.0009689039826650312, + "loss": 0.94990206, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.57910156, + "step": 725, + "time_per_iteration": 2.7856695652008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095871, + "balance_loss_mlp": 1.03941345, + "epoch": 0.13966910350134668, + "flos": 1524951988224.0, + "grad_norm": 0.03128450212810151, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77618933, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.56640625, + "step": 726, + "time_per_iteration": 4.903306245803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.00902975, + "epoch": 0.13986148518661023, + "flos": 500856367104.0, + "grad_norm": 0.052764167671210026, + "language_loss": 0.89172196, + "learning_rate": 0.0009686873120259941, + "loss": 0.90239263, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.57861328, + "step": 727, + "time_per_iteration": 2.6450552940368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072692, + "balance_loss_mlp": 1.01518559, + "epoch": 0.1400538668718738, + "flos": 599850074880.0, + "grad_norm": 0.036488800736072635, + "language_loss": 0.88047451, + "learning_rate": 0.0009685787037446004, + "loss": 0.89120144, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.57324219, + "step": 728, + "time_per_iteration": 2.763434648513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072512, + "balance_loss_mlp": 1.01481462, + "epoch": 0.14024624855713735, + "flos": 595169556480.0, + "grad_norm": 0.047561697925478, + "language_loss": 0.88858587, + "learning_rate": 0.0009684699135448201, + "loss": 0.89931101, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.57519531, + "step": 729, + "time_per_iteration": 2.745037078857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067277, + "balance_loss_mlp": 1.00962722, + "epoch": 0.1404386302424009, + "flos": 507586010880.0, + "grad_norm": 0.03094406590189725, + "language_loss": 0.9291476, + "learning_rate": 0.0009683609414688895, + "loss": 0.93982029, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.57470703, + "step": 730, + "time_per_iteration": 2.7384650707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.01039195, + "epoch": 0.14063101192766447, + "flos": 574515975936.0, + "grad_norm": 0.037780385553924656, + "language_loss": 0.87345785, + "learning_rate": 0.0009682517875591154, + "loss": 0.88414258, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.57910156, + "step": 731, + "time_per_iteration": 2.752572536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071345, + "balance_loss_mlp": 1.0129801, + "epoch": 0.14082339361292806, + "flos": 565765396992.0, + "grad_norm": 0.03832964150159033, + "language_loss": 0.87666118, + "learning_rate": 0.0009681424518578749, + "loss": 0.88737464, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.58203125, + "step": 732, + "time_per_iteration": 2.7323830127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068462, + "balance_loss_mlp": 1.01028764, + "epoch": 0.14101577529819162, + "flos": 464583558144.0, + "grad_norm": 0.035957988569031644, + "language_loss": 0.88670099, + "learning_rate": 0.000968032934407616, + "loss": 0.8973856, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.58007812, + "step": 733, + "time_per_iteration": 2.6479005813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064525, + "balance_loss_mlp": 1.00644577, + "epoch": 0.14120815698345518, + "flos": 597262423296.0, + "grad_norm": 0.039547782577588224, + "language_loss": 0.82413781, + "learning_rate": 0.0009679232352508571, + "loss": 0.83478296, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.57910156, + "step": 734, + "time_per_iteration": 2.7924795150756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063696, + "balance_loss_mlp": 1.00599897, + "epoch": 0.14140053866871874, + "flos": 536232925440.0, + "grad_norm": 0.03854566850595878, + "language_loss": 0.82520735, + "learning_rate": 0.0009678133544301871, + "loss": 0.83584428, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.57519531, + "step": 735, + "time_per_iteration": 2.658731698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_mlp": 1.00498438, + "epoch": 0.1415929203539823, + "flos": 521277623040.0, + "grad_norm": 0.0297517777524564, + "language_loss": 0.92917788, + "learning_rate": 0.0009677032919882658, + "loss": 0.93980187, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.57226562, + "step": 736, + "time_per_iteration": 2.661276340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_mlp": 1.0113374, + "epoch": 0.14178530203924586, + "flos": 483302719488.0, + "grad_norm": 0.041037110936195734, + "language_loss": 0.92867804, + "learning_rate": 0.000967593047967823, + "loss": 0.93936217, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.56982422, + "step": 737, + "time_per_iteration": 2.52840256690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068115, + "balance_loss_mlp": 1.01056099, + "epoch": 0.14197768372450942, + "flos": 677840259072.0, + "grad_norm": 0.04254557939420697, + "language_loss": 0.88126308, + "learning_rate": 0.0009674826224116593, + "loss": 0.89194429, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.57373047, + "step": 738, + "time_per_iteration": 2.858147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.0167979, + "epoch": 0.14217006540977298, + "flos": 446992972032.0, + "grad_norm": 0.045930563119643074, + "language_loss": 0.87994051, + "learning_rate": 0.0009673720153626455, + "loss": 0.89068353, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.57324219, + "step": 739, + "time_per_iteration": 2.664236545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.01199603, + "epoch": 0.14236244709503657, + "flos": 497478422784.0, + "grad_norm": 0.040566684483093814, + "language_loss": 0.88105047, + "learning_rate": 0.0009672612268637235, + "loss": 0.89174449, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.57226562, + "step": 740, + "time_per_iteration": 2.634126901626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069933, + "balance_loss_mlp": 1.01304626, + "epoch": 0.14255482878030012, + "flos": 649480104192.0, + "grad_norm": 0.05086050125917657, + "language_loss": 0.85906518, + "learning_rate": 0.0009671502569579048, + "loss": 0.86976457, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.56884766, + "step": 741, + "time_per_iteration": 2.7642107009887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071964, + "balance_loss_mlp": 1.01564944, + "epoch": 0.14274721046556368, + "flos": 537274984704.0, + "grad_norm": 0.037356444744632025, + "language_loss": 0.90824854, + "learning_rate": 0.0009670391056882719, + "loss": 0.91896814, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.56445312, + "step": 742, + "time_per_iteration": 2.7307372093200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069292, + "balance_loss_mlp": 1.01288199, + "epoch": 0.14293959215082724, + "flos": 958584893184.0, + "grad_norm": 0.03744948002603285, + "language_loss": 0.89976203, + "learning_rate": 0.0009669277730979776, + "loss": 0.91045499, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.56494141, + "step": 743, + "time_per_iteration": 3.2251601219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068321, + "balance_loss_mlp": 1.01162553, + "epoch": 0.1431319738360908, + "flos": 694386840576.0, + "grad_norm": 0.037398516399228816, + "language_loss": 0.86562485, + "learning_rate": 0.0009668162592302449, + "loss": 0.87630802, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.56738281, + "step": 744, + "time_per_iteration": 2.924435615539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.01015854, + "epoch": 0.14332435552135436, + "flos": 566503200000.0, + "grad_norm": 0.037819132294000864, + "language_loss": 0.86981773, + "learning_rate": 0.0009667045641283676, + "loss": 0.88048917, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.56933594, + "step": 745, + "time_per_iteration": 2.6744887828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071811, + "balance_loss_mlp": 1.01540148, + "epoch": 0.14351673720661792, + "flos": 739696988160.0, + "grad_norm": 0.042480690817339954, + "language_loss": 0.96115947, + "learning_rate": 0.0009665926878357092, + "loss": 0.97187757, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.56591797, + "step": 746, + "time_per_iteration": 2.9137520790100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069908, + "balance_loss_mlp": 1.0134027, + "epoch": 0.14370911889188148, + "flos": 550352248320.0, + "grad_norm": 0.037361960218361134, + "language_loss": 0.92219329, + "learning_rate": 0.0009664806303957043, + "loss": 0.93289238, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.56542969, + "step": 747, + "time_per_iteration": 2.7734382152557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_mlp": 1.01453757, + "epoch": 0.14390150057714507, + "flos": 591590390016.0, + "grad_norm": 0.040803275102161134, + "language_loss": 0.88578373, + "learning_rate": 0.0009663683918518571, + "loss": 0.89649272, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.56542969, + "step": 748, + "time_per_iteration": 2.93782114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_mlp": 1.0103749, + "epoch": 0.14409388226240863, + "flos": 592145445888.0, + "grad_norm": 0.040391516566669984, + "language_loss": 0.87085271, + "learning_rate": 0.0009662559722477428, + "loss": 0.88152146, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.56640625, + "step": 749, + "time_per_iteration": 2.696570873260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140549, + "balance_loss_mlp": 1.08542633, + "epoch": 0.1442862639476722, + "flos": 1514657762304.0, + "grad_norm": 0.043557664449290004, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77303517, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.55273438, + "step": 750, + "time_per_iteration": 5.024984836578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106928, + "balance_loss_mlp": 1.01263177, + "epoch": 0.14447864563293575, + "flos": 497856556032.0, + "grad_norm": 0.03544029116038115, + "language_loss": 0.90697813, + "learning_rate": 0.0009660305900333632, + "loss": 0.91767091, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.56738281, + "step": 751, + "time_per_iteration": 2.678037166595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078207, + "balance_loss_mlp": 1.02165437, + "epoch": 0.1446710273181993, + "flos": 590795239680.0, + "grad_norm": 0.04141635113788076, + "language_loss": 0.83649188, + "learning_rate": 0.0009659176275105992, + "loss": 0.84727395, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.56640625, + "step": 752, + "time_per_iteration": 2.714871883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.02074409, + "epoch": 0.14486340900346287, + "flos": 587013884160.0, + "grad_norm": 0.03637909883196532, + "language_loss": 0.87195009, + "learning_rate": 0.0009658044841025701, + "loss": 0.88271976, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.56396484, + "step": 753, + "time_per_iteration": 2.7753467559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.01901722, + "epoch": 0.14505579068872643, + "flos": 505741019904.0, + "grad_norm": 0.041255413340114844, + "language_loss": 0.82866222, + "learning_rate": 0.0009656911598532021, + "loss": 0.83941746, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.56591797, + "step": 754, + "time_per_iteration": 2.657831907272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.02119958, + "epoch": 0.14524817237399, + "flos": 487816041984.0, + "grad_norm": 0.03637506550278126, + "language_loss": 0.9138847, + "learning_rate": 0.0009655776548064917, + "loss": 0.92465889, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.56347656, + "step": 755, + "time_per_iteration": 2.6499805450439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070806, + "balance_loss_mlp": 1.01477778, + "epoch": 0.14544055405925355, + "flos": 729450394368.0, + "grad_norm": 0.037726189244012505, + "language_loss": 0.89799821, + "learning_rate": 0.0009654639690065054, + "loss": 0.90870631, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.56201172, + "step": 756, + "time_per_iteration": 2.913638114929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070686, + "balance_loss_mlp": 1.01461017, + "epoch": 0.14563293574451713, + "flos": 594787532544.0, + "grad_norm": 0.03772784195488967, + "language_loss": 0.8914414, + "learning_rate": 0.00096535010249738, + "loss": 0.90214825, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.5625, + "step": 757, + "time_per_iteration": 2.721640110015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.01121712, + "epoch": 0.1458253174297807, + "flos": 561623404800.0, + "grad_norm": 0.04410713855467511, + "language_loss": 0.84106696, + "learning_rate": 0.0009652360553233224, + "loss": 0.8517437, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.56591797, + "step": 758, + "time_per_iteration": 2.771986484527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080421, + "balance_loss_mlp": 1.02625275, + "epoch": 0.14601769911504425, + "flos": 1561189804032.0, + "grad_norm": 0.021986445825835567, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74854165, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.54296875, + "step": 759, + "time_per_iteration": 4.951657056808472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064246, + "balance_loss_mlp": 1.00712132, + "epoch": 0.1462100808003078, + "flos": 867823646976.0, + "grad_norm": 0.03532102179266325, + "language_loss": 0.82350075, + "learning_rate": 0.0009650074191575883, + "loss": 0.83414322, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.56982422, + "step": 760, + "time_per_iteration": 3.2275402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.02083874, + "epoch": 0.14640246248557137, + "flos": 524030525184.0, + "grad_norm": 0.0394901057776484, + "language_loss": 0.87295806, + "learning_rate": 0.0009648928302546766, + "loss": 0.88373965, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.57177734, + "step": 761, + "time_per_iteration": 2.6739044189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108061, + "balance_loss_mlp": 1.02319896, + "epoch": 0.14659484417083493, + "flos": 1032242556672.0, + "grad_norm": 0.0381114836464334, + "language_loss": 0.86423808, + "learning_rate": 0.0009647780608643613, + "loss": 0.87504417, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.57226562, + "step": 762, + "time_per_iteration": 3.355055332183838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084881, + "balance_loss_mlp": 1.02742219, + "epoch": 0.1467872258560985, + "flos": 501657353472.0, + "grad_norm": 0.04884269069306727, + "language_loss": 0.89483184, + "learning_rate": 0.0009646631110312001, + "loss": 0.90568066, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.57275391, + "step": 763, + "time_per_iteration": 2.638404607772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074108, + "balance_loss_mlp": 1.01683939, + "epoch": 0.14697960754136205, + "flos": 548936913408.0, + "grad_norm": 0.030517371118051684, + "language_loss": 0.89587164, + "learning_rate": 0.0009645479807998203, + "loss": 0.90661263, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.57128906, + "step": 764, + "time_per_iteration": 2.7784340381622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066881, + "balance_loss_mlp": 1.0099467, + "epoch": 0.14717198922662564, + "flos": 518902854144.0, + "grad_norm": 0.03321738346858149, + "language_loss": 0.93693149, + "learning_rate": 0.0009644326702149196, + "loss": 0.94760031, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.56884766, + "step": 765, + "time_per_iteration": 2.712148904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.009902, + "epoch": 0.1473643709118892, + "flos": 733484483328.0, + "grad_norm": 0.042813367444357694, + "language_loss": 0.86227441, + "learning_rate": 0.0009643171793212653, + "loss": 0.87293845, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.56591797, + "step": 766, + "time_per_iteration": 3.0350003242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.01357007, + "epoch": 0.14755675259715276, + "flos": 621669169152.0, + "grad_norm": 0.04397904632105779, + "language_loss": 0.90884185, + "learning_rate": 0.0009642015081636952, + "loss": 0.91953874, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.56298828, + "step": 767, + "time_per_iteration": 2.6967811584472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.01185656, + "epoch": 0.14774913428241632, + "flos": 453173395968.0, + "grad_norm": 0.040409537343205924, + "language_loss": 0.89756525, + "learning_rate": 0.0009640856567871166, + "loss": 0.90824074, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.55859375, + "step": 768, + "time_per_iteration": 2.5016207695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.00803363, + "epoch": 0.14794151596767988, + "flos": 838655702784.0, + "grad_norm": 0.03518214363191685, + "language_loss": 0.90024096, + "learning_rate": 0.0009639696252365072, + "loss": 0.91087824, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.55859375, + "step": 769, + "time_per_iteration": 3.0535316467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064247, + "balance_loss_mlp": 1.00874364, + "epoch": 0.14813389765294344, + "flos": 687405430272.0, + "grad_norm": 0.03578436651039587, + "language_loss": 0.83073497, + "learning_rate": 0.0009638534135569144, + "loss": 0.8413775, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.55664062, + "step": 770, + "time_per_iteration": 2.8983683586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065854, + "balance_loss_mlp": 1.01039767, + "epoch": 0.148326279338207, + "flos": 510944513280.0, + "grad_norm": 0.03931230706380594, + "language_loss": 0.91550887, + "learning_rate": 0.0009637370217934554, + "loss": 0.92616743, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.55615234, + "step": 771, + "time_per_iteration": 2.6311967372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061556, + "balance_loss_mlp": 1.00590932, + "epoch": 0.14851866102347056, + "flos": 589332272640.0, + "grad_norm": 0.03214719611667013, + "language_loss": 0.8436957, + "learning_rate": 0.0009636204499913175, + "loss": 0.85431123, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.55810547, + "step": 772, + "time_per_iteration": 2.8748695850372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066056, + "balance_loss_mlp": 1.01069546, + "epoch": 0.14871104270873411, + "flos": 692248286976.0, + "grad_norm": 0.034034874980260935, + "language_loss": 0.89455193, + "learning_rate": 0.0009635036981957581, + "loss": 0.9052124, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.55517578, + "step": 773, + "time_per_iteration": 2.8526012897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063541, + "balance_loss_mlp": 1.00789392, + "epoch": 0.1489034243939977, + "flos": 656283624960.0, + "grad_norm": 0.03841304714783139, + "language_loss": 0.91971016, + "learning_rate": 0.0009633867664521043, + "loss": 0.93034559, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.55810547, + "step": 774, + "time_per_iteration": 2.823320150375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_mlp": 1.00736797, + "epoch": 0.14909580607926126, + "flos": 476796652032.0, + "grad_norm": 0.0404919947218097, + "language_loss": 0.88328946, + "learning_rate": 0.0009632696548057527, + "loss": 0.89392436, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.56298828, + "step": 775, + "time_per_iteration": 2.5567190647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072265, + "balance_loss_mlp": 1.01609385, + "epoch": 0.14928818776452482, + "flos": 612284799744.0, + "grad_norm": 0.03821441574416946, + "language_loss": 0.86270714, + "learning_rate": 0.0009631523633021704, + "loss": 0.87342978, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.56347656, + "step": 776, + "time_per_iteration": 2.783348321914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068042, + "balance_loss_mlp": 1.01187015, + "epoch": 0.14948056944978838, + "flos": 562917230592.0, + "grad_norm": 0.039790220133906304, + "language_loss": 0.90072912, + "learning_rate": 0.0009630348919868936, + "loss": 0.9114095, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.56347656, + "step": 777, + "time_per_iteration": 2.7115018367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.01736236, + "epoch": 0.14967295113505194, + "flos": 450112346880.0, + "grad_norm": 0.044777999480791836, + "language_loss": 0.82363755, + "learning_rate": 0.0009629172409055293, + "loss": 0.83437192, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.5625, + "step": 778, + "time_per_iteration": 2.578178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079079, + "balance_loss_mlp": 1.02319324, + "epoch": 0.1498653328203155, + "flos": 572429912064.0, + "grad_norm": 0.03699200582710457, + "language_loss": 0.8876617, + "learning_rate": 0.0009627994101037531, + "loss": 0.89845246, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.56054688, + "step": 779, + "time_per_iteration": 2.7733986377716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.01556909, + "epoch": 0.15005771450557906, + "flos": 632408602368.0, + "grad_norm": 0.04036301028093645, + "language_loss": 0.90477651, + "learning_rate": 0.0009626813996273114, + "loss": 0.91549194, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.56152344, + "step": 780, + "time_per_iteration": 2.8476834297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064638, + "balance_loss_mlp": 1.00884771, + "epoch": 0.15025009619084262, + "flos": 579166358784.0, + "grad_norm": 0.036574622666600026, + "language_loss": 0.89819682, + "learning_rate": 0.0009625632095220198, + "loss": 0.90884316, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.55957031, + "step": 781, + "time_per_iteration": 2.8279531002044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065406, + "balance_loss_mlp": 1.00961614, + "epoch": 0.1504424778761062, + "flos": 484857060096.0, + "grad_norm": 0.04416373966784989, + "language_loss": 0.8858574, + "learning_rate": 0.0009624448398337637, + "loss": 0.89651144, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.55957031, + "step": 782, + "time_per_iteration": 2.512742280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0075767, + "epoch": 0.15063485956136977, + "flos": 763895708928.0, + "grad_norm": 0.03630111779859241, + "language_loss": 0.90811443, + "learning_rate": 0.0009623262906084984, + "loss": 0.9187429, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.55419922, + "step": 783, + "time_per_iteration": 3.0409936904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.01156867, + "epoch": 0.15082724124663333, + "flos": 498676984320.0, + "grad_norm": 0.03758683048429116, + "language_loss": 0.91324949, + "learning_rate": 0.0009622075618922486, + "loss": 0.92391407, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.55029297, + "step": 784, + "time_per_iteration": 2.716580629348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.01188219, + "epoch": 0.15101962293189689, + "flos": 510722882304.0, + "grad_norm": 0.0361748672236624, + "language_loss": 0.88713133, + "learning_rate": 0.0009620886537311091, + "loss": 0.89779752, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.54882812, + "step": 785, + "time_per_iteration": 2.7197515964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065435, + "balance_loss_mlp": 1.01069367, + "epoch": 0.15121200461716044, + "flos": 458702532864.0, + "grad_norm": 0.0476660620131034, + "language_loss": 0.86751854, + "learning_rate": 0.000961969566171244, + "loss": 0.87817287, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.54882812, + "step": 786, + "time_per_iteration": 2.519826650619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063397, + "balance_loss_mlp": 1.00865602, + "epoch": 0.151404386302424, + "flos": 539017908480.0, + "grad_norm": 0.0401982478312821, + "language_loss": 0.91594857, + "learning_rate": 0.0009618502992588873, + "loss": 0.92658257, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.54882812, + "step": 787, + "time_per_iteration": 2.6427645683288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.02133262, + "epoch": 0.15159676798768756, + "flos": 689617860864.0, + "grad_norm": 0.04258050045209434, + "language_loss": 0.8916502, + "learning_rate": 0.0009617308530403424, + "loss": 0.9024148, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.55273438, + "step": 788, + "time_per_iteration": 3.0662577152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106461, + "balance_loss_mlp": 1.00958323, + "epoch": 0.15178914967295112, + "flos": 546433832448.0, + "grad_norm": 0.03354297731817266, + "language_loss": 0.88695067, + "learning_rate": 0.0009616112275619825, + "loss": 0.89759684, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.55175781, + "step": 789, + "time_per_iteration": 2.7230606079101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065845, + "balance_loss_mlp": 1.01081765, + "epoch": 0.1519815313582147, + "flos": 512815749120.0, + "grad_norm": 0.03087624340708216, + "language_loss": 0.85391772, + "learning_rate": 0.0009614914228702503, + "loss": 0.86457616, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.55175781, + "step": 790, + "time_per_iteration": 2.6690316200256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075082, + "balance_loss_mlp": 1.02024603, + "epoch": 0.15217391304347827, + "flos": 685458372096.0, + "grad_norm": 0.03877155611381102, + "language_loss": 0.90952718, + "learning_rate": 0.0009613714390116581, + "loss": 0.92027801, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.54980469, + "step": 791, + "time_per_iteration": 3.006898880004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069036, + "balance_loss_mlp": 1.01396108, + "epoch": 0.15236629472874183, + "flos": 645446982144.0, + "grad_norm": 0.03750254169389994, + "language_loss": 0.87660968, + "learning_rate": 0.0009612512760327879, + "loss": 0.88730001, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.55224609, + "step": 792, + "time_per_iteration": 2.858262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068429, + "balance_loss_mlp": 1.01297235, + "epoch": 0.1525586764140054, + "flos": 413765660928.0, + "grad_norm": 0.044925092089749936, + "language_loss": 0.86468709, + "learning_rate": 0.0009611309339802909, + "loss": 0.87537134, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.55615234, + "step": 793, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070738, + "balance_loss_mlp": 1.01485312, + "epoch": 0.15275105809926895, + "flos": 804234687744.0, + "grad_norm": 0.03634630877191588, + "language_loss": 0.85518378, + "learning_rate": 0.0009610104129008881, + "loss": 0.8658911, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.56054688, + "step": 794, + "time_per_iteration": 3.119896173477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064568, + "balance_loss_mlp": 1.0088737, + "epoch": 0.1529434397845325, + "flos": 613543632384.0, + "grad_norm": 0.039196324818253456, + "language_loss": 0.89691782, + "learning_rate": 0.0009608897128413701, + "loss": 0.90756351, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.55859375, + "step": 795, + "time_per_iteration": 2.7244484424591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_mlp": 1.00949657, + "epoch": 0.15313582146979607, + "flos": 616472478720.0, + "grad_norm": 0.031652256183926086, + "language_loss": 0.86697376, + "learning_rate": 0.0009607688338485965, + "loss": 0.87762469, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.55761719, + "step": 796, + "time_per_iteration": 2.859959363937378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106534, + "balance_loss_mlp": 1.00997913, + "epoch": 0.15332820315505963, + "flos": 794993214720.0, + "grad_norm": 0.036135713167076366, + "language_loss": 0.91464871, + "learning_rate": 0.0009606477759694969, + "loss": 0.92530215, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.55517578, + "step": 797, + "time_per_iteration": 3.0383169651031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.00806129, + "epoch": 0.1535205848403232, + "flos": 551257247232.0, + "grad_norm": 0.04267360012583918, + "language_loss": 0.89290035, + "learning_rate": 0.0009605265392510703, + "loss": 0.90353841, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.55908203, + "step": 798, + "time_per_iteration": 2.642423152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063071, + "balance_loss_mlp": 1.00732899, + "epoch": 0.15371296652558677, + "flos": 536979476736.0, + "grad_norm": 0.03662373873498648, + "language_loss": 0.93232477, + "learning_rate": 0.0009604051237403846, + "loss": 0.94295549, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.55908203, + "step": 799, + "time_per_iteration": 2.6661648750305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062966, + "balance_loss_mlp": 1.00693774, + "epoch": 0.15390534821085033, + "flos": 396090504192.0, + "grad_norm": 0.042222005302764924, + "language_loss": 0.87381375, + "learning_rate": 0.0009602835294845776, + "loss": 0.8844434, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.56201172, + "step": 800, + "time_per_iteration": 2.4529898166656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_mlp": 1.00520432, + "epoch": 0.1540977298961139, + "flos": 536886157824.0, + "grad_norm": 0.03888031973735598, + "language_loss": 0.91938102, + "learning_rate": 0.0009601617565308565, + "loss": 0.92998952, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.55810547, + "step": 801, + "time_per_iteration": 2.6380698680877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.0085746, + "epoch": 0.15429011158137745, + "flos": 725091628800.0, + "grad_norm": 0.03523983772327724, + "language_loss": 0.87975162, + "learning_rate": 0.0009600398049264977, + "loss": 0.89039195, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.55615234, + "step": 802, + "time_per_iteration": 2.9610986709594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064973, + "balance_loss_mlp": 1.00970769, + "epoch": 0.154482493266641, + "flos": 621749849088.0, + "grad_norm": 0.04424510077845192, + "language_loss": 0.93353879, + "learning_rate": 0.0009599176747188469, + "loss": 0.94418848, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.55419922, + "step": 803, + "time_per_iteration": 2.883296251296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065036, + "balance_loss_mlp": 1.00981843, + "epoch": 0.15467487495190457, + "flos": 526720243968.0, + "grad_norm": 0.03833070581853241, + "language_loss": 0.84471631, + "learning_rate": 0.0009597953659553196, + "loss": 0.85536671, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.55371094, + "step": 804, + "time_per_iteration": 2.7128705978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.00712788, + "epoch": 0.15486725663716813, + "flos": 528760621056.0, + "grad_norm": 0.03896986919959599, + "language_loss": 0.90159577, + "learning_rate": 0.0009596728786833997, + "loss": 0.9122197, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.55419922, + "step": 805, + "time_per_iteration": 2.605398178100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062158, + "balance_loss_mlp": 1.00684452, + "epoch": 0.1550596383224317, + "flos": 1050280295424.0, + "grad_norm": 0.039312204875199507, + "language_loss": 0.90827858, + "learning_rate": 0.0009595502129506415, + "loss": 0.91890013, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.5546875, + "step": 806, + "time_per_iteration": 3.355556011199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_mlp": 1.00736439, + "epoch": 0.15525202000769528, + "flos": 614837458176.0, + "grad_norm": 0.03934214137038287, + "language_loss": 0.83726299, + "learning_rate": 0.0009594273688046678, + "loss": 0.8478874, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.55224609, + "step": 807, + "time_per_iteration": 2.765700101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062118, + "balance_loss_mlp": 1.00728118, + "epoch": 0.15544440169295884, + "flos": 534103120128.0, + "grad_norm": 0.042258492962953934, + "language_loss": 0.86714661, + "learning_rate": 0.000959304346293171, + "loss": 0.8777678, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.54980469, + "step": 808, + "time_per_iteration": 2.6490986347198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.00928247, + "epoch": 0.1556367833782224, + "flos": 645887331840.0, + "grad_norm": 0.047675746935091516, + "language_loss": 0.89139616, + "learning_rate": 0.0009591811454639125, + "loss": 0.90203738, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.54980469, + "step": 809, + "time_per_iteration": 2.7880568504333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059631, + "balance_loss_mlp": 1.00469911, + "epoch": 0.15582916506348596, + "flos": 544953368832.0, + "grad_norm": 0.05205155355433054, + "language_loss": 0.89500809, + "learning_rate": 0.0009590577663647234, + "loss": 0.90560436, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.55078125, + "step": 810, + "time_per_iteration": 2.743067741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061907, + "balance_loss_mlp": 1.0068804, + "epoch": 0.15602154674874952, + "flos": 581215484160.0, + "grad_norm": 0.039153260843753375, + "language_loss": 0.87186325, + "learning_rate": 0.0009589342090435036, + "loss": 0.88248235, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.55175781, + "step": 811, + "time_per_iteration": 2.806425094604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_mlp": 1.00607169, + "epoch": 0.15621392843401308, + "flos": 536317496064.0, + "grad_norm": 0.04937652455074429, + "language_loss": 0.88453877, + "learning_rate": 0.0009588104735482223, + "loss": 0.89514732, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.54931641, + "step": 812, + "time_per_iteration": 2.647728204727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060077, + "balance_loss_mlp": 1.00538397, + "epoch": 0.15640631011927664, + "flos": 551982411264.0, + "grad_norm": 0.04402679292728805, + "language_loss": 0.85281312, + "learning_rate": 0.0009586865599269177, + "loss": 0.86341381, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.54833984, + "step": 813, + "time_per_iteration": 2.642218828201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061354, + "balance_loss_mlp": 1.0069474, + "epoch": 0.1565986918045402, + "flos": 638636658432.0, + "grad_norm": 0.0415768255708782, + "language_loss": 0.89702487, + "learning_rate": 0.0009585624682276977, + "loss": 0.90763843, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.54541016, + "step": 814, + "time_per_iteration": 2.7770931720733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058453, + "balance_loss_mlp": 1.00366414, + "epoch": 0.15679107348980378, + "flos": 491782089984.0, + "grad_norm": 0.039213144049943555, + "language_loss": 0.88436091, + "learning_rate": 0.0009584381984987386, + "loss": 0.89494538, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.54931641, + "step": 815, + "time_per_iteration": 2.617560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061581, + "balance_loss_mlp": 1.00655353, + "epoch": 0.15698345517506734, + "flos": 531003187200.0, + "grad_norm": 0.030486806446719653, + "language_loss": 0.91117728, + "learning_rate": 0.0009583137507882864, + "loss": 0.92179304, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.55175781, + "step": 816, + "time_per_iteration": 2.6757051944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060617, + "balance_loss_mlp": 1.00568497, + "epoch": 0.1571758368603309, + "flos": 547078316544.0, + "grad_norm": 0.03910336486934304, + "language_loss": 0.82217371, + "learning_rate": 0.000958189125144656, + "loss": 0.83277988, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.55078125, + "step": 817, + "time_per_iteration": 2.7065701484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061392, + "balance_loss_mlp": 1.00655591, + "epoch": 0.15736821854559446, + "flos": 566744272896.0, + "grad_norm": 0.03730967846547413, + "language_loss": 0.89150202, + "learning_rate": 0.0009580643216162313, + "loss": 0.90211594, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.54980469, + "step": 818, + "time_per_iteration": 2.6849937438964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_mlp": 1.00792253, + "epoch": 0.15756060023085802, + "flos": 501954806784.0, + "grad_norm": 0.041127076818974775, + "language_loss": 0.80838168, + "learning_rate": 0.0009579393402514652, + "loss": 0.81900686, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.54736328, + "step": 819, + "time_per_iteration": 2.615342378616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060056, + "balance_loss_mlp": 1.00560164, + "epoch": 0.15775298191612158, + "flos": 520272502272.0, + "grad_norm": 0.037825026421493144, + "language_loss": 0.91941106, + "learning_rate": 0.0009578141810988801, + "loss": 0.93001157, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.54589844, + "step": 820, + "time_per_iteration": 2.6530544757843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061362, + "balance_loss_mlp": 1.00666904, + "epoch": 0.15794536360138514, + "flos": 467088584448.0, + "grad_norm": 0.039348813654249644, + "language_loss": 0.92238629, + "learning_rate": 0.0009576888442070668, + "loss": 0.93299985, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.54833984, + "step": 821, + "time_per_iteration": 2.5978658199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_mlp": 1.00809062, + "epoch": 0.1581377452866487, + "flos": 518168941824.0, + "grad_norm": 0.03790806580601569, + "language_loss": 0.93657464, + "learning_rate": 0.0009575633296246854, + "loss": 0.94720107, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.546875, + "step": 822, + "time_per_iteration": 2.582139492034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_mlp": 1.00711334, + "epoch": 0.15833012697191226, + "flos": 550838284800.0, + "grad_norm": 0.03604802690546967, + "language_loss": 0.84146446, + "learning_rate": 0.0009574376374004652, + "loss": 0.85208106, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.546875, + "step": 823, + "time_per_iteration": 2.6182329654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061584, + "balance_loss_mlp": 1.00703347, + "epoch": 0.15852250865717585, + "flos": 488467329024.0, + "grad_norm": 0.0382059884648543, + "language_loss": 0.82121176, + "learning_rate": 0.000957311767583204, + "loss": 0.83182758, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.546875, + "step": 824, + "time_per_iteration": 2.584266185760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_mlp": 1.00531006, + "epoch": 0.1587148903424394, + "flos": 1312699441152.0, + "grad_norm": 0.00659207066158758, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83129162, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.5234375, + "step": 825, + "time_per_iteration": 4.734830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_mlp": 1.00965643, + "epoch": 0.15890727202770297, + "flos": 467833190400.0, + "grad_norm": 0.04624650490850591, + "language_loss": 0.92764026, + "learning_rate": 0.0009570594953650961, + "loss": 0.93828189, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.54638672, + "step": 826, + "time_per_iteration": 2.5117454528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.00937772, + "epoch": 0.15909965371296653, + "flos": 778607993088.0, + "grad_norm": 0.03976637787958364, + "language_loss": 0.81327987, + "learning_rate": 0.00095693309306219, + "loss": 0.8239187, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.54638672, + "step": 827, + "time_per_iteration": 3.1954681873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_mlp": 1.00599849, + "epoch": 0.1592920353982301, + "flos": 1079964411648.0, + "grad_norm": 0.038150784713437476, + "language_loss": 0.89750922, + "learning_rate": 0.0009568065133621244, + "loss": 0.90811658, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.54882812, + "step": 828, + "time_per_iteration": 3.3355016708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_mlp": 1.00642896, + "epoch": 0.15948441708349365, + "flos": 726890932992.0, + "grad_norm": 0.03986186218144037, + "language_loss": 0.85834098, + "learning_rate": 0.0009566797563140422, + "loss": 0.86894989, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.54589844, + "step": 829, + "time_per_iteration": 2.873845100402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_mlp": 1.00519884, + "epoch": 0.1596767987687572, + "flos": 580076215296.0, + "grad_norm": 0.03433333328837374, + "language_loss": 0.89395094, + "learning_rate": 0.0009565528219671547, + "loss": 0.90454364, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.54199219, + "step": 830, + "time_per_iteration": 2.9566032886505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.00991619, + "epoch": 0.15986918045402077, + "flos": 530026256640.0, + "grad_norm": 0.037800776955081314, + "language_loss": 0.86586118, + "learning_rate": 0.0009564257103707418, + "loss": 0.87649965, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.54052734, + "step": 831, + "time_per_iteration": 2.6305205821990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_mlp": 1.00870061, + "epoch": 0.16006156213928435, + "flos": 575670796032.0, + "grad_norm": 0.04196239075383403, + "language_loss": 0.92502224, + "learning_rate": 0.0009562984215741533, + "loss": 0.93564951, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.54150391, + "step": 832, + "time_per_iteration": 2.6781210899353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00743783, + "epoch": 0.1602539438245479, + "flos": 516675839232.0, + "grad_norm": 0.039654673227061156, + "language_loss": 0.83729708, + "learning_rate": 0.0009561709556268065, + "loss": 0.84791321, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.54296875, + "step": 833, + "time_per_iteration": 2.732191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064816, + "balance_loss_mlp": 1.01021826, + "epoch": 0.16044632550981147, + "flos": 622162008576.0, + "grad_norm": 0.03600956841171521, + "language_loss": 0.95349514, + "learning_rate": 0.0009560433125781884, + "loss": 0.96414334, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.54736328, + "step": 834, + "time_per_iteration": 4.227160215377808 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063475, + "balance_loss_mlp": 1.008973, + "epoch": 0.16063870719507503, + "flos": 562128883200.0, + "grad_norm": 0.03652136008848007, + "language_loss": 0.94107795, + "learning_rate": 0.0009559154924778544, + "loss": 0.95171273, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.54638672, + "step": 835, + "time_per_iteration": 2.7238283157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_mlp": 1.01251614, + "epoch": 0.1608310888803386, + "flos": 806561824512.0, + "grad_norm": 0.044196177378580975, + "language_loss": 0.86185992, + "learning_rate": 0.0009557874953754284, + "loss": 0.87252581, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.54199219, + "step": 836, + "time_per_iteration": 3.03965425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_mlp": 1.00943184, + "epoch": 0.16102347056560215, + "flos": 601695065856.0, + "grad_norm": 0.04086380423696876, + "language_loss": 0.84961462, + "learning_rate": 0.0009556593213206038, + "loss": 0.86025023, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.54248047, + "step": 837, + "time_per_iteration": 2.714165687561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_mlp": 1.0095681, + "epoch": 0.1612158522508657, + "flos": 554615749632.0, + "grad_norm": 0.03942211179170501, + "language_loss": 0.88284755, + "learning_rate": 0.0009555309703631414, + "loss": 0.89348304, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.54101562, + "step": 838, + "time_per_iteration": 2.6616575717926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061318, + "balance_loss_mlp": 1.00729215, + "epoch": 0.16140823393612927, + "flos": 557018708736.0, + "grad_norm": 0.03970121061853926, + "language_loss": 0.88476837, + "learning_rate": 0.0009554024425528722, + "loss": 0.89538157, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.54150391, + "step": 839, + "time_per_iteration": 2.6778693199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061761, + "balance_loss_mlp": 1.00792611, + "epoch": 0.16160061562139286, + "flos": 544909627392.0, + "grad_norm": 0.03616953348933095, + "language_loss": 0.90216744, + "learning_rate": 0.0009552737379396948, + "loss": 0.91278505, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.53955078, + "step": 840, + "time_per_iteration": 2.6190080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060215, + "balance_loss_mlp": 1.00638056, + "epoch": 0.16179299730665642, + "flos": 605007881472.0, + "grad_norm": 0.03485432207779616, + "language_loss": 0.88917094, + "learning_rate": 0.0009551448565735767, + "loss": 0.89977312, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.53955078, + "step": 841, + "time_per_iteration": 2.771730422973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059839, + "balance_loss_mlp": 1.00624251, + "epoch": 0.16198537899191998, + "flos": 788552275968.0, + "grad_norm": 0.040424272174261144, + "language_loss": 0.855564, + "learning_rate": 0.0009550157985045543, + "loss": 0.86616236, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.53710938, + "step": 842, + "time_per_iteration": 3.014448642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063416, + "balance_loss_mlp": 1.00986671, + "epoch": 0.16217776067718354, + "flos": 520830470400.0, + "grad_norm": 0.03210449059239548, + "language_loss": 0.9010545, + "learning_rate": 0.0009548865637827321, + "loss": 0.91168869, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.53662109, + "step": 843, + "time_per_iteration": 2.663733959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_mlp": 1.00725794, + "epoch": 0.1623701423624471, + "flos": 506255246592.0, + "grad_norm": 0.04236042945807781, + "language_loss": 0.91279781, + "learning_rate": 0.0009547571524582838, + "loss": 0.92340446, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.53515625, + "step": 844, + "time_per_iteration": 2.5841143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061607, + "balance_loss_mlp": 1.00848722, + "epoch": 0.16256252404771065, + "flos": 498157900032.0, + "grad_norm": 0.043042899099755685, + "language_loss": 0.93573415, + "learning_rate": 0.0009546275645814512, + "loss": 0.94635028, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.53222656, + "step": 845, + "time_per_iteration": 2.601743221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064884, + "balance_loss_mlp": 1.01152599, + "epoch": 0.16275490573297421, + "flos": 503287516416.0, + "grad_norm": 0.046422900850994125, + "language_loss": 0.90658545, + "learning_rate": 0.0009544978002025446, + "loss": 0.9172343, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.53466797, + "step": 846, + "time_per_iteration": 2.582463502883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_mlp": 1.00957346, + "epoch": 0.16294728741823777, + "flos": 508354916352.0, + "grad_norm": 0.03474620131823351, + "language_loss": 0.88017273, + "learning_rate": 0.0009543678593719434, + "loss": 0.89080155, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.53417969, + "step": 847, + "time_per_iteration": 2.7039546966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_mlp": 1.01334834, + "epoch": 0.16313966910350133, + "flos": 510757875456.0, + "grad_norm": 0.031134263506057067, + "language_loss": 0.88570058, + "learning_rate": 0.0009542377421400945, + "loss": 0.89637142, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.53857422, + "step": 848, + "time_per_iteration": 2.79311203956604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061983, + "balance_loss_mlp": 1.00810015, + "epoch": 0.16333205078876492, + "flos": 545057381376.0, + "grad_norm": 0.03805815068737175, + "language_loss": 0.84448338, + "learning_rate": 0.0009541074485575145, + "loss": 0.85510319, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.54003906, + "step": 849, + "time_per_iteration": 2.714644193649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106829, + "balance_loss_mlp": 1.01450312, + "epoch": 0.16352443247402848, + "flos": 508712640768.0, + "grad_norm": 0.03447226436126556, + "language_loss": 0.93184924, + "learning_rate": 0.0009539769786747874, + "loss": 0.94253218, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.5390625, + "step": 850, + "time_per_iteration": 2.5857110023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070929, + "balance_loss_mlp": 1.01709449, + "epoch": 0.16371681415929204, + "flos": 543223084032.0, + "grad_norm": 0.036141614394747515, + "language_loss": 0.82550752, + "learning_rate": 0.0009538463325425665, + "loss": 0.83621687, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.53955078, + "step": 851, + "time_per_iteration": 2.7186405658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.01242912, + "epoch": 0.1639091958445556, + "flos": 521761714176.0, + "grad_norm": 0.03784697093976771, + "language_loss": 0.87203169, + "learning_rate": 0.0009537155102115728, + "loss": 0.8826977, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.54296875, + "step": 852, + "time_per_iteration": 2.5761775970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061784, + "balance_loss_mlp": 1.00771022, + "epoch": 0.16410157752981916, + "flos": 548482957824.0, + "grad_norm": 0.03731294741121226, + "language_loss": 0.85278255, + "learning_rate": 0.0009535845117325961, + "loss": 0.8634004, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.54199219, + "step": 853, + "time_per_iteration": 2.6968846321105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065026, + "balance_loss_mlp": 1.01085758, + "epoch": 0.16429395921508272, + "flos": 584026712064.0, + "grad_norm": 0.031860977478103375, + "language_loss": 0.9423098, + "learning_rate": 0.0009534533371564946, + "loss": 0.95296007, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.54296875, + "step": 854, + "time_per_iteration": 2.7640349864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106098, + "balance_loss_mlp": 1.00709713, + "epoch": 0.16448634090034628, + "flos": 531962621184.0, + "grad_norm": 0.03950290113288642, + "language_loss": 0.89868152, + "learning_rate": 0.0009533219865341949, + "loss": 0.90929133, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.54003906, + "step": 855, + "time_per_iteration": 2.6025009155273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060489, + "balance_loss_mlp": 1.00693989, + "epoch": 0.16467872258560984, + "flos": 492961209600.0, + "grad_norm": 0.03645156199748424, + "language_loss": 0.87602645, + "learning_rate": 0.0009531904599166916, + "loss": 0.88663131, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.53662109, + "step": 856, + "time_per_iteration": 2.656604290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060758, + "balance_loss_mlp": 1.00730467, + "epoch": 0.16487110427087343, + "flos": 507260367360.0, + "grad_norm": 0.04426557796634758, + "language_loss": 0.86560714, + "learning_rate": 0.0009530587573550478, + "loss": 0.87621474, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.53564453, + "step": 857, + "time_per_iteration": 2.610445261001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_mlp": 1.00538635, + "epoch": 0.16506348595613698, + "flos": 1436111555328.0, + "grad_norm": 0.010874217326465607, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75375891, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.51171875, + "step": 858, + "time_per_iteration": 4.991516590118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.00718212, + "epoch": 0.16525586764140054, + "flos": 478090477824.0, + "grad_norm": 0.04454190836652637, + "language_loss": 0.91544032, + "learning_rate": 0.0009527948246039337, + "loss": 0.9260481, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.53710938, + "step": 859, + "time_per_iteration": 2.538290500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058142, + "balance_loss_mlp": 1.00425971, + "epoch": 0.1654482493266641, + "flos": 882541767168.0, + "grad_norm": 0.03991834039284953, + "language_loss": 0.88867122, + "learning_rate": 0.000952662594516931, + "loss": 0.89925265, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.54003906, + "step": 860, + "time_per_iteration": 3.083786964416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065202, + "balance_loss_mlp": 1.01122451, + "epoch": 0.16564063101192766, + "flos": 628106217216.0, + "grad_norm": 0.03630731527649873, + "language_loss": 0.87934124, + "learning_rate": 0.0009525301886907234, + "loss": 0.88999331, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.54101562, + "step": 861, + "time_per_iteration": 2.8606412410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062775, + "balance_loss_mlp": 1.00884438, + "epoch": 0.16583301269719122, + "flos": 562593532416.0, + "grad_norm": 0.03632506699489255, + "language_loss": 0.8885988, + "learning_rate": 0.0009523976071767155, + "loss": 0.89922649, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.54052734, + "step": 862, + "time_per_iteration": 2.651202440261841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062989, + "balance_loss_mlp": 1.0094403, + "epoch": 0.16602539438245478, + "flos": 568984893696.0, + "grad_norm": 0.03883194498572106, + "language_loss": 0.88789731, + "learning_rate": 0.00095226485002638, + "loss": 0.8985272, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.53662109, + "step": 863, + "time_per_iteration": 2.798125982284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.01019073, + "epoch": 0.16621777606771834, + "flos": 576022684416.0, + "grad_norm": 0.03638934937563812, + "language_loss": 0.89892161, + "learning_rate": 0.0009521319172912576, + "loss": 0.90955949, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.53710938, + "step": 864, + "time_per_iteration": 4.098716974258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_mlp": 1.00632548, + "epoch": 0.16641015775298193, + "flos": 515598786816.0, + "grad_norm": 0.037169751839881825, + "language_loss": 0.96108532, + "learning_rate": 0.0009519988090229579, + "loss": 0.97168505, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.53759766, + "step": 865, + "time_per_iteration": 2.659381628036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068447, + "balance_loss_mlp": 1.01489806, + "epoch": 0.1666025394382455, + "flos": 622850234112.0, + "grad_norm": 0.04388029559541895, + "language_loss": 0.88811028, + "learning_rate": 0.0009518655252731576, + "loss": 0.89879477, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.53662109, + "step": 866, + "time_per_iteration": 2.738511323928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061925, + "balance_loss_mlp": 1.00880551, + "epoch": 0.16679492112350905, + "flos": 549933285888.0, + "grad_norm": 0.03352631932153436, + "language_loss": 0.91113746, + "learning_rate": 0.0009517320660936022, + "loss": 0.92175674, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.53222656, + "step": 867, + "time_per_iteration": 2.7755699157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.01343453, + "epoch": 0.1669873028087726, + "flos": 666866555904.0, + "grad_norm": 0.04051359913494383, + "language_loss": 0.84396493, + "learning_rate": 0.0009515984315361051, + "loss": 0.85462809, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.52978516, + "step": 868, + "time_per_iteration": 2.8502533435821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_mlp": 1.00944042, + "epoch": 0.16717968449403617, + "flos": 539604066816.0, + "grad_norm": 0.03969494402961726, + "language_loss": 0.88029611, + "learning_rate": 0.000951464621652548, + "loss": 0.89092225, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.53271484, + "step": 869, + "time_per_iteration": 2.6079800128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.01233244, + "epoch": 0.16737206617929973, + "flos": 531279253248.0, + "grad_norm": 0.03349656106003216, + "language_loss": 0.7990135, + "learning_rate": 0.0009513306364948804, + "loss": 0.80967236, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.53662109, + "step": 870, + "time_per_iteration": 2.824232578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106371, + "balance_loss_mlp": 1.00987494, + "epoch": 0.1675644478645633, + "flos": 481757127168.0, + "grad_norm": 0.04264569815750397, + "language_loss": 0.90229708, + "learning_rate": 0.0009511964761151197, + "loss": 0.91293418, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.53955078, + "step": 871, + "time_per_iteration": 2.6326816082000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106642, + "balance_loss_mlp": 1.01344323, + "epoch": 0.16775682954982685, + "flos": 495542058240.0, + "grad_norm": 0.04000245460937008, + "language_loss": 0.91825569, + "learning_rate": 0.0009510621405653521, + "loss": 0.92891991, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.53076172, + "step": 872, + "time_per_iteration": 2.5802783966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074346, + "balance_loss_mlp": 1.02151191, + "epoch": 0.1679492112350904, + "flos": 753406096896.0, + "grad_norm": 0.04130745072346603, + "language_loss": 0.85908926, + "learning_rate": 0.0009509276298977309, + "loss": 0.86983275, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.52929688, + "step": 873, + "time_per_iteration": 2.9676413536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069963, + "balance_loss_mlp": 1.01689136, + "epoch": 0.168141592920354, + "flos": 1137733583616.0, + "grad_norm": 0.036676349776393134, + "language_loss": 0.82925022, + "learning_rate": 0.0009507929441644778, + "loss": 0.83994985, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.53173828, + "step": 874, + "time_per_iteration": 3.5441927909851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062646, + "balance_loss_mlp": 1.00924039, + "epoch": 0.16833397460561755, + "flos": 633554674176.0, + "grad_norm": 0.03715311549034911, + "language_loss": 0.86810201, + "learning_rate": 0.0009506580834178826, + "loss": 0.87872851, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.53515625, + "step": 875, + "time_per_iteration": 2.767840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106879, + "balance_loss_mlp": 1.01524162, + "epoch": 0.1685263562908811, + "flos": 542543606784.0, + "grad_norm": 0.041322978640758234, + "language_loss": 0.92533737, + "learning_rate": 0.0009505230477103028, + "loss": 0.93602526, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.53662109, + "step": 876, + "time_per_iteration": 2.68626070022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_mlp": 1.01151776, + "epoch": 0.16871873797614467, + "flos": 620486158848.0, + "grad_norm": 0.04979097271806245, + "language_loss": 0.82312369, + "learning_rate": 0.0009503878370941641, + "loss": 0.83377057, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.53271484, + "step": 877, + "time_per_iteration": 2.738828182220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.01455081, + "epoch": 0.16891111966140823, + "flos": 607456527360.0, + "grad_norm": 0.048240798926105125, + "language_loss": 0.90597415, + "learning_rate": 0.0009502524516219595, + "loss": 0.91664839, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.52978516, + "step": 878, + "time_per_iteration": 2.7533464431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.01234174, + "epoch": 0.1691035013466718, + "flos": 553406494464.0, + "grad_norm": 0.04285435284136928, + "language_loss": 0.91275579, + "learning_rate": 0.0009501168913462506, + "loss": 0.92340994, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.53173828, + "step": 879, + "time_per_iteration": 2.6498849391937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106115, + "balance_loss_mlp": 1.00946045, + "epoch": 0.16929588303193535, + "flos": 1479308427264.0, + "grad_norm": 0.010969186313753012, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80183077, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.51757812, + "step": 880, + "time_per_iteration": 4.8127734661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065784, + "balance_loss_mlp": 1.01228285, + "epoch": 0.1694882647171989, + "flos": 927848024064.0, + "grad_norm": 0.04254449001590413, + "language_loss": 0.86211771, + "learning_rate": 0.0009498452465949042, + "loss": 0.87277561, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.53613281, + "step": 881, + "time_per_iteration": 3.242352247238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.00668061, + "epoch": 0.1696806464024625, + "flos": 547152193536.0, + "grad_norm": 0.03842920637304405, + "language_loss": 0.92758489, + "learning_rate": 0.0009497091622247285, + "loss": 0.93818152, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.53076172, + "step": 882, + "time_per_iteration": 2.7538321018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_mlp": 1.01363766, + "epoch": 0.16987302808772606, + "flos": 530295519744.0, + "grad_norm": 0.04346709327253658, + "language_loss": 0.94739175, + "learning_rate": 0.0009495729032619723, + "loss": 0.95805502, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.52783203, + "step": 883, + "time_per_iteration": 2.681851863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_mlp": 1.00830746, + "epoch": 0.17006540977298962, + "flos": 756479784960.0, + "grad_norm": 0.03707996109728333, + "language_loss": 0.85065424, + "learning_rate": 0.0009494364697595354, + "loss": 0.86126566, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.52929688, + "step": 884, + "time_per_iteration": 2.886613607406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058078, + "balance_loss_mlp": 1.00495851, + "epoch": 0.17025779145825318, + "flos": 559875623424.0, + "grad_norm": 0.04262534374301406, + "language_loss": 0.90753883, + "learning_rate": 0.0009492998617703867, + "loss": 0.91811961, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.53222656, + "step": 885, + "time_per_iteration": 2.7197954654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069962, + "balance_loss_mlp": 1.01684284, + "epoch": 0.17045017314351674, + "flos": 513217214976.0, + "grad_norm": 0.04472607646913617, + "language_loss": 0.89151132, + "learning_rate": 0.0009491630793475619, + "loss": 0.90221095, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.53222656, + "step": 886, + "time_per_iteration": 2.6023643016815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059759, + "balance_loss_mlp": 1.00706899, + "epoch": 0.1706425548287803, + "flos": 510013269504.0, + "grad_norm": 0.03690999998020265, + "language_loss": 0.86250949, + "learning_rate": 0.0009490261225441643, + "loss": 0.87310708, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.52783203, + "step": 887, + "time_per_iteration": 2.8811516761779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070816, + "balance_loss_mlp": 1.01845872, + "epoch": 0.17083493651404386, + "flos": 718715818752.0, + "grad_norm": 0.037520519160069404, + "language_loss": 0.91723603, + "learning_rate": 0.0009488889914133656, + "loss": 0.92794418, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.52441406, + "step": 888, + "time_per_iteration": 2.983920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.01515496, + "epoch": 0.17102731819930742, + "flos": 560201266944.0, + "grad_norm": 0.034570155262309, + "language_loss": 0.90050644, + "learning_rate": 0.0009487516860084047, + "loss": 0.91118205, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.52490234, + "step": 889, + "time_per_iteration": 2.739945888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061028, + "balance_loss_mlp": 1.0078603, + "epoch": 0.17121969988457098, + "flos": 495765634560.0, + "grad_norm": 0.04354558177795279, + "language_loss": 0.9033885, + "learning_rate": 0.0009486142063825884, + "loss": 0.91399872, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.53271484, + "step": 890, + "time_per_iteration": 2.541325569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107373, + "balance_loss_mlp": 1.02223206, + "epoch": 0.17141208156983456, + "flos": 1552108723968.0, + "grad_norm": 0.01766408052426257, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73499948, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.515625, + "step": 891, + "time_per_iteration": 4.968579053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_mlp": 1.00568736, + "epoch": 0.17160446325509812, + "flos": 620700986880.0, + "grad_norm": 0.037544702591063864, + "language_loss": 0.91210532, + "learning_rate": 0.0009483387246819542, + "loss": 0.92269152, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.53027344, + "step": 892, + "time_per_iteration": 2.7970938682556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071655, + "balance_loss_mlp": 1.0209198, + "epoch": 0.17179684494036168, + "flos": 1384695839232.0, + "grad_norm": 0.01601076320839161, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83357239, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.5078125, + "step": 893, + "time_per_iteration": 4.629605054855347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.01386988, + "epoch": 0.17198922662562524, + "flos": 493642632192.0, + "grad_norm": 0.03763004911158334, + "language_loss": 0.90241146, + "learning_rate": 0.0009480625467392688, + "loss": 0.91307414, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.52490234, + "step": 894, + "time_per_iteration": 2.6142358779907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_mlp": 1.01822662, + "epoch": 0.1721816083108888, + "flos": 1461488428800.0, + "grad_norm": 0.016749035753296605, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79063439, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.50585938, + "step": 895, + "time_per_iteration": 4.811494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112065, + "balance_loss_mlp": 1.06719661, + "epoch": 0.17237398999615236, + "flos": 529205828352.0, + "grad_norm": 0.05241044192650153, + "language_loss": 0.88738441, + "learning_rate": 0.0009477856729834196, + "loss": 0.89859092, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.53564453, + "step": 896, + "time_per_iteration": 2.7389612197875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066584, + "balance_loss_mlp": 1.01446557, + "epoch": 0.17256637168141592, + "flos": 605027323392.0, + "grad_norm": 0.03860455021635393, + "language_loss": 0.90989411, + "learning_rate": 0.0009476469753098809, + "loss": 0.92055988, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.52197266, + "step": 897, + "time_per_iteration": 2.7175238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.02507758, + "epoch": 0.17275875336667948, + "flos": 510694692096.0, + "grad_norm": 0.040412661310783936, + "language_loss": 0.88453948, + "learning_rate": 0.0009475081038443738, + "loss": 0.89531147, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.52197266, + "step": 898, + "time_per_iteration": 2.6398110389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.02753115, + "epoch": 0.17295113505194307, + "flos": 666502028544.0, + "grad_norm": 0.045107808798334564, + "language_loss": 0.87902451, + "learning_rate": 0.0009473690586408124, + "loss": 0.88981915, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.52001953, + "step": 899, + "time_per_iteration": 2.817730665206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.01965487, + "epoch": 0.17314351673720663, + "flos": 556432550400.0, + "grad_norm": 0.03870851432877784, + "language_loss": 0.87576568, + "learning_rate": 0.0009472298397531792, + "loss": 0.88648236, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.52099609, + "step": 900, + "time_per_iteration": 2.6932764053344727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061802, + "balance_loss_mlp": 1.00892079, + "epoch": 0.17333589842247019, + "flos": 504607587072.0, + "grad_norm": 0.03631909976073519, + "language_loss": 0.87174571, + "learning_rate": 0.0009470904472355235, + "loss": 0.88236374, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.52978516, + "step": 901, + "time_per_iteration": 2.669405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099242, + "balance_loss_mlp": 1.04593205, + "epoch": 0.17352828010773375, + "flos": 557351155200.0, + "grad_norm": 0.04839261993488341, + "language_loss": 0.80976391, + "learning_rate": 0.0009469508811419626, + "loss": 0.82075632, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.53417969, + "step": 902, + "time_per_iteration": 2.7412211894989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083992, + "balance_loss_mlp": 1.033638, + "epoch": 0.1737206617929973, + "flos": 1557794363136.0, + "grad_norm": 0.02136399149953286, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72697818, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.50390625, + "step": 903, + "time_per_iteration": 4.800720930099487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075494, + "balance_loss_mlp": 1.02318478, + "epoch": 0.17391304347826086, + "flos": 517756782336.0, + "grad_norm": 0.04178806719411302, + "language_loss": 0.85797513, + "learning_rate": 0.0009466712284439292, + "loss": 0.86873007, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.52392578, + "step": 904, + "time_per_iteration": 2.7409780025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076244, + "balance_loss_mlp": 1.02360141, + "epoch": 0.17410542516352442, + "flos": 542161582848.0, + "grad_norm": 0.043268311729831165, + "language_loss": 0.90273786, + "learning_rate": 0.0009465311419480276, + "loss": 0.91350031, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.52734375, + "step": 905, + "time_per_iteration": 2.7310986518859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068245, + "balance_loss_mlp": 1.01526833, + "epoch": 0.17429780684878798, + "flos": 625082106624.0, + "grad_norm": 0.0375699532684124, + "language_loss": 0.89484948, + "learning_rate": 0.0009463908820933622, + "loss": 0.905532, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.53076172, + "step": 906, + "time_per_iteration": 2.8575551509857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086696, + "balance_loss_mlp": 1.03281319, + "epoch": 0.17449018853405157, + "flos": 576849915648.0, + "grad_norm": 0.04286783530345041, + "language_loss": 0.83513701, + "learning_rate": 0.0009462504489343868, + "loss": 0.84600401, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.54003906, + "step": 907, + "time_per_iteration": 2.83085036277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.0128628, + "epoch": 0.17468257021931513, + "flos": 534773849088.0, + "grad_norm": 0.0408315501053547, + "language_loss": 0.90177906, + "learning_rate": 0.0009461098425256222, + "loss": 0.91243982, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.53320312, + "step": 908, + "time_per_iteration": 2.6000654697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075411, + "balance_loss_mlp": 1.02257717, + "epoch": 0.1748749519045787, + "flos": 541809694464.0, + "grad_norm": 0.0381088809784924, + "language_loss": 0.87053907, + "learning_rate": 0.0009459690629216567, + "loss": 0.88129318, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.52929688, + "step": 909, + "time_per_iteration": 2.622178316116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_mlp": 1.02770495, + "epoch": 0.17506733358984225, + "flos": 499627670016.0, + "grad_norm": 0.039096197570908604, + "language_loss": 0.88898331, + "learning_rate": 0.0009458281101771457, + "loss": 0.89978582, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.52636719, + "step": 910, + "time_per_iteration": 2.5964770317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064818, + "balance_loss_mlp": 1.01217556, + "epoch": 0.1752597152751058, + "flos": 624133366272.0, + "grad_norm": 0.035444142957055544, + "language_loss": 0.83730716, + "learning_rate": 0.0009456869843468122, + "loss": 0.84795535, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.52734375, + "step": 911, + "time_per_iteration": 2.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059336, + "balance_loss_mlp": 1.00650251, + "epoch": 0.17545209696036937, + "flos": 521994038784.0, + "grad_norm": 0.04587594362499167, + "language_loss": 0.79429859, + "learning_rate": 0.0009455456854854459, + "loss": 0.80489194, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.52929688, + "step": 912, + "time_per_iteration": 2.627058744430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.0219084, + "epoch": 0.17564447864563293, + "flos": 462946592256.0, + "grad_norm": 0.044462507375804226, + "language_loss": 0.85522115, + "learning_rate": 0.0009454042136479039, + "loss": 0.86597091, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.53173828, + "step": 913, + "time_per_iteration": 2.562453031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106477, + "balance_loss_mlp": 1.01250815, + "epoch": 0.1758368603308965, + "flos": 481618121472.0, + "grad_norm": 0.03599423435064716, + "language_loss": 0.84144086, + "learning_rate": 0.0009452625688891103, + "loss": 0.85208857, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.5234375, + "step": 914, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063728, + "balance_loss_mlp": 1.0137558, + "epoch": 0.17602924201616005, + "flos": 1482087574272.0, + "grad_norm": 0.013260252544834742, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79798466, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.49902344, + "step": 915, + "time_per_iteration": 4.572151184082031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_mlp": 1.0219233, + "epoch": 0.17622162370142364, + "flos": 603471037440.0, + "grad_norm": 0.044830704586910027, + "language_loss": 0.94022703, + "learning_rate": 0.0009449787608278015, + "loss": 0.95096982, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.52441406, + "step": 916, + "time_per_iteration": 2.731264114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062837, + "balance_loss_mlp": 1.0104804, + "epoch": 0.1764140053866872, + "flos": 443606279424.0, + "grad_norm": 0.0370205772569368, + "language_loss": 0.92972034, + "learning_rate": 0.0009448365976354704, + "loss": 0.94034874, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.52441406, + "step": 917, + "time_per_iteration": 2.478041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.0204134, + "epoch": 0.17660638707195075, + "flos": 501592224768.0, + "grad_norm": 0.047363321454448416, + "language_loss": 0.907022, + "learning_rate": 0.0009446942617422558, + "loss": 0.91775542, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.53027344, + "step": 918, + "time_per_iteration": 2.5698564052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_mlp": 1.00789583, + "epoch": 0.17679876875721431, + "flos": 539984145408.0, + "grad_norm": 0.03732253291641402, + "language_loss": 0.86447889, + "learning_rate": 0.0009445517532034176, + "loss": 0.87508708, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.53027344, + "step": 919, + "time_per_iteration": 2.6916563510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062144, + "balance_loss_mlp": 1.00926292, + "epoch": 0.17699115044247787, + "flos": 498715868160.0, + "grad_norm": 0.04444616550081301, + "language_loss": 0.8994987, + "learning_rate": 0.0009444090720742824, + "loss": 0.91012013, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.52978516, + "step": 920, + "time_per_iteration": 2.5798380374908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.01706016, + "epoch": 0.17718353212774143, + "flos": 663916322304.0, + "grad_norm": 0.04662040468857239, + "language_loss": 0.89399016, + "learning_rate": 0.0009442662184102439, + "loss": 0.90468818, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.52832031, + "step": 921, + "time_per_iteration": 2.755929708480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.01164341, + "epoch": 0.177375913813005, + "flos": 583848822528.0, + "grad_norm": 0.03479566109485236, + "language_loss": 0.88455689, + "learning_rate": 0.000944123192266763, + "loss": 0.89519787, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.52539062, + "step": 922, + "time_per_iteration": 2.8776824474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.00980616, + "epoch": 0.17756829549826855, + "flos": 553684505856.0, + "grad_norm": 0.036018663808135676, + "language_loss": 0.84559548, + "learning_rate": 0.0009439799936993671, + "loss": 0.85622525, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.53271484, + "step": 923, + "time_per_iteration": 2.708897113800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.01041508, + "epoch": 0.17776067718353214, + "flos": 557372542464.0, + "grad_norm": 0.06706828820902193, + "language_loss": 0.89721078, + "learning_rate": 0.0009438366227636511, + "loss": 0.90784371, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.52978516, + "step": 924, + "time_per_iteration": 2.6524295806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.01035416, + "epoch": 0.1779530588687957, + "flos": 659652820992.0, + "grad_norm": 0.03503923634288643, + "language_loss": 0.87549317, + "learning_rate": 0.0009436930795152763, + "loss": 0.8861202, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.52441406, + "step": 925, + "time_per_iteration": 2.8627374172210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_mlp": 1.01823378, + "epoch": 0.17814544055405926, + "flos": 645672503808.0, + "grad_norm": 0.03989967380061369, + "language_loss": 0.87815237, + "learning_rate": 0.0009435493640099713, + "loss": 0.88885403, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.52001953, + "step": 926, + "time_per_iteration": 2.7886180877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065048, + "balance_loss_mlp": 1.01283479, + "epoch": 0.17833782223932282, + "flos": 461885091072.0, + "grad_norm": 0.040977111340993126, + "language_loss": 0.85709256, + "learning_rate": 0.0009434054763035314, + "loss": 0.86774307, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.52294922, + "step": 927, + "time_per_iteration": 2.635576009750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00520515, + "epoch": 0.17853020392458638, + "flos": 760854101760.0, + "grad_norm": 0.029435711646972902, + "language_loss": 0.86359227, + "learning_rate": 0.0009432614164518185, + "loss": 0.8741703, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.52685547, + "step": 928, + "time_per_iteration": 2.945253849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074963, + "balance_loss_mlp": 1.02203369, + "epoch": 0.17872258560984994, + "flos": 784056450048.0, + "grad_norm": 0.039066121455708196, + "language_loss": 0.84876156, + "learning_rate": 0.000943117184510762, + "loss": 0.85951114, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.53027344, + "step": 929, + "time_per_iteration": 3.0016870498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092369, + "balance_loss_mlp": 1.04201508, + "epoch": 0.1789149672951135, + "flos": 1463034021120.0, + "grad_norm": 0.03241390760866092, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79882336, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.50390625, + "step": 930, + "time_per_iteration": 5.0408923625946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091695, + "balance_loss_mlp": 1.04005396, + "epoch": 0.17910734898037706, + "flos": 504931285248.0, + "grad_norm": 0.037670754636037675, + "language_loss": 0.90276599, + "learning_rate": 0.0009428282045846674, + "loss": 0.91368294, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.51708984, + "step": 931, + "time_per_iteration": 2.699357509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093914, + "balance_loss_mlp": 1.04260671, + "epoch": 0.17929973066564064, + "flos": 747670880256.0, + "grad_norm": 0.03557447538434831, + "language_loss": 0.91468316, + "learning_rate": 0.0009426834567118214, + "loss": 0.92562228, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.51367188, + "step": 932, + "time_per_iteration": 3.0888116359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095199, + "balance_loss_mlp": 1.04370034, + "epoch": 0.1794921123509042, + "flos": 714573826560.0, + "grad_norm": 0.03713873812168088, + "language_loss": 0.82311261, + "learning_rate": 0.0009425385369740155, + "loss": 0.8340646, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.515625, + "step": 933, + "time_per_iteration": 3.0156304836273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.04465711, + "epoch": 0.17968449403616776, + "flos": 634362463488.0, + "grad_norm": 0.04581160448205157, + "language_loss": 0.89044029, + "learning_rate": 0.0009423934454275125, + "loss": 0.90140092, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.51464844, + "step": 934, + "time_per_iteration": 2.8524558544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095874, + "balance_loss_mlp": 1.04428041, + "epoch": 0.17987687572143132, + "flos": 537378997248.0, + "grad_norm": 0.045982575553228676, + "language_loss": 0.93734717, + "learning_rate": 0.0009422481821286418, + "loss": 0.94830596, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.51660156, + "step": 935, + "time_per_iteration": 2.7354249954223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096416, + "balance_loss_mlp": 1.0448221, + "epoch": 0.18006925740669488, + "flos": 539119975680.0, + "grad_norm": 0.04748543050697339, + "language_loss": 0.89948702, + "learning_rate": 0.0009421027471337998, + "loss": 0.91045117, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.51660156, + "step": 936, + "time_per_iteration": 2.660287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095184, + "balance_loss_mlp": 1.04363835, + "epoch": 0.18026163909195844, + "flos": 540535310592.0, + "grad_norm": 0.04911488628490749, + "language_loss": 0.84066534, + "learning_rate": 0.0009419571404994493, + "loss": 0.8516171, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.51611328, + "step": 937, + "time_per_iteration": 2.624769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_mlp": 1.03865409, + "epoch": 0.180454020777222, + "flos": 501683598336.0, + "grad_norm": 0.0468107226861285, + "language_loss": 0.92304778, + "learning_rate": 0.00094181136228212, + "loss": 0.9339512, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.51757812, + "step": 938, + "time_per_iteration": 2.6784133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092284, + "balance_loss_mlp": 1.04069054, + "epoch": 0.18064640246248556, + "flos": 500007748608.0, + "grad_norm": 0.039466745711782485, + "language_loss": 0.87082231, + "learning_rate": 0.0009416654125384077, + "loss": 0.8817451, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.51660156, + "step": 939, + "time_per_iteration": 2.7231576442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081085, + "balance_loss_mlp": 1.03034973, + "epoch": 0.18083878414774912, + "flos": 1522293383424.0, + "grad_norm": 0.016406546431804496, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80853462, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.5078125, + "step": 940, + "time_per_iteration": 4.919930934906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329403, + "balance_loss_mlp": 1.27490067, + "epoch": 0.1810311658330127, + "flos": 728666904576.0, + "grad_norm": 0.12503564718566265, + "language_loss": 0.85519916, + "learning_rate": 0.000941372998698552, + "loss": 0.8684932, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.54638672, + "step": 941, + "time_per_iteration": 2.9731380939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093385, + "balance_loss_mlp": 1.04121876, + "epoch": 0.18122354751827627, + "flos": 566045353728.0, + "grad_norm": 0.05253753965114479, + "language_loss": 0.83319217, + "learning_rate": 0.0009412265347159336, + "loss": 0.84412599, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.52246094, + "step": 942, + "time_per_iteration": 2.7150988578796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103842, + "balance_loss_mlp": 1.05162799, + "epoch": 0.18141592920353983, + "flos": 520318189056.0, + "grad_norm": 0.046885904923641086, + "language_loss": 0.86687338, + "learning_rate": 0.0009410798994339829, + "loss": 0.87791175, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.52294922, + "step": 943, + "time_per_iteration": 2.598576545715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111341, + "balance_loss_mlp": 1.05831623, + "epoch": 0.1816083108888034, + "flos": 513477729792.0, + "grad_norm": 0.04639702407841738, + "language_loss": 0.8991158, + "learning_rate": 0.000940933092909628, + "loss": 0.91022921, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.53125, + "step": 944, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104212, + "balance_loss_mlp": 1.05109203, + "epoch": 0.18180069257406695, + "flos": 493373369088.0, + "grad_norm": 0.04493061679832577, + "language_loss": 0.85416293, + "learning_rate": 0.0009407861151998649, + "loss": 0.86520505, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.53222656, + "step": 945, + "time_per_iteration": 2.5710983276367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.04692006, + "epoch": 0.1819930742593305, + "flos": 571231350528.0, + "grad_norm": 0.04259629183686275, + "language_loss": 0.87787771, + "learning_rate": 0.0009406389663617552, + "loss": 0.88888001, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.53417969, + "step": 946, + "time_per_iteration": 2.6741456985473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.04661465, + "epoch": 0.18218545594459407, + "flos": 607111441920.0, + "grad_norm": 0.04866460503106345, + "language_loss": 0.87927794, + "learning_rate": 0.000940491646452427, + "loss": 0.89027911, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.53613281, + "step": 947, + "time_per_iteration": 2.718358278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101327, + "balance_loss_mlp": 1.04753995, + "epoch": 0.18237783762985763, + "flos": 549739845120.0, + "grad_norm": 0.042994543525894185, + "language_loss": 0.92601323, + "learning_rate": 0.000940344155529075, + "loss": 0.93702656, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.5390625, + "step": 948, + "time_per_iteration": 2.624303102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097087, + "balance_loss_mlp": 1.04325247, + "epoch": 0.1825702193151212, + "flos": 451675435776.0, + "grad_norm": 0.046415524987670945, + "language_loss": 0.89178842, + "learning_rate": 0.0009401964936489605, + "loss": 0.90275931, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.53955078, + "step": 949, + "time_per_iteration": 2.5104119777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088983, + "balance_loss_mlp": 1.03524303, + "epoch": 0.18276260100038477, + "flos": 590385025536.0, + "grad_norm": 0.0430347708706334, + "language_loss": 0.86972219, + "learning_rate": 0.0009400486608694108, + "loss": 0.88061202, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.53857422, + "step": 950, + "time_per_iteration": 2.744044065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085663, + "balance_loss_mlp": 1.03154159, + "epoch": 0.18295498268564833, + "flos": 788710723584.0, + "grad_norm": 0.040810758702646055, + "language_loss": 0.88588369, + "learning_rate": 0.0009399006572478195, + "loss": 0.89674032, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.54248047, + "step": 951, + "time_per_iteration": 3.0828475952148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.02493632, + "epoch": 0.1831473643709119, + "flos": 579226629888.0, + "grad_norm": 0.03747434947067488, + "language_loss": 0.92113942, + "learning_rate": 0.0009397524828416468, + "loss": 0.93193376, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.54638672, + "step": 952, + "time_per_iteration": 2.6881086826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.03405273, + "epoch": 0.18333974605617545, + "flos": 567964221696.0, + "grad_norm": 0.0419825959367211, + "language_loss": 0.97306633, + "learning_rate": 0.0009396041377084192, + "loss": 0.9839648, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.55957031, + "step": 953, + "time_per_iteration": 2.673654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097804, + "balance_loss_mlp": 1.04191864, + "epoch": 0.183532127741439, + "flos": 528070450176.0, + "grad_norm": 0.04203850234568462, + "language_loss": 0.89016271, + "learning_rate": 0.0009394556219057295, + "loss": 0.90114069, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.56054688, + "step": 954, + "time_per_iteration": 2.7255043983459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.01610565, + "epoch": 0.18372450942670257, + "flos": 595644899328.0, + "grad_norm": 0.03789415730727427, + "language_loss": 0.84751296, + "learning_rate": 0.0009393069354912362, + "loss": 0.85822284, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.55029297, + "step": 955, + "time_per_iteration": 2.7474210262298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084139, + "balance_loss_mlp": 1.02963698, + "epoch": 0.18391689111196613, + "flos": 646284907008.0, + "grad_norm": 0.04389714766773939, + "language_loss": 0.83882308, + "learning_rate": 0.0009391580785226649, + "loss": 0.84966445, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.54638672, + "step": 956, + "time_per_iteration": 2.844409465789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081024, + "balance_loss_mlp": 1.02990723, + "epoch": 0.18410927279722972, + "flos": 1460394846720.0, + "grad_norm": 0.013082177800516761, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80421472, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.51171875, + "step": 957, + "time_per_iteration": 4.792405843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084483, + "balance_loss_mlp": 1.030267, + "epoch": 0.18430165448249328, + "flos": 660004709376.0, + "grad_norm": 0.04089111102732722, + "language_loss": 0.88231802, + "learning_rate": 0.0009388598531545196, + "loss": 0.89316285, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.54345703, + "step": 958, + "time_per_iteration": 2.900062084197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084489, + "balance_loss_mlp": 1.03017747, + "epoch": 0.18449403616775684, + "flos": 518950486272.0, + "grad_norm": 0.045948437313162956, + "language_loss": 0.87467843, + "learning_rate": 0.000938710484870727, + "loss": 0.88552332, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.54443359, + "step": 959, + "time_per_iteration": 2.5785140991210938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.031569, + "epoch": 0.1846864178530204, + "flos": 553825456896.0, + "grad_norm": 0.04362127254920589, + "language_loss": 0.87369549, + "learning_rate": 0.0009385609462644189, + "loss": 0.88455284, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.54296875, + "step": 960, + "time_per_iteration": 2.686221122741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_mlp": 1.02774417, + "epoch": 0.18487879953828396, + "flos": 467116774656.0, + "grad_norm": 0.04468558895083242, + "language_loss": 0.86931455, + "learning_rate": 0.0009384112373936514, + "loss": 0.88013744, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.546875, + "step": 961, + "time_per_iteration": 2.633582830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.00935197, + "epoch": 0.18507118122354752, + "flos": 649684238592.0, + "grad_norm": 0.03687654302408078, + "language_loss": 0.9259429, + "learning_rate": 0.0009382613583165467, + "loss": 0.93658715, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.55224609, + "step": 962, + "time_per_iteration": 2.7910635471343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01458915, + "balance_loss_mlp": 1.40078855, + "epoch": 0.18526356290881107, + "flos": 627923470080.0, + "grad_norm": 0.09306974449566385, + "language_loss": 0.90611041, + "learning_rate": 0.0009381113090912928, + "loss": 0.92069954, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.57958984, + "step": 963, + "time_per_iteration": 2.7445125579833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078714, + "balance_loss_mlp": 1.02464056, + "epoch": 0.18545594459407463, + "flos": 433646445312.0, + "grad_norm": 0.04076594680163087, + "language_loss": 0.91471934, + "learning_rate": 0.000937961089776144, + "loss": 0.92550647, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.54199219, + "step": 964, + "time_per_iteration": 2.5835955142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089429, + "balance_loss_mlp": 1.03607059, + "epoch": 0.1856483262793382, + "flos": 750427673088.0, + "grad_norm": 0.041116434601540804, + "language_loss": 0.8449949, + "learning_rate": 0.0009378107004294208, + "loss": 0.8558892, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.53466797, + "step": 965, + "time_per_iteration": 2.9773664474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090833, + "balance_loss_mlp": 1.03790379, + "epoch": 0.18584070796460178, + "flos": 531402707712.0, + "grad_norm": 0.04029010126422192, + "language_loss": 0.93043375, + "learning_rate": 0.0009376601411095096, + "loss": 0.94134206, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.53027344, + "step": 966, + "time_per_iteration": 2.6703643798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088702, + "balance_loss_mlp": 1.03639269, + "epoch": 0.18603308964986534, + "flos": 484084263936.0, + "grad_norm": 0.03934020689435504, + "language_loss": 0.87718618, + "learning_rate": 0.0009375094118748622, + "loss": 0.88807321, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.52392578, + "step": 967, + "time_per_iteration": 2.5719969272613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091813, + "balance_loss_mlp": 1.03974187, + "epoch": 0.1862254713351289, + "flos": 802682292480.0, + "grad_norm": 0.042176858736630414, + "language_loss": 0.92643285, + "learning_rate": 0.0009373585127839976, + "loss": 0.93735105, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.52148438, + "step": 968, + "time_per_iteration": 2.956153392791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096332, + "balance_loss_mlp": 1.04483318, + "epoch": 0.18641785302039246, + "flos": 479290984704.0, + "grad_norm": 0.04307464179422831, + "language_loss": 0.92206955, + "learning_rate": 0.0009372074438954994, + "loss": 0.93303293, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.515625, + "step": 969, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092255, + "balance_loss_mlp": 1.04085171, + "epoch": 0.18661023470565602, + "flos": 389779822848.0, + "grad_norm": 0.044792080488554424, + "language_loss": 0.93312657, + "learning_rate": 0.0009370562052680181, + "loss": 0.94404912, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.51464844, + "step": 970, + "time_per_iteration": 2.4642274379730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109029, + "balance_loss_mlp": 1.03926873, + "epoch": 0.18680261639091958, + "flos": 565776090624.0, + "grad_norm": 0.03666794569701081, + "language_loss": 0.90593827, + "learning_rate": 0.0009369047969602695, + "loss": 0.91684115, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.51074219, + "step": 971, + "time_per_iteration": 2.6925313472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090519, + "balance_loss_mlp": 1.03968859, + "epoch": 0.18699499807618314, + "flos": 480230976768.0, + "grad_norm": 0.04959033368050126, + "language_loss": 0.88274431, + "learning_rate": 0.0009367532190310357, + "loss": 0.89364946, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.50878906, + "step": 972, + "time_per_iteration": 2.5632824897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095286, + "balance_loss_mlp": 1.04464579, + "epoch": 0.1871873797614467, + "flos": 554328989952.0, + "grad_norm": 0.047101191533600484, + "language_loss": 0.90956879, + "learning_rate": 0.0009366014715391644, + "loss": 0.92052168, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.50683594, + "step": 973, + "time_per_iteration": 2.6131792068481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087331, + "balance_loss_mlp": 1.03669059, + "epoch": 0.18737976144671029, + "flos": 553953768960.0, + "grad_norm": 0.03277863870695053, + "language_loss": 0.85193431, + "learning_rate": 0.0009364495545435693, + "loss": 0.86280763, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.50683594, + "step": 974, + "time_per_iteration": 2.768160820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077828, + "balance_loss_mlp": 1.02647221, + "epoch": 0.18757214313197385, + "flos": 503248632576.0, + "grad_norm": 0.03709252074476072, + "language_loss": 0.90046728, + "learning_rate": 0.0009362974681032297, + "loss": 0.91124547, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.51416016, + "step": 975, + "time_per_iteration": 2.596752405166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358762, + "balance_loss_mlp": 1.30464137, + "epoch": 0.1877645248172374, + "flos": 676292721408.0, + "grad_norm": 0.11355211768831018, + "language_loss": 0.89691889, + "learning_rate": 0.0009361452122771907, + "loss": 0.91050649, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.54248047, + "step": 976, + "time_per_iteration": 2.841670036315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.03649426, + "epoch": 0.18795690650250096, + "flos": 405863700480.0, + "grad_norm": 0.05182073733860081, + "language_loss": 0.85757113, + "learning_rate": 0.0009359927871245635, + "loss": 0.86844826, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.51269531, + "step": 977, + "time_per_iteration": 2.4593758583068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110124, + "balance_loss_mlp": 1.04988456, + "epoch": 0.18814928818776452, + "flos": 639064369152.0, + "grad_norm": 0.04599902588150218, + "language_loss": 0.8843354, + "learning_rate": 0.0009358401927045246, + "loss": 0.89534783, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.51416016, + "step": 978, + "time_per_iteration": 2.8043553829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103516, + "balance_loss_mlp": 1.05197036, + "epoch": 0.18834166987302808, + "flos": 1140117100800.0, + "grad_norm": 0.05109113713971293, + "language_loss": 0.89583617, + "learning_rate": 0.0009356874290763166, + "loss": 0.90687132, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.51611328, + "step": 979, + "time_per_iteration": 3.4783685207366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105346, + "balance_loss_mlp": 1.0536567, + "epoch": 0.18853405155829164, + "flos": 505816842240.0, + "grad_norm": 0.03906189308485337, + "language_loss": 0.90395761, + "learning_rate": 0.0009355344962992474, + "loss": 0.91501105, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.51757812, + "step": 980, + "time_per_iteration": 2.6457359790802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103049, + "balance_loss_mlp": 1.05116904, + "epoch": 0.1887264332435552, + "flos": 609371504640.0, + "grad_norm": 0.038270487176229884, + "language_loss": 0.89782834, + "learning_rate": 0.0009353813944326908, + "loss": 0.9088589, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.51953125, + "step": 981, + "time_per_iteration": 2.923243761062622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102538, + "balance_loss_mlp": 1.05070543, + "epoch": 0.1889188149288188, + "flos": 553593132288.0, + "grad_norm": 0.04212053297292714, + "language_loss": 0.84181225, + "learning_rate": 0.0009352281235360863, + "loss": 0.85283768, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.51904297, + "step": 982, + "time_per_iteration": 2.674790620803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103043, + "balance_loss_mlp": 1.05135345, + "epoch": 0.18911119661408235, + "flos": 419470742016.0, + "grad_norm": 0.03892833341753514, + "language_loss": 0.86323905, + "learning_rate": 0.0009350746836689389, + "loss": 0.87426949, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.51757812, + "step": 983, + "time_per_iteration": 2.5294649600982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103523, + "balance_loss_mlp": 1.05335999, + "epoch": 0.1893035782993459, + "flos": 1485320676864.0, + "grad_norm": 0.016207020064155576, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82542741, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.50195312, + "step": 984, + "time_per_iteration": 5.031845569610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094201, + "balance_loss_mlp": 1.04227316, + "epoch": 0.18949595998460947, + "flos": 509457246720.0, + "grad_norm": 0.045438139941342374, + "language_loss": 0.84563899, + "learning_rate": 0.0009347672972613634, + "loss": 0.85658097, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.52001953, + "step": 985, + "time_per_iteration": 2.6333274841308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.0384593, + "epoch": 0.18968834166987303, + "flos": 532193000448.0, + "grad_norm": 0.03993027053802703, + "language_loss": 0.8704083, + "learning_rate": 0.0009346133508402735, + "loss": 0.8813107, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.51855469, + "step": 986, + "time_per_iteration": 2.751340389251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089761, + "balance_loss_mlp": 1.03797686, + "epoch": 0.1898807233551366, + "flos": 500754299904.0, + "grad_norm": 0.04595906606263721, + "language_loss": 0.85852754, + "learning_rate": 0.0009344592356873166, + "loss": 0.86942512, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.51855469, + "step": 987, + "time_per_iteration": 2.6785645484924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.03223073, + "epoch": 0.19007310504040015, + "flos": 603360221952.0, + "grad_norm": 0.042275439246703725, + "language_loss": 0.79788595, + "learning_rate": 0.0009343049518623255, + "loss": 0.80872947, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.52197266, + "step": 988, + "time_per_iteration": 2.709439516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365061, + "balance_loss_mlp": 1.30979574, + "epoch": 0.1902654867256637, + "flos": 602765315328.0, + "grad_norm": 0.1049262798815586, + "language_loss": 0.8386007, + "learning_rate": 0.0009341504994251985, + "loss": 0.85225129, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.55419922, + "step": 989, + "time_per_iteration": 2.925954818725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.03952026, + "epoch": 0.19045786841092727, + "flos": 1579234345728.0, + "grad_norm": 0.01847097645999908, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74610186, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.50195312, + "step": 990, + "time_per_iteration": 5.025054216384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101957, + "balance_loss_mlp": 1.04845631, + "epoch": 0.19065025009619085, + "flos": 683055412992.0, + "grad_norm": 0.039739471389523856, + "language_loss": 0.8281374, + "learning_rate": 0.0009338410889544574, + "loss": 0.83915699, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.53613281, + "step": 991, + "time_per_iteration": 3.0653748512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112616, + "balance_loss_mlp": 1.05868626, + "epoch": 0.1908426317814544, + "flos": 603442847232.0, + "grad_norm": 0.04383499470371995, + "language_loss": 0.89543211, + "learning_rate": 0.000933686131040967, + "loss": 0.90655828, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.54052734, + "step": 992, + "time_per_iteration": 2.7901530265808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106006, + "balance_loss_mlp": 1.0517416, + "epoch": 0.19103501346671797, + "flos": 587434791936.0, + "grad_norm": 0.04122735235002176, + "language_loss": 0.92173266, + "learning_rate": 0.0009335310047555883, + "loss": 0.93279278, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.54394531, + "step": 993, + "time_per_iteration": 2.7153608798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097163, + "balance_loss_mlp": 1.04285157, + "epoch": 0.19122739515198153, + "flos": 546835298304.0, + "grad_norm": 0.04052898350535971, + "language_loss": 0.89637405, + "learning_rate": 0.0009333757101585467, + "loss": 0.90734565, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.54443359, + "step": 994, + "time_per_iteration": 2.6286795139312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091826, + "balance_loss_mlp": 1.03732359, + "epoch": 0.1914197768372451, + "flos": 522550061568.0, + "grad_norm": 0.03850908176124289, + "language_loss": 0.94694555, + "learning_rate": 0.0009332202473101329, + "loss": 0.95786381, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.54638672, + "step": 995, + "time_per_iteration": 2.649850368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072176, + "balance_loss_mlp": 1.01714945, + "epoch": 0.19161215852250865, + "flos": 612388812288.0, + "grad_norm": 0.03654296504823072, + "language_loss": 0.83743644, + "learning_rate": 0.0009330646162707028, + "loss": 0.84815824, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.55175781, + "step": 996, + "time_per_iteration": 2.7329981327056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059087, + "balance_loss_mlp": 1.0033443, + "epoch": 0.1918045402077722, + "flos": 848183935488.0, + "grad_norm": 0.03315860340701524, + "language_loss": 0.85236025, + "learning_rate": 0.0009329088171006779, + "loss": 0.8629511, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.55908203, + "step": 997, + "time_per_iteration": 3.135049343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290698, + "balance_loss_mlp": 1.2330482, + "epoch": 0.19199692189303577, + "flos": 466893198336.0, + "grad_norm": 0.06463762674453556, + "language_loss": 0.86239529, + "learning_rate": 0.0009327528498605446, + "loss": 0.87530231, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.57470703, + "step": 998, + "time_per_iteration": 2.5807580947875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.01727533, + "epoch": 0.19218930357829936, + "flos": 532613908224.0, + "grad_norm": 0.04280698068802137, + "language_loss": 0.90856296, + "learning_rate": 0.0009325967146108548, + "loss": 0.91928697, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.55273438, + "step": 999, + "time_per_iteration": 2.637840986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086346, + "balance_loss_mlp": 1.03217781, + "epoch": 0.19238168526356292, + "flos": 602728376832.0, + "grad_norm": 0.04847652630230049, + "language_loss": 0.88902158, + "learning_rate": 0.0009324404114122258, + "loss": 0.89988506, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.54296875, + "step": 1000, + "time_per_iteration": 4.1391942501068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090902, + "balance_loss_mlp": 1.03701913, + "epoch": 0.19257406694882648, + "flos": 573155076096.0, + "grad_norm": 0.04193719314851312, + "language_loss": 0.88362414, + "learning_rate": 0.0009322839403253397, + "loss": 0.89453316, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.54003906, + "step": 1001, + "time_per_iteration": 2.8266265392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087834, + "balance_loss_mlp": 1.03395164, + "epoch": 0.19276644863409004, + "flos": 803157635328.0, + "grad_norm": 0.04353601683576214, + "language_loss": 0.85235333, + "learning_rate": 0.0009321273014109439, + "loss": 0.86323166, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.54003906, + "step": 1002, + "time_per_iteration": 2.9539175033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_mlp": 1.04068995, + "epoch": 0.1929588303193536, + "flos": 564480319488.0, + "grad_norm": 0.03718563884895513, + "language_loss": 0.86078906, + "learning_rate": 0.0009319704947298513, + "loss": 0.87173432, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.53955078, + "step": 1003, + "time_per_iteration": 2.8760387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091693, + "balance_loss_mlp": 1.0380007, + "epoch": 0.19315121200461716, + "flos": 627988598784.0, + "grad_norm": 0.03744955738150477, + "language_loss": 0.89579475, + "learning_rate": 0.0009318135203429393, + "loss": 0.9067117, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.53808594, + "step": 1004, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094654, + "balance_loss_mlp": 1.04058087, + "epoch": 0.19334359368988072, + "flos": 518584013568.0, + "grad_norm": 0.03742742378220975, + "language_loss": 0.89228511, + "learning_rate": 0.0009316563783111511, + "loss": 0.90323162, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.54199219, + "step": 1005, + "time_per_iteration": 2.7024500370025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090205, + "balance_loss_mlp": 1.03598833, + "epoch": 0.19353597537514428, + "flos": 695400709632.0, + "grad_norm": 0.036019255491177425, + "language_loss": 0.83731771, + "learning_rate": 0.0009314990686954943, + "loss": 0.84821975, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.54345703, + "step": 1006, + "time_per_iteration": 2.901319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092974, + "balance_loss_mlp": 1.03866184, + "epoch": 0.19372835706040784, + "flos": 1212200981760.0, + "grad_norm": 0.03507497873235563, + "language_loss": 0.82359284, + "learning_rate": 0.000931341591557042, + "loss": 0.8345226, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.54443359, + "step": 1007, + "time_per_iteration": 3.70509672164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.03467596, + "epoch": 0.19392073874567142, + "flos": 521685891840.0, + "grad_norm": 0.04354230775215961, + "language_loss": 0.88703787, + "learning_rate": 0.0009311839469569325, + "loss": 0.89792681, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.54345703, + "step": 1008, + "time_per_iteration": 2.632070302963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088222, + "balance_loss_mlp": 1.03386211, + "epoch": 0.19411312043093498, + "flos": 589911628032.0, + "grad_norm": 0.044503426382111445, + "language_loss": 0.88821465, + "learning_rate": 0.0009310261349563687, + "loss": 0.89909685, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.54492188, + "step": 1009, + "time_per_iteration": 2.7138211727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_mlp": 1.0067606, + "epoch": 0.19430550211619854, + "flos": 580572945408.0, + "grad_norm": 0.029375689409949213, + "language_loss": 0.86173785, + "learning_rate": 0.0009308681556166186, + "loss": 0.87235624, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.55224609, + "step": 1010, + "time_per_iteration": 2.834946870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.05162705, + "balance_loss_mlp": 5.08607721, + "epoch": 0.1944978838014621, + "flos": 622246579200.0, + "grad_norm": 0.2884784307389343, + "language_loss": 0.88793403, + "learning_rate": 0.0009307100089990152, + "loss": 0.93956107, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.76513672, + "step": 1011, + "time_per_iteration": 2.705335855484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094303, + "balance_loss_mlp": 1.04189909, + "epoch": 0.19469026548672566, + "flos": 599815081728.0, + "grad_norm": 0.04633555371791679, + "language_loss": 0.85740912, + "learning_rate": 0.0009305516951649568, + "loss": 0.86835217, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.52490234, + "step": 1012, + "time_per_iteration": 2.7048773765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164213, + "balance_loss_mlp": 1.11281013, + "epoch": 0.19488264717198922, + "flos": 553248046848.0, + "grad_norm": 0.04991787894778298, + "language_loss": 0.87912452, + "learning_rate": 0.0009303932141759057, + "loss": 0.89076668, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.51464844, + "step": 1013, + "time_per_iteration": 2.8072102069854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211245, + "balance_loss_mlp": 1.15984225, + "epoch": 0.19507502885725278, + "flos": 667313708544.0, + "grad_norm": 0.06529111316537192, + "language_loss": 0.85445917, + "learning_rate": 0.0009302345660933902, + "loss": 0.86657166, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.51464844, + "step": 1014, + "time_per_iteration": 2.7895615100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244014, + "balance_loss_mlp": 1.19265878, + "epoch": 0.19526741054251634, + "flos": 672328618752.0, + "grad_norm": 0.06071591874537116, + "language_loss": 0.86587232, + "learning_rate": 0.0009300757509790026, + "loss": 0.87831247, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.51416016, + "step": 1015, + "time_per_iteration": 2.8867006301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012313, + "balance_loss_mlp": 1.18008745, + "epoch": 0.19545979222777993, + "flos": 448147792128.0, + "grad_norm": 0.057262662434688416, + "language_loss": 0.91914976, + "learning_rate": 0.0009299167688944005, + "loss": 0.93146276, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.51269531, + "step": 1016, + "time_per_iteration": 2.526421546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226901, + "balance_loss_mlp": 1.17568827, + "epoch": 0.1956521739130435, + "flos": 570169849344.0, + "grad_norm": 0.05343522997619492, + "language_loss": 0.87454194, + "learning_rate": 0.0009297576199013063, + "loss": 0.8868109, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.51269531, + "step": 1017, + "time_per_iteration": 2.7184784412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012071, + "balance_loss_mlp": 1.15884399, + "epoch": 0.19584455559830705, + "flos": 1458883280640.0, + "grad_norm": 0.03399393552013433, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74209231, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.48242188, + "step": 1018, + "time_per_iteration": 4.916393756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159874, + "balance_loss_mlp": 1.11199951, + "epoch": 0.1960369372835706, + "flos": 1594484189184.0, + "grad_norm": 0.02523442502037962, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80586171, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.47851562, + "step": 1019, + "time_per_iteration": 5.5991902351379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163202, + "balance_loss_mlp": 1.11241901, + "epoch": 0.19622931896883417, + "flos": 617254023168.0, + "grad_norm": 0.06792637193668423, + "language_loss": 0.88615566, + "learning_rate": 0.0009292791720892659, + "loss": 0.89778763, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.50830078, + "step": 1020, + "time_per_iteration": 2.8419806957244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.08191884, + "epoch": 0.19642170065409773, + "flos": 467208148224.0, + "grad_norm": 0.044541966790476714, + "language_loss": 0.90245676, + "learning_rate": 0.0009291193560807218, + "loss": 0.91378373, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.50830078, + "step": 1021, + "time_per_iteration": 2.60357403755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111942, + "balance_loss_mlp": 1.06858945, + "epoch": 0.19661408233936128, + "flos": 516288957696.0, + "grad_norm": 0.03957164107654416, + "language_loss": 0.88134921, + "learning_rate": 0.0009289593734732688, + "loss": 0.89254344, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.50878906, + "step": 1022, + "time_per_iteration": 2.6077988147735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115639, + "balance_loss_mlp": 1.06461763, + "epoch": 0.19680646402462484, + "flos": 393494104320.0, + "grad_norm": 0.03618938319364158, + "language_loss": 0.94921708, + "learning_rate": 0.0009287992243290175, + "loss": 0.96037352, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.51074219, + "step": 1023, + "time_per_iteration": 2.486910820007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104277, + "balance_loss_mlp": 1.05263603, + "epoch": 0.19699884570988843, + "flos": 627624071424.0, + "grad_norm": 0.04088238638674664, + "language_loss": 0.91379654, + "learning_rate": 0.0009286389087101435, + "loss": 0.92483938, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.51708984, + "step": 1024, + "time_per_iteration": 2.7762300968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083626, + "balance_loss_mlp": 1.03126919, + "epoch": 0.197191227395152, + "flos": 559074637056.0, + "grad_norm": 0.038177798611856564, + "language_loss": 0.89866579, + "learning_rate": 0.0009284784266788864, + "loss": 0.90950203, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.52441406, + "step": 1025, + "time_per_iteration": 2.7595441341400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.05275905, + "epoch": 0.19738360908041555, + "flos": 666250262016.0, + "grad_norm": 0.08120700653890094, + "language_loss": 0.93505025, + "learning_rate": 0.0009283177782975512, + "loss": 0.94610423, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.52734375, + "step": 1026, + "time_per_iteration": 2.9439735412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158523, + "balance_loss_mlp": 1.10511732, + "epoch": 0.1975759907656791, + "flos": 523511440896.0, + "grad_norm": 0.05175943009769999, + "language_loss": 0.89213437, + "learning_rate": 0.000928156963628507, + "loss": 0.9037196, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.53515625, + "step": 1027, + "time_per_iteration": 2.5648727416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124606, + "balance_loss_mlp": 1.0717721, + "epoch": 0.19776837245094267, + "flos": 463485118464.0, + "grad_norm": 0.0380471847687272, + "language_loss": 0.89530945, + "learning_rate": 0.0009279959827341877, + "loss": 0.90655547, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.52929688, + "step": 1028, + "time_per_iteration": 2.7482099533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114747, + "balance_loss_mlp": 1.0622474, + "epoch": 0.19796075413620623, + "flos": 504058367232.0, + "grad_norm": 0.038077776452832945, + "language_loss": 0.88821751, + "learning_rate": 0.0009278348356770915, + "loss": 0.89936495, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.52587891, + "step": 1029, + "time_per_iteration": 2.5559866428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125843, + "balance_loss_mlp": 1.07362974, + "epoch": 0.1981531358214698, + "flos": 508571689728.0, + "grad_norm": 0.03906482091144459, + "language_loss": 0.87010926, + "learning_rate": 0.0009276735225197814, + "loss": 0.88136768, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.52294922, + "step": 1030, + "time_per_iteration": 2.598353862762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116206, + "balance_loss_mlp": 1.06418335, + "epoch": 0.19834551750673335, + "flos": 532640153088.0, + "grad_norm": 0.039761606091750314, + "language_loss": 0.8715511, + "learning_rate": 0.0009275120433248847, + "loss": 0.88271314, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.52099609, + "step": 1031, + "time_per_iteration": 2.691051483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105688, + "balance_loss_mlp": 1.05414224, + "epoch": 0.1985378991919969, + "flos": 776971027200.0, + "grad_norm": 0.03650424605094363, + "language_loss": 0.87217546, + "learning_rate": 0.0009273503981550931, + "loss": 0.88323236, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.51611328, + "step": 1032, + "time_per_iteration": 3.05829119682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094626, + "balance_loss_mlp": 1.04336572, + "epoch": 0.1987302808772605, + "flos": 435192037632.0, + "grad_norm": 0.04492232470085823, + "language_loss": 0.88675368, + "learning_rate": 0.0009271885870731626, + "loss": 0.89769995, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.51318359, + "step": 1033, + "time_per_iteration": 2.5097644329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_mlp": 1.04036272, + "epoch": 0.19892266256252406, + "flos": 554654633472.0, + "grad_norm": 0.041410721104386976, + "language_loss": 0.89478087, + "learning_rate": 0.0009270266101419143, + "loss": 0.90569472, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.51074219, + "step": 1034, + "time_per_iteration": 2.6359710693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.04026711, + "epoch": 0.19911504424778761, + "flos": 550949100288.0, + "grad_norm": 0.034987230226667505, + "language_loss": 0.86329561, + "learning_rate": 0.0009268644674242328, + "loss": 0.87420899, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.51123047, + "step": 1035, + "time_per_iteration": 2.679041624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091574, + "balance_loss_mlp": 1.04045713, + "epoch": 0.19930742593305117, + "flos": 519313068288.0, + "grad_norm": 0.035495194235479824, + "language_loss": 0.81977046, + "learning_rate": 0.0009267021589830678, + "loss": 0.83068615, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.51171875, + "step": 1036, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01330025, + "balance_loss_mlp": 1.27871704, + "epoch": 0.19949980761831473, + "flos": 1512640717824.0, + "grad_norm": 0.0530000786951376, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78957105, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.51367188, + "step": 1037, + "time_per_iteration": 5.041083097457886 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097572, + "balance_loss_mlp": 1.04635978, + "epoch": 0.1996921893035783, + "flos": 699440634624.0, + "grad_norm": 0.03827221066614039, + "language_loss": 0.93735194, + "learning_rate": 0.000926377045182406, + "loss": 0.94832766, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.51269531, + "step": 1038, + "time_per_iteration": 2.921194314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106072, + "balance_loss_mlp": 1.05443072, + "epoch": 0.19988457098884185, + "flos": 728395696128.0, + "grad_norm": 0.0388450926907903, + "language_loss": 0.89164472, + "learning_rate": 0.0009262142399491296, + "loss": 0.90270543, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.51708984, + "step": 1039, + "time_per_iteration": 3.0543293952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.05093122, + "epoch": 0.2000769526741054, + "flos": 561625350144.0, + "grad_norm": 0.04341407711707897, + "language_loss": 0.8911137, + "learning_rate": 0.0009260512692448105, + "loss": 0.90213847, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.51611328, + "step": 1040, + "time_per_iteration": 2.6906111240386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097091, + "balance_loss_mlp": 1.04549766, + "epoch": 0.200269334359369, + "flos": 573165769728.0, + "grad_norm": 0.03433464693573298, + "language_loss": 0.85109496, + "learning_rate": 0.000925888133132719, + "loss": 0.86206591, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.51660156, + "step": 1041, + "time_per_iteration": 2.77327561378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112812, + "balance_loss_mlp": 1.06465149, + "epoch": 0.20046171604463256, + "flos": 1489155500544.0, + "grad_norm": 0.023433110981570023, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8072325, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.48144531, + "step": 1042, + "time_per_iteration": 4.926042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116989, + "balance_loss_mlp": 1.06525254, + "epoch": 0.20065409772989612, + "flos": 497578544640.0, + "grad_norm": 0.04254485219096875, + "language_loss": 0.82304472, + "learning_rate": 0.0009255613649386244, + "loss": 0.83421457, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.51806641, + "step": 1043, + "time_per_iteration": 2.6593456268310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111366, + "balance_loss_mlp": 1.06144655, + "epoch": 0.20084647941515968, + "flos": 580464075264.0, + "grad_norm": 0.040062947145422745, + "language_loss": 0.79980814, + "learning_rate": 0.0009253977329834838, + "loss": 0.81094474, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.52294922, + "step": 1044, + "time_per_iteration": 2.765777111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110762, + "balance_loss_mlp": 1.0584054, + "epoch": 0.20103886110042324, + "flos": 643288986624.0, + "grad_norm": 0.040441822708095716, + "language_loss": 0.87291706, + "learning_rate": 0.0009252339358742965, + "loss": 0.88402474, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.52441406, + "step": 1045, + "time_per_iteration": 2.825388193130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.05353701, + "epoch": 0.2012312427856868, + "flos": 442970543616.0, + "grad_norm": 0.03567593499019723, + "language_loss": 0.84250462, + "learning_rate": 0.000925069973674654, + "loss": 0.85356355, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.52441406, + "step": 1046, + "time_per_iteration": 2.609393358230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103182, + "balance_loss_mlp": 1.05082524, + "epoch": 0.20142362447095036, + "flos": 555473116416.0, + "grad_norm": 0.03147198417726023, + "language_loss": 0.89562172, + "learning_rate": 0.000924905846448212, + "loss": 0.90665352, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.52441406, + "step": 1047, + "time_per_iteration": 2.7771337032318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.0364331, + "epoch": 0.20161600615621392, + "flos": 671555822592.0, + "grad_norm": 0.0352448826174341, + "language_loss": 0.86282432, + "learning_rate": 0.0009247415542586906, + "loss": 0.87371844, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.53076172, + "step": 1048, + "time_per_iteration": 2.8992083072662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.03624833, + "epoch": 0.2018083878414775, + "flos": 574307950848.0, + "grad_norm": 0.02930747529675645, + "language_loss": 0.83574796, + "learning_rate": 0.0009245770971698735, + "loss": 0.84664071, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.53125, + "step": 1049, + "time_per_iteration": 2.890824317932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092759, + "balance_loss_mlp": 1.03992498, + "epoch": 0.20200076952674106, + "flos": 426795292416.0, + "grad_norm": 0.03785140598382088, + "language_loss": 0.89288604, + "learning_rate": 0.0009244124752456087, + "loss": 0.9038136, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.52929688, + "step": 1050, + "time_per_iteration": 2.5022785663604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078262, + "balance_loss_mlp": 1.02566695, + "epoch": 0.20219315121200462, + "flos": 537685198848.0, + "grad_norm": 0.03140637951028952, + "language_loss": 0.86254251, + "learning_rate": 0.0009242476885498081, + "loss": 0.87332511, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.52685547, + "step": 1051, + "time_per_iteration": 2.732915163040161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080259, + "balance_loss_mlp": 1.02771127, + "epoch": 0.20238553289726818, + "flos": 478835083776.0, + "grad_norm": 0.042472274730814934, + "language_loss": 0.82148528, + "learning_rate": 0.0009240827371464474, + "loss": 0.83228779, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.52636719, + "step": 1052, + "time_per_iteration": 2.577660322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076456, + "balance_loss_mlp": 1.02448094, + "epoch": 0.20257791458253174, + "flos": 1153847596800.0, + "grad_norm": 0.038862673250338535, + "language_loss": 0.85609984, + "learning_rate": 0.0009239176210995666, + "loss": 0.86686444, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.52050781, + "step": 1053, + "time_per_iteration": 3.517408609390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076589, + "balance_loss_mlp": 1.02485228, + "epoch": 0.2027702962677953, + "flos": 668149688064.0, + "grad_norm": 0.03591644261584591, + "language_loss": 0.94691521, + "learning_rate": 0.0009237523404732695, + "loss": 0.95768112, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.51806641, + "step": 1054, + "time_per_iteration": 2.9073944091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010707, + "balance_loss_mlp": 1.01934481, + "epoch": 0.20296267795305886, + "flos": 642453007104.0, + "grad_norm": 0.03829830750428097, + "language_loss": 0.85043323, + "learning_rate": 0.0009235868953317235, + "loss": 0.86114025, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.51416016, + "step": 1055, + "time_per_iteration": 2.8769731521606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063995, + "balance_loss_mlp": 1.01249659, + "epoch": 0.20315505963832242, + "flos": 932130967296.0, + "grad_norm": 0.03371739794492534, + "language_loss": 0.86243355, + "learning_rate": 0.0009234212857391602, + "loss": 0.87307346, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.515625, + "step": 1056, + "time_per_iteration": 3.1701345443725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062013, + "balance_loss_mlp": 1.01075327, + "epoch": 0.20334744132358598, + "flos": 563288560896.0, + "grad_norm": 0.028023058598955305, + "language_loss": 0.9034453, + "learning_rate": 0.000923255511759875, + "loss": 0.91406548, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.51318359, + "step": 1057, + "time_per_iteration": 2.8186585903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105938, + "balance_loss_mlp": 1.00840592, + "epoch": 0.20353982300884957, + "flos": 645429485568.0, + "grad_norm": 0.03599363132321351, + "language_loss": 0.85699975, + "learning_rate": 0.000923089573458227, + "loss": 0.86759359, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.51025391, + "step": 1058, + "time_per_iteration": 2.829428195953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063312, + "balance_loss_mlp": 1.01248097, + "epoch": 0.20373220469411313, + "flos": 652706403840.0, + "grad_norm": 0.03721325608628497, + "language_loss": 0.84890962, + "learning_rate": 0.0009229234708986392, + "loss": 0.85954273, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.50878906, + "step": 1059, + "time_per_iteration": 2.9125583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119614, + "balance_loss_mlp": 1.06964111, + "epoch": 0.2039245863793767, + "flos": 1440399367680.0, + "grad_norm": 0.026200157549973457, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82786512, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.49902344, + "step": 1060, + "time_per_iteration": 4.70502233505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105542, + "balance_loss_mlp": 1.00468493, + "epoch": 0.20411696806464025, + "flos": 598128538368.0, + "grad_norm": 0.03644056871626998, + "language_loss": 0.85909504, + "learning_rate": 0.0009225907732636548, + "loss": 0.86964923, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.5078125, + "step": 1061, + "time_per_iteration": 2.7681198120117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057413, + "balance_loss_mlp": 1.00672543, + "epoch": 0.2043093497499038, + "flos": 574897999872.0, + "grad_norm": 0.03243635340085092, + "language_loss": 0.87862682, + "learning_rate": 0.0009224241783174227, + "loss": 0.88920105, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.50732422, + "step": 1062, + "time_per_iteration": 2.682659864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058744, + "balance_loss_mlp": 1.00819898, + "epoch": 0.20450173143516737, + "flos": 631524990720.0, + "grad_norm": 0.033151959510572516, + "language_loss": 0.86810422, + "learning_rate": 0.0009222574193715802, + "loss": 0.87869167, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.50585938, + "step": 1063, + "time_per_iteration": 2.7470076084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057209, + "balance_loss_mlp": 1.00656855, + "epoch": 0.20469411312043093, + "flos": 575147821056.0, + "grad_norm": 0.03442752078644266, + "language_loss": 0.86910367, + "learning_rate": 0.000922090496490869, + "loss": 0.87967575, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.50683594, + "step": 1064, + "time_per_iteration": 2.789161443710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_mlp": 1.00465047, + "epoch": 0.20488649480569449, + "flos": 638280879360.0, + "grad_norm": 0.029149473365885022, + "language_loss": 0.90671569, + "learning_rate": 0.0009219234097400937, + "loss": 0.91726714, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.50537109, + "step": 1065, + "time_per_iteration": 2.8469130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00483656, + "epoch": 0.20507887649095807, + "flos": 977439169536.0, + "grad_norm": 0.03225683406068631, + "language_loss": 0.83590472, + "learning_rate": 0.0009217561591841237, + "loss": 0.84645659, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.50390625, + "step": 1066, + "time_per_iteration": 3.331498622894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105396, + "balance_loss_mlp": 1.00332034, + "epoch": 0.20527125817622163, + "flos": 487156006656.0, + "grad_norm": 0.037421781664849635, + "language_loss": 0.81758374, + "learning_rate": 0.0009215887448878913, + "loss": 0.82812333, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.50683594, + "step": 1067, + "time_per_iteration": 2.5782346725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054496, + "balance_loss_mlp": 1.00414193, + "epoch": 0.2054636398614852, + "flos": 528211401216.0, + "grad_norm": 0.031680985043262715, + "language_loss": 0.86063826, + "learning_rate": 0.0009214211669163922, + "loss": 0.87118322, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.50390625, + "step": 1068, + "time_per_iteration": 2.689772129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054583, + "balance_loss_mlp": 1.00403798, + "epoch": 0.20565602154674875, + "flos": 559324458240.0, + "grad_norm": 0.03119808154519671, + "language_loss": 0.94868428, + "learning_rate": 0.0009212534253346862, + "loss": 0.95923012, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.50585938, + "step": 1069, + "time_per_iteration": 2.760840654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.01027393, + "epoch": 0.2058484032320123, + "flos": 505221935616.0, + "grad_norm": 0.042999288209875815, + "language_loss": 0.85068119, + "learning_rate": 0.0009210855202078964, + "loss": 0.86128938, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.50585938, + "step": 1070, + "time_per_iteration": 2.6273016929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057609, + "balance_loss_mlp": 1.00687337, + "epoch": 0.20604078491727587, + "flos": 434047911168.0, + "grad_norm": 0.03672139626538296, + "language_loss": 0.88035965, + "learning_rate": 0.0009209174516012091, + "loss": 0.89093566, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.5078125, + "step": 1071, + "time_per_iteration": 2.5263099670410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.0049957, + "epoch": 0.20623316660253943, + "flos": 609875037696.0, + "grad_norm": 0.03118890610347894, + "language_loss": 0.89938867, + "learning_rate": 0.0009207492195798747, + "loss": 0.90994692, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.50878906, + "step": 1072, + "time_per_iteration": 2.773094654083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059091, + "balance_loss_mlp": 1.00816524, + "epoch": 0.206425548287803, + "flos": 481394545152.0, + "grad_norm": 0.034846135669383375, + "language_loss": 0.85408926, + "learning_rate": 0.0009205808242092061, + "loss": 0.86468017, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.50976562, + "step": 1073, + "time_per_iteration": 2.6704161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_mlp": 1.01025188, + "epoch": 0.20661792997306658, + "flos": 951124249344.0, + "grad_norm": 0.036438983488896924, + "language_loss": 0.83303434, + "learning_rate": 0.0009204122655545808, + "loss": 0.84364516, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.50878906, + "step": 1074, + "time_per_iteration": 3.3605480194091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059315, + "balance_loss_mlp": 1.00857949, + "epoch": 0.20681031165833014, + "flos": 604617109248.0, + "grad_norm": 0.03238632395719984, + "language_loss": 0.81744164, + "learning_rate": 0.0009202435436814388, + "loss": 0.82803476, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.5078125, + "step": 1075, + "time_per_iteration": 2.6966288089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106261, + "balance_loss_mlp": 1.01163661, + "epoch": 0.2070026933435937, + "flos": 710266583808.0, + "grad_norm": 0.03297439165012413, + "language_loss": 0.90137285, + "learning_rate": 0.0009200746586552836, + "loss": 0.91199899, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.51025391, + "step": 1076, + "time_per_iteration": 2.919851779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057537, + "balance_loss_mlp": 1.00675428, + "epoch": 0.20719507502885726, + "flos": 831255330048.0, + "grad_norm": 0.031928056401627374, + "language_loss": 0.84964621, + "learning_rate": 0.0009199056105416825, + "loss": 0.86022151, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.50830078, + "step": 1077, + "time_per_iteration": 3.0944886207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059646, + "balance_loss_mlp": 1.00881469, + "epoch": 0.20738745671412082, + "flos": 639500828160.0, + "grad_norm": 0.033227407694906064, + "language_loss": 0.87196565, + "learning_rate": 0.0009197363994062654, + "loss": 0.88256204, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.50878906, + "step": 1078, + "time_per_iteration": 2.8505265712738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059879, + "balance_loss_mlp": 1.00933433, + "epoch": 0.20757983839938438, + "flos": 686984522496.0, + "grad_norm": 0.03258152966614613, + "language_loss": 0.84972161, + "learning_rate": 0.0009195670253147262, + "loss": 0.86032039, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.50585938, + "step": 1079, + "time_per_iteration": 3.0077526569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_mlp": 1.01375961, + "epoch": 0.20777222008464794, + "flos": 520318189056.0, + "grad_norm": 0.03575722766779635, + "language_loss": 0.83075011, + "learning_rate": 0.0009193974883328216, + "loss": 0.84139216, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.50488281, + "step": 1080, + "time_per_iteration": 2.6277496814727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062434, + "balance_loss_mlp": 1.01212776, + "epoch": 0.2079646017699115, + "flos": 512470663680.0, + "grad_norm": 0.03316952161345372, + "language_loss": 0.87936002, + "learning_rate": 0.0009192277885263718, + "loss": 0.88998437, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.50341797, + "step": 1081, + "time_per_iteration": 2.6486003398895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056126, + "balance_loss_mlp": 1.00596321, + "epoch": 0.20815698345517505, + "flos": 933468534528.0, + "grad_norm": 0.031694408237267754, + "language_loss": 0.87043977, + "learning_rate": 0.0009190579259612602, + "loss": 0.881001, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.50195312, + "step": 1082, + "time_per_iteration": 3.280133008956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062428, + "balance_loss_mlp": 1.01202655, + "epoch": 0.20834936514043864, + "flos": 633554674176.0, + "grad_norm": 0.03367407497844021, + "language_loss": 0.87446159, + "learning_rate": 0.000918887900703433, + "loss": 0.88508588, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.50439453, + "step": 1083, + "time_per_iteration": 2.7914657592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_mlp": 1.01024699, + "epoch": 0.2085417468257022, + "flos": 395243831040.0, + "grad_norm": 0.03354838448754016, + "language_loss": 0.91036344, + "learning_rate": 0.0009187177128188999, + "loss": 0.92096996, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.50439453, + "step": 1084, + "time_per_iteration": 2.4803311824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107357, + "balance_loss_mlp": 1.02455139, + "epoch": 0.20873412851096576, + "flos": 1405197775104.0, + "grad_norm": 0.012085868941934568, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78230107, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.48925781, + "step": 1085, + "time_per_iteration": 4.883121728897095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_mlp": 1.00562024, + "epoch": 0.20892651019622932, + "flos": 448762140672.0, + "grad_norm": 0.03493036575467998, + "language_loss": 0.8691588, + "learning_rate": 0.000918376849434071, + "loss": 0.87971807, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.50317383, + "step": 1086, + "time_per_iteration": 2.537820816040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065129, + "balance_loss_mlp": 1.01444149, + "epoch": 0.20911889188149288, + "flos": 494081036544.0, + "grad_norm": 0.040745363066357655, + "language_loss": 0.91673005, + "learning_rate": 0.0009182061740661098, + "loss": 0.9273814, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.50732422, + "step": 1087, + "time_per_iteration": 2.5920886993408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056749, + "balance_loss_mlp": 1.00615633, + "epoch": 0.20931127356675644, + "flos": 842750062848.0, + "grad_norm": 0.02822254108426211, + "language_loss": 0.85810733, + "learning_rate": 0.0009180353363361127, + "loss": 0.86867487, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.50634766, + "step": 1088, + "time_per_iteration": 3.1376798152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060338, + "balance_loss_mlp": 1.00979316, + "epoch": 0.20950365525202, + "flos": 758525019648.0, + "grad_norm": 0.03922038165748564, + "language_loss": 0.83160806, + "learning_rate": 0.0009178643363104044, + "loss": 0.84221143, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.50585938, + "step": 1089, + "time_per_iteration": 3.124352216720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059844, + "balance_loss_mlp": 1.00939417, + "epoch": 0.20969603693728356, + "flos": 473492584704.0, + "grad_norm": 0.04272734591158297, + "language_loss": 0.920385, + "learning_rate": 0.0009176931740553735, + "loss": 0.93098342, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.50488281, + "step": 1090, + "time_per_iteration": 2.556528091430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067623, + "balance_loss_mlp": 1.01731646, + "epoch": 0.20988841862254715, + "flos": 978628982784.0, + "grad_norm": 0.03590255199570226, + "language_loss": 0.83530974, + "learning_rate": 0.0009175218496374708, + "loss": 0.84598601, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.50341797, + "step": 1091, + "time_per_iteration": 3.328984260559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059931, + "balance_loss_mlp": 1.00976801, + "epoch": 0.2100808003078107, + "flos": 1094819592192.0, + "grad_norm": 0.03766723451938342, + "language_loss": 0.86626744, + "learning_rate": 0.0009173503631232103, + "loss": 0.87686676, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.50170898, + "step": 1092, + "time_per_iteration": 3.4216480255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.00832939, + "epoch": 0.21027318199307427, + "flos": 1014560596992.0, + "grad_norm": 0.047058286401960234, + "language_loss": 0.82703817, + "learning_rate": 0.0009171787145791691, + "loss": 0.83762449, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.50341797, + "step": 1093, + "time_per_iteration": 3.2454655170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.00911129, + "epoch": 0.21046556367833782, + "flos": 522413001216.0, + "grad_norm": 0.043211200123957835, + "language_loss": 0.80955076, + "learning_rate": 0.000917006904071987, + "loss": 0.8201468, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.50537109, + "step": 1094, + "time_per_iteration": 2.6560592651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_mlp": 1.01053584, + "epoch": 0.21065794536360138, + "flos": 604840685568.0, + "grad_norm": 0.03488627405352903, + "language_loss": 0.87964189, + "learning_rate": 0.0009168349316683669, + "loss": 0.89025223, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.50537109, + "step": 1095, + "time_per_iteration": 2.794358253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.01243329, + "epoch": 0.21085032704886494, + "flos": 604558783488.0, + "grad_norm": 0.031199931973452354, + "language_loss": 0.82918072, + "learning_rate": 0.0009166627974350741, + "loss": 0.83981001, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.50537109, + "step": 1096, + "time_per_iteration": 2.89837384223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062823, + "balance_loss_mlp": 1.01242077, + "epoch": 0.2110427087341285, + "flos": 638832044544.0, + "grad_norm": 0.03623978918327459, + "language_loss": 0.90394479, + "learning_rate": 0.0009164905014389373, + "loss": 0.91457301, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.50439453, + "step": 1097, + "time_per_iteration": 2.79203462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055619, + "balance_loss_mlp": 1.00559878, + "epoch": 0.21123509041939206, + "flos": 523930403328.0, + "grad_norm": 0.03351990521185014, + "language_loss": 0.87381279, + "learning_rate": 0.0009163180437468476, + "loss": 0.88436902, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.50024414, + "step": 1098, + "time_per_iteration": 2.6110002994537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056208, + "balance_loss_mlp": 1.00647402, + "epoch": 0.21142747210465565, + "flos": 452194520064.0, + "grad_norm": 0.03619268995909484, + "language_loss": 0.86631316, + "learning_rate": 0.000916145424425759, + "loss": 0.87687522, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.49658203, + "step": 1099, + "time_per_iteration": 2.67106294631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060107, + "balance_loss_mlp": 1.01027727, + "epoch": 0.2116198537899192, + "flos": 877626978816.0, + "grad_norm": 0.042483916895571405, + "language_loss": 0.91832745, + "learning_rate": 0.0009159726435426885, + "loss": 0.92892849, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.49780273, + "step": 1100, + "time_per_iteration": 3.095250129699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052771, + "balance_loss_mlp": 1.00275087, + "epoch": 0.21181223547518277, + "flos": 524675009280.0, + "grad_norm": 0.035590136232614346, + "language_loss": 0.91126454, + "learning_rate": 0.0009157997011647154, + "loss": 0.92179227, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.49926758, + "step": 1101, + "time_per_iteration": 2.61954665184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.00615227, + "epoch": 0.21200461716044633, + "flos": 573426284544.0, + "grad_norm": 0.03167271765745466, + "language_loss": 0.86759949, + "learning_rate": 0.0009156265973589817, + "loss": 0.87816215, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.50146484, + "step": 1102, + "time_per_iteration": 2.7851946353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053549, + "balance_loss_mlp": 1.00348067, + "epoch": 0.2121969988457099, + "flos": 546175262976.0, + "grad_norm": 0.033324702660241096, + "language_loss": 0.90598941, + "learning_rate": 0.0009154533321926926, + "loss": 0.91652489, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.50073242, + "step": 1103, + "time_per_iteration": 2.658358573913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056655, + "balance_loss_mlp": 1.00663483, + "epoch": 0.21238938053097345, + "flos": 845355211008.0, + "grad_norm": 0.03290940631262569, + "language_loss": 0.88234645, + "learning_rate": 0.0009152799057331156, + "loss": 0.89291298, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.50024414, + "step": 1104, + "time_per_iteration": 3.1174561977386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056205, + "balance_loss_mlp": 1.00623202, + "epoch": 0.212581762216237, + "flos": 447142671360.0, + "grad_norm": 0.035279899791186564, + "language_loss": 0.91767001, + "learning_rate": 0.0009151063180475805, + "loss": 0.92823207, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.5, + "step": 1105, + "time_per_iteration": 2.538922071456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054823, + "balance_loss_mlp": 1.00489795, + "epoch": 0.21277414390150057, + "flos": 515385904128.0, + "grad_norm": 0.03737857831356842, + "language_loss": 0.85410213, + "learning_rate": 0.0009149325692034803, + "loss": 0.86465037, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.49853516, + "step": 1106, + "time_per_iteration": 2.588087558746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055756, + "balance_loss_mlp": 1.00788116, + "epoch": 0.21296652558676413, + "flos": 1488514907136.0, + "grad_norm": 0.005769411809131762, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80259192, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.47851562, + "step": 1107, + "time_per_iteration": 4.901995658874512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055609, + "balance_loss_mlp": 1.00596976, + "epoch": 0.21315890727202771, + "flos": 847451968512.0, + "grad_norm": 0.03679321288402367, + "language_loss": 0.87994891, + "learning_rate": 0.0009145845883094678, + "loss": 0.89050496, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.49584961, + "step": 1108, + "time_per_iteration": 3.034179925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057835, + "balance_loss_mlp": 1.00833917, + "epoch": 0.21335128895729127, + "flos": 630556808448.0, + "grad_norm": 0.040833312538100186, + "language_loss": 0.86006308, + "learning_rate": 0.000914410356394654, + "loss": 0.87064135, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.49438477, + "step": 1109, + "time_per_iteration": 2.793839931488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058379, + "balance_loss_mlp": 1.00878823, + "epoch": 0.21354367064255483, + "flos": 712285573632.0, + "grad_norm": 0.029526159769499145, + "language_loss": 0.85111213, + "learning_rate": 0.0009142359635914709, + "loss": 0.86169595, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.49560547, + "step": 1110, + "time_per_iteration": 3.0403430461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063298, + "balance_loss_mlp": 1.01375508, + "epoch": 0.2137360523278184, + "flos": 457211375616.0, + "grad_norm": 0.03547311640481051, + "language_loss": 0.85051197, + "learning_rate": 0.0009140614099676245, + "loss": 0.8611449, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.49414062, + "step": 1111, + "time_per_iteration": 2.6027371883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054693, + "balance_loss_mlp": 1.00495887, + "epoch": 0.21392843401308195, + "flos": 667266076416.0, + "grad_norm": 0.03139007596896344, + "language_loss": 0.8342849, + "learning_rate": 0.0009138866955908821, + "loss": 0.84483182, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.49658203, + "step": 1112, + "time_per_iteration": 2.924180269241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055191, + "balance_loss_mlp": 1.00517082, + "epoch": 0.2141208156983455, + "flos": 750362544384.0, + "grad_norm": 0.03405304612319473, + "language_loss": 0.81477892, + "learning_rate": 0.0009137118205290738, + "loss": 0.82533085, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.49951172, + "step": 1113, + "time_per_iteration": 2.956289768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057131, + "balance_loss_mlp": 1.00711048, + "epoch": 0.21431319738360907, + "flos": 420011213568.0, + "grad_norm": 0.037812047895131755, + "language_loss": 0.90930229, + "learning_rate": 0.0009135367848500924, + "loss": 0.9198736, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.49975586, + "step": 1114, + "time_per_iteration": 2.5228912830352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_mlp": 1.01079023, + "epoch": 0.21450557906887263, + "flos": 610239565056.0, + "grad_norm": 0.04455846969282107, + "language_loss": 0.87261575, + "learning_rate": 0.0009133615886218927, + "loss": 0.88322389, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.5, + "step": 1115, + "time_per_iteration": 2.7146785259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105379, + "balance_loss_mlp": 1.00367427, + "epoch": 0.21469796075413622, + "flos": 562975556352.0, + "grad_norm": 0.04025415931658291, + "language_loss": 0.88754129, + "learning_rate": 0.0009131862319124917, + "loss": 0.89807916, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.50097656, + "step": 1116, + "time_per_iteration": 2.702315092086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058084, + "balance_loss_mlp": 1.0081588, + "epoch": 0.21489034243939978, + "flos": 595738218240.0, + "grad_norm": 0.036347556106983744, + "language_loss": 0.84819156, + "learning_rate": 0.0009130107147899691, + "loss": 0.8587724, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.49902344, + "step": 1117, + "time_per_iteration": 2.705153226852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055436, + "balance_loss_mlp": 1.00555849, + "epoch": 0.21508272412466334, + "flos": 442850979840.0, + "grad_norm": 0.032390780355026266, + "language_loss": 0.85796201, + "learning_rate": 0.0009128350373224665, + "loss": 0.86851633, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.49804688, + "step": 1118, + "time_per_iteration": 2.5689737796783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055817, + "balance_loss_mlp": 1.00775146, + "epoch": 0.2152751058099269, + "flos": 1499234898432.0, + "grad_norm": 0.005802610423144338, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82512248, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.48046875, + "step": 1119, + "time_per_iteration": 4.659603834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054629, + "balance_loss_mlp": 1.00475144, + "epoch": 0.21546748749519046, + "flos": 494992838400.0, + "grad_norm": 0.03550503890551413, + "language_loss": 0.86117166, + "learning_rate": 0.0009124832016254005, + "loss": 0.87171793, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.4987793, + "step": 1120, + "time_per_iteration": 2.6080243587493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054572, + "balance_loss_mlp": 1.00450444, + "epoch": 0.21565986918045402, + "flos": 635695173120.0, + "grad_norm": 0.03761657282592244, + "language_loss": 0.88987935, + "learning_rate": 0.0009123070435324316, + "loss": 0.90042508, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.50097656, + "step": 1121, + "time_per_iteration": 2.8451340198516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062664, + "balance_loss_mlp": 1.01450348, + "epoch": 0.21585225086571758, + "flos": 1586801914368.0, + "grad_norm": 0.011675507285583616, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78938448, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.48144531, + "step": 1122, + "time_per_iteration": 5.018117666244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_mlp": 1.00541639, + "epoch": 0.21604463255098114, + "flos": 685323257088.0, + "grad_norm": 0.03443856201457266, + "language_loss": 0.87021005, + "learning_rate": 0.0009119542471995752, + "loss": 0.8807621, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.49682617, + "step": 1123, + "time_per_iteration": 2.8631908893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.00755107, + "epoch": 0.2162370142362447, + "flos": 782308668672.0, + "grad_norm": 0.034966150945184314, + "language_loss": 0.82536203, + "learning_rate": 0.0009117776090966554, + "loss": 0.83593345, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.49511719, + "step": 1124, + "time_per_iteration": 2.9458060264587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_mlp": 1.00877571, + "epoch": 0.21642939592150828, + "flos": 1003762838016.0, + "grad_norm": 0.03795033166932298, + "language_loss": 0.87775326, + "learning_rate": 0.0009116008111274899, + "loss": 0.88833648, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.49511719, + "step": 1125, + "time_per_iteration": 3.2748866081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_mlp": 1.00556183, + "epoch": 0.21662177760677184, + "flos": 1485764917248.0, + "grad_norm": 0.008195913283110022, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8015998, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.47460938, + "step": 1126, + "time_per_iteration": 4.803825616836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105355, + "balance_loss_mlp": 1.00391161, + "epoch": 0.2168141592920354, + "flos": 888861196800.0, + "grad_norm": 0.03626284425770287, + "language_loss": 0.85553163, + "learning_rate": 0.0009112467358650396, + "loss": 0.86606717, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.49609375, + "step": 1127, + "time_per_iteration": 3.155856132507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057313, + "balance_loss_mlp": 1.00753081, + "epoch": 0.21700654097729896, + "flos": 547085119488.0, + "grad_norm": 0.03272511127748384, + "language_loss": 0.87140059, + "learning_rate": 0.0009110694587092192, + "loss": 0.88197374, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.49682617, + "step": 1128, + "time_per_iteration": 2.7438507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.00781655, + "epoch": 0.21719892266256252, + "flos": 510536244480.0, + "grad_norm": 0.0385378102776186, + "language_loss": 0.81826651, + "learning_rate": 0.0009108920219620815, + "loss": 0.82884294, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.49829102, + "step": 1129, + "time_per_iteration": 2.6256754398345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.00682795, + "epoch": 0.21739130434782608, + "flos": 544462474752.0, + "grad_norm": 0.03288593298355655, + "language_loss": 0.9021399, + "learning_rate": 0.0009107144256925133, + "loss": 0.91270602, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.49707031, + "step": 1130, + "time_per_iteration": 2.665764808654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055495, + "balance_loss_mlp": 1.00566518, + "epoch": 0.21758368603308964, + "flos": 617983077888.0, + "grad_norm": 0.04004849400109536, + "language_loss": 0.83221352, + "learning_rate": 0.0009105366699694638, + "loss": 0.84276843, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.49755859, + "step": 1131, + "time_per_iteration": 2.7092785835266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055334, + "balance_loss_mlp": 1.0055995, + "epoch": 0.2177760677183532, + "flos": 636335766528.0, + "grad_norm": 0.03327692114185805, + "language_loss": 0.82139939, + "learning_rate": 0.0009103587548619439, + "loss": 0.83195269, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.49658203, + "step": 1132, + "time_per_iteration": 2.833617925643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055546, + "balance_loss_mlp": 1.00585985, + "epoch": 0.2179684494036168, + "flos": 533597641728.0, + "grad_norm": 0.036557340203022134, + "language_loss": 0.8721149, + "learning_rate": 0.0009101806804390261, + "loss": 0.8826704, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.49609375, + "step": 1133, + "time_per_iteration": 2.7880306243896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054716, + "balance_loss_mlp": 1.0050298, + "epoch": 0.21816083108888035, + "flos": 476182303488.0, + "grad_norm": 0.03701280834454915, + "language_loss": 0.917292, + "learning_rate": 0.0009100024467698453, + "loss": 0.92783916, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.49560547, + "step": 1134, + "time_per_iteration": 2.592986822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054821, + "balance_loss_mlp": 1.00513422, + "epoch": 0.2183532127741439, + "flos": 578547152640.0, + "grad_norm": 0.04183992577645213, + "language_loss": 0.83309305, + "learning_rate": 0.0009098240539235981, + "loss": 0.84364122, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.49658203, + "step": 1135, + "time_per_iteration": 2.693387269973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055318, + "balance_loss_mlp": 1.00558341, + "epoch": 0.21854559445940747, + "flos": 595280371968.0, + "grad_norm": 0.03379290176549673, + "language_loss": 0.88387418, + "learning_rate": 0.0009096455019695423, + "loss": 0.89442736, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.49609375, + "step": 1136, + "time_per_iteration": 2.781304359436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059818, + "balance_loss_mlp": 1.0098455, + "epoch": 0.21873797614467103, + "flos": 409549791744.0, + "grad_norm": 0.03874067782032871, + "language_loss": 0.90736896, + "learning_rate": 0.000909466790976998, + "loss": 0.91796714, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.49951172, + "step": 1137, + "time_per_iteration": 2.4837231636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.00620675, + "epoch": 0.21893035782993459, + "flos": 895655969280.0, + "grad_norm": 0.03281311030157744, + "language_loss": 0.83296013, + "learning_rate": 0.0009092879210153473, + "loss": 0.84352005, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.49682617, + "step": 1138, + "time_per_iteration": 3.156329870223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058359, + "balance_loss_mlp": 1.00862455, + "epoch": 0.21912273951519814, + "flos": 468569048064.0, + "grad_norm": 0.03332829582894704, + "language_loss": 0.89480728, + "learning_rate": 0.0009091088921540333, + "loss": 0.90539086, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.49731445, + "step": 1139, + "time_per_iteration": 2.5444674491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060322, + "balance_loss_mlp": 1.01197052, + "epoch": 0.2193151212004617, + "flos": 1535180118528.0, + "grad_norm": 0.009447727830516332, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76569003, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.48339844, + "step": 1140, + "time_per_iteration": 4.993603944778442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_mlp": 1.00158358, + "epoch": 0.2195075028857253, + "flos": 592275703296.0, + "grad_norm": 0.039648398816974934, + "language_loss": 0.85201681, + "learning_rate": 0.0009087503580104985, + "loss": 0.86252946, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.49560547, + "step": 1141, + "time_per_iteration": 2.6736245155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053862, + "balance_loss_mlp": 1.00436676, + "epoch": 0.21969988457098885, + "flos": 637518776832.0, + "grad_norm": 0.03678403810630545, + "language_loss": 0.8005864, + "learning_rate": 0.0009085708528674728, + "loss": 0.81112504, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.49414062, + "step": 1142, + "time_per_iteration": 2.799607038497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.00362051, + "epoch": 0.2198922662562524, + "flos": 913860903936.0, + "grad_norm": 0.040969430424554455, + "language_loss": 0.86853033, + "learning_rate": 0.0009083911891031745, + "loss": 0.87906301, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.49487305, + "step": 1143, + "time_per_iteration": 3.1043601036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010518, + "balance_loss_mlp": 1.00235164, + "epoch": 0.22008464794151597, + "flos": 824495550720.0, + "grad_norm": 0.03475506353694162, + "language_loss": 0.91937912, + "learning_rate": 0.0009082113667873553, + "loss": 0.92989707, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.4934082, + "step": 1144, + "time_per_iteration": 3.114678144454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055626, + "balance_loss_mlp": 1.00636888, + "epoch": 0.22027702962677953, + "flos": 460619455488.0, + "grad_norm": 0.047183367988671336, + "language_loss": 0.91319406, + "learning_rate": 0.0009080313859898283, + "loss": 0.92375034, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.49145508, + "step": 1145, + "time_per_iteration": 2.529627799987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_mlp": 1.00877535, + "epoch": 0.2204694113120431, + "flos": 532288264704.0, + "grad_norm": 0.034289556826903954, + "language_loss": 0.91988164, + "learning_rate": 0.0009078512467804684, + "loss": 0.93046296, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.49243164, + "step": 1146, + "time_per_iteration": 2.692556381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056493, + "balance_loss_mlp": 1.00737858, + "epoch": 0.22066179299730665, + "flos": 523687385088.0, + "grad_norm": 0.03628724645244133, + "language_loss": 0.91349947, + "learning_rate": 0.0009076709492292119, + "loss": 0.9240644, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.49023438, + "step": 1147, + "time_per_iteration": 2.6262857913970947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056943, + "balance_loss_mlp": 1.00799513, + "epoch": 0.2208541746825702, + "flos": 547506027264.0, + "grad_norm": 0.0383258843164557, + "language_loss": 0.89899343, + "learning_rate": 0.0009074904934060562, + "loss": 0.90956283, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.48901367, + "step": 1148, + "time_per_iteration": 2.710716962814331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054195, + "balance_loss_mlp": 1.00498509, + "epoch": 0.22104655636783377, + "flos": 710060504064.0, + "grad_norm": 0.034028934421108444, + "language_loss": 0.85814822, + "learning_rate": 0.0009073098793810607, + "loss": 0.86869013, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.4909668, + "step": 1149, + "time_per_iteration": 2.986891269683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056627, + "balance_loss_mlp": 1.00758433, + "epoch": 0.22123893805309736, + "flos": 585965021952.0, + "grad_norm": 0.03641392016248804, + "language_loss": 0.88886124, + "learning_rate": 0.000907129107224346, + "loss": 0.89942753, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.48999023, + "step": 1150, + "time_per_iteration": 2.7348337173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055224, + "balance_loss_mlp": 1.00601482, + "epoch": 0.22143131973836092, + "flos": 493251859968.0, + "grad_norm": 0.02984339906163832, + "language_loss": 0.89448893, + "learning_rate": 0.0009069481770060939, + "loss": 0.90504116, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.49121094, + "step": 1151, + "time_per_iteration": 2.688180685043335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055578, + "balance_loss_mlp": 1.00593948, + "epoch": 0.22162370142362448, + "flos": 1081469174784.0, + "grad_norm": 0.034516826316188534, + "language_loss": 0.8487525, + "learning_rate": 0.000906767088796548, + "loss": 0.85930824, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.49584961, + "step": 1152, + "time_per_iteration": 3.4747724533081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057959, + "balance_loss_mlp": 1.00841522, + "epoch": 0.22181608310888803, + "flos": 493512374784.0, + "grad_norm": 0.03114695536209251, + "language_loss": 0.87880313, + "learning_rate": 0.0009065858426660127, + "loss": 0.88938272, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.49462891, + "step": 1153, + "time_per_iteration": 2.6112635135650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060109, + "balance_loss_mlp": 1.0103749, + "epoch": 0.2220084647941516, + "flos": 725325898752.0, + "grad_norm": 0.04119971901255946, + "language_loss": 0.85662532, + "learning_rate": 0.0009064044386848543, + "loss": 0.86722642, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.49658203, + "step": 1154, + "time_per_iteration": 2.893120288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105547, + "balance_loss_mlp": 1.00564086, + "epoch": 0.22220084647941515, + "flos": 490245245952.0, + "grad_norm": 0.04012578927121656, + "language_loss": 0.89651787, + "learning_rate": 0.0009062228769234997, + "loss": 0.9070726, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.49731445, + "step": 1155, + "time_per_iteration": 2.544904947280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_mlp": 1.00344408, + "epoch": 0.2223932281646787, + "flos": 537296371968.0, + "grad_norm": 0.03814815821860503, + "language_loss": 0.82016486, + "learning_rate": 0.0009060411574524376, + "loss": 0.83069855, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.49804688, + "step": 1156, + "time_per_iteration": 2.6412572860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056148, + "balance_loss_mlp": 1.00660419, + "epoch": 0.22258560984994227, + "flos": 932968892160.0, + "grad_norm": 0.0415511709861084, + "language_loss": 0.88770878, + "learning_rate": 0.0009058592803422178, + "loss": 0.89827025, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.49462891, + "step": 1157, + "time_per_iteration": 4.623233079910278 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055229, + "balance_loss_mlp": 1.00792694, + "epoch": 0.22277799153520586, + "flos": 1202397638400.0, + "grad_norm": 0.007067436666665483, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79765517, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.47265625, + "step": 1158, + "time_per_iteration": 4.805820465087891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053661, + "balance_loss_mlp": 1.00397491, + "epoch": 0.22297037322046942, + "flos": 502317388800.0, + "grad_norm": 0.032485949168455416, + "language_loss": 0.91067338, + "learning_rate": 0.00090549505348681, + "loss": 0.92121005, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.49633789, + "step": 1159, + "time_per_iteration": 2.5877561569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00427544, + "epoch": 0.22316275490573298, + "flos": 754113764352.0, + "grad_norm": 0.0354615562345569, + "language_loss": 0.84617937, + "learning_rate": 0.0009053127038830275, + "loss": 0.85672045, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.49731445, + "step": 1160, + "time_per_iteration": 3.0164098739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_mlp": 1.00777233, + "epoch": 0.22335513659099654, + "flos": 515804866560.0, + "grad_norm": 0.03692799991821936, + "language_loss": 0.87995219, + "learning_rate": 0.000905130196922898, + "loss": 0.89052767, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.49682617, + "step": 1161, + "time_per_iteration": 2.603769063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058076, + "balance_loss_mlp": 1.00848484, + "epoch": 0.2235475182762601, + "flos": 485508347136.0, + "grad_norm": 0.031071089964746976, + "language_loss": 0.8758713, + "learning_rate": 0.0009049475326772769, + "loss": 0.88645208, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.49511719, + "step": 1162, + "time_per_iteration": 2.6613070964813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_mlp": 1.00334835, + "epoch": 0.22373989996152366, + "flos": 471068238336.0, + "grad_norm": 0.03308636607962537, + "language_loss": 0.83887613, + "learning_rate": 0.0009047647112170811, + "loss": 0.84940416, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.49389648, + "step": 1163, + "time_per_iteration": 2.8056106567382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105253, + "balance_loss_mlp": 1.00322485, + "epoch": 0.22393228164678722, + "flos": 1273019542272.0, + "grad_norm": 0.035987441954907426, + "language_loss": 0.88180983, + "learning_rate": 0.0009045817326132876, + "loss": 0.89233518, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.49243164, + "step": 1164, + "time_per_iteration": 3.7020320892333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_mlp": 1.00575495, + "epoch": 0.22412466333205078, + "flos": 597468503040.0, + "grad_norm": 0.03371692057767332, + "language_loss": 0.84342653, + "learning_rate": 0.0009043985969369357, + "loss": 0.85397661, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.49145508, + "step": 1165, + "time_per_iteration": 2.8581626415252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_mlp": 1.00299454, + "epoch": 0.22431704501731436, + "flos": 609632019456.0, + "grad_norm": 0.03010954873673584, + "language_loss": 0.84869868, + "learning_rate": 0.0009042153042591245, + "loss": 0.85922217, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.49243164, + "step": 1166, + "time_per_iteration": 2.810300827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_mlp": 1.0050199, + "epoch": 0.22450942670257792, + "flos": 908108190720.0, + "grad_norm": 0.030118647676053625, + "language_loss": 0.86120874, + "learning_rate": 0.0009040318546510146, + "loss": 0.87175173, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.49169922, + "step": 1167, + "time_per_iteration": 3.129802942276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057032, + "balance_loss_mlp": 1.00791764, + "epoch": 0.22470180838784148, + "flos": 566381690880.0, + "grad_norm": 0.035718478093575166, + "language_loss": 0.85780692, + "learning_rate": 0.0009038482481838275, + "loss": 0.86837721, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.49047852, + "step": 1168, + "time_per_iteration": 2.674471855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_mlp": 1.00880456, + "epoch": 0.22489419007310504, + "flos": 835918351872.0, + "grad_norm": 0.03078757560697398, + "language_loss": 0.88093269, + "learning_rate": 0.0009036644849288455, + "loss": 0.89151073, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.48925781, + "step": 1169, + "time_per_iteration": 3.126168727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00331759, + "epoch": 0.2250865717583686, + "flos": 582139924992.0, + "grad_norm": 0.03503818002335677, + "language_loss": 0.86431491, + "learning_rate": 0.0009034805649574118, + "loss": 0.87483639, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.48779297, + "step": 1170, + "time_per_iteration": 2.6982839107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056949, + "balance_loss_mlp": 1.0084312, + "epoch": 0.22527895344363216, + "flos": 601671733248.0, + "grad_norm": 0.031992933731526396, + "language_loss": 0.85811341, + "learning_rate": 0.0009032964883409308, + "loss": 0.86868292, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.48510742, + "step": 1171, + "time_per_iteration": 2.9468932151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055088, + "balance_loss_mlp": 1.00826263, + "epoch": 0.22547133512889572, + "flos": 1443734537472.0, + "grad_norm": 0.010800983830845337, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.7410562, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.46777344, + "step": 1172, + "time_per_iteration": 5.044191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_mlp": 1.0051204, + "epoch": 0.22566371681415928, + "flos": 491586703872.0, + "grad_norm": 0.034976527569036825, + "language_loss": 0.88142014, + "learning_rate": 0.0009029278654587462, + "loss": 0.89195722, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.48583984, + "step": 1173, + "time_per_iteration": 2.5891120433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105611, + "balance_loss_mlp": 1.00749624, + "epoch": 0.22585609849942284, + "flos": 605752487424.0, + "grad_norm": 0.03629905495680353, + "language_loss": 0.82793885, + "learning_rate": 0.0009027433193361548, + "loss": 0.83850002, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.48583984, + "step": 1174, + "time_per_iteration": 2.707061290740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105926, + "balance_loss_mlp": 1.01064646, + "epoch": 0.22604848018468643, + "flos": 636728484096.0, + "grad_norm": 0.035409171913978986, + "language_loss": 0.87780964, + "learning_rate": 0.00090255861685474, + "loss": 0.88840234, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.48608398, + "step": 1175, + "time_per_iteration": 2.7910189628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.00752461, + "epoch": 0.22624086186995, + "flos": 480845325312.0, + "grad_norm": 0.040136392489239156, + "language_loss": 0.91905487, + "learning_rate": 0.0009023737580862095, + "loss": 0.92961645, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.48632812, + "step": 1176, + "time_per_iteration": 2.5489909648895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054427, + "balance_loss_mlp": 1.00600469, + "epoch": 0.22643324355521355, + "flos": 496807693824.0, + "grad_norm": 0.032828642541270554, + "language_loss": 0.83966863, + "learning_rate": 0.0009021887431023321, + "loss": 0.85021293, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.48413086, + "step": 1177, + "time_per_iteration": 2.679046392440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060571, + "balance_loss_mlp": 1.01224387, + "epoch": 0.2266256252404771, + "flos": 562684905984.0, + "grad_norm": 0.03431341234676521, + "language_loss": 0.8836711, + "learning_rate": 0.0009020035719749369, + "loss": 0.89427686, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.4831543, + "step": 1178, + "time_per_iteration": 2.777273416519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_mlp": 1.00516534, + "epoch": 0.22681800692574067, + "flos": 581033703936.0, + "grad_norm": 0.0422995660898389, + "language_loss": 0.78512251, + "learning_rate": 0.0009018182447759136, + "loss": 0.79566014, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.48583984, + "step": 1179, + "time_per_iteration": 2.9779903888702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105363, + "balance_loss_mlp": 1.00508785, + "epoch": 0.22701038861100423, + "flos": 741466156800.0, + "grad_norm": 0.03672617722264385, + "language_loss": 0.80683887, + "learning_rate": 0.0009016327615772126, + "loss": 0.81737518, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.48535156, + "step": 1180, + "time_per_iteration": 2.953355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054935, + "balance_loss_mlp": 1.00636911, + "epoch": 0.2272027702962678, + "flos": 578306079744.0, + "grad_norm": 0.03924605706365315, + "language_loss": 0.88551408, + "learning_rate": 0.0009014471224508451, + "loss": 0.89606345, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.4855957, + "step": 1181, + "time_per_iteration": 2.7092630863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00744355, + "epoch": 0.22739515198153135, + "flos": 545291651328.0, + "grad_norm": 0.04038062834310644, + "language_loss": 0.83949769, + "learning_rate": 0.0009012613274688823, + "loss": 0.85005856, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.48632812, + "step": 1182, + "time_per_iteration": 2.642143964767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055555, + "balance_loss_mlp": 1.00689363, + "epoch": 0.22758753366679493, + "flos": 441092504832.0, + "grad_norm": 0.03566258536478163, + "language_loss": 0.88506091, + "learning_rate": 0.0009010753767034565, + "loss": 0.89561647, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.48632812, + "step": 1183, + "time_per_iteration": 2.599167585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_mlp": 1.00526154, + "epoch": 0.2277799153520585, + "flos": 730824900096.0, + "grad_norm": 0.03354089847275564, + "language_loss": 0.79992342, + "learning_rate": 0.0009008892702267599, + "loss": 0.81046152, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.48535156, + "step": 1184, + "time_per_iteration": 2.9798924922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057068, + "balance_loss_mlp": 1.00855029, + "epoch": 0.22797229703732205, + "flos": 527913947904.0, + "grad_norm": 0.04184098346005727, + "language_loss": 0.89975739, + "learning_rate": 0.0009007030081110457, + "loss": 0.91032803, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.48510742, + "step": 1185, + "time_per_iteration": 2.6349968910217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.00910807, + "epoch": 0.2281646787225856, + "flos": 536521630464.0, + "grad_norm": 0.03583751901003141, + "language_loss": 0.85487026, + "learning_rate": 0.000900516590428627, + "loss": 0.86544555, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.48413086, + "step": 1186, + "time_per_iteration": 2.669015407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054531, + "balance_loss_mlp": 1.00596476, + "epoch": 0.22835706040784917, + "flos": 542478478080.0, + "grad_norm": 0.03191556588332838, + "language_loss": 0.9033947, + "learning_rate": 0.0009003300172518778, + "loss": 0.91394001, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.4855957, + "step": 1187, + "time_per_iteration": 2.7164688110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.00804579, + "epoch": 0.22854944209311273, + "flos": 792006042624.0, + "grad_norm": 0.0322044633529041, + "language_loss": 0.85374159, + "learning_rate": 0.0009001432886532321, + "loss": 0.86430913, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.48681641, + "step": 1188, + "time_per_iteration": 2.9621965885162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054799, + "balance_loss_mlp": 1.00568485, + "epoch": 0.2287418237783763, + "flos": 470216707584.0, + "grad_norm": 0.03536870053258389, + "language_loss": 0.87358034, + "learning_rate": 0.0008999564047051843, + "loss": 0.88412833, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.49047852, + "step": 1189, + "time_per_iteration": 2.5233154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058932, + "balance_loss_mlp": 1.01003218, + "epoch": 0.22893420546363985, + "flos": 469005507072.0, + "grad_norm": 0.030491923293758834, + "language_loss": 0.8554523, + "learning_rate": 0.0008997693654802894, + "loss": 0.86604154, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.48852539, + "step": 1190, + "time_per_iteration": 2.6391589641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_mlp": 1.00965738, + "epoch": 0.22912658714890344, + "flos": 627402440448.0, + "grad_norm": 0.0331512035559832, + "language_loss": 0.87166977, + "learning_rate": 0.0008995821710511625, + "loss": 0.88225698, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.49023438, + "step": 1191, + "time_per_iteration": 2.7549567222595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.00599909, + "epoch": 0.229318968834167, + "flos": 504021428736.0, + "grad_norm": 0.030936804790582927, + "language_loss": 0.85688579, + "learning_rate": 0.0008993948214904786, + "loss": 0.86743385, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.48779297, + "step": 1192, + "time_per_iteration": 2.596224784851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_mlp": 1.01483917, + "epoch": 0.22951135051943056, + "flos": 1377716374272.0, + "grad_norm": 0.008909469382289665, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79484069, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.46972656, + "step": 1193, + "time_per_iteration": 4.853066921234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062271, + "balance_loss_mlp": 1.01356232, + "epoch": 0.22970373220469412, + "flos": 645550994688.0, + "grad_norm": 0.0389743097765726, + "language_loss": 0.78935194, + "learning_rate": 0.0008990196572654427, + "loss": 0.79997468, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.48681641, + "step": 1194, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00771046, + "epoch": 0.22989611388995768, + "flos": 501273384192.0, + "grad_norm": 0.02988304738122761, + "language_loss": 0.88486552, + "learning_rate": 0.0008988318427467426, + "loss": 0.8954283, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.4855957, + "step": 1195, + "time_per_iteration": 2.6931521892547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053709, + "balance_loss_mlp": 1.00514269, + "epoch": 0.23008849557522124, + "flos": 1098334596864.0, + "grad_norm": 0.03694163801075408, + "language_loss": 0.87307864, + "learning_rate": 0.0008986438733877887, + "loss": 0.88361579, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.4855957, + "step": 1196, + "time_per_iteration": 3.4505865573883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00447488, + "epoch": 0.2302808772604848, + "flos": 684993722880.0, + "grad_norm": 0.030674764969734848, + "language_loss": 0.85086071, + "learning_rate": 0.0008984557492615576, + "loss": 0.86139137, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.48583984, + "step": 1197, + "time_per_iteration": 2.936891794204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056985, + "balance_loss_mlp": 1.00837183, + "epoch": 0.23047325894574835, + "flos": 529961127936.0, + "grad_norm": 0.03469763625730159, + "language_loss": 0.90249604, + "learning_rate": 0.0008982674704410854, + "loss": 0.91306591, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.48608398, + "step": 1198, + "time_per_iteration": 2.6928677558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_mlp": 1.00653744, + "epoch": 0.23066564063101191, + "flos": 684127607808.0, + "grad_norm": 0.03582939263118032, + "language_loss": 0.78263444, + "learning_rate": 0.0008980790369994682, + "loss": 0.79318547, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.4855957, + "step": 1199, + "time_per_iteration": 2.941063642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692904, + "epoch": 0.2308580223162755, + "flos": 559632605184.0, + "grad_norm": 0.03400437188822284, + "language_loss": 0.87868834, + "learning_rate": 0.000897890449009863, + "loss": 0.88924116, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.48339844, + "step": 1200, + "time_per_iteration": 2.6677346229553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058779, + "balance_loss_mlp": 1.01061893, + "epoch": 0.23105040400153906, + "flos": 556730003712.0, + "grad_norm": 0.030515141355108834, + "language_loss": 0.90571141, + "learning_rate": 0.0008977017065454853, + "loss": 0.91629916, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.48144531, + "step": 1201, + "time_per_iteration": 2.7204995155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053158, + "balance_loss_mlp": 1.00506902, + "epoch": 0.23124278568680262, + "flos": 706050714624.0, + "grad_norm": 0.034769733982414605, + "language_loss": 0.81452352, + "learning_rate": 0.0008975128096796121, + "loss": 0.82505512, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.48071289, + "step": 1202, + "time_per_iteration": 2.861058473587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.00517035, + "epoch": 0.23143516737206618, + "flos": 613969397760.0, + "grad_norm": 0.03845725381901349, + "language_loss": 0.86815399, + "learning_rate": 0.0008973237584855794, + "loss": 0.87868845, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.48266602, + "step": 1203, + "time_per_iteration": 2.907670021057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055623, + "balance_loss_mlp": 1.00715244, + "epoch": 0.23162754905732974, + "flos": 390096718080.0, + "grad_norm": 0.03680581416715809, + "language_loss": 0.82972479, + "learning_rate": 0.0008971345530367832, + "loss": 0.84028101, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.48461914, + "step": 1204, + "time_per_iteration": 2.4500131607055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_mlp": 1.00190353, + "epoch": 0.2318199307425933, + "flos": 668970116352.0, + "grad_norm": 0.03636020946200237, + "language_loss": 0.86001658, + "learning_rate": 0.0008969451934066799, + "loss": 0.87052464, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.48828125, + "step": 1205, + "time_per_iteration": 2.786860704421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_mlp": 1.00558126, + "epoch": 0.23201231242785686, + "flos": 667628658432.0, + "grad_norm": 0.042825772722853955, + "language_loss": 0.80798173, + "learning_rate": 0.0008967556796687854, + "loss": 0.81852657, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.48852539, + "step": 1206, + "time_per_iteration": 2.9043900966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106058, + "balance_loss_mlp": 1.01153755, + "epoch": 0.23220469411312042, + "flos": 750095226624.0, + "grad_norm": 0.036226897286377145, + "language_loss": 0.84918714, + "learning_rate": 0.0008965660118966752, + "loss": 0.85979295, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.48974609, + "step": 1207, + "time_per_iteration": 2.8989100456237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054609, + "balance_loss_mlp": 1.00597119, + "epoch": 0.232397075798384, + "flos": 668262448896.0, + "grad_norm": 0.03230217319227319, + "language_loss": 0.90859735, + "learning_rate": 0.0008963761901639851, + "loss": 0.91914344, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.48632812, + "step": 1208, + "time_per_iteration": 2.801715612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050654, + "balance_loss_mlp": 1.00204051, + "epoch": 0.23258945748364757, + "flos": 611346753024.0, + "grad_norm": 0.038379048380249, + "language_loss": 0.83753544, + "learning_rate": 0.0008961862145444103, + "loss": 0.84804195, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.48608398, + "step": 1209, + "time_per_iteration": 2.6739237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105504, + "balance_loss_mlp": 1.00656986, + "epoch": 0.23278183916891113, + "flos": 490672956672.0, + "grad_norm": 0.04093378826068356, + "language_loss": 0.86382735, + "learning_rate": 0.0008959960851117059, + "loss": 0.87437773, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.48461914, + "step": 1210, + "time_per_iteration": 2.635650634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056695, + "balance_loss_mlp": 1.00808144, + "epoch": 0.23297422085417469, + "flos": 512674798080.0, + "grad_norm": 0.0354403494585401, + "language_loss": 0.84509313, + "learning_rate": 0.0008958058019396868, + "loss": 0.85566002, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.48608398, + "step": 1211, + "time_per_iteration": 2.788318157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105326, + "balance_loss_mlp": 1.00462246, + "epoch": 0.23316660253943824, + "flos": 547532272128.0, + "grad_norm": 0.03263062148431384, + "language_loss": 0.87462825, + "learning_rate": 0.0008956153651022274, + "loss": 0.8851608, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.48608398, + "step": 1212, + "time_per_iteration": 2.725313901901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_mlp": 1.00709951, + "epoch": 0.2333589842247018, + "flos": 511289598720.0, + "grad_norm": 0.03371055024816449, + "language_loss": 0.84886169, + "learning_rate": 0.0008954247746732618, + "loss": 0.85942048, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.48754883, + "step": 1213, + "time_per_iteration": 2.592165470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057864, + "balance_loss_mlp": 1.00894058, + "epoch": 0.23355136590996536, + "flos": 664407216384.0, + "grad_norm": 0.030798488974581865, + "language_loss": 0.9124192, + "learning_rate": 0.0008952340307267837, + "loss": 0.92299783, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.48876953, + "step": 1214, + "time_per_iteration": 2.887542724609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_mlp": 1.00332439, + "epoch": 0.23374374759522892, + "flos": 509465995008.0, + "grad_norm": 0.038631928770240895, + "language_loss": 0.8442086, + "learning_rate": 0.0008950431333368468, + "loss": 0.85472775, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.48583984, + "step": 1215, + "time_per_iteration": 2.5713701248168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_mlp": 1.00283849, + "epoch": 0.2339361292804925, + "flos": 1296429915648.0, + "grad_norm": 0.03446682830311694, + "language_loss": 0.8584398, + "learning_rate": 0.0008948520825775634, + "loss": 0.86895549, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.48706055, + "step": 1216, + "time_per_iteration": 3.631596565246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054055, + "balance_loss_mlp": 1.00541723, + "epoch": 0.23412851096575607, + "flos": 707177344512.0, + "grad_norm": 0.031791306217448204, + "language_loss": 0.84468639, + "learning_rate": 0.0008946608785231067, + "loss": 0.85522687, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.48632812, + "step": 1217, + "time_per_iteration": 2.878099203109741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053689, + "balance_loss_mlp": 1.00517046, + "epoch": 0.23432089265101963, + "flos": 439175582208.0, + "grad_norm": 0.03486793229645632, + "language_loss": 0.85493773, + "learning_rate": 0.0008944695212477084, + "loss": 0.86547458, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.48510742, + "step": 1218, + "time_per_iteration": 2.5141704082489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053338, + "balance_loss_mlp": 1.00498641, + "epoch": 0.2345132743362832, + "flos": 481915574784.0, + "grad_norm": 0.03047714423600347, + "language_loss": 0.87145793, + "learning_rate": 0.0008942780108256599, + "loss": 0.88199133, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.48339844, + "step": 1219, + "time_per_iteration": 2.6020901203155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050373, + "balance_loss_mlp": 1.00180733, + "epoch": 0.23470565602154675, + "flos": 412341577728.0, + "grad_norm": 0.03328064907126118, + "language_loss": 0.87382472, + "learning_rate": 0.0008940863473313121, + "loss": 0.88432848, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.4855957, + "step": 1220, + "time_per_iteration": 2.4561610221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_mlp": 1.00483322, + "epoch": 0.2348980377068103, + "flos": 546500906496.0, + "grad_norm": 0.04239569524538178, + "language_loss": 0.88751769, + "learning_rate": 0.0008938945308390756, + "loss": 0.89805412, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.48779297, + "step": 1221, + "time_per_iteration": 2.657763719558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057653, + "balance_loss_mlp": 1.00906336, + "epoch": 0.23509041939207387, + "flos": 576843112704.0, + "grad_norm": 0.04482007629740174, + "language_loss": 0.88039029, + "learning_rate": 0.00089370256142342, + "loss": 0.89096677, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.48583984, + "step": 1222, + "time_per_iteration": 2.7348928451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_mlp": 1.00616074, + "epoch": 0.23528280107733743, + "flos": 589948566528.0, + "grad_norm": 0.030112791330182954, + "language_loss": 0.85687798, + "learning_rate": 0.0008935104391588746, + "loss": 0.86742526, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.4855957, + "step": 1223, + "time_per_iteration": 2.7620511054992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_mlp": 1.00350857, + "epoch": 0.235475182762601, + "flos": 824858132736.0, + "grad_norm": 0.028710207733723417, + "language_loss": 0.83630896, + "learning_rate": 0.0008933181641200276, + "loss": 0.84683019, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.48608398, + "step": 1224, + "time_per_iteration": 3.1587913036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053568, + "balance_loss_mlp": 1.00531197, + "epoch": 0.23566756444786457, + "flos": 681367902720.0, + "grad_norm": 0.03430983930689064, + "language_loss": 0.86561936, + "learning_rate": 0.0008931257363815271, + "loss": 0.87615514, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.48242188, + "step": 1225, + "time_per_iteration": 2.9277396202087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056611, + "balance_loss_mlp": 1.00849795, + "epoch": 0.23585994613312813, + "flos": 703135474176.0, + "grad_norm": 0.029906055234585397, + "language_loss": 0.90256047, + "learning_rate": 0.0008929331560180798, + "loss": 0.91312659, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.48095703, + "step": 1226, + "time_per_iteration": 2.911451578140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_mlp": 1.00676012, + "epoch": 0.2360523278183917, + "flos": 525196038912.0, + "grad_norm": 0.030679819106685022, + "language_loss": 0.9186613, + "learning_rate": 0.0008927404231044525, + "loss": 0.92921197, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.48291016, + "step": 1227, + "time_per_iteration": 2.6848785877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.00756276, + "epoch": 0.23624470950365525, + "flos": 525443914752.0, + "grad_norm": 0.030207709240370546, + "language_loss": 0.82286787, + "learning_rate": 0.0008925475377154703, + "loss": 0.83342624, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.48266602, + "step": 1228, + "time_per_iteration": 2.7278709411621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058098, + "balance_loss_mlp": 1.00974643, + "epoch": 0.2364370911889188, + "flos": 597961342464.0, + "grad_norm": 0.04301213480645635, + "language_loss": 0.82405227, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463323, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.48339844, + "step": 1229, + "time_per_iteration": 2.7282724380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055873, + "balance_loss_mlp": 1.00766432, + "epoch": 0.23662947287418237, + "flos": 758173131264.0, + "grad_norm": 0.03660169780759576, + "language_loss": 0.92488217, + "learning_rate": 0.00089216130981104, + "loss": 0.93544096, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.48193359, + "step": 1230, + "time_per_iteration": 3.0333714485168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051461, + "balance_loss_mlp": 1.00337219, + "epoch": 0.23682185455944593, + "flos": 547208573952.0, + "grad_norm": 0.03138155314794734, + "language_loss": 0.83336782, + "learning_rate": 0.000891967967445539, + "loss": 0.8438825, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.48071289, + "step": 1231, + "time_per_iteration": 2.7093472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_mlp": 1.00587165, + "epoch": 0.2370142362447095, + "flos": 663523604736.0, + "grad_norm": 0.02795314572038805, + "language_loss": 0.89439881, + "learning_rate": 0.0008917744729045772, + "loss": 0.90493822, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.48046875, + "step": 1232, + "time_per_iteration": 2.8760838508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057809, + "balance_loss_mlp": 1.00974393, + "epoch": 0.23720661792997308, + "flos": 684913042944.0, + "grad_norm": 0.03460859048974857, + "language_loss": 0.8446126, + "learning_rate": 0.0008915808262632757, + "loss": 0.85519075, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.48046875, + "step": 1233, + "time_per_iteration": 2.889141321182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_mlp": 1.01058459, + "epoch": 0.23739899961523664, + "flos": 560023377408.0, + "grad_norm": 0.03296017154749467, + "language_loss": 0.94079709, + "learning_rate": 0.0008913870275968148, + "loss": 0.95138192, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.47875977, + "step": 1234, + "time_per_iteration": 2.7432892322540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_mlp": 1.00655627, + "epoch": 0.2375913813005002, + "flos": 891165000960.0, + "grad_norm": 0.03128077017401229, + "language_loss": 0.88428569, + "learning_rate": 0.0008911930769804342, + "loss": 0.89483166, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.48022461, + "step": 1235, + "time_per_iteration": 3.261483669281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.00692844, + "epoch": 0.23778376298576376, + "flos": 642366491136.0, + "grad_norm": 0.029107844015886564, + "language_loss": 0.91850013, + "learning_rate": 0.0008909989744894318, + "loss": 0.92905295, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.48339844, + "step": 1236, + "time_per_iteration": 2.8673832416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061412, + "balance_loss_mlp": 1.01287031, + "epoch": 0.23797614467102732, + "flos": 617946139392.0, + "grad_norm": 0.034095811880077646, + "language_loss": 0.82566786, + "learning_rate": 0.0008908047201991649, + "loss": 0.83628196, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.48535156, + "step": 1237, + "time_per_iteration": 2.7810442447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00511789, + "epoch": 0.23816852635629088, + "flos": 625464130560.0, + "grad_norm": 0.032663011960307756, + "language_loss": 0.87081301, + "learning_rate": 0.0008906103141850502, + "loss": 0.88134885, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.48461914, + "step": 1238, + "time_per_iteration": 2.880305528640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_mlp": 1.00416911, + "epoch": 0.23836090804155444, + "flos": 522441191424.0, + "grad_norm": 0.03474425243888252, + "language_loss": 0.88862967, + "learning_rate": 0.0008904157565225621, + "loss": 0.89915323, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.48168945, + "step": 1239, + "time_per_iteration": 2.648766040802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052394, + "balance_loss_mlp": 1.00423324, + "epoch": 0.238553289726818, + "flos": 1155855892992.0, + "grad_norm": 0.034399895266541865, + "language_loss": 0.82445645, + "learning_rate": 0.000890221047287235, + "loss": 0.83498037, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.48144531, + "step": 1240, + "time_per_iteration": 3.5001280307769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055, + "balance_loss_mlp": 1.00703037, + "epoch": 0.23874567141208156, + "flos": 500910802176.0, + "grad_norm": 0.03306053891413694, + "language_loss": 0.91726851, + "learning_rate": 0.0008900261865546615, + "loss": 0.92781848, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.47949219, + "step": 1241, + "time_per_iteration": 2.6465680599212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052437, + "balance_loss_mlp": 1.00418115, + "epoch": 0.23893805309734514, + "flos": 558050074368.0, + "grad_norm": 0.0354259641755878, + "language_loss": 0.85598528, + "learning_rate": 0.0008898311744004936, + "loss": 0.86650962, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.48242188, + "step": 1242, + "time_per_iteration": 2.7268829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053623, + "balance_loss_mlp": 1.0055337, + "epoch": 0.2391304347826087, + "flos": 550317255168.0, + "grad_norm": 0.0320494810853186, + "language_loss": 0.87574649, + "learning_rate": 0.0008896360109004414, + "loss": 0.88628268, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.48071289, + "step": 1243, + "time_per_iteration": 2.6199252605438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050337, + "balance_loss_mlp": 1.00222456, + "epoch": 0.23932281646787226, + "flos": 517079250432.0, + "grad_norm": 0.0302458656306059, + "language_loss": 0.85177696, + "learning_rate": 0.0008894406961302742, + "loss": 0.86228031, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.48095703, + "step": 1244, + "time_per_iteration": 2.604508876800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052152, + "balance_loss_mlp": 1.00411069, + "epoch": 0.23951519815313582, + "flos": 745002548736.0, + "grad_norm": 0.03429303167053761, + "language_loss": 0.84712255, + "learning_rate": 0.0008892452301658201, + "loss": 0.85764414, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.48022461, + "step": 1245, + "time_per_iteration": 2.924288272857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054436, + "balance_loss_mlp": 1.00651395, + "epoch": 0.23970757983839938, + "flos": 555175663104.0, + "grad_norm": 0.03219666617279603, + "language_loss": 0.84054452, + "learning_rate": 0.0008890496130829653, + "loss": 0.85108888, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.47900391, + "step": 1246, + "time_per_iteration": 2.6700189113616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052243, + "balance_loss_mlp": 1.00441635, + "epoch": 0.23989996152366294, + "flos": 481618121472.0, + "grad_norm": 0.033578246726411604, + "language_loss": 0.86002076, + "learning_rate": 0.0008888538449576555, + "loss": 0.87054318, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.47802734, + "step": 1247, + "time_per_iteration": 2.6269826889038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_mlp": 1.00886118, + "epoch": 0.2400923432089265, + "flos": 486281143296.0, + "grad_norm": 0.03580496599340432, + "language_loss": 0.83572984, + "learning_rate": 0.0008886579258658944, + "loss": 0.84630001, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.48144531, + "step": 1248, + "time_per_iteration": 2.577885389328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054529, + "balance_loss_mlp": 1.0065589, + "epoch": 0.24028472489419006, + "flos": 624793401600.0, + "grad_norm": 0.03296142515540601, + "language_loss": 0.85843956, + "learning_rate": 0.0008884618558837446, + "loss": 0.86898482, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.47949219, + "step": 1249, + "time_per_iteration": 2.874666929244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.00870681, + "epoch": 0.24047710657945365, + "flos": 602809056768.0, + "grad_norm": 0.033943651692576245, + "language_loss": 0.87474859, + "learning_rate": 0.0008882656350873273, + "loss": 0.88531733, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.48144531, + "step": 1250, + "time_per_iteration": 2.8647053241729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055385, + "balance_loss_mlp": 1.00748658, + "epoch": 0.2406694882647172, + "flos": 843001829376.0, + "grad_norm": 0.04142560607115463, + "language_loss": 0.87984931, + "learning_rate": 0.0008880692635528219, + "loss": 0.89040315, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.47875977, + "step": 1251, + "time_per_iteration": 3.0643107891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105149, + "balance_loss_mlp": 1.00352037, + "epoch": 0.24086186994998077, + "flos": 528135578880.0, + "grad_norm": 0.03337559285192523, + "language_loss": 0.90356189, + "learning_rate": 0.0008878727413564669, + "loss": 0.91407681, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.47949219, + "step": 1252, + "time_per_iteration": 2.7680115699768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053848, + "balance_loss_mlp": 1.00826263, + "epoch": 0.24105425163524433, + "flos": 1341462028800.0, + "grad_norm": 0.009196650126926217, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81189448, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.45507812, + "step": 1253, + "time_per_iteration": 4.858070135116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_mlp": 1.00698781, + "epoch": 0.24124663332050789, + "flos": 615228230400.0, + "grad_norm": 0.036740782431925904, + "language_loss": 0.79496801, + "learning_rate": 0.0008874792452834528, + "loss": 0.80551577, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.47753906, + "step": 1254, + "time_per_iteration": 2.756243944168091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_mlp": 1.00954247, + "epoch": 0.24143901500577145, + "flos": 576593291520.0, + "grad_norm": 0.037714132300224086, + "language_loss": 0.87880921, + "learning_rate": 0.0008872822715595626, + "loss": 0.88938332, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.47851562, + "step": 1255, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.00812411, + "epoch": 0.241631396691035, + "flos": 496147658496.0, + "grad_norm": 0.038695693582970765, + "language_loss": 0.87873089, + "learning_rate": 0.0008870851474793598, + "loss": 0.88929206, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.47973633, + "step": 1256, + "time_per_iteration": 2.6313350200653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058027, + "balance_loss_mlp": 1.009866, + "epoch": 0.24182377837629856, + "flos": 637397267712.0, + "grad_norm": 0.03630749648984725, + "language_loss": 0.904266, + "learning_rate": 0.0008868878731193752, + "loss": 0.9148463, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.48144531, + "step": 1257, + "time_per_iteration": 2.820671558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052967, + "balance_loss_mlp": 1.00509274, + "epoch": 0.24201616006156215, + "flos": 516350195712.0, + "grad_norm": 0.04098435374075245, + "language_loss": 0.90631104, + "learning_rate": 0.0008866904485561973, + "loss": 0.91684067, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.47851562, + "step": 1258, + "time_per_iteration": 2.712970495223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053405, + "balance_loss_mlp": 1.0053165, + "epoch": 0.2422085417468257, + "flos": 616379159808.0, + "grad_norm": 0.03199149634406808, + "language_loss": 0.83463258, + "learning_rate": 0.000886492873866473, + "loss": 0.84516662, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.48071289, + "step": 1259, + "time_per_iteration": 2.8250985145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00330269, + "epoch": 0.24240092343208927, + "flos": 586913762304.0, + "grad_norm": 0.03973618931504764, + "language_loss": 0.85183978, + "learning_rate": 0.000886295149126908, + "loss": 0.86235273, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.47973633, + "step": 1260, + "time_per_iteration": 2.7110049724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_mlp": 1.00338328, + "epoch": 0.24259330511735283, + "flos": 763572010752.0, + "grad_norm": 0.03275678482299809, + "language_loss": 0.86485362, + "learning_rate": 0.0008860972744142655, + "loss": 0.87536597, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.47827148, + "step": 1261, + "time_per_iteration": 2.9053289890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051398, + "balance_loss_mlp": 1.00361907, + "epoch": 0.2427856868026164, + "flos": 628134407424.0, + "grad_norm": 0.03196094686024711, + "language_loss": 0.82455611, + "learning_rate": 0.0008858992498053671, + "loss": 0.83507007, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.47753906, + "step": 1262, + "time_per_iteration": 2.8111376762390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054321, + "balance_loss_mlp": 1.00797272, + "epoch": 0.24297806848787995, + "flos": 1514922167808.0, + "grad_norm": 0.010120346862694057, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77643073, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.46289062, + "step": 1263, + "time_per_iteration": 4.84857177734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052202, + "balance_loss_mlp": 1.00420785, + "epoch": 0.2431704501731435, + "flos": 543073384704.0, + "grad_norm": 0.030775668427347653, + "language_loss": 0.83837479, + "learning_rate": 0.0008855027512063817, + "loss": 0.84889686, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.47973633, + "step": 1264, + "time_per_iteration": 2.69954252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055608, + "balance_loss_mlp": 1.0077095, + "epoch": 0.24336283185840707, + "flos": 524879143680.0, + "grad_norm": 0.03906981412635217, + "language_loss": 0.86655742, + "learning_rate": 0.0008853042773702292, + "loss": 0.87711346, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.47875977, + "step": 1265, + "time_per_iteration": 2.703227996826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_mlp": 1.00530863, + "epoch": 0.24355521354367063, + "flos": 538206228480.0, + "grad_norm": 0.030917867079500824, + "language_loss": 0.88497615, + "learning_rate": 0.0008851056539456896, + "loss": 0.89550632, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.47680664, + "step": 1266, + "time_per_iteration": 2.6844840049743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_mlp": 1.00655031, + "epoch": 0.24374759522893422, + "flos": 932109580032.0, + "grad_norm": 0.032880300158599975, + "language_loss": 0.82697207, + "learning_rate": 0.0008849068810098755, + "loss": 0.83751392, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.47607422, + "step": 1267, + "time_per_iteration": 3.274641513824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_mlp": 1.00789249, + "epoch": 0.24393997691419778, + "flos": 428685970176.0, + "grad_norm": 0.04273651221625489, + "language_loss": 0.84108871, + "learning_rate": 0.0008847079586399575, + "loss": 0.85164183, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.47387695, + "step": 1268, + "time_per_iteration": 2.475217819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057264, + "balance_loss_mlp": 1.00993788, + "epoch": 0.24413235859946134, + "flos": 579943045632.0, + "grad_norm": 0.03463136192779687, + "language_loss": 0.86878628, + "learning_rate": 0.0008845088869131641, + "loss": 0.87935889, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.47290039, + "step": 1269, + "time_per_iteration": 2.676954746246338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054221, + "balance_loss_mlp": 1.00689447, + "epoch": 0.2443247402847249, + "flos": 530901120000.0, + "grad_norm": 0.04739098518835349, + "language_loss": 0.8972156, + "learning_rate": 0.0008843096659067818, + "loss": 0.90775776, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.47290039, + "step": 1270, + "time_per_iteration": 2.6031625270843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_mlp": 1.00896251, + "epoch": 0.24451712196998845, + "flos": 697625779200.0, + "grad_norm": 0.03005687387855686, + "language_loss": 0.8676796, + "learning_rate": 0.000884110295698155, + "loss": 0.87824345, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.47387695, + "step": 1271, + "time_per_iteration": 2.946385145187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_mlp": 1.00460577, + "epoch": 0.24470950365525201, + "flos": 530864181504.0, + "grad_norm": 0.03542850047119753, + "language_loss": 0.86657912, + "learning_rate": 0.0008839107763646861, + "loss": 0.87710059, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.47509766, + "step": 1272, + "time_per_iteration": 2.6175343990325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057047, + "balance_loss_mlp": 1.00955379, + "epoch": 0.24490188534051557, + "flos": 492348806400.0, + "grad_norm": 0.04294337139782129, + "language_loss": 0.9099223, + "learning_rate": 0.0008837111079838353, + "loss": 0.92049271, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.47460938, + "step": 1273, + "time_per_iteration": 2.699777126312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051504, + "balance_loss_mlp": 1.00393975, + "epoch": 0.24509426702577913, + "flos": 475112054016.0, + "grad_norm": 0.03233839715385124, + "language_loss": 0.90686411, + "learning_rate": 0.000883511290633121, + "loss": 0.91737914, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.4753418, + "step": 1274, + "time_per_iteration": 2.5347506999969482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053736, + "balance_loss_mlp": 1.0061239, + "epoch": 0.24528664871104272, + "flos": 551648019456.0, + "grad_norm": 0.029596958484994024, + "language_loss": 0.9283247, + "learning_rate": 0.000883311324390119, + "loss": 0.93886209, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.47583008, + "step": 1275, + "time_per_iteration": 2.7105162143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_mlp": 1.00703931, + "epoch": 0.24547903039630628, + "flos": 827336914176.0, + "grad_norm": 0.04026092464880397, + "language_loss": 0.8227402, + "learning_rate": 0.0008831112093324629, + "loss": 0.83328599, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.47509766, + "step": 1276, + "time_per_iteration": 3.0518436431884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052804, + "balance_loss_mlp": 1.00523984, + "epoch": 0.24567141208156984, + "flos": 592694665728.0, + "grad_norm": 0.0350541873914122, + "language_loss": 0.89993191, + "learning_rate": 0.0008829109455378444, + "loss": 0.91045994, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.4753418, + "step": 1277, + "time_per_iteration": 2.705888032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053461, + "balance_loss_mlp": 1.00606322, + "epoch": 0.2458637937668334, + "flos": 548930110464.0, + "grad_norm": 0.03225743101348484, + "language_loss": 0.87107539, + "learning_rate": 0.000882710533084013, + "loss": 0.88161004, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.47363281, + "step": 1278, + "time_per_iteration": 2.6600000858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_mlp": 1.00418186, + "epoch": 0.24605617545209696, + "flos": 516912054528.0, + "grad_norm": 0.031446449457072034, + "language_loss": 0.89965951, + "learning_rate": 0.0008825099720487755, + "loss": 0.91017628, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.47460938, + "step": 1279, + "time_per_iteration": 2.6381545066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059078, + "balance_loss_mlp": 1.01320648, + "epoch": 0.24624855713736052, + "flos": 1515061173504.0, + "grad_norm": 0.006597619453236458, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76320213, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.45800781, + "step": 1280, + "time_per_iteration": 4.836413621902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.0109787, + "epoch": 0.24644093882262408, + "flos": 1530749421312.0, + "grad_norm": 0.006438131933853504, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000866, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.45703125, + "step": 1281, + "time_per_iteration": 4.763012409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055673, + "balance_loss_mlp": 1.00817966, + "epoch": 0.24663332050788764, + "flos": 660349794816.0, + "grad_norm": 0.03366863359794558, + "language_loss": 0.89743239, + "learning_rate": 0.0008819073982335619, + "loss": 0.90798908, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.47460938, + "step": 1282, + "time_per_iteration": 2.830066204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051734, + "balance_loss_mlp": 1.00426519, + "epoch": 0.24682570219315123, + "flos": 542806066944.0, + "grad_norm": 0.034270358372240205, + "language_loss": 0.85323066, + "learning_rate": 0.0008817062436519235, + "loss": 0.86374807, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.47436523, + "step": 1283, + "time_per_iteration": 2.6451101303100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00680709, + "epoch": 0.24701808387841478, + "flos": 441659221248.0, + "grad_norm": 0.03422998600893363, + "language_loss": 0.90367711, + "learning_rate": 0.0008815049408787788, + "loss": 0.91422176, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.47631836, + "step": 1284, + "time_per_iteration": 2.5568699836730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_mlp": 1.00672722, + "epoch": 0.24721046556367834, + "flos": 469033697280.0, + "grad_norm": 0.036620952447016124, + "language_loss": 0.86045629, + "learning_rate": 0.0008813034899922805, + "loss": 0.87100112, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.47729492, + "step": 1285, + "time_per_iteration": 2.5571885108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052621, + "balance_loss_mlp": 1.00498545, + "epoch": 0.2474028472489419, + "flos": 505408573440.0, + "grad_norm": 0.03938899634346209, + "language_loss": 0.90811062, + "learning_rate": 0.0008811018910706387, + "loss": 0.91863692, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.47607422, + "step": 1286, + "time_per_iteration": 2.5542702674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105262, + "balance_loss_mlp": 1.00496054, + "epoch": 0.24759522893420546, + "flos": 480956140800.0, + "grad_norm": 0.04329385189604929, + "language_loss": 0.82886434, + "learning_rate": 0.0008809001441921211, + "loss": 0.83939052, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.47631836, + "step": 1287, + "time_per_iteration": 2.7426302433013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056359, + "balance_loss_mlp": 1.00879443, + "epoch": 0.24778761061946902, + "flos": 534754407168.0, + "grad_norm": 0.03495005483538565, + "language_loss": 0.86372733, + "learning_rate": 0.0008806982494350528, + "loss": 0.87429094, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.4753418, + "step": 1288, + "time_per_iteration": 2.6200613975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_mlp": 1.0063771, + "epoch": 0.24797999230473258, + "flos": 560943927552.0, + "grad_norm": 0.028534619779485338, + "language_loss": 0.90820038, + "learning_rate": 0.0008804962068778161, + "loss": 0.91874075, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.47631836, + "step": 1289, + "time_per_iteration": 2.8445866107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050603, + "balance_loss_mlp": 1.00287127, + "epoch": 0.24817237398999614, + "flos": 625481627136.0, + "grad_norm": 0.033144052318390974, + "language_loss": 0.81476247, + "learning_rate": 0.0008802940165988511, + "loss": 0.82526851, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.47705078, + "step": 1290, + "time_per_iteration": 2.874469518661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052663, + "balance_loss_mlp": 1.00500298, + "epoch": 0.2483647556752597, + "flos": 613485306624.0, + "grad_norm": 0.033485904546120666, + "language_loss": 0.88976955, + "learning_rate": 0.000880091678676655, + "loss": 0.90029621, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.47631836, + "step": 1291, + "time_per_iteration": 2.8294923305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_mlp": 1.00159943, + "epoch": 0.2485571373605233, + "flos": 584688692736.0, + "grad_norm": 0.030875088012072577, + "language_loss": 0.89826584, + "learning_rate": 0.0008798891931897821, + "loss": 0.90875816, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.47607422, + "step": 1292, + "time_per_iteration": 2.7068471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050902, + "balance_loss_mlp": 1.00359952, + "epoch": 0.24874951904578685, + "flos": 495737444352.0, + "grad_norm": 0.03670876005724945, + "language_loss": 0.84959131, + "learning_rate": 0.0008796865602168447, + "loss": 0.86010033, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.47265625, + "step": 1293, + "time_per_iteration": 2.550218343734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_mlp": 1.00526226, + "epoch": 0.2489419007310504, + "flos": 457174437120.0, + "grad_norm": 0.03243940706171699, + "language_loss": 0.89144397, + "learning_rate": 0.0008794837798365115, + "loss": 0.90196991, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.47290039, + "step": 1294, + "time_per_iteration": 2.6271979808807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.00420678, + "epoch": 0.24913428241631397, + "flos": 486565957632.0, + "grad_norm": 0.03268946967982851, + "language_loss": 0.89255542, + "learning_rate": 0.0008792808521275089, + "loss": 0.90307105, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.47314453, + "step": 1295, + "time_per_iteration": 2.733107566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_mlp": 1.00544262, + "epoch": 0.24932666410157753, + "flos": 519918668544.0, + "grad_norm": 0.031266052737173484, + "language_loss": 0.88015056, + "learning_rate": 0.0008790777771686206, + "loss": 0.89068043, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.47509766, + "step": 1296, + "time_per_iteration": 2.5860161781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_mlp": 1.0059917, + "epoch": 0.2495190457868411, + "flos": 473557713408.0, + "grad_norm": 0.03428757295266267, + "language_loss": 0.86048388, + "learning_rate": 0.0008788745550386872, + "loss": 0.8710202, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.47607422, + "step": 1297, + "time_per_iteration": 2.599851608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_mlp": 1.00776434, + "epoch": 0.24971142747210465, + "flos": 747199428096.0, + "grad_norm": 0.03345883603952397, + "language_loss": 0.80858141, + "learning_rate": 0.0008786711858166063, + "loss": 0.81913638, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.47705078, + "step": 1298, + "time_per_iteration": 2.9357736110687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_mlp": 1.00770009, + "epoch": 0.2499038091573682, + "flos": 750903015936.0, + "grad_norm": 0.03503874681650984, + "language_loss": 0.84951854, + "learning_rate": 0.0008784676695813332, + "loss": 0.86007309, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.47729492, + "step": 1299, + "time_per_iteration": 2.955172538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055374, + "balance_loss_mlp": 1.00776184, + "epoch": 0.2500961908426318, + "flos": 746344006656.0, + "grad_norm": 0.032686560936085865, + "language_loss": 0.85840905, + "learning_rate": 0.0008782640064118796, + "loss": 0.86896276, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.47583008, + "step": 1300, + "time_per_iteration": 2.897998571395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055206, + "balance_loss_mlp": 1.00904846, + "epoch": 0.2502885725278953, + "flos": 1420526353152.0, + "grad_norm": 0.0075534145797937526, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77239954, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.4609375, + "step": 1301, + "time_per_iteration": 5.023081541061401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.00904393, + "epoch": 0.2504809542131589, + "flos": 516232577280.0, + "grad_norm": 0.03748206036604932, + "language_loss": 0.87484509, + "learning_rate": 0.0008778562395867648, + "loss": 0.88541192, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.47607422, + "step": 1302, + "time_per_iteration": 2.593972682952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105251, + "balance_loss_mlp": 1.00477886, + "epoch": 0.25067333589842244, + "flos": 526852446720.0, + "grad_norm": 0.031223058919554587, + "language_loss": 0.84117836, + "learning_rate": 0.0008776521360894127, + "loss": 0.85170352, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.47705078, + "step": 1303, + "time_per_iteration": 2.6153149604797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.02342987, + "epoch": 0.25086571758368603, + "flos": 1477160146944.0, + "grad_norm": 0.014969332736355754, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80031657, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.4609375, + "step": 1304, + "time_per_iteration": 4.792739629745483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053769, + "balance_loss_mlp": 1.00649047, + "epoch": 0.2510580992689496, + "flos": 529403159808.0, + "grad_norm": 0.03453306909815573, + "language_loss": 0.91369265, + "learning_rate": 0.0008772434893213186, + "loss": 0.92423034, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.47241211, + "step": 1305, + "time_per_iteration": 2.581268072128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056214, + "balance_loss_mlp": 1.00919807, + "epoch": 0.25125048095421315, + "flos": 518466395136.0, + "grad_norm": 0.035319884850533015, + "language_loss": 0.84733635, + "learning_rate": 0.0008770389462092276, + "loss": 0.85789847, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.46972656, + "step": 1306, + "time_per_iteration": 2.627317428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056702, + "balance_loss_mlp": 1.00951862, + "epoch": 0.25144286263947674, + "flos": 621675972096.0, + "grad_norm": 0.03558379494917989, + "language_loss": 0.87486076, + "learning_rate": 0.0008768342567176357, + "loss": 0.88542777, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.47143555, + "step": 1307, + "time_per_iteration": 2.787318706512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052534, + "balance_loss_mlp": 1.00537527, + "epoch": 0.25163524432474027, + "flos": 504866156544.0, + "grad_norm": 0.03616031366836922, + "language_loss": 0.9109531, + "learning_rate": 0.0008766294209260107, + "loss": 0.92147839, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.47119141, + "step": 1308, + "time_per_iteration": 2.6384546756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_mlp": 1.00510657, + "epoch": 0.25182762601000386, + "flos": 510080343552.0, + "grad_norm": 0.03702737725286332, + "language_loss": 0.92033225, + "learning_rate": 0.0008764244389138767, + "loss": 0.93085706, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.47338867, + "step": 1309, + "time_per_iteration": 2.5620551109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053435, + "balance_loss_mlp": 1.006037, + "epoch": 0.2520200076952674, + "flos": 635098321152.0, + "grad_norm": 0.03928250470986306, + "language_loss": 0.83104628, + "learning_rate": 0.000876219310760815, + "loss": 0.84158063, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.47363281, + "step": 1310, + "time_per_iteration": 2.886335849761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053423, + "balance_loss_mlp": 1.00614405, + "epoch": 0.252212389380531, + "flos": 495652873728.0, + "grad_norm": 0.03544669215118347, + "language_loss": 0.82256365, + "learning_rate": 0.0008760140365464631, + "loss": 0.83309782, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.47241211, + "step": 1311, + "time_per_iteration": 2.607191801071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053107, + "balance_loss_mlp": 1.00592351, + "epoch": 0.2524047710657945, + "flos": 491530323456.0, + "grad_norm": 0.037974131054051216, + "language_loss": 0.87817502, + "learning_rate": 0.0008758086163505156, + "loss": 0.88870609, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.47143555, + "step": 1312, + "time_per_iteration": 2.6121339797973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052408, + "balance_loss_mlp": 1.00505757, + "epoch": 0.2525971527510581, + "flos": 648613989120.0, + "grad_norm": 0.03226827566126977, + "language_loss": 0.90228277, + "learning_rate": 0.0008756030502527239, + "loss": 0.91280687, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.47314453, + "step": 1313, + "time_per_iteration": 2.8256115913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049721, + "balance_loss_mlp": 1.00234711, + "epoch": 0.2527895344363217, + "flos": 570373983744.0, + "grad_norm": 0.0325160066751772, + "language_loss": 0.907884, + "learning_rate": 0.0008753973383328954, + "loss": 0.91838121, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.47338867, + "step": 1314, + "time_per_iteration": 2.722231388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051583, + "balance_loss_mlp": 1.00423265, + "epoch": 0.2529819161215852, + "flos": 515069008896.0, + "grad_norm": 0.040482030139478604, + "language_loss": 0.8500945, + "learning_rate": 0.0008751914806708952, + "loss": 0.86061025, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.47314453, + "step": 1315, + "time_per_iteration": 2.593076229095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_mlp": 1.00376213, + "epoch": 0.2531742978068488, + "flos": 532351448064.0, + "grad_norm": 0.03414491036051862, + "language_loss": 0.82694548, + "learning_rate": 0.0008749854773466439, + "loss": 0.8374573, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.47387695, + "step": 1316, + "time_per_iteration": 2.660116672515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054789, + "balance_loss_mlp": 1.00722456, + "epoch": 0.25336667949211233, + "flos": 597748459776.0, + "grad_norm": 0.03206754273868493, + "language_loss": 0.84984171, + "learning_rate": 0.0008747793284401192, + "loss": 0.86038959, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.4753418, + "step": 1317, + "time_per_iteration": 2.692183017730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105185, + "balance_loss_mlp": 1.00407052, + "epoch": 0.2535590611773759, + "flos": 603256209408.0, + "grad_norm": 0.034288977750124294, + "language_loss": 0.85941386, + "learning_rate": 0.0008745730340313551, + "loss": 0.86993235, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.47753906, + "step": 1318, + "time_per_iteration": 2.7932682037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105299, + "balance_loss_mlp": 1.00525868, + "epoch": 0.25375144286263945, + "flos": 496323602688.0, + "grad_norm": 0.035249055653748196, + "language_loss": 0.8522734, + "learning_rate": 0.0008743665942004422, + "loss": 0.86280334, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.47705078, + "step": 1319, + "time_per_iteration": 2.6616318225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052413, + "balance_loss_mlp": 1.00465751, + "epoch": 0.25394382454790304, + "flos": 513477729792.0, + "grad_norm": 0.032623992793633046, + "language_loss": 0.93257391, + "learning_rate": 0.0008741600090275277, + "loss": 0.94309807, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.47729492, + "step": 1320, + "time_per_iteration": 2.567985773086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051086, + "balance_loss_mlp": 1.00333035, + "epoch": 0.25413620623316663, + "flos": 960856616448.0, + "grad_norm": 0.03465281335593922, + "language_loss": 0.8488484, + "learning_rate": 0.0008739532785928151, + "loss": 0.85935926, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.47729492, + "step": 1321, + "time_per_iteration": 3.4506430625915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054222, + "balance_loss_mlp": 1.00882721, + "epoch": 0.25432858791843016, + "flos": 1580651625984.0, + "grad_norm": 0.01348888133328934, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75947809, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.453125, + "step": 1322, + "time_per_iteration": 4.819811820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055371, + "balance_loss_mlp": 1.00752044, + "epoch": 0.25452096960369375, + "flos": 584894772480.0, + "grad_norm": 0.03690210205672512, + "language_loss": 0.83839363, + "learning_rate": 0.0008735393822590908, + "loss": 0.84894735, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.47827148, + "step": 1323, + "time_per_iteration": 2.680769681930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069306, + "balance_loss_mlp": 1.02138364, + "epoch": 0.2547133512889573, + "flos": 509641939200.0, + "grad_norm": 0.03795743442729459, + "language_loss": 0.87760162, + "learning_rate": 0.0008733322165207681, + "loss": 0.8882947, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.47900391, + "step": 1324, + "time_per_iteration": 2.6391303539276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056249, + "balance_loss_mlp": 1.00856507, + "epoch": 0.25490573297422087, + "flos": 784037008128.0, + "grad_norm": 0.03625483542623235, + "language_loss": 0.83670151, + "learning_rate": 0.0008731249058420247, + "loss": 0.84726399, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.4765625, + "step": 1325, + "time_per_iteration": 3.0179827213287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062074, + "balance_loss_mlp": 1.01479542, + "epoch": 0.2550981146594844, + "flos": 510953261568.0, + "grad_norm": 0.03728184694741104, + "language_loss": 0.91373062, + "learning_rate": 0.0008729174503033459, + "loss": 0.92435133, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.47241211, + "step": 1326, + "time_per_iteration": 2.644351005554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059853, + "balance_loss_mlp": 1.01262248, + "epoch": 0.255290496344748, + "flos": 677931632640.0, + "grad_norm": 0.04262364220636159, + "language_loss": 0.83700824, + "learning_rate": 0.0008727098499852728, + "loss": 0.84760678, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.47192383, + "step": 1327, + "time_per_iteration": 2.8393821716308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059289, + "balance_loss_mlp": 1.01212943, + "epoch": 0.2554828780300115, + "flos": 538985827584.0, + "grad_norm": 0.0346626903619469, + "language_loss": 0.90499496, + "learning_rate": 0.0008725021049684034, + "loss": 0.91558784, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.47119141, + "step": 1328, + "time_per_iteration": 2.74480938911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_mlp": 1.00554764, + "epoch": 0.2556752597152751, + "flos": 825624125952.0, + "grad_norm": 0.0321884383853499, + "language_loss": 0.83690739, + "learning_rate": 0.000872294215333391, + "loss": 0.84743297, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.46972656, + "step": 1329, + "time_per_iteration": 3.177448034286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066156, + "balance_loss_mlp": 1.01880646, + "epoch": 0.2558676414005387, + "flos": 571891385856.0, + "grad_norm": 0.037080167806849716, + "language_loss": 0.84060931, + "learning_rate": 0.0008720861811609457, + "loss": 0.85127091, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.47314453, + "step": 1330, + "time_per_iteration": 2.7320711612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_mlp": 1.00745046, + "epoch": 0.2560600230858022, + "flos": 487748967936.0, + "grad_norm": 0.03498979971426328, + "language_loss": 0.84052318, + "learning_rate": 0.0008718780025318338, + "loss": 0.85106957, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.47143555, + "step": 1331, + "time_per_iteration": 2.7297112941741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.00705111, + "epoch": 0.2562524047710658, + "flos": 514120268544.0, + "grad_norm": 0.03699782349212247, + "language_loss": 0.84697664, + "learning_rate": 0.0008716696795268771, + "loss": 0.85751587, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.46826172, + "step": 1332, + "time_per_iteration": 2.6615397930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054318, + "balance_loss_mlp": 1.00756466, + "epoch": 0.25644478645632934, + "flos": 636110244864.0, + "grad_norm": 0.03600089626817585, + "language_loss": 0.85914254, + "learning_rate": 0.0008714612122269538, + "loss": 0.86968577, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.46704102, + "step": 1333, + "time_per_iteration": 2.849813938140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056443, + "balance_loss_mlp": 1.00968957, + "epoch": 0.25663716814159293, + "flos": 437545419264.0, + "grad_norm": 0.03932780780666976, + "language_loss": 0.90516675, + "learning_rate": 0.0008712526007129982, + "loss": 0.91573119, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.46704102, + "step": 1334, + "time_per_iteration": 2.520730972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_mlp": 1.00675464, + "epoch": 0.25682954982685646, + "flos": 499243700736.0, + "grad_norm": 0.03395243638019146, + "language_loss": 0.9133085, + "learning_rate": 0.0008710438450660003, + "loss": 0.9238441, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.4675293, + "step": 1335, + "time_per_iteration": 2.6936721801757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053065, + "balance_loss_mlp": 1.00590599, + "epoch": 0.25702193151212005, + "flos": 458628655872.0, + "grad_norm": 0.038911849114865095, + "language_loss": 0.8791827, + "learning_rate": 0.0008708349453670064, + "loss": 0.88971329, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.47119141, + "step": 1336, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074594, + "balance_loss_mlp": 1.02733934, + "epoch": 0.2572143131973836, + "flos": 599404867584.0, + "grad_norm": 0.03723585257139378, + "language_loss": 0.92015922, + "learning_rate": 0.0008706259016971185, + "loss": 0.93090516, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.47216797, + "step": 1337, + "time_per_iteration": 2.792436361312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055792, + "balance_loss_mlp": 1.00872791, + "epoch": 0.25740669488264717, + "flos": 699527150592.0, + "grad_norm": 0.04259016947882448, + "language_loss": 0.8355068, + "learning_rate": 0.0008704167141374944, + "loss": 0.84606469, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.47021484, + "step": 1338, + "time_per_iteration": 2.806931972503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056758, + "balance_loss_mlp": 1.01014686, + "epoch": 0.25759907656791076, + "flos": 503378889984.0, + "grad_norm": 0.03686560218677495, + "language_loss": 0.88890558, + "learning_rate": 0.0008702073827693482, + "loss": 0.89947319, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.46557617, + "step": 1339, + "time_per_iteration": 2.7613115310668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_mlp": 1.01112759, + "epoch": 0.2577914582531743, + "flos": 775242687744.0, + "grad_norm": 0.03484469931885578, + "language_loss": 0.89865053, + "learning_rate": 0.0008699979076739494, + "loss": 0.90922654, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.46411133, + "step": 1340, + "time_per_iteration": 2.9694418907165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_mlp": 1.00552797, + "epoch": 0.2579838399384379, + "flos": 460610707200.0, + "grad_norm": 0.04216529081594553, + "language_loss": 0.89380765, + "learning_rate": 0.0008697882889326234, + "loss": 0.9043293, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.46582031, + "step": 1341, + "time_per_iteration": 2.5050456523895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051599, + "balance_loss_mlp": 1.00482166, + "epoch": 0.2581762216237014, + "flos": 570263168256.0, + "grad_norm": 0.03742337984590145, + "language_loss": 0.87203884, + "learning_rate": 0.0008695785266267515, + "loss": 0.88255489, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.46728516, + "step": 1342, + "time_per_iteration": 2.677072763442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057516, + "balance_loss_mlp": 1.01069069, + "epoch": 0.258368603308965, + "flos": 605387960064.0, + "grad_norm": 0.035138016776099276, + "language_loss": 0.83827055, + "learning_rate": 0.0008693686208377704, + "loss": 0.84884572, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.46777344, + "step": 1343, + "time_per_iteration": 2.826026439666748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054134, + "balance_loss_mlp": 1.0075947, + "epoch": 0.2585609849942285, + "flos": 492487812096.0, + "grad_norm": 0.03194520317053949, + "language_loss": 0.89379156, + "learning_rate": 0.0008691585716471733, + "loss": 0.90433288, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.46484375, + "step": 1344, + "time_per_iteration": 2.6379647254943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_mlp": 1.00646937, + "epoch": 0.2587533666794921, + "flos": 641958222336.0, + "grad_norm": 0.03185107281306307, + "language_loss": 0.86602217, + "learning_rate": 0.0008689483791365079, + "loss": 0.87655246, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.46508789, + "step": 1345, + "time_per_iteration": 2.8372344970703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105515, + "balance_loss_mlp": 1.00868249, + "epoch": 0.2589457483647557, + "flos": 577995987456.0, + "grad_norm": 0.038033594557881883, + "language_loss": 0.90178049, + "learning_rate": 0.0008687380433873786, + "loss": 0.91233194, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.46411133, + "step": 1346, + "time_per_iteration": 2.7660248279571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.00636888, + "epoch": 0.25913813005001923, + "flos": 536467195392.0, + "grad_norm": 0.03823400300780179, + "language_loss": 0.83192778, + "learning_rate": 0.0008685275644814448, + "loss": 0.8424564, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.46435547, + "step": 1347, + "time_per_iteration": 2.6657776832580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058039, + "balance_loss_mlp": 1.01118934, + "epoch": 0.2593305117352828, + "flos": 722347474944.0, + "grad_norm": 0.04308500968206218, + "language_loss": 0.85215819, + "learning_rate": 0.0008683169425004216, + "loss": 0.86273861, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.46801758, + "step": 1348, + "time_per_iteration": 2.8938682079315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067463, + "balance_loss_mlp": 1.02058995, + "epoch": 0.25952289342054635, + "flos": 711356275200.0, + "grad_norm": 0.04420512127692048, + "language_loss": 0.84604859, + "learning_rate": 0.0008681061775260799, + "loss": 0.85672331, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.46826172, + "step": 1349, + "time_per_iteration": 2.8803627490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_mlp": 1.00634348, + "epoch": 0.25971527510580994, + "flos": 456850738944.0, + "grad_norm": 0.03368144531989068, + "language_loss": 0.92376006, + "learning_rate": 0.0008678952696402458, + "loss": 0.93428755, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.46337891, + "step": 1350, + "time_per_iteration": 2.5544798374176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054358, + "balance_loss_mlp": 1.00824761, + "epoch": 0.25990765679107347, + "flos": 613754569728.0, + "grad_norm": 0.03011764192417466, + "language_loss": 0.87159944, + "learning_rate": 0.000867684218924801, + "loss": 0.88214302, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.46044922, + "step": 1351, + "time_per_iteration": 2.856372833251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_mlp": 1.02496338, + "epoch": 0.26010003847633706, + "flos": 1541407196160.0, + "grad_norm": 0.012951365709411706, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80016494, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.4453125, + "step": 1352, + "time_per_iteration": 4.943616628646851 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058433, + "balance_loss_mlp": 1.01194191, + "epoch": 0.2602924201616006, + "flos": 717545447424.0, + "grad_norm": 0.029832851456929797, + "language_loss": 0.85926312, + "learning_rate": 0.0008672616893328834, + "loss": 0.86984742, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.46435547, + "step": 1353, + "time_per_iteration": 2.913235664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_mlp": 1.01012051, + "epoch": 0.2604848018468642, + "flos": 644686824960.0, + "grad_norm": 0.03749633937906014, + "language_loss": 0.91143578, + "learning_rate": 0.0008670502106204512, + "loss": 0.92200339, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.46582031, + "step": 1354, + "time_per_iteration": 2.821753978729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091569, + "balance_loss_mlp": 1.0442189, + "epoch": 0.26067718353212777, + "flos": 518038684416.0, + "grad_norm": 0.04686611644365056, + "language_loss": 0.82400739, + "learning_rate": 0.0008668385894064892, + "loss": 0.83492303, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.47314453, + "step": 1355, + "time_per_iteration": 2.642392158508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056006, + "balance_loss_mlp": 1.00925195, + "epoch": 0.2608695652173913, + "flos": 824226287616.0, + "grad_norm": 0.03313451231790272, + "language_loss": 0.89331532, + "learning_rate": 0.0008666268257731562, + "loss": 0.90387547, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.46704102, + "step": 1356, + "time_per_iteration": 3.1127805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060563, + "balance_loss_mlp": 1.01414335, + "epoch": 0.2610619469026549, + "flos": 1009450422528.0, + "grad_norm": 0.04035878870854939, + "language_loss": 0.86687934, + "learning_rate": 0.0008664149198026662, + "loss": 0.87748504, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.46362305, + "step": 1357, + "time_per_iteration": 3.2328455448150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106616, + "balance_loss_mlp": 1.01971614, + "epoch": 0.2612543285879184, + "flos": 537826149888.0, + "grad_norm": 0.03943672852684058, + "language_loss": 0.8952527, + "learning_rate": 0.0008662028715772883, + "loss": 0.90591431, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.46386719, + "step": 1358, + "time_per_iteration": 2.621894359588623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058771, + "balance_loss_mlp": 1.01213586, + "epoch": 0.261446710273182, + "flos": 520439698176.0, + "grad_norm": 0.03590038892764462, + "language_loss": 0.86476588, + "learning_rate": 0.0008659906811793467, + "loss": 0.87535357, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.46582031, + "step": 1359, + "time_per_iteration": 2.6540629863739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054238, + "balance_loss_mlp": 1.00741243, + "epoch": 0.26163909195844554, + "flos": 584399987712.0, + "grad_norm": 0.03384500135634075, + "language_loss": 0.90458202, + "learning_rate": 0.0008657783486912215, + "loss": 0.91512442, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.46777344, + "step": 1360, + "time_per_iteration": 2.71598744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063542, + "balance_loss_mlp": 1.01626348, + "epoch": 0.2618314736437091, + "flos": 960369613056.0, + "grad_norm": 0.03695926115068694, + "language_loss": 0.90376949, + "learning_rate": 0.0008655658741953472, + "loss": 0.91440493, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.47241211, + "step": 1361, + "time_per_iteration": 3.233081102371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061537, + "balance_loss_mlp": 1.01413929, + "epoch": 0.26202385532897265, + "flos": 575903120640.0, + "grad_norm": 0.032102410789184695, + "language_loss": 0.892542, + "learning_rate": 0.0008653532577742136, + "loss": 0.90315735, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.47363281, + "step": 1362, + "time_per_iteration": 2.671513319015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_mlp": 1.00673676, + "epoch": 0.26221623701423624, + "flos": 446398065408.0, + "grad_norm": 0.034188430773875136, + "language_loss": 0.88125902, + "learning_rate": 0.0008651404995103659, + "loss": 0.8917954, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.46850586, + "step": 1363, + "time_per_iteration": 2.5599000453948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064691, + "balance_loss_mlp": 1.01803255, + "epoch": 0.26240861869949983, + "flos": 536755900416.0, + "grad_norm": 0.03309695956224158, + "language_loss": 0.87925225, + "learning_rate": 0.0008649275994864041, + "loss": 0.88989913, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.46606445, + "step": 1364, + "time_per_iteration": 2.68673038482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061602, + "balance_loss_mlp": 1.01472914, + "epoch": 0.26260100038476336, + "flos": 566488615680.0, + "grad_norm": 0.0327166713474878, + "language_loss": 0.84653741, + "learning_rate": 0.0008647145577849834, + "loss": 0.85715348, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.46826172, + "step": 1365, + "time_per_iteration": 2.8294174671173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_mlp": 1.01471996, + "epoch": 0.26279338207002695, + "flos": 614321286144.0, + "grad_norm": 0.027467777319160957, + "language_loss": 0.83391041, + "learning_rate": 0.0008645013744888139, + "loss": 0.84452683, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.46875, + "step": 1366, + "time_per_iteration": 2.845019578933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.01238823, + "epoch": 0.2629857637552905, + "flos": 523945954560.0, + "grad_norm": 0.034051307399065846, + "language_loss": 0.88423878, + "learning_rate": 0.0008642880496806607, + "loss": 0.89483547, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.47241211, + "step": 1367, + "time_per_iteration": 2.7665200233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_mlp": 1.01832283, + "epoch": 0.26317814544055407, + "flos": 535655515392.0, + "grad_norm": 0.03476637042829631, + "language_loss": 0.85672963, + "learning_rate": 0.0008640745834433437, + "loss": 0.86738896, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.47583008, + "step": 1368, + "time_per_iteration": 2.7824857234954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_mlp": 1.00967455, + "epoch": 0.2633705271258176, + "flos": 556780548096.0, + "grad_norm": 0.035052832704740904, + "language_loss": 0.8778615, + "learning_rate": 0.000863860975859738, + "loss": 0.88843262, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.47412109, + "step": 1369, + "time_per_iteration": 2.938157796859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059209, + "balance_loss_mlp": 1.01214516, + "epoch": 0.2635629088110812, + "flos": 553462874880.0, + "grad_norm": 0.04030614296387141, + "language_loss": 0.89190161, + "learning_rate": 0.0008636472270127733, + "loss": 0.90249372, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.47021484, + "step": 1370, + "time_per_iteration": 2.6449878215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_mlp": 1.0106585, + "epoch": 0.2637552904963448, + "flos": 456915867648.0, + "grad_norm": 0.03827203709322554, + "language_loss": 0.91134202, + "learning_rate": 0.0008634333369854345, + "loss": 0.9219166, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.4675293, + "step": 1371, + "time_per_iteration": 2.6090121269226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053294, + "balance_loss_mlp": 1.00642049, + "epoch": 0.2639476721816083, + "flos": 614260048128.0, + "grad_norm": 0.03299961926418253, + "language_loss": 0.88250023, + "learning_rate": 0.0008632193058607608, + "loss": 0.89303321, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.46826172, + "step": 1372, + "time_per_iteration": 2.6980674266815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_mlp": 1.00562024, + "epoch": 0.2641400538668719, + "flos": 573026764032.0, + "grad_norm": 0.03659842444989107, + "language_loss": 0.81553382, + "learning_rate": 0.0008630051337218466, + "loss": 0.82606065, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.47021484, + "step": 1373, + "time_per_iteration": 2.6634395122528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056457, + "balance_loss_mlp": 1.00960791, + "epoch": 0.2643324355521354, + "flos": 583340431872.0, + "grad_norm": 0.03511173854729822, + "language_loss": 0.82885635, + "learning_rate": 0.0008627908206518409, + "loss": 0.83942091, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.46801758, + "step": 1374, + "time_per_iteration": 2.6550941467285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_mlp": 1.01022339, + "epoch": 0.264524817237399, + "flos": 1548027969792.0, + "grad_norm": 0.005864236448565476, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76206684, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.45117188, + "step": 1375, + "time_per_iteration": 4.995543718338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_mlp": 1.01197898, + "epoch": 0.26471719892266254, + "flos": 519043805184.0, + "grad_norm": 0.03321674595186757, + "language_loss": 0.92123759, + "learning_rate": 0.0008623617720514241, + "loss": 0.93182206, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.46411133, + "step": 1376, + "time_per_iteration": 2.592569351196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061616, + "balance_loss_mlp": 1.0151242, + "epoch": 0.26490958060792613, + "flos": 518205880320.0, + "grad_norm": 0.036665073764434085, + "language_loss": 0.85824203, + "learning_rate": 0.0008621470366875848, + "loss": 0.8688581, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.46435547, + "step": 1377, + "time_per_iteration": 2.5636963844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00766897, + "epoch": 0.26510196229318966, + "flos": 597683331072.0, + "grad_norm": 0.03396624681403314, + "language_loss": 0.88501984, + "learning_rate": 0.0008619321607257966, + "loss": 0.8955617, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.46459961, + "step": 1378, + "time_per_iteration": 2.687581777572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056779, + "balance_loss_mlp": 1.010144, + "epoch": 0.26529434397845325, + "flos": 687053541888.0, + "grad_norm": 0.031207845572821406, + "language_loss": 0.82550275, + "learning_rate": 0.000861717144249482, + "loss": 0.83607054, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.46582031, + "step": 1379, + "time_per_iteration": 2.8333678245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.00819123, + "epoch": 0.26548672566371684, + "flos": 425260393728.0, + "grad_norm": 0.03047521662480035, + "language_loss": 0.90854567, + "learning_rate": 0.0008615019873421175, + "loss": 0.91909492, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.46679688, + "step": 1380, + "time_per_iteration": 2.47892689704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00437295, + "epoch": 0.26567910734898037, + "flos": 490850846208.0, + "grad_norm": 0.03515354974137605, + "language_loss": 0.8636173, + "learning_rate": 0.0008612866900872349, + "loss": 0.87412781, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.46630859, + "step": 1381, + "time_per_iteration": 2.558497428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.00833893, + "epoch": 0.26587148903424396, + "flos": 535229750016.0, + "grad_norm": 0.033124361732310995, + "language_loss": 0.88441265, + "learning_rate": 0.0008610712525684197, + "loss": 0.89496362, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.46704102, + "step": 1382, + "time_per_iteration": 2.6567015647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056513, + "balance_loss_mlp": 1.00997365, + "epoch": 0.2660638707195075, + "flos": 1019056422912.0, + "grad_norm": 0.038309225150243896, + "language_loss": 0.84641987, + "learning_rate": 0.0008608556748693121, + "loss": 0.85698497, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.46484375, + "step": 1383, + "time_per_iteration": 3.266127347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054133, + "balance_loss_mlp": 1.00754607, + "epoch": 0.2662562524047711, + "flos": 525063836160.0, + "grad_norm": 0.03266135396779854, + "language_loss": 0.86478686, + "learning_rate": 0.000860639957073607, + "loss": 0.87532818, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.46533203, + "step": 1384, + "time_per_iteration": 2.701979398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052082, + "balance_loss_mlp": 1.00542331, + "epoch": 0.2664486340900346, + "flos": 553480371456.0, + "grad_norm": 0.03507018041250785, + "language_loss": 0.88455647, + "learning_rate": 0.0008604240992650534, + "loss": 0.89507735, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.46606445, + "step": 1385, + "time_per_iteration": 2.6528589725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051347, + "balance_loss_mlp": 1.00476038, + "epoch": 0.2666410157752982, + "flos": 471209189376.0, + "grad_norm": 0.03349459525563368, + "language_loss": 0.89804894, + "learning_rate": 0.0008602081015274545, + "loss": 0.90856242, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.46533203, + "step": 1386, + "time_per_iteration": 2.7359464168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00626826, + "epoch": 0.2668333974605617, + "flos": 571016522496.0, + "grad_norm": 0.027882929979452454, + "language_loss": 0.8367793, + "learning_rate": 0.0008599919639446684, + "loss": 0.84730947, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.46704102, + "step": 1387, + "time_per_iteration": 2.72188401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_mlp": 1.00572038, + "epoch": 0.2670257791458253, + "flos": 399896159232.0, + "grad_norm": 0.038277743086958374, + "language_loss": 0.80995691, + "learning_rate": 0.000859775686600607, + "loss": 0.82048184, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.46728516, + "step": 1388, + "time_per_iteration": 2.5220229625701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051137, + "balance_loss_mlp": 1.00443089, + "epoch": 0.2672181608310889, + "flos": 516892612608.0, + "grad_norm": 0.03738976993969629, + "language_loss": 0.85769641, + "learning_rate": 0.0008595592695792367, + "loss": 0.86820781, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.46655273, + "step": 1389, + "time_per_iteration": 2.7041423320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050805, + "balance_loss_mlp": 1.0042417, + "epoch": 0.26741054251635243, + "flos": 508526002944.0, + "grad_norm": 0.03398026188762752, + "language_loss": 0.91414082, + "learning_rate": 0.0008593427129645778, + "loss": 0.92464888, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.46508789, + "step": 1390, + "time_per_iteration": 2.563215732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.0067687, + "epoch": 0.267602924201616, + "flos": 577809349632.0, + "grad_norm": 0.03481446530036303, + "language_loss": 0.86254311, + "learning_rate": 0.0008591260168407052, + "loss": 0.87307882, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.4675293, + "step": 1391, + "time_per_iteration": 2.788869619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_mlp": 1.00475395, + "epoch": 0.26779530588687955, + "flos": 525000652800.0, + "grad_norm": 0.029176301882166727, + "language_loss": 0.83413607, + "learning_rate": 0.0008589091812917479, + "loss": 0.84465045, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.46630859, + "step": 1392, + "time_per_iteration": 2.6471304893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057177, + "balance_loss_mlp": 1.0103997, + "epoch": 0.26798768757214314, + "flos": 557828443392.0, + "grad_norm": 0.034011915135398356, + "language_loss": 0.85611916, + "learning_rate": 0.0008586922064018887, + "loss": 0.86669087, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.46728516, + "step": 1393, + "time_per_iteration": 2.665710926055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00488269, + "epoch": 0.2681800692574067, + "flos": 932095974144.0, + "grad_norm": 0.035119979561623306, + "language_loss": 0.89861763, + "learning_rate": 0.0008584750922553651, + "loss": 0.90913308, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.46606445, + "step": 1394, + "time_per_iteration": 3.1556007862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00813687, + "epoch": 0.26837245094267026, + "flos": 702318936576.0, + "grad_norm": 0.034220503648090136, + "language_loss": 0.84388494, + "learning_rate": 0.0008582578389364677, + "loss": 0.85443103, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.46411133, + "step": 1395, + "time_per_iteration": 2.8831770420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054667, + "balance_loss_mlp": 1.00824666, + "epoch": 0.26856483262793385, + "flos": 594394814976.0, + "grad_norm": 0.030437239966241224, + "language_loss": 0.92446673, + "learning_rate": 0.0008580404465295422, + "loss": 0.93501341, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.46362305, + "step": 1396, + "time_per_iteration": 2.823685884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_mlp": 1.00578523, + "epoch": 0.2687572143131974, + "flos": 715589640960.0, + "grad_norm": 0.035135728363153845, + "language_loss": 0.88714433, + "learning_rate": 0.0008578229151189876, + "loss": 0.89766812, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.46533203, + "step": 1397, + "time_per_iteration": 2.9427757263183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_mlp": 1.00858808, + "epoch": 0.26894959599846097, + "flos": 468671115264.0, + "grad_norm": 0.03944499035247069, + "language_loss": 0.82205743, + "learning_rate": 0.0008576052447892573, + "loss": 0.83260822, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.46435547, + "step": 1398, + "time_per_iteration": 2.570364475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_mlp": 1.00712895, + "epoch": 0.2691419776837245, + "flos": 469630549248.0, + "grad_norm": 0.035560759826370754, + "language_loss": 0.87260717, + "learning_rate": 0.000857387435624858, + "loss": 0.88314486, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.46582031, + "step": 1399, + "time_per_iteration": 2.5241427421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_mlp": 1.00698149, + "epoch": 0.2693343593689881, + "flos": 939286376448.0, + "grad_norm": 0.026228750880396605, + "language_loss": 0.88826966, + "learning_rate": 0.0008571694877103513, + "loss": 0.89880389, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.46386719, + "step": 1400, + "time_per_iteration": 3.2871432304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049973, + "balance_loss_mlp": 1.00355244, + "epoch": 0.2695267410542516, + "flos": 578795028480.0, + "grad_norm": 0.031687518811048296, + "language_loss": 0.88370931, + "learning_rate": 0.0008569514011303515, + "loss": 0.89420903, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.46362305, + "step": 1401, + "time_per_iteration": 2.8385562896728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00763249, + "epoch": 0.2697191227395152, + "flos": 557965503744.0, + "grad_norm": 0.03646210542720766, + "language_loss": 0.89149171, + "learning_rate": 0.0008567331759695277, + "loss": 0.90203321, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.46459961, + "step": 1402, + "time_per_iteration": 2.73796010017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_mlp": 1.00663948, + "epoch": 0.26991150442477874, + "flos": 530314961664.0, + "grad_norm": 0.03368837159460442, + "language_loss": 0.86897242, + "learning_rate": 0.0008565148123126023, + "loss": 0.87950301, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.46362305, + "step": 1403, + "time_per_iteration": 2.654782772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055763, + "balance_loss_mlp": 1.00970042, + "epoch": 0.2701038861100423, + "flos": 533087305728.0, + "grad_norm": 0.02742415368344255, + "language_loss": 0.86797845, + "learning_rate": 0.0008562963102443516, + "loss": 0.87853605, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.45996094, + "step": 1404, + "time_per_iteration": 2.6844303607940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057243, + "balance_loss_mlp": 1.01122797, + "epoch": 0.2702962677953059, + "flos": 736505681664.0, + "grad_norm": 0.03794782730472634, + "language_loss": 0.85607296, + "learning_rate": 0.0008560776698496056, + "loss": 0.86664534, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.45947266, + "step": 1405, + "time_per_iteration": 2.9016945362091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054152, + "balance_loss_mlp": 1.00806534, + "epoch": 0.27048864948056944, + "flos": 576001297152.0, + "grad_norm": 0.03333453941991407, + "language_loss": 0.8661586, + "learning_rate": 0.0008558588912132481, + "loss": 0.8767001, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.46020508, + "step": 1406, + "time_per_iteration": 2.8187410831451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074676, + "balance_loss_mlp": 1.03042603, + "epoch": 0.27068103116583303, + "flos": 1426912856832.0, + "grad_norm": 0.025019447230712623, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77533662, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.44335938, + "step": 1407, + "time_per_iteration": 4.91855001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059735, + "balance_loss_mlp": 1.01386356, + "epoch": 0.27087341285109656, + "flos": 533032870656.0, + "grad_norm": 0.03180107690871134, + "language_loss": 0.83613265, + "learning_rate": 0.0008554209195555016, + "loss": 0.84672999, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.45800781, + "step": 1408, + "time_per_iteration": 2.7004964351654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.02188134, + "epoch": 0.27106579453636015, + "flos": 582465568512.0, + "grad_norm": 0.03644580883658202, + "language_loss": 0.89378774, + "learning_rate": 0.0008552017267041483, + "loss": 0.90446383, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.45654297, + "step": 1409, + "time_per_iteration": 2.7288694381713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.0219177, + "epoch": 0.2712581762216237, + "flos": 507881518848.0, + "grad_norm": 0.03188220116364099, + "language_loss": 0.84328783, + "learning_rate": 0.0008549823959512549, + "loss": 0.85396332, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.45556641, + "step": 1410, + "time_per_iteration": 2.67370343208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060246, + "balance_loss_mlp": 1.01435077, + "epoch": 0.27145055790688727, + "flos": 999143557632.0, + "grad_norm": 0.03419744556224296, + "language_loss": 0.87478781, + "learning_rate": 0.0008547629273819728, + "loss": 0.88539028, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.45825195, + "step": 1411, + "time_per_iteration": 3.3728370666503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_mlp": 1.0104996, + "epoch": 0.2716429395921508, + "flos": 547729603584.0, + "grad_norm": 0.037303619224495106, + "language_loss": 0.84070724, + "learning_rate": 0.0008545433210815074, + "loss": 0.85127789, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.46508789, + "step": 1412, + "time_per_iteration": 2.6812539100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_mlp": 1.01536179, + "epoch": 0.2718353212774144, + "flos": 574311841536.0, + "grad_norm": 0.033089137280770606, + "language_loss": 0.8805269, + "learning_rate": 0.0008543235771351176, + "loss": 0.89114881, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.46777344, + "step": 1413, + "time_per_iteration": 2.713487148284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.00961292, + "epoch": 0.272027702962678, + "flos": 645585987840.0, + "grad_norm": 0.026077025600286987, + "language_loss": 0.85152733, + "learning_rate": 0.0008541036956281154, + "loss": 0.86208814, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.46411133, + "step": 1414, + "time_per_iteration": 2.9018056392669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062827, + "balance_loss_mlp": 1.01631117, + "epoch": 0.2722200846479415, + "flos": 654996602112.0, + "grad_norm": 0.04047455719590206, + "language_loss": 0.83293629, + "learning_rate": 0.0008538836766458665, + "loss": 0.84356457, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.46459961, + "step": 1415, + "time_per_iteration": 2.84184193611145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106005, + "balance_loss_mlp": 1.01365411, + "epoch": 0.2724124663332051, + "flos": 580779025152.0, + "grad_norm": 0.0390255284508479, + "language_loss": 0.85920322, + "learning_rate": 0.0008536635202737897, + "loss": 0.86980367, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.46337891, + "step": 1416, + "time_per_iteration": 2.814687728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01272988, + "epoch": 0.2726048480184686, + "flos": 538468688640.0, + "grad_norm": 0.03678906161491062, + "language_loss": 0.82951486, + "learning_rate": 0.0008534432265973573, + "loss": 0.8401081, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.46533203, + "step": 1417, + "time_per_iteration": 2.641660451889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056276, + "balance_loss_mlp": 1.00930703, + "epoch": 0.2727972297037322, + "flos": 997550333184.0, + "grad_norm": 0.4222293446211692, + "language_loss": 0.88806397, + "learning_rate": 0.000853222795702095, + "loss": 0.89862669, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.46923828, + "step": 1418, + "time_per_iteration": 3.3743135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181433, + "balance_loss_mlp": 1.1334635, + "epoch": 0.27298961138899575, + "flos": 607335018240.0, + "grad_norm": 0.06715989722341878, + "language_loss": 0.84640503, + "learning_rate": 0.0008530022276735813, + "loss": 0.85821939, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.47949219, + "step": 1419, + "time_per_iteration": 2.752645254135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069458, + "balance_loss_mlp": 1.02225161, + "epoch": 0.27318199307425933, + "flos": 530397586944.0, + "grad_norm": 0.040820608700474346, + "language_loss": 0.87344372, + "learning_rate": 0.0008527815225974489, + "loss": 0.88413835, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.47167969, + "step": 1420, + "time_per_iteration": 2.65108585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085616, + "balance_loss_mlp": 1.03852844, + "epoch": 0.2733743747595229, + "flos": 409912373760.0, + "grad_norm": 0.06690132065136703, + "language_loss": 0.92052042, + "learning_rate": 0.0008525606805593829, + "loss": 0.93137658, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.47045898, + "step": 1421, + "time_per_iteration": 2.4201173782348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.03422987, + "epoch": 0.27356675644478645, + "flos": 517228949760.0, + "grad_norm": 0.05290317096475839, + "language_loss": 0.85793996, + "learning_rate": 0.0008523397016451213, + "loss": 0.86875236, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.46972656, + "step": 1422, + "time_per_iteration": 2.632446765899658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080251, + "balance_loss_mlp": 1.03328276, + "epoch": 0.27375913813005004, + "flos": 1054059705600.0, + "grad_norm": 0.039766191828199446, + "language_loss": 0.90321743, + "learning_rate": 0.0008521185859404564, + "loss": 0.91401994, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.46923828, + "step": 1423, + "time_per_iteration": 3.381535291671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107676, + "balance_loss_mlp": 1.02998257, + "epoch": 0.27395151981531357, + "flos": 626004602112.0, + "grad_norm": 0.042654551092476074, + "language_loss": 0.92207062, + "learning_rate": 0.0008518973335312326, + "loss": 0.9328382, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.46728516, + "step": 1424, + "time_per_iteration": 2.787799596786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070757, + "balance_loss_mlp": 1.0240984, + "epoch": 0.27414390150057716, + "flos": 551415694848.0, + "grad_norm": 0.04883209929837253, + "language_loss": 0.85839558, + "learning_rate": 0.0008516759445033477, + "loss": 0.86910313, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.46606445, + "step": 1425, + "time_per_iteration": 2.6206350326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_mlp": 1.01881957, + "epoch": 0.2743362831858407, + "flos": 540952327680.0, + "grad_norm": 0.043467714857121094, + "language_loss": 0.87962419, + "learning_rate": 0.0008514544189427526, + "loss": 0.89028037, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.4675293, + "step": 1426, + "time_per_iteration": 2.679623603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_mlp": 1.0118494, + "epoch": 0.2745286648711043, + "flos": 469545978624.0, + "grad_norm": 0.04158543868721512, + "language_loss": 0.89037859, + "learning_rate": 0.0008512327569354511, + "loss": 0.90096468, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.46704102, + "step": 1427, + "time_per_iteration": 2.5345683097839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_mlp": 1.01036775, + "epoch": 0.2747210465563678, + "flos": 473872663296.0, + "grad_norm": 0.05094281183667316, + "language_loss": 0.85685182, + "learning_rate": 0.0008510109585675001, + "loss": 0.8674283, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.47241211, + "step": 1428, + "time_per_iteration": 2.5991017818450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076946, + "balance_loss_mlp": 1.03031158, + "epoch": 0.2749134282416314, + "flos": 1318059436800.0, + "grad_norm": 0.019364160619571847, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82230288, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.46582031, + "step": 1429, + "time_per_iteration": 4.724486351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.03459787, + "epoch": 0.275105809926895, + "flos": 972533129472.0, + "grad_norm": 0.05143903496013185, + "language_loss": 0.82696635, + "learning_rate": 0.0008505669530941415, + "loss": 0.83778298, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.47021484, + "step": 1430, + "time_per_iteration": 3.3173024654388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058611, + "balance_loss_mlp": 1.01231062, + "epoch": 0.2752981916121585, + "flos": 528369848832.0, + "grad_norm": 0.04649662222604448, + "language_loss": 0.87158883, + "learning_rate": 0.000850344746161112, + "loss": 0.88217485, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.46240234, + "step": 1431, + "time_per_iteration": 2.635831356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065544, + "balance_loss_mlp": 1.01943398, + "epoch": 0.2754905732974221, + "flos": 454599424512.0, + "grad_norm": 0.04970989937431765, + "language_loss": 0.90776384, + "learning_rate": 0.0008501224032121894, + "loss": 0.91841936, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.46044922, + "step": 1432, + "time_per_iteration": 2.531921148300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_mlp": 1.02339363, + "epoch": 0.27568295498268564, + "flos": 498509788416.0, + "grad_norm": 0.04336527805629792, + "language_loss": 0.84821916, + "learning_rate": 0.0008498999243336946, + "loss": 0.85891324, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.45947266, + "step": 1433, + "time_per_iteration": 2.6142802238464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068976, + "balance_loss_mlp": 1.02298498, + "epoch": 0.2758753366679492, + "flos": 609417191424.0, + "grad_norm": 0.03822636329404569, + "language_loss": 0.8997575, + "learning_rate": 0.0008496773096120021, + "loss": 0.91044724, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.45922852, + "step": 1434, + "time_per_iteration": 2.788863182067871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066122, + "balance_loss_mlp": 1.01977372, + "epoch": 0.27606771835321275, + "flos": 741437966592.0, + "grad_norm": 0.04844453313229188, + "language_loss": 0.86675751, + "learning_rate": 0.0008494545591335381, + "loss": 0.87741876, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.46289062, + "step": 1435, + "time_per_iteration": 2.8883180618286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061614, + "balance_loss_mlp": 1.01516986, + "epoch": 0.27626010003847634, + "flos": 555749182464.0, + "grad_norm": 0.03304758436240527, + "language_loss": 0.88791698, + "learning_rate": 0.0008492316729847823, + "loss": 0.89853311, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.46386719, + "step": 1436, + "time_per_iteration": 2.794938087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.0072248, + "epoch": 0.2764524817237399, + "flos": 543696481536.0, + "grad_norm": 0.13725655625344893, + "language_loss": 0.82129836, + "learning_rate": 0.0008490086512522664, + "loss": 0.83184153, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.47045898, + "step": 1437, + "time_per_iteration": 2.6979260444641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_mlp": 1.01495445, + "epoch": 0.27664486340900346, + "flos": 407129336064.0, + "grad_norm": 0.04115092615815086, + "language_loss": 0.92702913, + "learning_rate": 0.0008487854940225755, + "loss": 0.93765163, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.47265625, + "step": 1438, + "time_per_iteration": 2.4361565113067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_mlp": 1.0080049, + "epoch": 0.27683724509426705, + "flos": 523157607168.0, + "grad_norm": 0.06281356926864295, + "language_loss": 0.92480713, + "learning_rate": 0.0008485622013823466, + "loss": 0.93535829, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.47070312, + "step": 1439, + "time_per_iteration": 2.588972568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060631, + "balance_loss_mlp": 1.01332879, + "epoch": 0.2770296267795306, + "flos": 536410814976.0, + "grad_norm": 0.048827385499573994, + "language_loss": 0.8582921, + "learning_rate": 0.00084833877341827, + "loss": 0.86889839, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.47265625, + "step": 1440, + "time_per_iteration": 2.6215152740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063403, + "balance_loss_mlp": 1.01648188, + "epoch": 0.27722200846479417, + "flos": 488970862080.0, + "grad_norm": 0.04074125375838667, + "language_loss": 0.82920921, + "learning_rate": 0.000848115210217088, + "loss": 0.83984327, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.46875, + "step": 1441, + "time_per_iteration": 2.578479290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059768, + "balance_loss_mlp": 1.01244187, + "epoch": 0.2774143901500577, + "flos": 619444099584.0, + "grad_norm": 0.03981713509883016, + "language_loss": 0.84628934, + "learning_rate": 0.0008478915118655952, + "loss": 0.85688698, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.47290039, + "step": 1442, + "time_per_iteration": 2.697610855102539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055568, + "balance_loss_mlp": 1.0080508, + "epoch": 0.2776067718353213, + "flos": 514845432576.0, + "grad_norm": 0.032345577367045, + "language_loss": 0.88479745, + "learning_rate": 0.0008476676784506393, + "loss": 0.89535314, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.47485352, + "step": 1443, + "time_per_iteration": 2.6315112113952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056662, + "balance_loss_mlp": 1.00897789, + "epoch": 0.2777991535205848, + "flos": 1006042342656.0, + "grad_norm": 0.04008629757661371, + "language_loss": 0.8412413, + "learning_rate": 0.0008474437100591201, + "loss": 0.85180795, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.4765625, + "step": 1444, + "time_per_iteration": 3.3463656902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051562, + "balance_loss_mlp": 1.00371146, + "epoch": 0.2779915352058484, + "flos": 551376811008.0, + "grad_norm": 0.033834103416723965, + "language_loss": 0.87362587, + "learning_rate": 0.0008472196067779898, + "loss": 0.88414145, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.47827148, + "step": 1445, + "time_per_iteration": 2.6647677421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054302, + "balance_loss_mlp": 1.00649953, + "epoch": 0.278183916891112, + "flos": 875217216768.0, + "grad_norm": 0.0457526450580795, + "language_loss": 0.87776953, + "learning_rate": 0.0008469953686942531, + "loss": 0.88831258, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.4777832, + "step": 1446, + "time_per_iteration": 3.076035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056504, + "balance_loss_mlp": 1.00882006, + "epoch": 0.2783762985763755, + "flos": 625196812800.0, + "grad_norm": 0.042452946668595545, + "language_loss": 0.85090148, + "learning_rate": 0.0008467709958949668, + "loss": 0.86146653, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.4765625, + "step": 1447, + "time_per_iteration": 2.744459629058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_mlp": 1.00850928, + "epoch": 0.2785686802616391, + "flos": 582912721152.0, + "grad_norm": 0.04136143865758397, + "language_loss": 0.87796736, + "learning_rate": 0.0008465464884672403, + "loss": 0.88852853, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.47583008, + "step": 1448, + "time_per_iteration": 2.6887707710266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_mlp": 1.00235641, + "epoch": 0.27876106194690264, + "flos": 588540034560.0, + "grad_norm": 0.031263057988026755, + "language_loss": 0.87220562, + "learning_rate": 0.0008463218464982348, + "loss": 0.88270551, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.47607422, + "step": 1449, + "time_per_iteration": 2.8354454040527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_mlp": 1.00326335, + "epoch": 0.27895344363216623, + "flos": 877431592704.0, + "grad_norm": 0.03730856956989286, + "language_loss": 0.89626968, + "learning_rate": 0.0008460970700751645, + "loss": 0.90677798, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.4753418, + "step": 1450, + "time_per_iteration": 3.12705135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062156, + "balance_loss_mlp": 1.01442492, + "epoch": 0.27914582531742976, + "flos": 605036071680.0, + "grad_norm": 0.0379360607610882, + "language_loss": 0.8910991, + "learning_rate": 0.000845872159285295, + "loss": 0.90172064, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.47705078, + "step": 1451, + "time_per_iteration": 2.792448043823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065174, + "balance_loss_mlp": 1.02025604, + "epoch": 0.27933820700269335, + "flos": 1501133346048.0, + "grad_norm": 0.01376981107013524, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.7883203, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.44921875, + "step": 1452, + "time_per_iteration": 4.966037034988403 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_mlp": 1.00921774, + "epoch": 0.2795305886879569, + "flos": 1033518885888.0, + "grad_norm": 0.037040263742322534, + "language_loss": 0.87809932, + "learning_rate": 0.0008454219349544836, + "loss": 0.88866544, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.47363281, + "step": 1453, + "time_per_iteration": 3.428589344024658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055627, + "balance_loss_mlp": 1.00851548, + "epoch": 0.27972297037322047, + "flos": 608227378176.0, + "grad_norm": 0.03307542484781365, + "language_loss": 0.83086669, + "learning_rate": 0.000845196621588334, + "loss": 0.84142298, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.47070312, + "step": 1454, + "time_per_iteration": 2.7620909214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053868, + "balance_loss_mlp": 1.00661373, + "epoch": 0.27991535205848406, + "flos": 631561929216.0, + "grad_norm": 0.034345141589198824, + "language_loss": 0.77104861, + "learning_rate": 0.0008449711742049706, + "loss": 0.78158724, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.47216797, + "step": 1455, + "time_per_iteration": 2.7629852294921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057076, + "balance_loss_mlp": 1.009655, + "epoch": 0.2801077337437476, + "flos": 550354193664.0, + "grad_norm": 0.03843537360044117, + "language_loss": 0.85426688, + "learning_rate": 0.0008447455928919196, + "loss": 0.86483765, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.47387695, + "step": 1456, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054361, + "balance_loss_mlp": 1.00670111, + "epoch": 0.2803001154290112, + "flos": 487742164992.0, + "grad_norm": 0.03308646323695097, + "language_loss": 0.8834334, + "learning_rate": 0.0008445198777367595, + "loss": 0.89397705, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.47631836, + "step": 1457, + "time_per_iteration": 2.5908620357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054094, + "balance_loss_mlp": 1.00633848, + "epoch": 0.2804924971142747, + "flos": 523092478464.0, + "grad_norm": 0.036759152060528134, + "language_loss": 0.82140505, + "learning_rate": 0.0008442940288271208, + "loss": 0.8319459, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.47729492, + "step": 1458, + "time_per_iteration": 2.6980724334716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057218, + "balance_loss_mlp": 1.00953484, + "epoch": 0.2806848787995383, + "flos": 528850049280.0, + "grad_norm": 0.03179596299998768, + "language_loss": 0.88266242, + "learning_rate": 0.0008440680462506856, + "loss": 0.89323461, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.4765625, + "step": 1459, + "time_per_iteration": 2.818169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058111, + "balance_loss_mlp": 1.01047492, + "epoch": 0.2808772604848018, + "flos": 486485277696.0, + "grad_norm": 0.030255628698855237, + "language_loss": 0.87626624, + "learning_rate": 0.0008438419300951883, + "loss": 0.88684738, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.47607422, + "step": 1460, + "time_per_iteration": 2.644911527633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_mlp": 1.00825953, + "epoch": 0.2810696421700654, + "flos": 619340087040.0, + "grad_norm": 0.03597967684758823, + "language_loss": 0.87670606, + "learning_rate": 0.0008436156804484148, + "loss": 0.88726676, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.4777832, + "step": 1461, + "time_per_iteration": 2.7725627422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_mlp": 1.00657165, + "epoch": 0.28126202385532895, + "flos": 455687170560.0, + "grad_norm": 0.0394598317615188, + "language_loss": 0.89263237, + "learning_rate": 0.0008433892973982031, + "loss": 0.90317494, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.4765625, + "step": 1462, + "time_per_iteration": 2.5091495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063928, + "balance_loss_mlp": 1.0156002, + "epoch": 0.28145440554059253, + "flos": 531739044864.0, + "grad_norm": 0.041651284680957995, + "language_loss": 0.866346, + "learning_rate": 0.0008431627810324431, + "loss": 0.87698531, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.4831543, + "step": 1463, + "time_per_iteration": 2.6705899238586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056697, + "balance_loss_mlp": 1.00872695, + "epoch": 0.2816467872258561, + "flos": 453164647680.0, + "grad_norm": 0.03544245246238935, + "language_loss": 0.81977493, + "learning_rate": 0.000842936131439076, + "loss": 0.83034194, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.47949219, + "step": 1464, + "time_per_iteration": 2.610419511795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055364, + "balance_loss_mlp": 1.00763226, + "epoch": 0.28183916891111965, + "flos": 473705467392.0, + "grad_norm": 0.034609246408770326, + "language_loss": 0.89094436, + "learning_rate": 0.0008427093487060951, + "loss": 0.90149802, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.47705078, + "step": 1465, + "time_per_iteration": 2.72540283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054608, + "balance_loss_mlp": 1.00656629, + "epoch": 0.28203155059638324, + "flos": 558189080064.0, + "grad_norm": 0.02738603689522664, + "language_loss": 0.8552286, + "learning_rate": 0.000842482432921545, + "loss": 0.86577463, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.48022461, + "step": 1466, + "time_per_iteration": 2.8388257026672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105456, + "balance_loss_mlp": 1.00654304, + "epoch": 0.28222393228164677, + "flos": 417879462912.0, + "grad_norm": 0.03402242241185157, + "language_loss": 0.88381398, + "learning_rate": 0.0008422553841735225, + "loss": 0.89435959, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.47998047, + "step": 1467, + "time_per_iteration": 2.495126485824585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057213, + "balance_loss_mlp": 1.00917137, + "epoch": 0.28241631396691036, + "flos": 606041192448.0, + "grad_norm": 0.032675143321136885, + "language_loss": 0.86003613, + "learning_rate": 0.0008420282025501757, + "loss": 0.87060827, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.48022461, + "step": 1468, + "time_per_iteration": 2.7908880710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052419, + "balance_loss_mlp": 1.00473487, + "epoch": 0.2826086956521739, + "flos": 574051326720.0, + "grad_norm": 0.03300906221563125, + "language_loss": 0.86686498, + "learning_rate": 0.0008418008881397043, + "loss": 0.87738919, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.4765625, + "step": 1469, + "time_per_iteration": 2.7646520137786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054478, + "balance_loss_mlp": 1.00693762, + "epoch": 0.2828010773374375, + "flos": 844319954688.0, + "grad_norm": 0.03195966631281891, + "language_loss": 0.84124947, + "learning_rate": 0.0008415734410303595, + "loss": 0.85179424, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.47509766, + "step": 1470, + "time_per_iteration": 3.1784656047821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059132, + "balance_loss_mlp": 1.01151943, + "epoch": 0.28299345902270107, + "flos": 543772303872.0, + "grad_norm": 0.0307788797974712, + "language_loss": 0.91781342, + "learning_rate": 0.0008413458613104444, + "loss": 0.92840481, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.47583008, + "step": 1471, + "time_per_iteration": 2.7000675201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057543, + "balance_loss_mlp": 1.00995505, + "epoch": 0.2831858407079646, + "flos": 572755555584.0, + "grad_norm": 0.03187726406761503, + "language_loss": 0.84024346, + "learning_rate": 0.0008411181490683129, + "loss": 0.85081899, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.47558594, + "step": 1472, + "time_per_iteration": 2.7358603477478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105744, + "balance_loss_mlp": 1.00958943, + "epoch": 0.2833782223932282, + "flos": 765172038144.0, + "grad_norm": 0.03258814259190176, + "language_loss": 0.83765668, + "learning_rate": 0.0008408903043923707, + "loss": 0.84823108, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.47827148, + "step": 1473, + "time_per_iteration": 3.016690492630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060961, + "balance_loss_mlp": 1.01291955, + "epoch": 0.2835706040784917, + "flos": 540088157952.0, + "grad_norm": 0.03783140599229066, + "language_loss": 0.82463539, + "learning_rate": 0.0008406623273710754, + "loss": 0.83524501, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.48022461, + "step": 1474, + "time_per_iteration": 2.651932954788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_mlp": 1.00736535, + "epoch": 0.2837629857637553, + "flos": 531654474240.0, + "grad_norm": 0.03425671969493541, + "language_loss": 0.84354198, + "learning_rate": 0.0008404342180929351, + "loss": 0.85409558, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.47973633, + "step": 1475, + "time_per_iteration": 2.6064491271972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105922, + "balance_loss_mlp": 1.01120257, + "epoch": 0.28395536744901884, + "flos": 541110775296.0, + "grad_norm": 0.03564784056716401, + "language_loss": 0.8245163, + "learning_rate": 0.00084020597664651, + "loss": 0.83510846, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.47998047, + "step": 1476, + "time_per_iteration": 2.7597527503967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.00890458, + "epoch": 0.2841477491342824, + "flos": 574802735616.0, + "grad_norm": 0.037292940254278956, + "language_loss": 0.8496412, + "learning_rate": 0.0008399776031204111, + "loss": 0.86021066, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.48022461, + "step": 1477, + "time_per_iteration": 2.759089231491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.00412941, + "epoch": 0.28434013081954596, + "flos": 573139524864.0, + "grad_norm": 0.03522410712402375, + "language_loss": 0.80955458, + "learning_rate": 0.0008397490976033009, + "loss": 0.8200742, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.47802734, + "step": 1478, + "time_per_iteration": 2.6423845291137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056133, + "balance_loss_mlp": 1.0100708, + "epoch": 0.28453251250480954, + "flos": 1556676481536.0, + "grad_norm": 0.010218347035897045, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78935778, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.45996094, + "step": 1479, + "time_per_iteration": 4.732174396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_mlp": 1.0056026, + "epoch": 0.28472489419007313, + "flos": 750427673088.0, + "grad_norm": 0.028762601306014927, + "language_loss": 0.86263019, + "learning_rate": 0.0008392916909509525, + "loss": 0.87316358, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.47705078, + "step": 1480, + "time_per_iteration": 3.0842366218566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105593, + "balance_loss_mlp": 1.00817478, + "epoch": 0.28491727587533666, + "flos": 491139551232.0, + "grad_norm": 0.03654292068957682, + "language_loss": 0.86134857, + "learning_rate": 0.0008390627899932954, + "loss": 0.87190789, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.47729492, + "step": 1481, + "time_per_iteration": 2.615267753601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053309, + "balance_loss_mlp": 1.0055064, + "epoch": 0.28510965756060025, + "flos": 730360250880.0, + "grad_norm": 0.03257927187729683, + "language_loss": 0.89633858, + "learning_rate": 0.000838833757399789, + "loss": 0.90687168, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.4777832, + "step": 1482, + "time_per_iteration": 2.9428212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053301, + "balance_loss_mlp": 1.00528359, + "epoch": 0.2853020392458638, + "flos": 552670636800.0, + "grad_norm": 0.036455185890550544, + "language_loss": 0.82055122, + "learning_rate": 0.0008386045932593515, + "loss": 0.83108419, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.47998047, + "step": 1483, + "time_per_iteration": 2.724045991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_mlp": 1.00416255, + "epoch": 0.28549442093112737, + "flos": 756097761024.0, + "grad_norm": 0.02777472605390161, + "language_loss": 0.8718375, + "learning_rate": 0.0008383752976609525, + "loss": 0.8823595, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.48022461, + "step": 1484, + "time_per_iteration": 2.929905891418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054156, + "balance_loss_mlp": 1.00618601, + "epoch": 0.2856868026163909, + "flos": 539704188672.0, + "grad_norm": 0.028392575187028035, + "language_loss": 0.8111921, + "learning_rate": 0.0008381458706936123, + "loss": 0.82173365, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.47949219, + "step": 1485, + "time_per_iteration": 2.717545986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053651, + "balance_loss_mlp": 1.00563323, + "epoch": 0.2858791843016545, + "flos": 584921017344.0, + "grad_norm": 0.03333139148622456, + "language_loss": 0.88664746, + "learning_rate": 0.0008379163124464025, + "loss": 0.8971839, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.47998047, + "step": 1486, + "time_per_iteration": 2.7234747409820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_mlp": 1.00685012, + "epoch": 0.286071565986918, + "flos": 646052582400.0, + "grad_norm": 0.03454926432429506, + "language_loss": 0.77946562, + "learning_rate": 0.0008376866230084452, + "loss": 0.79001164, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.47729492, + "step": 1487, + "time_per_iteration": 2.856128692626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00408018, + "epoch": 0.2862639476721816, + "flos": 492331309824.0, + "grad_norm": 0.034661288064865674, + "language_loss": 0.87705112, + "learning_rate": 0.000837456802468914, + "loss": 0.88757157, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.47949219, + "step": 1488, + "time_per_iteration": 2.57454514503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_mlp": 1.00700366, + "epoch": 0.2864563293574452, + "flos": 522745447680.0, + "grad_norm": 0.035472984165373166, + "language_loss": 0.86247557, + "learning_rate": 0.0008372268509170331, + "loss": 0.87302554, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.47973633, + "step": 1489, + "time_per_iteration": 2.661430597305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_mlp": 1.00452483, + "epoch": 0.2866487110427087, + "flos": 548257436160.0, + "grad_norm": 0.03357077125927176, + "language_loss": 0.85950172, + "learning_rate": 0.0008369967684420779, + "loss": 0.8700276, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.48046875, + "step": 1490, + "time_per_iteration": 2.703200101852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052654, + "balance_loss_mlp": 1.0047555, + "epoch": 0.2868410927279723, + "flos": 483218148864.0, + "grad_norm": 0.03511930922286833, + "language_loss": 0.8567192, + "learning_rate": 0.0008367665551333736, + "loss": 0.86724567, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.47875977, + "step": 1491, + "time_per_iteration": 2.6027045249938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051173, + "balance_loss_mlp": 1.00334597, + "epoch": 0.28703347441323585, + "flos": 726137578752.0, + "grad_norm": 0.03668604763704844, + "language_loss": 0.86648476, + "learning_rate": 0.0008365362110802977, + "loss": 0.87699652, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.47802734, + "step": 1492, + "time_per_iteration": 2.872743606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.00630987, + "epoch": 0.28722585609849943, + "flos": 636214257408.0, + "grad_norm": 0.0346446819062503, + "language_loss": 0.83264536, + "learning_rate": 0.0008363057363722773, + "loss": 0.84318721, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.47851562, + "step": 1493, + "time_per_iteration": 2.830925941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055811, + "balance_loss_mlp": 1.00827014, + "epoch": 0.28741823778376296, + "flos": 511252660224.0, + "grad_norm": 0.03541460771255837, + "language_loss": 0.8481909, + "learning_rate": 0.0008360751310987906, + "loss": 0.85874903, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.47509766, + "step": 1494, + "time_per_iteration": 2.6102633476257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055778, + "balance_loss_mlp": 1.00840437, + "epoch": 0.28761061946902655, + "flos": 604932059136.0, + "grad_norm": 0.030521465086419404, + "language_loss": 0.86298919, + "learning_rate": 0.0008358443953493666, + "loss": 0.87354696, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.47338867, + "step": 1495, + "time_per_iteration": 2.8808648586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053186, + "balance_loss_mlp": 1.00590765, + "epoch": 0.28780300115429014, + "flos": 408060579840.0, + "grad_norm": 0.03760103829607362, + "language_loss": 0.89352167, + "learning_rate": 0.0008356135292135851, + "loss": 0.90405357, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.47241211, + "step": 1496, + "time_per_iteration": 2.5025811195373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055101, + "balance_loss_mlp": 1.00794196, + "epoch": 0.28799538283955367, + "flos": 375745070592.0, + "grad_norm": 0.04396673202836768, + "language_loss": 0.93575335, + "learning_rate": 0.0008353825327810758, + "loss": 0.94630432, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.47119141, + "step": 1497, + "time_per_iteration": 2.4455389976501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053362, + "balance_loss_mlp": 1.00601161, + "epoch": 0.28818776452481726, + "flos": 593020309248.0, + "grad_norm": 0.03575929377279749, + "language_loss": 0.82620615, + "learning_rate": 0.00083515140614152, + "loss": 0.83673978, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.47314453, + "step": 1498, + "time_per_iteration": 2.7318496704101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_mlp": 1.01204443, + "epoch": 0.2883801462100808, + "flos": 536104613376.0, + "grad_norm": 0.03408677708994041, + "language_loss": 0.8771323, + "learning_rate": 0.0008349201493846485, + "loss": 0.88772887, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.47583008, + "step": 1499, + "time_per_iteration": 2.671473503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105332, + "balance_loss_mlp": 1.00606573, + "epoch": 0.2885725278953444, + "flos": 481077649920.0, + "grad_norm": 0.037679681148910335, + "language_loss": 0.90198493, + "learning_rate": 0.0008346887626002432, + "loss": 0.91251814, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.47216797, + "step": 1500, + "time_per_iteration": 2.565556287765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_mlp": 1.00290418, + "epoch": 0.2887649095806079, + "flos": 465030710784.0, + "grad_norm": 0.03453406345592784, + "language_loss": 0.87256986, + "learning_rate": 0.000834457245878137, + "loss": 0.88307267, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.47338867, + "step": 1501, + "time_per_iteration": 2.6684980392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_mlp": 1.00411427, + "epoch": 0.2889572912658715, + "flos": 932641303296.0, + "grad_norm": 0.034149555340210275, + "language_loss": 0.82079703, + "learning_rate": 0.000834225599308212, + "loss": 0.83131123, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.47265625, + "step": 1502, + "time_per_iteration": 3.2747607231140137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052615, + "balance_loss_mlp": 1.00526536, + "epoch": 0.28914967295113503, + "flos": 571257595392.0, + "grad_norm": 0.03426641952710734, + "language_loss": 0.85934782, + "learning_rate": 0.0008339938229804016, + "loss": 0.869874, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.47314453, + "step": 1503, + "time_per_iteration": 2.7027056217193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062946, + "balance_loss_mlp": 1.01783752, + "epoch": 0.2893420546363986, + "flos": 1489874828544.0, + "grad_norm": 0.016861580481692767, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76497769, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.45019531, + "step": 1504, + "time_per_iteration": 4.9503560066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010625, + "balance_loss_mlp": 1.01536465, + "epoch": 0.2895344363216622, + "flos": 471182944512.0, + "grad_norm": 0.04276572481675365, + "language_loss": 0.8589167, + "learning_rate": 0.0008335298814111094, + "loss": 0.86954165, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.47094727, + "step": 1505, + "time_per_iteration": 2.548398017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.01654112, + "epoch": 0.28972681800692573, + "flos": 649341098496.0, + "grad_norm": 0.03572405467889404, + "language_loss": 0.89211309, + "learning_rate": 0.0008332977163497455, + "loss": 0.90274966, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.47070312, + "step": 1506, + "time_per_iteration": 2.786355972290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059474, + "balance_loss_mlp": 1.01241064, + "epoch": 0.2899191996921893, + "flos": 573306720768.0, + "grad_norm": 0.03560254091063293, + "language_loss": 0.84471554, + "learning_rate": 0.0008330654218907325, + "loss": 0.85531026, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.47021484, + "step": 1507, + "time_per_iteration": 2.706066131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054224, + "balance_loss_mlp": 1.00701702, + "epoch": 0.29011158137745285, + "flos": 662638047744.0, + "grad_norm": 0.03364876986368613, + "language_loss": 0.82771999, + "learning_rate": 0.0008328329981242548, + "loss": 0.8382622, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.47167969, + "step": 1508, + "time_per_iteration": 2.9025378227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053376, + "balance_loss_mlp": 1.00607395, + "epoch": 0.29030396306271644, + "flos": 537403296768.0, + "grad_norm": 0.0314370875382877, + "language_loss": 0.88638061, + "learning_rate": 0.0008326004451405475, + "loss": 0.89691436, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.47265625, + "step": 1509, + "time_per_iteration": 2.740288496017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091124, + "balance_loss_mlp": 1.04370284, + "epoch": 0.29049634474798, + "flos": 512956700160.0, + "grad_norm": 0.04021928954994292, + "language_loss": 0.83711147, + "learning_rate": 0.0008323677630298957, + "loss": 0.84802264, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.47387695, + "step": 1510, + "time_per_iteration": 2.5700840950012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_mlp": 1.00935256, + "epoch": 0.29068872643324356, + "flos": 614983266816.0, + "grad_norm": 0.03498537298994642, + "language_loss": 0.86212677, + "learning_rate": 0.0008321349518826345, + "loss": 0.87268996, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.46923828, + "step": 1511, + "time_per_iteration": 2.7968146800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060609, + "balance_loss_mlp": 1.01364064, + "epoch": 0.2908811081185071, + "flos": 547469088768.0, + "grad_norm": 0.03734404843374857, + "language_loss": 0.95525789, + "learning_rate": 0.0008319020117891491, + "loss": 0.96586394, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.46923828, + "step": 1512, + "time_per_iteration": 2.646127939224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.01107061, + "epoch": 0.2910734898037707, + "flos": 605902186752.0, + "grad_norm": 0.03463533015087841, + "language_loss": 0.88378417, + "learning_rate": 0.0008316689428398751, + "loss": 0.89436436, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.46899414, + "step": 1513, + "time_per_iteration": 2.7310631275177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056228, + "balance_loss_mlp": 1.00935447, + "epoch": 0.29126587148903427, + "flos": 575836046592.0, + "grad_norm": 0.028150288904366032, + "language_loss": 0.89498413, + "learning_rate": 0.0008314357451252979, + "loss": 0.90554643, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.46826172, + "step": 1514, + "time_per_iteration": 2.8262994289398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054124, + "balance_loss_mlp": 1.00727487, + "epoch": 0.2914582531742978, + "flos": 572134404096.0, + "grad_norm": 0.05354948204009119, + "language_loss": 0.89001274, + "learning_rate": 0.0008312024187359527, + "loss": 0.90055394, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.46801758, + "step": 1515, + "time_per_iteration": 2.717780590057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_mlp": 1.01109469, + "epoch": 0.2916506348595614, + "flos": 732303418368.0, + "grad_norm": 0.032865630858266236, + "language_loss": 0.8831327, + "learning_rate": 0.000830968963762425, + "loss": 0.89371502, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.47094727, + "step": 1516, + "time_per_iteration": 3.080526828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051181, + "balance_loss_mlp": 1.00383127, + "epoch": 0.2918430165448249, + "flos": 511467488256.0, + "grad_norm": 0.032871242995291323, + "language_loss": 0.84882748, + "learning_rate": 0.0008307353802953497, + "loss": 0.85933936, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.47314453, + "step": 1517, + "time_per_iteration": 2.744476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.03726828, + "epoch": 0.2920353982300885, + "flos": 631607616000.0, + "grad_norm": 0.03594729450056152, + "language_loss": 0.86997348, + "learning_rate": 0.0008305016684254125, + "loss": 0.88082325, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.47680664, + "step": 1518, + "time_per_iteration": 2.8340506553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_mlp": 1.00001049, + "epoch": 0.29222777991535204, + "flos": 502671222528.0, + "grad_norm": 0.03192476620539529, + "language_loss": 0.87901479, + "learning_rate": 0.0008302678282433479, + "loss": 0.88948864, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.47338867, + "step": 1519, + "time_per_iteration": 2.5783281326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048912, + "balance_loss_mlp": 1.00177681, + "epoch": 0.2924201616006156, + "flos": 487842286848.0, + "grad_norm": 0.03491462978028735, + "language_loss": 0.85667795, + "learning_rate": 0.0008300338598399411, + "loss": 0.86716712, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.47094727, + "step": 1520, + "time_per_iteration": 2.6763737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105218, + "balance_loss_mlp": 1.0049969, + "epoch": 0.2926125432858792, + "flos": 477411000576.0, + "grad_norm": 0.036990289889529016, + "language_loss": 0.957196, + "learning_rate": 0.0008297997633060263, + "loss": 0.96771777, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.47143555, + "step": 1521, + "time_per_iteration": 2.5368785858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055222, + "balance_loss_mlp": 1.00799167, + "epoch": 0.29280492497114274, + "flos": 677868449280.0, + "grad_norm": 0.0362418142607002, + "language_loss": 0.86058486, + "learning_rate": 0.0008295655387324883, + "loss": 0.87113714, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.47192383, + "step": 1522, + "time_per_iteration": 2.8447062969207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055265, + "balance_loss_mlp": 1.0079869, + "epoch": 0.29299730665640633, + "flos": 459345071616.0, + "grad_norm": 0.03782463739456531, + "language_loss": 0.86245579, + "learning_rate": 0.0008293311862102609, + "loss": 0.87300849, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.47241211, + "step": 1523, + "time_per_iteration": 2.5397908687591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050568, + "balance_loss_mlp": 1.00328994, + "epoch": 0.29318968834166986, + "flos": 447496505088.0, + "grad_norm": 0.03500221637525105, + "language_loss": 0.90103561, + "learning_rate": 0.0008290967058303275, + "loss": 0.91154128, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.47241211, + "step": 1524, + "time_per_iteration": 2.4784419536590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.03349924, + "epoch": 0.29338207002693345, + "flos": 451256473344.0, + "grad_norm": 0.038529021386844775, + "language_loss": 0.87365985, + "learning_rate": 0.0008288620976837219, + "loss": 0.88447046, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.4753418, + "step": 1525, + "time_per_iteration": 2.540762424468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_mlp": 1.00684249, + "epoch": 0.293574451712197, + "flos": 503285571072.0, + "grad_norm": 0.03477645959362119, + "language_loss": 0.8372373, + "learning_rate": 0.000828627361861527, + "loss": 0.84778112, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.47509766, + "step": 1526, + "time_per_iteration": 2.583862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058639, + "balance_loss_mlp": 1.01124167, + "epoch": 0.29376683339746057, + "flos": 697684104960.0, + "grad_norm": 0.03858140978476568, + "language_loss": 0.85503912, + "learning_rate": 0.0008283924984548752, + "loss": 0.8656255, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.47363281, + "step": 1527, + "time_per_iteration": 2.848947525024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054231, + "balance_loss_mlp": 1.00680923, + "epoch": 0.2939592150827241, + "flos": 479542751232.0, + "grad_norm": 0.03208252397749005, + "language_loss": 0.8577444, + "learning_rate": 0.0008281575075549485, + "loss": 0.86828673, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.47387695, + "step": 1528, + "time_per_iteration": 2.6076998710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063099, + "balance_loss_mlp": 1.01703644, + "epoch": 0.2941515967679877, + "flos": 1488389507328.0, + "grad_norm": 0.010941905571601225, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78415793, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.45996094, + "step": 1529, + "time_per_iteration": 4.672811508178711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175133, + "balance_loss_mlp": 1.12690103, + "epoch": 0.2943439784532513, + "flos": 675400361472.0, + "grad_norm": 0.05299717257038309, + "language_loss": 0.90924174, + "learning_rate": 0.0008276871436402469, + "loss": 0.92099309, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.48217773, + "step": 1530, + "time_per_iteration": 2.8220977783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010581, + "balance_loss_mlp": 1.01096439, + "epoch": 0.2945363601385148, + "flos": 577383584256.0, + "grad_norm": 0.03620573442946411, + "language_loss": 0.88955015, + "learning_rate": 0.000827451770808083, + "loss": 0.90013111, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.47094727, + "step": 1531, + "time_per_iteration": 2.6981046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057368, + "balance_loss_mlp": 1.01013768, + "epoch": 0.2947287418237784, + "flos": 481618121472.0, + "grad_norm": 0.03382548660060083, + "language_loss": 0.84345412, + "learning_rate": 0.0008272162708478674, + "loss": 0.85402787, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.47192383, + "step": 1532, + "time_per_iteration": 2.5975306034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_mlp": 1.01151645, + "epoch": 0.2949211235090419, + "flos": 559261274880.0, + "grad_norm": 0.03154442800865326, + "language_loss": 0.87544608, + "learning_rate": 0.000826980643851029, + "loss": 0.88603282, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.47119141, + "step": 1533, + "time_per_iteration": 2.6889007091522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063865, + "balance_loss_mlp": 1.01675379, + "epoch": 0.2951135051943055, + "flos": 484857060096.0, + "grad_norm": 0.03876668067992812, + "language_loss": 0.85914761, + "learning_rate": 0.0008267448899090464, + "loss": 0.86978626, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.47070312, + "step": 1534, + "time_per_iteration": 2.5630924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062291, + "balance_loss_mlp": 1.01498842, + "epoch": 0.29530588687956905, + "flos": 551422497792.0, + "grad_norm": 0.034923849251574525, + "language_loss": 0.81812191, + "learning_rate": 0.0008265090091134473, + "loss": 0.82874477, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.47265625, + "step": 1535, + "time_per_iteration": 2.8399465084075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105968, + "balance_loss_mlp": 1.01235437, + "epoch": 0.29549826856483263, + "flos": 674310670080.0, + "grad_norm": 0.028029616611284485, + "language_loss": 0.80873084, + "learning_rate": 0.0008262730015558088, + "loss": 0.81932771, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.47290039, + "step": 1536, + "time_per_iteration": 2.874537944793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059174, + "balance_loss_mlp": 1.01151371, + "epoch": 0.29569065025009617, + "flos": 766136329728.0, + "grad_norm": 0.03177117147053012, + "language_loss": 0.82803708, + "learning_rate": 0.0008260368673277574, + "loss": 0.83862883, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.47631836, + "step": 1537, + "time_per_iteration": 3.0976641178131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_mlp": 1.00573432, + "epoch": 0.29588303193535975, + "flos": 544831859712.0, + "grad_norm": 0.031452220479770684, + "language_loss": 0.84814745, + "learning_rate": 0.0008258006065209682, + "loss": 0.85868478, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.47973633, + "step": 1538, + "time_per_iteration": 2.7704694271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115804, + "balance_loss_mlp": 1.06735778, + "epoch": 0.29607541362062334, + "flos": 598146034944.0, + "grad_norm": 0.04896094729194987, + "language_loss": 0.81966412, + "learning_rate": 0.0008255642192271657, + "loss": 0.83082211, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.484375, + "step": 1539, + "time_per_iteration": 2.774122714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_mlp": 1.01219356, + "epoch": 0.29626779530588687, + "flos": 611038606080.0, + "grad_norm": 0.02837345788652225, + "language_loss": 0.84628069, + "learning_rate": 0.0008253277055381241, + "loss": 0.85687971, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.47680664, + "step": 1540, + "time_per_iteration": 2.837587833404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061212, + "balance_loss_mlp": 1.01340961, + "epoch": 0.29646017699115046, + "flos": 868959025152.0, + "grad_norm": 0.03662488769273821, + "language_loss": 0.86757702, + "learning_rate": 0.0008250910655456658, + "loss": 0.87818909, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.4777832, + "step": 1541, + "time_per_iteration": 3.123687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010574, + "balance_loss_mlp": 1.00954938, + "epoch": 0.296652558676414, + "flos": 496881570816.0, + "grad_norm": 0.03318095479066229, + "language_loss": 0.84889704, + "learning_rate": 0.0008248542993416625, + "loss": 0.85947102, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.47827148, + "step": 1542, + "time_per_iteration": 2.637747049331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068583, + "balance_loss_mlp": 1.02082753, + "epoch": 0.2968449403616776, + "flos": 572627243520.0, + "grad_norm": 0.03443634648546435, + "language_loss": 0.84426934, + "learning_rate": 0.0008246174070180352, + "loss": 0.8549552, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.47729492, + "step": 1543, + "time_per_iteration": 2.6872684955596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062899, + "balance_loss_mlp": 1.01511967, + "epoch": 0.2970373220469411, + "flos": 795651304704.0, + "grad_norm": 0.035080805136432934, + "language_loss": 0.85198414, + "learning_rate": 0.0008243803886667537, + "loss": 0.86261314, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.47753906, + "step": 1544, + "time_per_iteration": 3.13710618019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069403, + "balance_loss_mlp": 1.02145684, + "epoch": 0.2972297037322047, + "flos": 662249220864.0, + "grad_norm": 0.04094703338464919, + "language_loss": 0.80137819, + "learning_rate": 0.0008241432443798364, + "loss": 0.81207222, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.47924805, + "step": 1545, + "time_per_iteration": 2.841092109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061565, + "balance_loss_mlp": 1.0138818, + "epoch": 0.29742208541746823, + "flos": 598232550912.0, + "grad_norm": 0.028624248431763765, + "language_loss": 0.86072361, + "learning_rate": 0.0008239059742493512, + "loss": 0.87133932, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.4765625, + "step": 1546, + "time_per_iteration": 2.7034194469451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01349258, + "balance_loss_mlp": 1.29957151, + "epoch": 0.2976144671027318, + "flos": 771339823104.0, + "grad_norm": 0.07377893489124947, + "language_loss": 0.88059306, + "learning_rate": 0.0008236685783674142, + "loss": 0.89408565, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.49584961, + "step": 1547, + "time_per_iteration": 3.063077688217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071266, + "balance_loss_mlp": 1.02510834, + "epoch": 0.2978068487879954, + "flos": 1487914164480.0, + "grad_norm": 0.01225569795264997, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.7729246, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.4609375, + "step": 1548, + "time_per_iteration": 4.894561767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.02564275, + "epoch": 0.29799923047325894, + "flos": 476330057472.0, + "grad_norm": 0.041178192237982324, + "language_loss": 0.84313369, + "learning_rate": 0.0008231934097178955, + "loss": 0.85386503, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.47460938, + "step": 1549, + "time_per_iteration": 2.630146026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081209, + "balance_loss_mlp": 1.03362012, + "epoch": 0.2981916121585225, + "flos": 761169051648.0, + "grad_norm": 0.037198017460407115, + "language_loss": 0.86745787, + "learning_rate": 0.0008229556371347903, + "loss": 0.87826997, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.47558594, + "step": 1550, + "time_per_iteration": 2.9614980220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081757, + "balance_loss_mlp": 1.03416848, + "epoch": 0.29838399384378606, + "flos": 876517845504.0, + "grad_norm": 0.043512769843104544, + "language_loss": 0.80808616, + "learning_rate": 0.0008227177391691874, + "loss": 0.81890368, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.47558594, + "step": 1551, + "time_per_iteration": 3.11059832572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081803, + "balance_loss_mlp": 1.03445339, + "epoch": 0.29857637552904964, + "flos": 580752780288.0, + "grad_norm": 0.039547132323558824, + "language_loss": 0.90871334, + "learning_rate": 0.0008224797159134463, + "loss": 0.91953135, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.47314453, + "step": 1552, + "time_per_iteration": 2.7177717685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077357, + "balance_loss_mlp": 1.03026903, + "epoch": 0.2987687572143132, + "flos": 837809029632.0, + "grad_norm": 0.03288289742732326, + "language_loss": 0.84735203, + "learning_rate": 0.0008222415674599765, + "loss": 0.85812569, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.47045898, + "step": 1553, + "time_per_iteration": 3.090768814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.02513897, + "epoch": 0.29896113889957676, + "flos": 568168356096.0, + "grad_norm": 0.03857517262144223, + "language_loss": 0.8489393, + "learning_rate": 0.0008220032939012349, + "loss": 0.85966009, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.46899414, + "step": 1554, + "time_per_iteration": 2.7050375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.02554476, + "epoch": 0.29915352058484035, + "flos": 499836662016.0, + "grad_norm": 0.03341170745827686, + "language_loss": 0.89154899, + "learning_rate": 0.0008217648953297277, + "loss": 0.90227222, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.46728516, + "step": 1555, + "time_per_iteration": 2.8296022415161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106052, + "balance_loss_mlp": 1.01376653, + "epoch": 0.2993459022701039, + "flos": 593215695360.0, + "grad_norm": 0.042418434687241845, + "language_loss": 0.79395097, + "learning_rate": 0.0008215263718380095, + "loss": 0.80455619, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.46704102, + "step": 1556, + "time_per_iteration": 2.683760643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02541041, + "balance_loss_mlp": 2.4871583, + "epoch": 0.29953828395536747, + "flos": 573473916672.0, + "grad_norm": 0.19828678552993478, + "language_loss": 0.85491472, + "learning_rate": 0.0008212877235186833, + "loss": 0.88032514, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.54003906, + "step": 1557, + "time_per_iteration": 2.6963422298431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086143, + "balance_loss_mlp": 1.0413208, + "epoch": 0.299730665640631, + "flos": 1508086566144.0, + "grad_norm": 0.015049722833054002, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78823709, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.44824219, + "step": 1558, + "time_per_iteration": 4.971554279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098273, + "balance_loss_mlp": 1.05063736, + "epoch": 0.2999230473258946, + "flos": 514808494080.0, + "grad_norm": 0.04814176942398931, + "language_loss": 0.82249933, + "learning_rate": 0.0008208100527678611, + "loss": 0.83348203, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.47607422, + "step": 1559, + "time_per_iteration": 2.6210360527038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130027, + "balance_loss_mlp": 1.08127058, + "epoch": 0.3001154290111581, + "flos": 835855168512.0, + "grad_norm": 0.05333171316141313, + "language_loss": 0.80031002, + "learning_rate": 0.0008205710305218135, + "loss": 0.81161028, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.48730469, + "step": 1560, + "time_per_iteration": 3.0021140575408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168816, + "balance_loss_mlp": 1.11898673, + "epoch": 0.3003078106964217, + "flos": 557946061824.0, + "grad_norm": 0.05314988858528354, + "language_loss": 0.91578549, + "learning_rate": 0.0008203318838190541, + "loss": 0.92747366, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.49707031, + "step": 1561, + "time_per_iteration": 2.7369065284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153064, + "balance_loss_mlp": 1.10247147, + "epoch": 0.30050019238168524, + "flos": 527169341952.0, + "grad_norm": 0.047834322975263, + "language_loss": 0.86778915, + "learning_rate": 0.0008200926127524281, + "loss": 0.87931979, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.50634766, + "step": 1562, + "time_per_iteration": 2.6357791423797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157108, + "balance_loss_mlp": 1.10565686, + "epoch": 0.3006925740669488, + "flos": 578937924864.0, + "grad_norm": 0.04357261617021945, + "language_loss": 0.84502149, + "learning_rate": 0.0008198532174148289, + "loss": 0.85659254, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.51513672, + "step": 1563, + "time_per_iteration": 2.7241976261138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097195, + "balance_loss_mlp": 1.04941559, + "epoch": 0.3008849557522124, + "flos": 1493613409536.0, + "grad_norm": 0.019627167679756308, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.8178336, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.47753906, + "step": 1564, + "time_per_iteration": 4.851420879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122708, + "balance_loss_mlp": 1.07035148, + "epoch": 0.30107733743747594, + "flos": 510824949504.0, + "grad_norm": 0.045341503179798265, + "language_loss": 0.90611446, + "learning_rate": 0.0008193740542985244, + "loss": 0.91734147, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.52441406, + "step": 1565, + "time_per_iteration": 2.62724232673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113673, + "balance_loss_mlp": 1.06098223, + "epoch": 0.30126971912273953, + "flos": 588821936640.0, + "grad_norm": 0.04014967632238747, + "language_loss": 0.87587321, + "learning_rate": 0.0008191342867058467, + "loss": 0.88700998, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.52783203, + "step": 1566, + "time_per_iteration": 2.766045570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133038, + "balance_loss_mlp": 1.07991791, + "epoch": 0.30146210080800306, + "flos": 603221216256.0, + "grad_norm": 0.039455426947262194, + "language_loss": 0.84397018, + "learning_rate": 0.0008188943952142509, + "loss": 0.85530061, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.53222656, + "step": 1567, + "time_per_iteration": 2.798323154449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113428, + "balance_loss_mlp": 1.06030834, + "epoch": 0.30165448249326665, + "flos": 919287973632.0, + "grad_norm": 0.03836627098538091, + "language_loss": 0.83653766, + "learning_rate": 0.0008186543799168711, + "loss": 0.84767193, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.53222656, + "step": 1568, + "time_per_iteration": 3.1216585636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112139, + "balance_loss_mlp": 1.0594008, + "epoch": 0.3018468641785302, + "flos": 778631325696.0, + "grad_norm": 0.037681015369085746, + "language_loss": 0.89441907, + "learning_rate": 0.0008184142409068892, + "loss": 0.90554047, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.52832031, + "step": 1569, + "time_per_iteration": 2.9987363815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.03584409, + "epoch": 0.30203924586379377, + "flos": 523389931776.0, + "grad_norm": 0.031063886155947292, + "language_loss": 0.87584674, + "learning_rate": 0.000818173978277536, + "loss": 0.88672638, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.52197266, + "step": 1570, + "time_per_iteration": 2.657801389694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.04125619, + "epoch": 0.3022316275490573, + "flos": 525649994496.0, + "grad_norm": 0.03542742618693904, + "language_loss": 0.8460654, + "learning_rate": 0.000817933592122089, + "loss": 0.85699487, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.51757812, + "step": 1571, + "time_per_iteration": 2.699676752090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094536, + "balance_loss_mlp": 1.04289424, + "epoch": 0.3024240092343209, + "flos": 480873515520.0, + "grad_norm": 0.03710559119511486, + "language_loss": 0.84148443, + "learning_rate": 0.0008176930825338749, + "loss": 0.85242975, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.51708984, + "step": 1572, + "time_per_iteration": 2.560293197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085585, + "balance_loss_mlp": 1.03446782, + "epoch": 0.3026163909195845, + "flos": 688431938304.0, + "grad_norm": 0.03769478699711506, + "language_loss": 0.89810324, + "learning_rate": 0.0008174524496062679, + "loss": 0.90895915, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.51171875, + "step": 1573, + "time_per_iteration": 2.9185256958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083791, + "balance_loss_mlp": 1.03334129, + "epoch": 0.302808772604848, + "flos": 544087253760.0, + "grad_norm": 0.033203995249134796, + "language_loss": 0.86450267, + "learning_rate": 0.0008172116934326894, + "loss": 0.87534058, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.50488281, + "step": 1574, + "time_per_iteration": 2.77254056930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107482, + "balance_loss_mlp": 1.02456117, + "epoch": 0.3030011542901116, + "flos": 476052046080.0, + "grad_norm": 0.03232260410081742, + "language_loss": 0.88820696, + "learning_rate": 0.0008169708141066097, + "loss": 0.89895517, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.50268555, + "step": 1575, + "time_per_iteration": 2.5428524017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083713, + "balance_loss_mlp": 1.03402615, + "epoch": 0.30319353597537513, + "flos": 482473542912.0, + "grad_norm": 0.035261838486320786, + "language_loss": 0.91478366, + "learning_rate": 0.0008167298117215465, + "loss": 0.92562079, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.49536133, + "step": 1576, + "time_per_iteration": 2.5388023853302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_mlp": 1.0151732, + "epoch": 0.3033859176606387, + "flos": 706113897984.0, + "grad_norm": 0.033895137386355495, + "language_loss": 0.89157575, + "learning_rate": 0.0008164886863710649, + "loss": 0.90221858, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.49047852, + "step": 1577, + "time_per_iteration": 2.9326250553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072249, + "balance_loss_mlp": 1.02363503, + "epoch": 0.30357829934590225, + "flos": 766110084864.0, + "grad_norm": 0.03320904121402137, + "language_loss": 0.87079322, + "learning_rate": 0.0008162474381487783, + "loss": 0.88151574, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.48608398, + "step": 1578, + "time_per_iteration": 3.0217320919036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_mlp": 1.02135277, + "epoch": 0.30377068103116583, + "flos": 533449887744.0, + "grad_norm": 0.035817825196195696, + "language_loss": 0.854909, + "learning_rate": 0.0008160060671483475, + "loss": 0.86560726, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.48461914, + "step": 1579, + "time_per_iteration": 2.6730797290802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074874, + "balance_loss_mlp": 1.02647483, + "epoch": 0.3039630627164294, + "flos": 511224470016.0, + "grad_norm": 0.04566645575365512, + "language_loss": 0.84833682, + "learning_rate": 0.0008157645734634809, + "loss": 0.85908556, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.48388672, + "step": 1580, + "time_per_iteration": 2.5822741985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186287, + "balance_loss_mlp": 1.14089203, + "epoch": 0.30415544440169295, + "flos": 1509190841856.0, + "grad_norm": 0.045615209750242004, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78082776, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.453125, + "step": 1581, + "time_per_iteration": 4.900806665420532 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157879, + "balance_loss_mlp": 1.11257935, + "epoch": 0.30434782608695654, + "flos": 1461789772800.0, + "grad_norm": 0.04177274485031814, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74372375, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.45214844, + "step": 1582, + "time_per_iteration": 4.890560150146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071245, + "balance_loss_mlp": 1.02329922, + "epoch": 0.3045402077722201, + "flos": 483535044096.0, + "grad_norm": 0.03665669352532136, + "language_loss": 0.84926951, + "learning_rate": 0.000815039357240067, + "loss": 0.85998201, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.47924805, + "step": 1583, + "time_per_iteration": 2.655641555786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075238, + "balance_loss_mlp": 1.02695799, + "epoch": 0.30473258945748366, + "flos": 544627725312.0, + "grad_norm": 0.03699880598765725, + "language_loss": 0.86035675, + "learning_rate": 0.0008147973737554952, + "loss": 0.87110913, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.48266602, + "step": 1584, + "time_per_iteration": 2.8118185997009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066041, + "balance_loss_mlp": 1.01754665, + "epoch": 0.3049249711427472, + "flos": 568122669312.0, + "grad_norm": 0.039919187148179, + "language_loss": 0.86646891, + "learning_rate": 0.000814555268055744, + "loss": 0.87712932, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.48486328, + "step": 1585, + "time_per_iteration": 2.618649482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067734, + "balance_loss_mlp": 1.01926374, + "epoch": 0.3051173528280108, + "flos": 529290398976.0, + "grad_norm": 0.034961032963054674, + "language_loss": 0.88066852, + "learning_rate": 0.0008143130402348073, + "loss": 0.89134592, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.48461914, + "step": 1586, + "time_per_iteration": 2.6645073890686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064545, + "balance_loss_mlp": 1.01593137, + "epoch": 0.3053097345132743, + "flos": 587600042496.0, + "grad_norm": 0.03198607314396223, + "language_loss": 0.79707628, + "learning_rate": 0.0008140706903867265, + "loss": 0.80772173, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.48608398, + "step": 1587, + "time_per_iteration": 2.772688150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_mlp": 1.01706147, + "epoch": 0.3055021161985379, + "flos": 608201133312.0, + "grad_norm": 0.03820330265300666, + "language_loss": 0.90882033, + "learning_rate": 0.0008138282186055897, + "loss": 0.91947937, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.48803711, + "step": 1588, + "time_per_iteration": 2.6824429035186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106751, + "balance_loss_mlp": 1.01851535, + "epoch": 0.3056944978838015, + "flos": 574963128576.0, + "grad_norm": 0.03364087196891663, + "language_loss": 0.83419842, + "learning_rate": 0.0008135856249855331, + "loss": 0.84487349, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.48950195, + "step": 1589, + "time_per_iteration": 2.6829729080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.0164994, + "epoch": 0.305886879569065, + "flos": 635072076288.0, + "grad_norm": 0.036524553871552005, + "language_loss": 0.90591866, + "learning_rate": 0.0008133429096207398, + "loss": 0.91657621, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.4909668, + "step": 1590, + "time_per_iteration": 2.7734742164611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135399, + "balance_loss_mlp": 1.08351898, + "epoch": 0.3060792612543286, + "flos": 1372133769216.0, + "grad_norm": 0.023040785082221134, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76447666, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.51953125, + "step": 1591, + "time_per_iteration": 4.964044094085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106727, + "balance_loss_mlp": 1.01806068, + "epoch": 0.30627164293959214, + "flos": 519619269888.0, + "grad_norm": 0.029618090290997726, + "language_loss": 0.87174189, + "learning_rate": 0.0008128571140339123, + "loss": 0.88241458, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.49121094, + "step": 1592, + "time_per_iteration": 2.6813180446624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068394, + "balance_loss_mlp": 1.01942289, + "epoch": 0.3064640246248557, + "flos": 456533843712.0, + "grad_norm": 0.02963099688993501, + "language_loss": 0.87551641, + "learning_rate": 0.0008126140340004805, + "loss": 0.88620031, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.48876953, + "step": 1593, + "time_per_iteration": 2.5293447971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_mlp": 1.01580834, + "epoch": 0.30665640631011926, + "flos": 851609511936.0, + "grad_norm": 0.028917997945976257, + "language_loss": 0.82855684, + "learning_rate": 0.0008123708325995172, + "loss": 0.8392061, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.49023438, + "step": 1594, + "time_per_iteration": 3.1976583003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068322, + "balance_loss_mlp": 1.01937473, + "epoch": 0.30684878799538284, + "flos": 759616656384.0, + "grad_norm": 0.02786640270256765, + "language_loss": 0.80270225, + "learning_rate": 0.0008121275099254414, + "loss": 0.81338549, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.48901367, + "step": 1595, + "time_per_iteration": 2.9073448181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105895, + "balance_loss_mlp": 1.01069379, + "epoch": 0.3070411696806464, + "flos": 518596652544.0, + "grad_norm": 0.02828411740511225, + "language_loss": 0.89261508, + "learning_rate": 0.0008118840660727194, + "loss": 0.90320462, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.48242188, + "step": 1596, + "time_per_iteration": 2.6137096881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105679, + "balance_loss_mlp": 1.00855815, + "epoch": 0.30723355136590996, + "flos": 845791670016.0, + "grad_norm": 0.02807637717187332, + "language_loss": 0.8853125, + "learning_rate": 0.0008116405011358644, + "loss": 0.89588046, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.48217773, + "step": 1597, + "time_per_iteration": 3.1528680324554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059967, + "balance_loss_mlp": 1.01163971, + "epoch": 0.30742593305117355, + "flos": 467079836160.0, + "grad_norm": 0.032917462624290315, + "language_loss": 0.80716425, + "learning_rate": 0.0008113968152094369, + "loss": 0.81776392, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.4831543, + "step": 1598, + "time_per_iteration": 2.5390987396240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059252, + "balance_loss_mlp": 1.011235, + "epoch": 0.3076183147364371, + "flos": 687817589760.0, + "grad_norm": 0.03298344899906339, + "language_loss": 0.830042, + "learning_rate": 0.0008111530083880438, + "loss": 0.84063458, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.47998047, + "step": 1599, + "time_per_iteration": 2.904327154159546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059695, + "balance_loss_mlp": 1.01170099, + "epoch": 0.30781069642170067, + "flos": 615180598272.0, + "grad_norm": 0.03364515132561045, + "language_loss": 0.86925042, + "learning_rate": 0.0008109090807663399, + "loss": 0.87984729, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.47973633, + "step": 1600, + "time_per_iteration": 2.794553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059845, + "balance_loss_mlp": 1.01206601, + "epoch": 0.3080030781069642, + "flos": 591509710080.0, + "grad_norm": 0.029450986393402313, + "language_loss": 0.89288217, + "learning_rate": 0.0008106650324390257, + "loss": 0.90348059, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.47753906, + "step": 1601, + "time_per_iteration": 2.825118064880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055341, + "balance_loss_mlp": 1.00744271, + "epoch": 0.3081954597922278, + "flos": 563691972096.0, + "grad_norm": 0.03217567830931305, + "language_loss": 0.82333392, + "learning_rate": 0.0008104208635008493, + "loss": 0.83388734, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.47875977, + "step": 1602, + "time_per_iteration": 2.7727856636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057631, + "balance_loss_mlp": 1.0099231, + "epoch": 0.3083878414774913, + "flos": 448762140672.0, + "grad_norm": 0.03928010080840531, + "language_loss": 0.82422024, + "learning_rate": 0.0008101765740466058, + "loss": 0.83479655, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.47680664, + "step": 1603, + "time_per_iteration": 2.5764591693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106031, + "balance_loss_mlp": 1.01272202, + "epoch": 0.3085802231627549, + "flos": 494545685760.0, + "grad_norm": 0.03880240670965016, + "language_loss": 0.84925759, + "learning_rate": 0.0008099321641711364, + "loss": 0.85986066, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.47558594, + "step": 1604, + "time_per_iteration": 2.6562154293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059833, + "balance_loss_mlp": 1.01262641, + "epoch": 0.3087726048480185, + "flos": 488690905344.0, + "grad_norm": 0.030963234073246262, + "language_loss": 0.84138477, + "learning_rate": 0.0008096876339693295, + "loss": 0.85198307, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.47167969, + "step": 1605, + "time_per_iteration": 2.6818747520446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057926, + "balance_loss_mlp": 1.01083875, + "epoch": 0.308964986533282, + "flos": 731888346624.0, + "grad_norm": 0.03606871420254603, + "language_loss": 0.82584137, + "learning_rate": 0.0008094429835361206, + "loss": 0.83642066, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.47045898, + "step": 1606, + "time_per_iteration": 2.940202236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059317, + "balance_loss_mlp": 1.01211011, + "epoch": 0.3091573682185456, + "flos": 606516535296.0, + "grad_norm": 0.033324674351776856, + "language_loss": 0.86802429, + "learning_rate": 0.0008091982129664908, + "loss": 0.87861747, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.47167969, + "step": 1607, + "time_per_iteration": 2.7152366638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055819, + "balance_loss_mlp": 1.00858819, + "epoch": 0.30934974990380915, + "flos": 461307681024.0, + "grad_norm": 0.0316485976101594, + "language_loss": 0.83554763, + "learning_rate": 0.0008089533223554687, + "loss": 0.84610581, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.47192383, + "step": 1608, + "time_per_iteration": 2.73236083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_mlp": 1.00692201, + "epoch": 0.30954213158907273, + "flos": 554568117504.0, + "grad_norm": 0.03240022060424308, + "language_loss": 0.85798776, + "learning_rate": 0.0008087083117981294, + "loss": 0.86852884, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.47143555, + "step": 1609, + "time_per_iteration": 2.8992979526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052885, + "balance_loss_mlp": 1.00543988, + "epoch": 0.30973451327433627, + "flos": 554114161920.0, + "grad_norm": 0.03509024741452312, + "language_loss": 0.88937026, + "learning_rate": 0.0008084631813895943, + "loss": 0.89989913, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.47412109, + "step": 1610, + "time_per_iteration": 2.8113343715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00168037, + "epoch": 0.30992689495959985, + "flos": 566763714816.0, + "grad_norm": 0.03310460584308608, + "language_loss": 0.8446725, + "learning_rate": 0.0008082179312250315, + "loss": 0.85516399, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.47436523, + "step": 1611, + "time_per_iteration": 2.6286494731903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146973, + "balance_loss_mlp": 1.09509277, + "epoch": 0.3101192766448634, + "flos": 1445562998784.0, + "grad_norm": 0.022501740699277736, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.8100282, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.51953125, + "step": 1612, + "time_per_iteration": 4.877255439758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132568, + "balance_loss_mlp": 1.08087921, + "epoch": 0.31031165833012697, + "flos": 1535130541056.0, + "grad_norm": 0.020576462480935535, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.777619, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.51757812, + "step": 1613, + "time_per_iteration": 5.064774751663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.00363839, + "epoch": 0.31050404001539056, + "flos": 993633862656.0, + "grad_norm": 0.03245007970491877, + "language_loss": 0.83116508, + "learning_rate": 0.0008074814631475545, + "loss": 0.84167451, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.47265625, + "step": 1614, + "time_per_iteration": 3.322155714035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_mlp": 1.00741875, + "epoch": 0.3106964217006541, + "flos": 446973530112.0, + "grad_norm": 0.03235075185089818, + "language_loss": 0.80034411, + "learning_rate": 0.0008072357349114907, + "loss": 0.81089151, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.47290039, + "step": 1615, + "time_per_iteration": 2.699772596359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056016, + "balance_loss_mlp": 1.00880885, + "epoch": 0.3108888033859177, + "flos": 511495678464.0, + "grad_norm": 0.0340106704308988, + "language_loss": 0.89603639, + "learning_rate": 0.0008069898873959363, + "loss": 0.90659654, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.47167969, + "step": 1616, + "time_per_iteration": 2.680640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051359, + "balance_loss_mlp": 1.0043664, + "epoch": 0.3110811850711812, + "flos": 521779210752.0, + "grad_norm": 0.029395602971080924, + "language_loss": 0.86344647, + "learning_rate": 0.0008067439206963375, + "loss": 0.87396008, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.46948242, + "step": 1617, + "time_per_iteration": 2.6484971046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055386, + "balance_loss_mlp": 1.00844121, + "epoch": 0.3112735667564448, + "flos": 687731073792.0, + "grad_norm": 0.03406090033110643, + "language_loss": 0.87673247, + "learning_rate": 0.0008064978349081873, + "loss": 0.88728631, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.46899414, + "step": 1618, + "time_per_iteration": 2.92702579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_mlp": 1.00965679, + "epoch": 0.31146594844170833, + "flos": 534166303488.0, + "grad_norm": 0.030256910717709223, + "language_loss": 0.87292403, + "learning_rate": 0.0008062516301270245, + "loss": 0.88348979, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.46875, + "step": 1619, + "time_per_iteration": 2.7301478385925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.00859511, + "epoch": 0.3116583301269719, + "flos": 680842982400.0, + "grad_norm": 0.027867683897015817, + "language_loss": 0.88937479, + "learning_rate": 0.0008060053064484343, + "loss": 0.89992964, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.46850586, + "step": 1620, + "time_per_iteration": 2.947906017303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048804, + "balance_loss_mlp": 1.00202632, + "epoch": 0.31185071181223545, + "flos": 587330779392.0, + "grad_norm": 0.03167203134142694, + "language_loss": 0.86095911, + "learning_rate": 0.0008057588639680482, + "loss": 0.87144709, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.46728516, + "step": 1621, + "time_per_iteration": 2.7836551666259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_mlp": 1.00282919, + "epoch": 0.31204309349749904, + "flos": 726658608384.0, + "grad_norm": 0.037979301866738396, + "language_loss": 0.83855367, + "learning_rate": 0.0008055123027815434, + "loss": 0.84904802, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.46557617, + "step": 1622, + "time_per_iteration": 2.9263358116149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_mlp": 1.00455689, + "epoch": 0.3122354751827626, + "flos": 577895865600.0, + "grad_norm": 0.032507776226150094, + "language_loss": 0.85607505, + "learning_rate": 0.0008052656229846436, + "loss": 0.86658645, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.46533203, + "step": 1623, + "time_per_iteration": 2.662386894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051831, + "balance_loss_mlp": 1.00514877, + "epoch": 0.31242785686802615, + "flos": 577029750528.0, + "grad_norm": 0.03513403942618559, + "language_loss": 0.91195071, + "learning_rate": 0.0008050188246731182, + "loss": 0.92246902, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.46630859, + "step": 1624, + "time_per_iteration": 2.710176467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052884, + "balance_loss_mlp": 1.00624907, + "epoch": 0.31262023855328974, + "flos": 738197082624.0, + "grad_norm": 0.0324646036152644, + "language_loss": 0.82931978, + "learning_rate": 0.0008047719079427834, + "loss": 0.83984858, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.46582031, + "step": 1625, + "time_per_iteration": 2.970287561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082729, + "balance_loss_mlp": 1.03533173, + "epoch": 0.3128126202385533, + "flos": 1562594445312.0, + "grad_norm": 0.01743050972952843, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75434434, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.47363281, + "step": 1626, + "time_per_iteration": 4.816533088684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053999, + "balance_loss_mlp": 1.0071733, + "epoch": 0.31300500192381686, + "flos": 515943872256.0, + "grad_norm": 0.030770809254638827, + "language_loss": 0.86711371, + "learning_rate": 0.0008042777196091757, + "loss": 0.87765372, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.46777344, + "step": 1627, + "time_per_iteration": 2.7191882133483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057531, + "balance_loss_mlp": 1.01072919, + "epoch": 0.3131973836090804, + "flos": 527662181376.0, + "grad_norm": 0.031150181208545357, + "language_loss": 0.82488692, + "learning_rate": 0.0008040304481977643, + "loss": 0.83546221, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.4675293, + "step": 1628, + "time_per_iteration": 2.706782579421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.01065385, + "epoch": 0.313389765294344, + "flos": 824210736384.0, + "grad_norm": 0.032636383561425994, + "language_loss": 0.87568998, + "learning_rate": 0.0008037830587512649, + "loss": 0.88626337, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.46630859, + "step": 1629, + "time_per_iteration": 3.0928542613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054937, + "balance_loss_mlp": 1.00820696, + "epoch": 0.31358214697960757, + "flos": 394703359488.0, + "grad_norm": 0.03241768310332359, + "language_loss": 0.79631239, + "learning_rate": 0.0008035355513657224, + "loss": 0.80686176, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.46679688, + "step": 1630, + "time_per_iteration": 2.449666738510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054798, + "balance_loss_mlp": 1.00806797, + "epoch": 0.3137745286648711, + "flos": 573098695680.0, + "grad_norm": 0.0293939817515363, + "language_loss": 0.93494189, + "learning_rate": 0.0008032879261372279, + "loss": 0.94548988, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.46679688, + "step": 1631, + "time_per_iteration": 2.766951084136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068432, + "balance_loss_mlp": 1.02256012, + "epoch": 0.3139669103501347, + "flos": 1501632021504.0, + "grad_norm": 0.011791019456215185, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80704272, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.45800781, + "step": 1632, + "time_per_iteration": 5.585620403289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_mlp": 1.00425589, + "epoch": 0.3141592920353982, + "flos": 526359607296.0, + "grad_norm": 0.030163528949794682, + "language_loss": 0.87607086, + "learning_rate": 0.0008027923225359748, + "loss": 0.88657928, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.46533203, + "step": 1633, + "time_per_iteration": 2.607407808303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105517, + "balance_loss_mlp": 1.0084641, + "epoch": 0.3143516737206618, + "flos": 594388012032.0, + "grad_norm": 0.030785944321789945, + "language_loss": 0.88644683, + "learning_rate": 0.0008025443443556267, + "loss": 0.89699847, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.46655273, + "step": 1634, + "time_per_iteration": 2.704568862915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053981, + "balance_loss_mlp": 1.00756085, + "epoch": 0.31454405540592534, + "flos": 649680347904.0, + "grad_norm": 0.028625636333363444, + "language_loss": 0.88813668, + "learning_rate": 0.000802296248717147, + "loss": 0.89867646, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.46362305, + "step": 1635, + "time_per_iteration": 2.914228916168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_mlp": 1.00461841, + "epoch": 0.3147364370911889, + "flos": 644070531072.0, + "grad_norm": 0.032412817231273386, + "language_loss": 0.79727387, + "learning_rate": 0.0008020480357168554, + "loss": 0.80778593, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.46533203, + "step": 1636, + "time_per_iteration": 2.8196966648101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_mlp": 1.00505865, + "epoch": 0.31492881877645246, + "flos": 472821855744.0, + "grad_norm": 0.028828485286514015, + "language_loss": 0.88662213, + "learning_rate": 0.0008017997054511165, + "loss": 0.89713949, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.46630859, + "step": 1637, + "time_per_iteration": 2.6545960903167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051567, + "balance_loss_mlp": 1.00486124, + "epoch": 0.31512120046171604, + "flos": 630630685440.0, + "grad_norm": 0.03463883423234526, + "language_loss": 0.86238796, + "learning_rate": 0.0008015512580163407, + "loss": 0.87290359, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.46655273, + "step": 1638, + "time_per_iteration": 2.775726795196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00429583, + "epoch": 0.31531358214697963, + "flos": 705054342144.0, + "grad_norm": 0.0328972983749375, + "language_loss": 0.81582069, + "learning_rate": 0.0008013026935089838, + "loss": 0.82632947, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.46533203, + "step": 1639, + "time_per_iteration": 2.859405040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048793, + "balance_loss_mlp": 1.00182474, + "epoch": 0.31550596383224316, + "flos": 573632364288.0, + "grad_norm": 0.03266078051512415, + "language_loss": 0.84787768, + "learning_rate": 0.0008010540120255472, + "loss": 0.85836554, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.46923828, + "step": 1640, + "time_per_iteration": 2.654087781906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051523, + "balance_loss_mlp": 1.00457835, + "epoch": 0.31569834551750675, + "flos": 659513815296.0, + "grad_norm": 0.0373471738494659, + "language_loss": 0.87093472, + "learning_rate": 0.0008008052136625774, + "loss": 0.88144994, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.46899414, + "step": 1641, + "time_per_iteration": 2.7806570529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_mlp": 1.00730693, + "epoch": 0.3158907272027703, + "flos": 567404308224.0, + "grad_norm": 0.028103315573088077, + "language_loss": 0.87394774, + "learning_rate": 0.0008005562985166666, + "loss": 0.88449007, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.46875, + "step": 1642, + "time_per_iteration": 2.6866798400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053754, + "balance_loss_mlp": 1.00699973, + "epoch": 0.31608310888803387, + "flos": 537973903872.0, + "grad_norm": 0.024374019828786602, + "language_loss": 0.85555339, + "learning_rate": 0.0008003072666844524, + "loss": 0.86609089, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.46704102, + "step": 1643, + "time_per_iteration": 2.684518337249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_mlp": 1.00856149, + "epoch": 0.3162754905732974, + "flos": 487640097792.0, + "grad_norm": 0.037314537224785074, + "language_loss": 0.8350842, + "learning_rate": 0.0008000581182626173, + "loss": 0.84563494, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.46459961, + "step": 1644, + "time_per_iteration": 2.5574259757995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051572, + "balance_loss_mlp": 1.00481844, + "epoch": 0.316467872258561, + "flos": 531096506112.0, + "grad_norm": 0.03327277300757214, + "language_loss": 0.87005818, + "learning_rate": 0.0007998088533478894, + "loss": 0.88057387, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.46704102, + "step": 1645, + "time_per_iteration": 2.6987338066101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_mlp": 1.00894499, + "epoch": 0.3166602539438245, + "flos": 444414068736.0, + "grad_norm": 0.040202418156990175, + "language_loss": 0.85042381, + "learning_rate": 0.000799559472037042, + "loss": 0.8609792, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.46533203, + "step": 1646, + "time_per_iteration": 2.6219563484191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056055, + "balance_loss_mlp": 1.00958765, + "epoch": 0.3168526356290881, + "flos": 647103389952.0, + "grad_norm": 0.026601574185044653, + "language_loss": 0.8823331, + "learning_rate": 0.0007993099744268932, + "loss": 0.89289367, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.46411133, + "step": 1647, + "time_per_iteration": 2.8902037143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_mlp": 1.00817358, + "epoch": 0.3170450173143517, + "flos": 587258847744.0, + "grad_norm": 0.03281471441230887, + "language_loss": 0.8855083, + "learning_rate": 0.000799060360614307, + "loss": 0.89605635, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.46582031, + "step": 1648, + "time_per_iteration": 2.694293975830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_mlp": 1.00945473, + "epoch": 0.3172373989996152, + "flos": 828574359552.0, + "grad_norm": 0.03046931045185914, + "language_loss": 0.84284711, + "learning_rate": 0.0007988106306961917, + "loss": 0.85340536, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.46313477, + "step": 1649, + "time_per_iteration": 3.121788501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_mlp": 1.01195896, + "epoch": 0.3174297806848788, + "flos": 528434977536.0, + "grad_norm": 0.03563880571664149, + "language_loss": 0.85299373, + "learning_rate": 0.0007985607847695014, + "loss": 0.8635785, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.46459961, + "step": 1650, + "time_per_iteration": 2.625356912612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_mlp": 1.00107014, + "epoch": 0.31762216237014235, + "flos": 714482452992.0, + "grad_norm": 0.030498079123472206, + "language_loss": 0.83133662, + "learning_rate": 0.0007983108229312345, + "loss": 0.84180987, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.46191406, + "step": 1651, + "time_per_iteration": 2.894109010696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_mlp": 1.00362098, + "epoch": 0.31781454405540593, + "flos": 484800679680.0, + "grad_norm": 0.03387492306443982, + "language_loss": 0.86931884, + "learning_rate": 0.0007980607452784351, + "loss": 0.87981641, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.46069336, + "step": 1652, + "time_per_iteration": 2.5593390464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048407, + "balance_loss_mlp": 1.00236845, + "epoch": 0.31800692574066947, + "flos": 549804973824.0, + "grad_norm": 0.04030851184116312, + "language_loss": 0.90997875, + "learning_rate": 0.0007978105519081919, + "loss": 0.92046285, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.4597168, + "step": 1653, + "time_per_iteration": 2.683809995651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_mlp": 0.99982309, + "epoch": 0.31819930742593305, + "flos": 517917175296.0, + "grad_norm": 0.033294821801319624, + "language_loss": 0.88831019, + "learning_rate": 0.0007975602429176385, + "loss": 0.89876974, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.46069336, + "step": 1654, + "time_per_iteration": 2.5786075592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00238276, + "epoch": 0.31839168911119664, + "flos": 456970302720.0, + "grad_norm": 0.028947480678153642, + "language_loss": 0.82318926, + "learning_rate": 0.0007973098184039536, + "loss": 0.83367276, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.45898438, + "step": 1655, + "time_per_iteration": 2.651188611984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 0.99921381, + "epoch": 0.3185840707964602, + "flos": 627296482560.0, + "grad_norm": 0.03276090001573999, + "language_loss": 0.8731916, + "learning_rate": 0.0007970592784643602, + "loss": 0.88364458, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.46020508, + "step": 1656, + "time_per_iteration": 2.8683595657348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 0.99976265, + "epoch": 0.31877645248172376, + "flos": 568541631744.0, + "grad_norm": 0.035945607337745746, + "language_loss": 0.85986471, + "learning_rate": 0.0007968086231961272, + "loss": 0.87032342, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.46044922, + "step": 1657, + "time_per_iteration": 2.642733335494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00119007, + "epoch": 0.3189688341669873, + "flos": 490553392896.0, + "grad_norm": 0.04377426906704287, + "language_loss": 0.84065533, + "learning_rate": 0.0007965578526965671, + "loss": 0.85112733, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.45947266, + "step": 1658, + "time_per_iteration": 2.5638930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049099, + "balance_loss_mlp": 1.00291717, + "epoch": 0.3191612158522509, + "flos": 577381638912.0, + "grad_norm": 0.02931224295785387, + "language_loss": 0.86766565, + "learning_rate": 0.0007963069670630377, + "loss": 0.87815666, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.46118164, + "step": 1659, + "time_per_iteration": 2.7154479026794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051177, + "balance_loss_mlp": 1.00506639, + "epoch": 0.3193535975375144, + "flos": 539193852672.0, + "grad_norm": 0.03496177903686506, + "language_loss": 0.88776976, + "learning_rate": 0.0007960559663929416, + "loss": 0.89828151, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.46044922, + "step": 1660, + "time_per_iteration": 2.6322021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_mlp": 1.00868368, + "epoch": 0.319545979222778, + "flos": 735628872960.0, + "grad_norm": 0.030221795014758104, + "language_loss": 0.88154632, + "learning_rate": 0.0007958048507837259, + "loss": 0.89209306, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.45922852, + "step": 1661, + "time_per_iteration": 2.9221389293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105245, + "balance_loss_mlp": 1.00648332, + "epoch": 0.31973836090804153, + "flos": 765768890112.0, + "grad_norm": 0.037416739988226255, + "language_loss": 0.87668484, + "learning_rate": 0.0007955536203328822, + "loss": 0.88720942, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.45898438, + "step": 1662, + "time_per_iteration": 2.9018445014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_mlp": 1.00184774, + "epoch": 0.3199307425933051, + "flos": 561742968576.0, + "grad_norm": 0.03025687936293395, + "language_loss": 0.84124553, + "learning_rate": 0.0007953022751379469, + "loss": 0.85172796, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.46337891, + "step": 1663, + "time_per_iteration": 2.781562566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085701, + "balance_loss_mlp": 1.03906643, + "epoch": 0.3201231242785687, + "flos": 752672184576.0, + "grad_norm": 0.03881407073457837, + "language_loss": 0.82717097, + "learning_rate": 0.000795050815296501, + "loss": 0.83802795, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.46582031, + "step": 1664, + "time_per_iteration": 2.9950287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_mlp": 1.00446498, + "epoch": 0.32031550596383224, + "flos": 497385103872.0, + "grad_norm": 0.02713287522590179, + "language_loss": 0.93810016, + "learning_rate": 0.0007947992409061695, + "loss": 0.94860852, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.46313477, + "step": 1665, + "time_per_iteration": 2.583118438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056564, + "balance_loss_mlp": 1.01045382, + "epoch": 0.3205078876490958, + "flos": 732875970816.0, + "grad_norm": 0.03263285268561658, + "language_loss": 0.86165506, + "learning_rate": 0.0007945475520646226, + "loss": 0.8722207, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.46044922, + "step": 1666, + "time_per_iteration": 2.903190851211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_mlp": 1.01324141, + "epoch": 0.32070026933435936, + "flos": 550475702784.0, + "grad_norm": 0.03801033406135743, + "language_loss": 0.85650241, + "learning_rate": 0.0007942957488695743, + "loss": 0.86709714, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.46166992, + "step": 1667, + "time_per_iteration": 2.661292791366577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_mlp": 1.01277089, + "epoch": 0.32089265101962294, + "flos": 746685201408.0, + "grad_norm": 0.031638418068872444, + "language_loss": 0.81749988, + "learning_rate": 0.0007940438314187833, + "loss": 0.82809013, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.46191406, + "step": 1668, + "time_per_iteration": 3.0293474197387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057511, + "balance_loss_mlp": 1.01144862, + "epoch": 0.3210850327048865, + "flos": 495196972800.0, + "grad_norm": 0.034120041175176606, + "language_loss": 0.81371748, + "learning_rate": 0.0007937917998100529, + "loss": 0.82429266, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.45996094, + "step": 1669, + "time_per_iteration": 2.5822434425354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08258255, + "balance_loss_mlp": 8.0, + "epoch": 0.32127741439015006, + "flos": 531673916160.0, + "grad_norm": 0.043058724234977634, + "language_loss": 0.81425405, + "learning_rate": 0.0007935396541412302, + "loss": 0.89683664, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 2.58203125, + "step": 1670, + "time_per_iteration": 2.5968360900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0830899, + "balance_loss_mlp": 8.0, + "epoch": 0.3214697960754136, + "flos": 502224069888.0, + "grad_norm": 0.0363513778225316, + "language_loss": 0.87401152, + "learning_rate": 0.0007932873945102068, + "loss": 0.9571014, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 3.0859375, + "step": 1671, + "time_per_iteration": 2.582617998123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08312805, + "balance_loss_mlp": 8.0, + "epoch": 0.3216621777606772, + "flos": 1386404736768.0, + "grad_norm": 0.003686648730821959, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.84074581, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 3.125, + "step": 1672, + "time_per_iteration": 4.829998970031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08311279, + "balance_loss_mlp": 8.0, + "epoch": 0.32185455944594077, + "flos": 572635991808.0, + "grad_norm": 0.030782594356869853, + "language_loss": 0.88089788, + "learning_rate": 0.0007927825337533461, + "loss": 0.96401072, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 3.109375, + "step": 1673, + "time_per_iteration": 2.6633598804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08310516, + "balance_loss_mlp": 8.0, + "epoch": 0.3220469411312043, + "flos": 544937817600.0, + "grad_norm": 0.040711103761993876, + "language_loss": 0.86732781, + "learning_rate": 0.0007925299328235131, + "loss": 0.95043296, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 3.1015625, + "step": 1674, + "time_per_iteration": 2.634169578552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08307083, + "balance_loss_mlp": 8.0, + "epoch": 0.3222393228164679, + "flos": 492162168576.0, + "grad_norm": 0.03938689136463286, + "language_loss": 0.86802006, + "learning_rate": 0.000792277218323488, + "loss": 0.95109081, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 3.06640625, + "step": 1675, + "time_per_iteration": 2.5893990993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.08270843, + "balance_loss_mlp": 8.0, + "epoch": 0.3224317045017314, + "flos": 491363127552.0, + "grad_norm": 0.03386575094399551, + "language_loss": 0.86165106, + "learning_rate": 0.0007920243903513833, + "loss": 0.94435954, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 2.7109375, + "step": 1676, + "time_per_iteration": 2.5602426528930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02321873, + "balance_loss_mlp": 2.26942062, + "epoch": 0.322624086186995, + "flos": 576871302912.0, + "grad_norm": 0.12910494226103245, + "language_loss": 0.85448408, + "learning_rate": 0.0007917714490053556, + "loss": 0.87770277, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.52539062, + "step": 1677, + "time_per_iteration": 2.6558380126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071536, + "balance_loss_mlp": 1.02492559, + "epoch": 0.32281646787225854, + "flos": 630572359680.0, + "grad_norm": 0.04049679721352166, + "language_loss": 0.87627459, + "learning_rate": 0.0007915183943836055, + "loss": 0.88698995, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.46557617, + "step": 1678, + "time_per_iteration": 2.898658037185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.02631712, + "epoch": 0.3230088495575221, + "flos": 782808311040.0, + "grad_norm": 0.04272749105284559, + "language_loss": 0.85738349, + "learning_rate": 0.0007912652265843773, + "loss": 0.86811107, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.46386719, + "step": 1679, + "time_per_iteration": 3.049938917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082142, + "balance_loss_mlp": 1.03557873, + "epoch": 0.3232012312427857, + "flos": 537201107712.0, + "grad_norm": 0.04201967602882564, + "language_loss": 0.83624417, + "learning_rate": 0.0007910119457059597, + "loss": 0.84706557, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.46508789, + "step": 1680, + "time_per_iteration": 2.7126853466033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108585, + "balance_loss_mlp": 1.03895342, + "epoch": 0.32339361292804925, + "flos": 706233461760.0, + "grad_norm": 0.044345030126194285, + "language_loss": 0.81981564, + "learning_rate": 0.0007907585518466849, + "loss": 0.83067411, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.46850586, + "step": 1681, + "time_per_iteration": 2.9758992195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088847, + "balance_loss_mlp": 1.0419023, + "epoch": 0.32358599461331283, + "flos": 453257966592.0, + "grad_norm": 0.04210474159896445, + "language_loss": 0.91257876, + "learning_rate": 0.000790505045104929, + "loss": 0.92346722, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.46899414, + "step": 1682, + "time_per_iteration": 2.5105395317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090368, + "balance_loss_mlp": 1.04337561, + "epoch": 0.32377837629857636, + "flos": 602092641024.0, + "grad_norm": 0.04465728550727914, + "language_loss": 0.88834655, + "learning_rate": 0.0007902514255791125, + "loss": 0.89925027, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.46948242, + "step": 1683, + "time_per_iteration": 2.7610387802124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089116, + "balance_loss_mlp": 1.04190934, + "epoch": 0.32397075798383995, + "flos": 808899654912.0, + "grad_norm": 0.04108658803287063, + "language_loss": 0.89801908, + "learning_rate": 0.0007899976933676986, + "loss": 0.90891027, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.47167969, + "step": 1684, + "time_per_iteration": 2.963387966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089307, + "balance_loss_mlp": 1.04205263, + "epoch": 0.3241631396691035, + "flos": 602793505536.0, + "grad_norm": 0.046655842402160155, + "language_loss": 0.89137548, + "learning_rate": 0.0007897438485691955, + "loss": 0.90226853, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.47216797, + "step": 1685, + "time_per_iteration": 2.675910711288452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079467, + "balance_loss_mlp": 1.03195012, + "epoch": 0.32435552135436707, + "flos": 475177182720.0, + "grad_norm": 0.045429866607221585, + "language_loss": 0.84063458, + "learning_rate": 0.0007894898912821542, + "loss": 0.85142922, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.47485352, + "step": 1686, + "time_per_iteration": 2.530951976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077585, + "balance_loss_mlp": 1.02980566, + "epoch": 0.3245479030396306, + "flos": 539220097536.0, + "grad_norm": 0.03833008440392265, + "language_loss": 0.88029444, + "learning_rate": 0.0007892358216051695, + "loss": 0.89107037, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.47753906, + "step": 1687, + "time_per_iteration": 2.7729742527008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.01963735, + "epoch": 0.3247402847248942, + "flos": 548697785856.0, + "grad_norm": 0.039082280310976325, + "language_loss": 0.93519121, + "learning_rate": 0.0007889816396368803, + "loss": 0.94586968, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.48193359, + "step": 1688, + "time_per_iteration": 2.625795602798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_mlp": 1.01371753, + "epoch": 0.3249326664101578, + "flos": 378992757504.0, + "grad_norm": 0.03548852277095179, + "language_loss": 0.86296374, + "learning_rate": 0.0007887273454759687, + "loss": 0.87358844, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.48754883, + "step": 1689, + "time_per_iteration": 2.4798507690429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070366, + "balance_loss_mlp": 1.02106154, + "epoch": 0.3251250480954213, + "flos": 529123203072.0, + "grad_norm": 0.03304707654173593, + "language_loss": 0.83602285, + "learning_rate": 0.0007884729392211603, + "loss": 0.84672654, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.49194336, + "step": 1690, + "time_per_iteration": 2.6475188732147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.01732576, + "epoch": 0.3253174297806849, + "flos": 450559499520.0, + "grad_norm": 0.03986808198030794, + "language_loss": 0.86860085, + "learning_rate": 0.0007882184209712245, + "loss": 0.87927043, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.49609375, + "step": 1691, + "time_per_iteration": 2.5213029384613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.03961909, + "epoch": 0.32550981146594843, + "flos": 705490801152.0, + "grad_norm": 0.03183986603149819, + "language_loss": 0.86227143, + "learning_rate": 0.000787963790824974, + "loss": 0.8731674, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.49975586, + "step": 1692, + "time_per_iteration": 2.9866673946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086614, + "balance_loss_mlp": 1.03654587, + "epoch": 0.325702193151212, + "flos": 393559233024.0, + "grad_norm": 0.035135222587328305, + "language_loss": 0.90092403, + "learning_rate": 0.0007877090488812651, + "loss": 0.91179013, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.50073242, + "step": 1693, + "time_per_iteration": 2.443784475326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067298, + "balance_loss_mlp": 1.01708698, + "epoch": 0.32589457483647555, + "flos": 578584091136.0, + "grad_norm": 0.03604448220117138, + "language_loss": 0.84406531, + "learning_rate": 0.0007874541952389973, + "loss": 0.85473824, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.50219727, + "step": 1694, + "time_per_iteration": 2.6662275791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069205, + "balance_loss_mlp": 1.01918459, + "epoch": 0.32608695652173914, + "flos": 499330216704.0, + "grad_norm": 0.03462929627838828, + "language_loss": 0.87473089, + "learning_rate": 0.0007871992299971136, + "loss": 0.88542295, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.50024414, + "step": 1695, + "time_per_iteration": 2.5501420497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068106, + "balance_loss_mlp": 1.01803839, + "epoch": 0.32627933820700267, + "flos": 592301948160.0, + "grad_norm": 0.0349674772808078, + "language_loss": 0.85830671, + "learning_rate": 0.0007869441532546001, + "loss": 0.86898774, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.5, + "step": 1696, + "time_per_iteration": 2.7640528678894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065186, + "balance_loss_mlp": 1.01550007, + "epoch": 0.32647171989226625, + "flos": 610274558208.0, + "grad_norm": 0.03448959411295718, + "language_loss": 0.80548751, + "learning_rate": 0.0007866889651104867, + "loss": 0.81613934, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.49658203, + "step": 1697, + "time_per_iteration": 2.8403704166412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106687, + "balance_loss_mlp": 1.01723123, + "epoch": 0.32666410157752984, + "flos": 478190599680.0, + "grad_norm": 0.0393752309547029, + "language_loss": 0.84585583, + "learning_rate": 0.000786433665663846, + "loss": 0.85652447, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.49536133, + "step": 1698, + "time_per_iteration": 2.7460434436798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065434, + "balance_loss_mlp": 1.01603401, + "epoch": 0.3268564832627934, + "flos": 719694694656.0, + "grad_norm": 0.03598572558720647, + "language_loss": 0.87469888, + "learning_rate": 0.0007861782550137942, + "loss": 0.88535315, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.49291992, + "step": 1699, + "time_per_iteration": 2.922189474105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_mlp": 1.01299262, + "epoch": 0.32704886494805696, + "flos": 770106268416.0, + "grad_norm": 0.033319227910548664, + "language_loss": 0.86952895, + "learning_rate": 0.0007859227332594901, + "loss": 0.88014954, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.48999023, + "step": 1700, + "time_per_iteration": 2.8891940116882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_mlp": 1.00782549, + "epoch": 0.3272412466333205, + "flos": 851405377536.0, + "grad_norm": 0.0384838580126543, + "language_loss": 0.85734528, + "learning_rate": 0.0007856671005001365, + "loss": 0.8679111, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.48730469, + "step": 1701, + "time_per_iteration": 3.169032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105609, + "balance_loss_mlp": 1.00728559, + "epoch": 0.3274336283185841, + "flos": 833041995264.0, + "grad_norm": 0.03605284930108709, + "language_loss": 0.82799482, + "learning_rate": 0.0007854113568349787, + "loss": 0.83855575, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.48779297, + "step": 1702, + "time_per_iteration": 3.123967170715332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060179, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3276260100038476, + "flos": 693253407744.0, + "grad_norm": 0.03564674283827795, + "language_loss": 0.81364781, + "learning_rate": 0.0007851555023633052, + "loss": 0.82424963, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.48388672, + "step": 1703, + "time_per_iteration": 2.8430581092834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059511, + "balance_loss_mlp": 1.01120698, + "epoch": 0.3278183916891112, + "flos": 436978702848.0, + "grad_norm": 0.03514994366577059, + "language_loss": 0.83518881, + "learning_rate": 0.0007848995371844474, + "loss": 0.84578383, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.48291016, + "step": 1704, + "time_per_iteration": 2.552917003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056561, + "balance_loss_mlp": 1.00861514, + "epoch": 0.3280107733743748, + "flos": 462017293824.0, + "grad_norm": 0.03278124420090015, + "language_loss": 0.81157213, + "learning_rate": 0.0007846434613977801, + "loss": 0.82213771, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.47924805, + "step": 1705, + "time_per_iteration": 2.496506929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062567, + "balance_loss_mlp": 1.01483595, + "epoch": 0.3282031550596383, + "flos": 680529977856.0, + "grad_norm": 0.03615486988598079, + "language_loss": 0.79136091, + "learning_rate": 0.0007843872751027203, + "loss": 0.80198663, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.47705078, + "step": 1706, + "time_per_iteration": 2.8048393726348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00088739, + "epoch": 0.3283955367449019, + "flos": 546255942912.0, + "grad_norm": 0.030185021157442368, + "language_loss": 0.879673, + "learning_rate": 0.0007841309783987287, + "loss": 0.89015824, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.47607422, + "step": 1707, + "time_per_iteration": 2.7402358055114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053102, + "balance_loss_mlp": 1.00553715, + "epoch": 0.32858791843016544, + "flos": 482241218304.0, + "grad_norm": 0.035416956868504886, + "language_loss": 0.89878803, + "learning_rate": 0.0007838745713853084, + "loss": 0.90931904, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.4753418, + "step": 1708, + "time_per_iteration": 2.603816270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.00752318, + "epoch": 0.328780300115429, + "flos": 567916589568.0, + "grad_norm": 0.03507338685235107, + "language_loss": 0.84775996, + "learning_rate": 0.0007836180541620053, + "loss": 0.8583082, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.47265625, + "step": 1709, + "time_per_iteration": 2.7194666862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_mlp": 1.00730944, + "epoch": 0.32897268180069256, + "flos": 476992038144.0, + "grad_norm": 0.03621825417570051, + "language_loss": 0.86992389, + "learning_rate": 0.0007833614268284082, + "loss": 0.88046837, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.47094727, + "step": 1710, + "time_per_iteration": 2.510921001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.01346588, + "epoch": 0.32916506348595614, + "flos": 1580453327616.0, + "grad_norm": 0.014405511351568959, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75167489, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.44335938, + "step": 1711, + "time_per_iteration": 4.875708341598511 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.00429153, + "epoch": 0.3293574451712197, + "flos": 483851939328.0, + "grad_norm": 0.03545808379065215, + "language_loss": 0.7916249, + "learning_rate": 0.0007828478422289016, + "loss": 0.80213821, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.4699707, + "step": 1712, + "time_per_iteration": 2.583045721054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_mlp": 1.00582564, + "epoch": 0.32954982685648326, + "flos": 623725097472.0, + "grad_norm": 0.0327870747371716, + "language_loss": 0.89787406, + "learning_rate": 0.0007825908851623833, + "loss": 0.9084022, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.46948242, + "step": 1713, + "time_per_iteration": 2.824685573577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050866, + "balance_loss_mlp": 1.00396931, + "epoch": 0.32974220854174685, + "flos": 546071250432.0, + "grad_norm": 0.03386258255996434, + "language_loss": 0.85659784, + "learning_rate": 0.0007823338183843533, + "loss": 0.8671065, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.46850586, + "step": 1714, + "time_per_iteration": 2.672525644302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051399, + "balance_loss_mlp": 1.00459802, + "epoch": 0.3299345902270104, + "flos": 983823727872.0, + "grad_norm": 0.03566876288837857, + "language_loss": 0.82096756, + "learning_rate": 0.0007820766419946141, + "loss": 0.83148158, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.4675293, + "step": 1715, + "time_per_iteration": 3.2718288898468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051636, + "balance_loss_mlp": 1.00662231, + "epoch": 0.33012697191227397, + "flos": 1406904727296.0, + "grad_norm": 0.0085720970679931, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80724114, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.44921875, + "step": 1716, + "time_per_iteration": 4.983957290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065575, + "balance_loss_mlp": 1.01836789, + "epoch": 0.3303193535975375, + "flos": 506170675968.0, + "grad_norm": 0.038525927315114124, + "language_loss": 0.76583785, + "learning_rate": 0.0007815619607794288, + "loss": 0.77649361, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.47167969, + "step": 1717, + "time_per_iteration": 2.6315019130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054249, + "balance_loss_mlp": 1.00713778, + "epoch": 0.3305117352828011, + "flos": 939485653248.0, + "grad_norm": 0.041342276741222116, + "language_loss": 0.83710063, + "learning_rate": 0.0007813044561538001, + "loss": 0.84764308, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.47070312, + "step": 1718, + "time_per_iteration": 3.127446174621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055308, + "balance_loss_mlp": 1.00814831, + "epoch": 0.3307041169680646, + "flos": 722794627584.0, + "grad_norm": 0.03526572402512133, + "language_loss": 0.88796169, + "learning_rate": 0.0007810468423160958, + "loss": 0.89851475, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.47119141, + "step": 1719, + "time_per_iteration": 2.8622305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_mlp": 1.00741386, + "epoch": 0.3308964986533282, + "flos": 584817004800.0, + "grad_norm": 0.029883098234782163, + "language_loss": 0.82424414, + "learning_rate": 0.0007807891193663306, + "loss": 0.83478725, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.46850586, + "step": 1720, + "time_per_iteration": 2.7917239665985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064031, + "balance_loss_mlp": 1.01715815, + "epoch": 0.33108888033859174, + "flos": 474525895680.0, + "grad_norm": 0.040993977150413745, + "language_loss": 0.82757467, + "learning_rate": 0.0007805312874045614, + "loss": 0.83821499, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.46826172, + "step": 1721, + "time_per_iteration": 2.516045331954956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_mlp": 1.00279772, + "epoch": 0.3312812620238553, + "flos": 386996785152.0, + "grad_norm": 0.03885390252626127, + "language_loss": 0.87709427, + "learning_rate": 0.0007802733465308874, + "loss": 0.88759029, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.4675293, + "step": 1722, + "time_per_iteration": 2.4662280082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_mlp": 1.00108933, + "epoch": 0.3314736437091189, + "flos": 495605241600.0, + "grad_norm": 0.03316625802825005, + "language_loss": 0.85110468, + "learning_rate": 0.0007800152968454501, + "loss": 0.86158121, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.46508789, + "step": 1723, + "time_per_iteration": 2.6313533782958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_mlp": 1.00515401, + "epoch": 0.33166602539438245, + "flos": 654931473408.0, + "grad_norm": 0.02722776998075876, + "language_loss": 0.90998107, + "learning_rate": 0.0007797571384484334, + "loss": 0.92049968, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.46655273, + "step": 1724, + "time_per_iteration": 2.8411970138549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00257659, + "epoch": 0.33185840707964603, + "flos": 521835591168.0, + "grad_norm": 0.03419077024576391, + "language_loss": 0.92796665, + "learning_rate": 0.0007794988714400633, + "loss": 0.93846071, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.46777344, + "step": 1725, + "time_per_iteration": 2.5964980125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_mlp": 1.00367355, + "epoch": 0.33205078876490957, + "flos": 437899252992.0, + "grad_norm": 0.033932075991051254, + "language_loss": 0.86014992, + "learning_rate": 0.0007792404959206079, + "loss": 0.87065518, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.46801758, + "step": 1726, + "time_per_iteration": 2.491852283477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_mlp": 1.00497568, + "epoch": 0.33224317045017315, + "flos": 770095574784.0, + "grad_norm": 0.034529473302537826, + "language_loss": 0.82129228, + "learning_rate": 0.0007789820119903774, + "loss": 0.83181036, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.46777344, + "step": 1727, + "time_per_iteration": 2.9898605346679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_mlp": 1.01260376, + "epoch": 0.3324355521354367, + "flos": 1469296103424.0, + "grad_norm": 0.013638873720884416, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79550946, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.45605469, + "step": 1728, + "time_per_iteration": 4.859704971313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_mlp": 1.00343382, + "epoch": 0.3326279338207003, + "flos": 497800175616.0, + "grad_norm": 0.033386991625918766, + "language_loss": 0.84234303, + "learning_rate": 0.0007784647192990428, + "loss": 0.85284609, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.46826172, + "step": 1729, + "time_per_iteration": 2.7268624305725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_mlp": 1.00419581, + "epoch": 0.33282031550596386, + "flos": 637054127616.0, + "grad_norm": 0.031138270474946127, + "language_loss": 0.81414318, + "learning_rate": 0.0007782059107387696, + "loss": 0.82465172, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.46606445, + "step": 1730, + "time_per_iteration": 2.85831618309021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_mlp": 1.00752223, + "epoch": 0.3330126971912274, + "flos": 690722136576.0, + "grad_norm": 0.03556521205278414, + "language_loss": 0.89100444, + "learning_rate": 0.0007779469941693826, + "loss": 0.9015491, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.46899414, + "step": 1731, + "time_per_iteration": 2.8736839294433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058267, + "balance_loss_mlp": 1.01168013, + "epoch": 0.333205078876491, + "flos": 567554007552.0, + "grad_norm": 0.03898705252222011, + "language_loss": 0.77083337, + "learning_rate": 0.0007776879696914029, + "loss": 0.78141606, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.46533203, + "step": 1732, + "time_per_iteration": 2.84578275680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055134, + "balance_loss_mlp": 1.00868976, + "epoch": 0.3333974605617545, + "flos": 642171105024.0, + "grad_norm": 0.028730663384365272, + "language_loss": 0.89631069, + "learning_rate": 0.000777428837405392, + "loss": 0.90686202, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.46386719, + "step": 1733, + "time_per_iteration": 2.8595433235168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.00275302, + "epoch": 0.3335898422470181, + "flos": 462779396352.0, + "grad_norm": 0.03984590801707433, + "language_loss": 0.87746447, + "learning_rate": 0.0007771695974119544, + "loss": 0.88795674, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.46411133, + "step": 1734, + "time_per_iteration": 2.5200014114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_mlp": 1.00537193, + "epoch": 0.33378222393228163, + "flos": 854338114560.0, + "grad_norm": 0.03554719013753984, + "language_loss": 0.76235908, + "learning_rate": 0.0007769102498117359, + "loss": 0.77287674, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.46337891, + "step": 1735, + "time_per_iteration": 3.1014633178710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052548, + "balance_loss_mlp": 1.00624716, + "epoch": 0.3339746056175452, + "flos": 956310246144.0, + "grad_norm": 0.03187783426815399, + "language_loss": 0.80701965, + "learning_rate": 0.000776650794705424, + "loss": 0.81754518, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.46240234, + "step": 1736, + "time_per_iteration": 3.253756046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_mlp": 1.00434327, + "epoch": 0.33416698730280875, + "flos": 545895306240.0, + "grad_norm": 0.03238990381642275, + "language_loss": 0.83209848, + "learning_rate": 0.0007763912321937483, + "loss": 0.84260583, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.46337891, + "step": 1737, + "time_per_iteration": 2.712942361831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051632, + "balance_loss_mlp": 1.00525999, + "epoch": 0.33435936898807234, + "flos": 1015876776960.0, + "grad_norm": 0.036470780413058734, + "language_loss": 0.8337301, + "learning_rate": 0.0007761315623774799, + "loss": 0.84424639, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.46313477, + "step": 1738, + "time_per_iteration": 3.38946795463562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053137, + "balance_loss_mlp": 1.00671661, + "epoch": 0.3345517506733359, + "flos": 616372356864.0, + "grad_norm": 0.034452353492031275, + "language_loss": 0.88688117, + "learning_rate": 0.0007758717853574313, + "loss": 0.89741254, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.46362305, + "step": 1739, + "time_per_iteration": 2.7438387870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105524, + "balance_loss_mlp": 1.00896263, + "epoch": 0.33474413235859946, + "flos": 495570248448.0, + "grad_norm": 0.03665446817767542, + "language_loss": 0.90973008, + "learning_rate": 0.0007756119012344571, + "loss": 0.92028248, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.4621582, + "step": 1740, + "time_per_iteration": 2.5443572998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.0052774, + "epoch": 0.33493651404386304, + "flos": 629488504320.0, + "grad_norm": 0.0365358867260097, + "language_loss": 0.85516071, + "learning_rate": 0.0007753519101094535, + "loss": 0.86567724, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.46313477, + "step": 1741, + "time_per_iteration": 2.785595417022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050508, + "balance_loss_mlp": 1.00396836, + "epoch": 0.3351288957291266, + "flos": 514743365376.0, + "grad_norm": 0.038608286094447275, + "language_loss": 0.87042749, + "learning_rate": 0.0007750918120833575, + "loss": 0.88093251, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.46484375, + "step": 1742, + "time_per_iteration": 2.5612564086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_mlp": 1.00825262, + "epoch": 0.33532127741439016, + "flos": 648483731712.0, + "grad_norm": 0.038902913238311417, + "language_loss": 0.88245445, + "learning_rate": 0.0007748316072571485, + "loss": 0.89300191, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.46435547, + "step": 1743, + "time_per_iteration": 2.8040030002593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056064, + "balance_loss_mlp": 1.00969172, + "epoch": 0.3355136590996537, + "flos": 769789373184.0, + "grad_norm": 0.032744002461956113, + "language_loss": 0.80090916, + "learning_rate": 0.0007745712957318467, + "loss": 0.81146979, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.46313477, + "step": 1744, + "time_per_iteration": 2.955864429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_mlp": 1.00656557, + "epoch": 0.3357060407849173, + "flos": 596650020096.0, + "grad_norm": 0.027209343707751667, + "language_loss": 0.86834347, + "learning_rate": 0.0007743108776085141, + "loss": 0.87887406, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.46435547, + "step": 1745, + "time_per_iteration": 2.8065922260284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059361, + "balance_loss_mlp": 1.01277399, + "epoch": 0.3358984224701808, + "flos": 599802442752.0, + "grad_norm": 0.030632877870575562, + "language_loss": 0.83193165, + "learning_rate": 0.0007740503529882543, + "loss": 0.84252524, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.46533203, + "step": 1746, + "time_per_iteration": 2.783057451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058625, + "balance_loss_mlp": 1.01218116, + "epoch": 0.3360908041554444, + "flos": 579430764288.0, + "grad_norm": 0.03209356344176002, + "language_loss": 0.91440552, + "learning_rate": 0.0007737897219722114, + "loss": 0.92499179, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.46386719, + "step": 1747, + "time_per_iteration": 2.6678693294525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_mlp": 1.00723922, + "epoch": 0.336283185840708, + "flos": 514621856256.0, + "grad_norm": 0.02947569275247992, + "language_loss": 0.81706387, + "learning_rate": 0.0007735289846615716, + "loss": 0.82759976, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.46289062, + "step": 1748, + "time_per_iteration": 2.664217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_mlp": 1.00312185, + "epoch": 0.3364755675259715, + "flos": 526014521856.0, + "grad_norm": 0.03437288512368296, + "language_loss": 0.83148289, + "learning_rate": 0.0007732681411575621, + "loss": 0.84197474, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.45996094, + "step": 1749, + "time_per_iteration": 2.679304361343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_mlp": 1.00613475, + "epoch": 0.3366679492112351, + "flos": 555974704128.0, + "grad_norm": 0.040002531784274646, + "language_loss": 0.88002014, + "learning_rate": 0.0007730071915614514, + "loss": 0.89053994, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.45776367, + "step": 1750, + "time_per_iteration": 2.6813647747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00734055, + "epoch": 0.33686033089649864, + "flos": 428164940544.0, + "grad_norm": 0.03793638318473741, + "language_loss": 0.88937026, + "learning_rate": 0.0007727461359745489, + "loss": 0.89990187, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.45751953, + "step": 1751, + "time_per_iteration": 2.459137439727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_mlp": 1.00425673, + "epoch": 0.3370527125817622, + "flos": 542841060096.0, + "grad_norm": 0.030686532457312277, + "language_loss": 0.86821485, + "learning_rate": 0.0007724849744982056, + "loss": 0.87871712, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.45898438, + "step": 1752, + "time_per_iteration": 2.682023525238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.00412822, + "epoch": 0.33724509426702576, + "flos": 543231832320.0, + "grad_norm": 0.03146587739195435, + "language_loss": 0.82788759, + "learning_rate": 0.0007722237072338131, + "loss": 0.8383888, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.45922852, + "step": 1753, + "time_per_iteration": 2.7289977073669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_mlp": 1.00735557, + "epoch": 0.33743747595228935, + "flos": 473753099520.0, + "grad_norm": 0.036309304678759154, + "language_loss": 0.86263937, + "learning_rate": 0.0007719623342828046, + "loss": 0.8731702, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.45654297, + "step": 1754, + "time_per_iteration": 2.5323400497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_mlp": 1.00127256, + "epoch": 0.33762985763755293, + "flos": 470837859072.0, + "grad_norm": 0.037209700878319825, + "language_loss": 0.84580374, + "learning_rate": 0.000771700855746654, + "loss": 0.85627109, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.45385742, + "step": 1755, + "time_per_iteration": 2.585667848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049151, + "balance_loss_mlp": 1.00366056, + "epoch": 0.33782223932281646, + "flos": 493251859968.0, + "grad_norm": 0.03059786996599164, + "language_loss": 0.89290714, + "learning_rate": 0.0007714392717268763, + "loss": 0.90339863, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.45410156, + "step": 1756, + "time_per_iteration": 2.5836589336395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_mlp": 1.00321686, + "epoch": 0.33801462100808005, + "flos": 466018334976.0, + "grad_norm": 0.035533831964213135, + "language_loss": 0.87473714, + "learning_rate": 0.0007711775823250273, + "loss": 0.88522607, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.45605469, + "step": 1757, + "time_per_iteration": 2.5619492530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_mlp": 1.00417781, + "epoch": 0.3382070026933436, + "flos": 797068584960.0, + "grad_norm": 0.03198873828119691, + "language_loss": 0.84101963, + "learning_rate": 0.0007709157876427039, + "loss": 0.85151625, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.45410156, + "step": 1758, + "time_per_iteration": 3.084735870361328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049654, + "balance_loss_mlp": 1.00414026, + "epoch": 0.33839938437860717, + "flos": 509429056512.0, + "grad_norm": 0.031347294296384644, + "language_loss": 0.86196065, + "learning_rate": 0.0007706538877815439, + "loss": 0.87245721, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.4543457, + "step": 1759, + "time_per_iteration": 2.6354048252105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_mlp": 1.00371122, + "epoch": 0.3385917660638707, + "flos": 485274077184.0, + "grad_norm": 0.03028112214235413, + "language_loss": 0.83875918, + "learning_rate": 0.0007703918828432259, + "loss": 0.84925139, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.4543457, + "step": 1760, + "time_per_iteration": 2.6017844676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_mlp": 1.00358403, + "epoch": 0.3387841477491343, + "flos": 546416335872.0, + "grad_norm": 0.033680258429279644, + "language_loss": 0.89293355, + "learning_rate": 0.000770129772929469, + "loss": 0.90342498, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.45483398, + "step": 1761, + "time_per_iteration": 2.671287775039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048026, + "balance_loss_mlp": 1.00217831, + "epoch": 0.3389765294343978, + "flos": 721064342784.0, + "grad_norm": 0.03497277274463044, + "language_loss": 0.89180952, + "learning_rate": 0.0007698675581420334, + "loss": 0.90228981, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.45776367, + "step": 1762, + "time_per_iteration": 2.9236271381378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_mlp": 1.00677264, + "epoch": 0.3391689111196614, + "flos": 701264238336.0, + "grad_norm": 0.034268369898116914, + "language_loss": 0.79778481, + "learning_rate": 0.0007696052385827199, + "loss": 0.80830908, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.45581055, + "step": 1763, + "time_per_iteration": 2.9605488777160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055868, + "balance_loss_mlp": 1.01018691, + "epoch": 0.339361292804925, + "flos": 628249113600.0, + "grad_norm": 0.03454670185411084, + "language_loss": 0.78905737, + "learning_rate": 0.00076934281435337, + "loss": 0.79961604, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.45605469, + "step": 1764, + "time_per_iteration": 2.7454025745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052159, + "balance_loss_mlp": 1.00647831, + "epoch": 0.33955367449018853, + "flos": 610795587840.0, + "grad_norm": 0.03693575970108084, + "language_loss": 0.86892688, + "learning_rate": 0.0007690802855558658, + "loss": 0.87944847, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.45605469, + "step": 1765, + "time_per_iteration": 2.8936946392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054825, + "balance_loss_mlp": 1.01057434, + "epoch": 0.3397460561754521, + "flos": 1456589191680.0, + "grad_norm": 0.006269192400269108, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77429777, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.44335938, + "step": 1766, + "time_per_iteration": 4.913206100463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054352, + "balance_loss_mlp": 1.00855207, + "epoch": 0.33993843786071565, + "flos": 488291384832.0, + "grad_norm": 0.039386286306125895, + "language_loss": 0.89967024, + "learning_rate": 0.0007685549146641262, + "loss": 0.91021377, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.45727539, + "step": 1767, + "time_per_iteration": 2.593353271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050967, + "balance_loss_mlp": 1.00554788, + "epoch": 0.34013081954597923, + "flos": 418233296640.0, + "grad_norm": 0.032458575290873634, + "language_loss": 0.89062989, + "learning_rate": 0.0007682920727738579, + "loss": 0.90113962, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.45336914, + "step": 1768, + "time_per_iteration": 2.510331392288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054131, + "balance_loss_mlp": 1.00835514, + "epoch": 0.34032320123124277, + "flos": 438430976256.0, + "grad_norm": 0.037803385345055784, + "language_loss": 0.85379529, + "learning_rate": 0.000768029126723369, + "loss": 0.86433661, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.45703125, + "step": 1769, + "time_per_iteration": 2.5152533054351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054345, + "balance_loss_mlp": 1.00852144, + "epoch": 0.34051558291650635, + "flos": 458544085248.0, + "grad_norm": 0.04157155741286578, + "language_loss": 0.82432753, + "learning_rate": 0.0007677660766147447, + "loss": 0.83487099, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.45751953, + "step": 1770, + "time_per_iteration": 2.5669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_mlp": 1.00858307, + "epoch": 0.3407079646017699, + "flos": 1562140489728.0, + "grad_norm": 0.006526141838203855, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73523682, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.44238281, + "step": 1771, + "time_per_iteration": 4.953578233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051633, + "balance_loss_mlp": 1.00602317, + "epoch": 0.3409003462870335, + "flos": 493531816704.0, + "grad_norm": 0.043561887450476046, + "language_loss": 0.80659652, + "learning_rate": 0.0007672396646316306, + "loss": 0.81711292, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.45532227, + "step": 1772, + "time_per_iteration": 2.5720248222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00356674, + "epoch": 0.34109272797229706, + "flos": 809822150400.0, + "grad_norm": 0.03735237922314452, + "language_loss": 0.80629146, + "learning_rate": 0.000766976302961512, + "loss": 0.81678128, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.45336914, + "step": 1773, + "time_per_iteration": 3.0438191890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050599, + "balance_loss_mlp": 1.00513268, + "epoch": 0.3412851096575606, + "flos": 471100319232.0, + "grad_norm": 0.03730121261656314, + "language_loss": 0.82086515, + "learning_rate": 0.0007667128376420003, + "loss": 0.83137119, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.45385742, + "step": 1774, + "time_per_iteration": 2.5461959838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_mlp": 1.00681531, + "epoch": 0.3414774913428242, + "flos": 596771529216.0, + "grad_norm": 0.03978671612524881, + "language_loss": 0.85611963, + "learning_rate": 0.0007664492687753817, + "loss": 0.86664057, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.4519043, + "step": 1775, + "time_per_iteration": 2.7454183101654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_mlp": 1.00362372, + "epoch": 0.3416698730280877, + "flos": 528508854528.0, + "grad_norm": 0.03225195621375244, + "language_loss": 0.82109249, + "learning_rate": 0.000766185596463983, + "loss": 0.83158267, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.453125, + "step": 1776, + "time_per_iteration": 2.636876106262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050456, + "balance_loss_mlp": 1.00513279, + "epoch": 0.3418622547133513, + "flos": 876118324992.0, + "grad_norm": 0.033083928099711564, + "language_loss": 0.77454132, + "learning_rate": 0.0007659218208101706, + "loss": 0.78504586, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.45239258, + "step": 1777, + "time_per_iteration": 3.097163677215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055706, + "balance_loss_mlp": 1.01031137, + "epoch": 0.34205463639861483, + "flos": 604877624064.0, + "grad_norm": 0.03453483859247358, + "language_loss": 0.86064076, + "learning_rate": 0.0007656579419163515, + "loss": 0.87119782, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.453125, + "step": 1778, + "time_per_iteration": 2.7452263832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055225, + "balance_loss_mlp": 1.0096159, + "epoch": 0.3422470180838784, + "flos": 464715760896.0, + "grad_norm": 0.037184345749469765, + "language_loss": 0.77793133, + "learning_rate": 0.0007653939598849724, + "loss": 0.78848356, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.45532227, + "step": 1779, + "time_per_iteration": 2.5020663738250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0134964, + "epoch": 0.34243939976914195, + "flos": 1589819222016.0, + "grad_norm": 0.009860928497574006, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83937383, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.4375, + "step": 1780, + "time_per_iteration": 4.958939552307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_mlp": 1.00849116, + "epoch": 0.34263178145440554, + "flos": 874444420608.0, + "grad_norm": 0.034671274665512654, + "language_loss": 0.80890739, + "learning_rate": 0.000764865686819522, + "loss": 0.81944883, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.45581055, + "step": 1781, + "time_per_iteration": 3.0468943119049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_mlp": 1.01148522, + "epoch": 0.3428241631396691, + "flos": 507874715904.0, + "grad_norm": 0.02984044691012994, + "language_loss": 0.86276633, + "learning_rate": 0.0007646013959905449, + "loss": 0.87333775, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.45581055, + "step": 1782, + "time_per_iteration": 2.59788179397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056783, + "balance_loss_mlp": 1.01114941, + "epoch": 0.34301654482493266, + "flos": 881525952768.0, + "grad_norm": 0.034646354408830966, + "language_loss": 0.81384498, + "learning_rate": 0.0007643370024341949, + "loss": 0.82441282, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.45556641, + "step": 1783, + "time_per_iteration": 3.0783512592315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_mlp": 1.00288546, + "epoch": 0.34320892651019624, + "flos": 432669514752.0, + "grad_norm": 0.031189947688426686, + "language_loss": 0.84145617, + "learning_rate": 0.0007640725062531195, + "loss": 0.85193729, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.45141602, + "step": 1784, + "time_per_iteration": 2.5152812004089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00559807, + "epoch": 0.3434013081954598, + "flos": 464594251776.0, + "grad_norm": 0.03760163078295718, + "language_loss": 0.86810297, + "learning_rate": 0.0007638079075500047, + "loss": 0.87861264, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.45288086, + "step": 1785, + "time_per_iteration": 2.5846633911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.0019455, + "epoch": 0.34359368988072336, + "flos": 1560677522688.0, + "grad_norm": 0.003111664808940008, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76225722, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.43164062, + "step": 1786, + "time_per_iteration": 4.94433856010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_mlp": 1.003739, + "epoch": 0.3437860715659869, + "flos": 496573423872.0, + "grad_norm": 0.03208809815455149, + "language_loss": 0.83580017, + "learning_rate": 0.0007632784029886026, + "loss": 0.8462882, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.45019531, + "step": 1787, + "time_per_iteration": 2.6222987174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_mlp": 1.00523186, + "epoch": 0.3439784532512505, + "flos": 719610124032.0, + "grad_norm": 0.03771035877194531, + "language_loss": 0.86448389, + "learning_rate": 0.0007630134973358873, + "loss": 0.87498415, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.44799805, + "step": 1788, + "time_per_iteration": 2.9359545707702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00320995, + "epoch": 0.34417083493651407, + "flos": 566922162432.0, + "grad_norm": 0.0315223877917514, + "language_loss": 0.8730194, + "learning_rate": 0.0007627484895722763, + "loss": 0.88349926, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.44775391, + "step": 1789, + "time_per_iteration": 2.710433006286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00397587, + "epoch": 0.3443632166217776, + "flos": 797702375424.0, + "grad_norm": 0.034658336241014505, + "language_loss": 0.80973929, + "learning_rate": 0.0007624833798006552, + "loss": 0.82022536, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.4465332, + "step": 1790, + "time_per_iteration": 3.061995506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049093, + "balance_loss_mlp": 1.00419891, + "epoch": 0.3445555983070412, + "flos": 570393425664.0, + "grad_norm": 0.0359941873064626, + "language_loss": 0.84664464, + "learning_rate": 0.0007622181681239483, + "loss": 0.85713559, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.44873047, + "step": 1791, + "time_per_iteration": 2.708204984664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00192165, + "epoch": 0.3447479799923047, + "flos": 569981266176.0, + "grad_norm": 0.030307911746310208, + "language_loss": 0.85264516, + "learning_rate": 0.0007619528546451202, + "loss": 0.86311066, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.4465332, + "step": 1792, + "time_per_iteration": 2.8142476081848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047842, + "balance_loss_mlp": 1.00323367, + "epoch": 0.3449403616775683, + "flos": 969333074688.0, + "grad_norm": 0.03266645448260783, + "language_loss": 0.84415537, + "learning_rate": 0.0007616874394671745, + "loss": 0.85463381, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.4465332, + "step": 1793, + "time_per_iteration": 3.340257406234741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048671, + "balance_loss_mlp": 1.00411057, + "epoch": 0.34513274336283184, + "flos": 569677009920.0, + "grad_norm": 0.042713127170940564, + "language_loss": 0.85883492, + "learning_rate": 0.0007614219226931547, + "loss": 0.86932158, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.44604492, + "step": 1794, + "time_per_iteration": 2.666299343109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00301611, + "epoch": 0.3453251250480954, + "flos": 461858846208.0, + "grad_norm": 0.03409376285864792, + "language_loss": 0.85191298, + "learning_rate": 0.0007611563044261435, + "loss": 0.86238825, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.44580078, + "step": 1795, + "time_per_iteration": 2.509730577468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00340092, + "epoch": 0.34551750673335896, + "flos": 416520508416.0, + "grad_norm": 0.03871598691360063, + "language_loss": 0.87655377, + "learning_rate": 0.0007608905847692631, + "loss": 0.88703358, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.4465332, + "step": 1796, + "time_per_iteration": 2.468144416809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045751, + "balance_loss_mlp": 1.0012145, + "epoch": 0.34570988841862255, + "flos": 589115499264.0, + "grad_norm": 0.03133980127061019, + "language_loss": 0.87422049, + "learning_rate": 0.0007606247638256749, + "loss": 0.88467801, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.44580078, + "step": 1797, + "time_per_iteration": 2.8401029109954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_mlp": 1.00758362, + "epoch": 0.34590227010388613, + "flos": 1571145747456.0, + "grad_norm": 0.007450888717391324, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79220599, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.42773438, + "step": 1798, + "time_per_iteration": 4.913544178009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_mlp": 1.00097656, + "epoch": 0.34609465178914967, + "flos": 1540930886400.0, + "grad_norm": 0.004797214297707501, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80371094, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.4296875, + "step": 1799, + "time_per_iteration": 4.771878719329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_mlp": 1.00469148, + "epoch": 0.34628703347441325, + "flos": 610517576448.0, + "grad_norm": 0.037119753663607306, + "language_loss": 0.86850703, + "learning_rate": 0.0007598266943068686, + "loss": 0.8790009, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.44750977, + "step": 1800, + "time_per_iteration": 2.746819496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050462, + "balance_loss_mlp": 1.00535274, + "epoch": 0.3464794151596768, + "flos": 474265380864.0, + "grad_norm": 0.03436691989893219, + "language_loss": 0.84791839, + "learning_rate": 0.0007595604692488507, + "loss": 0.85842299, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.45019531, + "step": 1801, + "time_per_iteration": 2.564328908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_mlp": 1.00587356, + "epoch": 0.34667179684494037, + "flos": 606822736896.0, + "grad_norm": 0.03808690892272381, + "language_loss": 0.83437663, + "learning_rate": 0.0007592941434205215, + "loss": 0.8448841, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.44848633, + "step": 1802, + "time_per_iteration": 2.826420545578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059933, + "balance_loss_mlp": 1.016922, + "epoch": 0.3468641785302039, + "flos": 1568362709760.0, + "grad_norm": 0.013636299413791342, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74630988, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.43066406, + "step": 1803, + "time_per_iteration": 5.063625812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050175, + "balance_loss_mlp": 1.00523341, + "epoch": 0.3470565602154675, + "flos": 908724484608.0, + "grad_norm": 0.03942668215130471, + "language_loss": 0.80763334, + "learning_rate": 0.0007587611898665566, + "loss": 0.81813502, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.44921875, + "step": 1804, + "time_per_iteration": 3.0834579467773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050119, + "balance_loss_mlp": 1.0052247, + "epoch": 0.347248941900731, + "flos": 640060741632.0, + "grad_norm": 0.031209613313051415, + "language_loss": 0.82727098, + "learning_rate": 0.0007584945623478315, + "loss": 0.83777213, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.44873047, + "step": 1805, + "time_per_iteration": 2.861560106277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_mlp": 1.00688517, + "epoch": 0.3474413235859946, + "flos": 848782732800.0, + "grad_norm": 0.03633023546687314, + "language_loss": 0.81859386, + "learning_rate": 0.000758227834472617, + "loss": 0.82910925, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.44702148, + "step": 1806, + "time_per_iteration": 3.0337021350860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.00767589, + "epoch": 0.3476337052712582, + "flos": 516697226496.0, + "grad_norm": 0.035243207865769656, + "language_loss": 0.77929807, + "learning_rate": 0.0007579610063444664, + "loss": 0.78982013, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.44580078, + "step": 1807, + "time_per_iteration": 2.7339653968811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_mlp": 1.01154768, + "epoch": 0.34782608695652173, + "flos": 915115845888.0, + "grad_norm": 0.03414685220945043, + "language_loss": 0.88006967, + "learning_rate": 0.0007576940780669712, + "loss": 0.89063108, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.4465332, + "step": 1808, + "time_per_iteration": 3.211806058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_mlp": 1.00756717, + "epoch": 0.3480184686417853, + "flos": 775084240128.0, + "grad_norm": 0.07111913657628408, + "language_loss": 0.84903318, + "learning_rate": 0.0007574270497437624, + "loss": 0.85955209, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.4440918, + "step": 1809, + "time_per_iteration": 2.984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00518048, + "epoch": 0.34821085032704885, + "flos": 578004735744.0, + "grad_norm": 0.031195535995176178, + "language_loss": 0.88877916, + "learning_rate": 0.000757159921478509, + "loss": 0.89927369, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.44360352, + "step": 1810, + "time_per_iteration": 2.778917074203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.00888824, + "epoch": 0.34840323201231244, + "flos": 1528042205952.0, + "grad_norm": 0.009192534613281171, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75502062, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.42578125, + "step": 1811, + "time_per_iteration": 4.791734218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.0040617, + "epoch": 0.34859561369757597, + "flos": 510182410752.0, + "grad_norm": 0.038842956055274956, + "language_loss": 0.88272417, + "learning_rate": 0.0007566253655367423, + "loss": 0.89320654, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.44262695, + "step": 1812, + "time_per_iteration": 2.6542506217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050997, + "balance_loss_mlp": 1.00689006, + "epoch": 0.34878799538283956, + "flos": 549757341696.0, + "grad_norm": 0.030689577509801048, + "language_loss": 0.90222162, + "learning_rate": 0.000756357938067762, + "loss": 0.91273159, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.44189453, + "step": 1813, + "time_per_iteration": 2.6897120475769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047642, + "balance_loss_mlp": 1.00346339, + "epoch": 0.34898037706810314, + "flos": 985195321344.0, + "grad_norm": 0.03422241032564105, + "language_loss": 0.83499646, + "learning_rate": 0.0007560904110718033, + "loss": 0.84547287, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.44262695, + "step": 1814, + "time_per_iteration": 3.3129422664642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_mlp": 1.00102115, + "epoch": 0.3491727587533667, + "flos": 682837672704.0, + "grad_norm": 0.03439092984945392, + "language_loss": 0.84187126, + "learning_rate": 0.0007558227846527297, + "loss": 0.85232258, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.44189453, + "step": 1815, + "time_per_iteration": 2.8228747844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_mlp": 1.00880051, + "epoch": 0.34936514043863026, + "flos": 394889997312.0, + "grad_norm": 0.04066201843968592, + "language_loss": 0.84257603, + "learning_rate": 0.0007555550589144429, + "loss": 0.8531037, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.44042969, + "step": 1816, + "time_per_iteration": 2.4170055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053968, + "balance_loss_mlp": 1.01000416, + "epoch": 0.3495575221238938, + "flos": 462340992000.0, + "grad_norm": 0.036355924698056825, + "language_loss": 0.84744954, + "learning_rate": 0.000755287233960883, + "loss": 0.85798925, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.44042969, + "step": 1817, + "time_per_iteration": 2.577195405960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_mlp": 1.01115596, + "epoch": 0.3497499038091574, + "flos": 725429911296.0, + "grad_norm": 0.037028935917378006, + "language_loss": 0.78975379, + "learning_rate": 0.0007550193098960292, + "loss": 0.80030644, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.44189453, + "step": 1818, + "time_per_iteration": 2.9124276638031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_mlp": 1.00609303, + "epoch": 0.3499422854944209, + "flos": 829197456384.0, + "grad_norm": 0.03031702063556045, + "language_loss": 0.8721534, + "learning_rate": 0.0007547512868238988, + "loss": 0.88265729, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.44384766, + "step": 1819, + "time_per_iteration": 3.1275570392608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_mlp": 1.00203693, + "epoch": 0.3501346671796845, + "flos": 494543740416.0, + "grad_norm": 0.03689243892136314, + "language_loss": 0.8434422, + "learning_rate": 0.0007544831648485473, + "loss": 0.85390604, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.44433594, + "step": 1820, + "time_per_iteration": 2.6672415733337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053354, + "balance_loss_mlp": 1.00917482, + "epoch": 0.35032704886494803, + "flos": 579849726720.0, + "grad_norm": 0.04031883928972686, + "language_loss": 0.8166672, + "learning_rate": 0.0007542149440740694, + "loss": 0.82720077, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.44262695, + "step": 1821, + "time_per_iteration": 2.659205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_mlp": 1.0069536, + "epoch": 0.3505194305502116, + "flos": 585832819200.0, + "grad_norm": 0.035872862949689145, + "language_loss": 0.86380953, + "learning_rate": 0.000753946624604597, + "loss": 0.8743242, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.44604492, + "step": 1822, + "time_per_iteration": 2.748387575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049848, + "balance_loss_mlp": 1.00528705, + "epoch": 0.3507118122354752, + "flos": 527979076608.0, + "grad_norm": 0.036265727976650085, + "language_loss": 0.88431466, + "learning_rate": 0.0007536782065443015, + "loss": 0.89481318, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.44628906, + "step": 1823, + "time_per_iteration": 2.608429193496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054753, + "balance_loss_mlp": 1.00997818, + "epoch": 0.35090419392073874, + "flos": 512546486016.0, + "grad_norm": 0.039277226542114754, + "language_loss": 0.75647306, + "learning_rate": 0.0007534096899973919, + "loss": 0.76702058, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.44799805, + "step": 1824, + "time_per_iteration": 2.702721118927002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_mlp": 1.0046134, + "epoch": 0.3510965756060023, + "flos": 565196735232.0, + "grad_norm": 0.031185756782702443, + "language_loss": 0.83427215, + "learning_rate": 0.0007531410750681154, + "loss": 0.84476435, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.44677734, + "step": 1825, + "time_per_iteration": 2.7568912506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_mlp": 1.00831807, + "epoch": 0.35128895729126586, + "flos": 1022254532352.0, + "grad_norm": 0.030666943866844928, + "language_loss": 0.87304175, + "learning_rate": 0.0007528723618607575, + "loss": 0.88357341, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.44848633, + "step": 1826, + "time_per_iteration": 3.4575371742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.00510669, + "epoch": 0.35148133897652944, + "flos": 589425591552.0, + "grad_norm": 0.04947505148138052, + "language_loss": 0.83428013, + "learning_rate": 0.0007526035504796422, + "loss": 0.84477776, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.44702148, + "step": 1827, + "time_per_iteration": 2.7913553714752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053021, + "balance_loss_mlp": 1.00838912, + "epoch": 0.351673720661793, + "flos": 496286664192.0, + "grad_norm": 0.03604129919469899, + "language_loss": 0.87358594, + "learning_rate": 0.0007523346410291312, + "loss": 0.88411617, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.44702148, + "step": 1828, + "time_per_iteration": 2.769817590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.00499058, + "epoch": 0.35186610234705656, + "flos": 763999721472.0, + "grad_norm": 0.036507155273352104, + "language_loss": 0.85486639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86536574, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.44921875, + "step": 1829, + "time_per_iteration": 2.960890293121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00364745, + "epoch": 0.3520584840323201, + "flos": 627389801472.0, + "grad_norm": 0.0323509050656096, + "language_loss": 0.88885164, + "learning_rate": 0.0007517965283375599, + "loss": 0.89933491, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.44702148, + "step": 1830, + "time_per_iteration": 2.868405818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047282, + "balance_loss_mlp": 1.00260293, + "epoch": 0.3522508657175837, + "flos": 538449246720.0, + "grad_norm": 0.03139560131485747, + "language_loss": 0.89993465, + "learning_rate": 0.0007515273253054132, + "loss": 0.91040754, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.44726562, + "step": 1831, + "time_per_iteration": 2.6341445446014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_mlp": 1.00298083, + "epoch": 0.35244324740284727, + "flos": 568502747904.0, + "grad_norm": 0.03545868131612223, + "language_loss": 0.83198845, + "learning_rate": 0.0007512580246216988, + "loss": 0.8424651, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.44726562, + "step": 1832, + "time_per_iteration": 2.691678524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_mlp": 1.00860476, + "epoch": 0.3526356290881108, + "flos": 514055139840.0, + "grad_norm": 0.03517539350184397, + "language_loss": 0.85415643, + "learning_rate": 0.000750988626390968, + "loss": 0.86468661, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.44506836, + "step": 1833, + "time_per_iteration": 2.6027944087982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050095, + "balance_loss_mlp": 1.00577271, + "epoch": 0.3528280107733744, + "flos": 596973718272.0, + "grad_norm": 0.033457257877764275, + "language_loss": 0.85569251, + "learning_rate": 0.0007507191307178108, + "loss": 0.86619347, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.4440918, + "step": 1834, + "time_per_iteration": 2.8065004348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054314, + "balance_loss_mlp": 1.00999165, + "epoch": 0.3530203924586379, + "flos": 552299306496.0, + "grad_norm": 0.040042804692427734, + "language_loss": 0.75668854, + "learning_rate": 0.0007504495377068543, + "loss": 0.76723164, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.4440918, + "step": 1835, + "time_per_iteration": 2.736536741256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052598, + "balance_loss_mlp": 1.00832355, + "epoch": 0.3532127741439015, + "flos": 654306431232.0, + "grad_norm": 0.0387965270782292, + "language_loss": 0.82353514, + "learning_rate": 0.0007501798474627642, + "loss": 0.83406115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.44360352, + "step": 1836, + "time_per_iteration": 2.9019014835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052616, + "balance_loss_mlp": 1.00824583, + "epoch": 0.35340515582916504, + "flos": 724151636736.0, + "grad_norm": 0.03634896017563763, + "language_loss": 0.84383756, + "learning_rate": 0.0007499100600902433, + "loss": 0.85436368, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.44458008, + "step": 1837, + "time_per_iteration": 3.0071663856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105242, + "balance_loss_mlp": 1.00812232, + "epoch": 0.35359753751442863, + "flos": 595998733056.0, + "grad_norm": 0.039287132740407786, + "language_loss": 0.853827, + "learning_rate": 0.0007496401756940324, + "loss": 0.86435115, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.44384766, + "step": 1838, + "time_per_iteration": 2.6924545764923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052318, + "balance_loss_mlp": 1.00780547, + "epoch": 0.3537899191996922, + "flos": 633806440704.0, + "grad_norm": 0.041905435038062475, + "language_loss": 0.83424079, + "learning_rate": 0.0007493701943789098, + "loss": 0.84476393, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.44580078, + "step": 1839, + "time_per_iteration": 2.744781970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.00727141, + "epoch": 0.35398230088495575, + "flos": 507353686272.0, + "grad_norm": 0.0353986915713622, + "language_loss": 0.8339026, + "learning_rate": 0.000749100116249692, + "loss": 0.84441972, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.44506836, + "step": 1840, + "time_per_iteration": 2.5823822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_mlp": 1.00490189, + "epoch": 0.35417468257021933, + "flos": 509047032576.0, + "grad_norm": 0.03988576427868324, + "language_loss": 0.86907303, + "learning_rate": 0.0007488299414112321, + "loss": 0.87956673, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.4453125, + "step": 1841, + "time_per_iteration": 2.6171295642852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055735, + "balance_loss_mlp": 1.01126969, + "epoch": 0.35436706425548287, + "flos": 657660076032.0, + "grad_norm": 0.035376771477334756, + "language_loss": 0.78015333, + "learning_rate": 0.0007485596699684215, + "loss": 0.79071069, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.44555664, + "step": 1842, + "time_per_iteration": 2.8393046855926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070571, + "balance_loss_mlp": 1.02572489, + "epoch": 0.35455944594074645, + "flos": 653889414144.0, + "grad_norm": 0.03498191670442302, + "language_loss": 0.86517459, + "learning_rate": 0.000748289302026189, + "loss": 0.87588024, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.44848633, + "step": 1843, + "time_per_iteration": 2.8524656295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060154, + "balance_loss_mlp": 1.01566541, + "epoch": 0.35475182762601, + "flos": 850011429888.0, + "grad_norm": 0.03510464987001869, + "language_loss": 0.86422503, + "learning_rate": 0.0007480188376895004, + "loss": 0.87482655, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.4453125, + "step": 1844, + "time_per_iteration": 3.1228320598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048378, + "balance_loss_mlp": 1.00584412, + "epoch": 0.3549442093112736, + "flos": 1524777989376.0, + "grad_norm": 0.00626506088035535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74859715, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.42578125, + "step": 1845, + "time_per_iteration": 4.8881309032440186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053267, + "balance_loss_mlp": 1.00906432, + "epoch": 0.3551365909965371, + "flos": 652715152128.0, + "grad_norm": 0.03760423595997357, + "language_loss": 0.78996736, + "learning_rate": 0.0007474776202528074, + "loss": 0.80050004, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.44287109, + "step": 1846, + "time_per_iteration": 2.9740474224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055296, + "balance_loss_mlp": 1.01118839, + "epoch": 0.3553289726818007, + "flos": 898923098112.0, + "grad_norm": 0.04404679517400465, + "language_loss": 0.81547415, + "learning_rate": 0.000747206867362922, + "loss": 0.82602704, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.44189453, + "step": 1847, + "time_per_iteration": 3.0834994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052455, + "balance_loss_mlp": 1.00822854, + "epoch": 0.3555213543670643, + "flos": 689734512384.0, + "grad_norm": 0.03965516085145463, + "language_loss": 0.8451193, + "learning_rate": 0.0007469360184988194, + "loss": 0.85564387, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.44311523, + "step": 1848, + "time_per_iteration": 2.8074848651885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050526, + "balance_loss_mlp": 1.00632286, + "epoch": 0.3557137360523278, + "flos": 539604066816.0, + "grad_norm": 0.033414642983477745, + "language_loss": 0.87585986, + "learning_rate": 0.0007466650737656518, + "loss": 0.88636506, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.44287109, + "step": 1849, + "time_per_iteration": 2.604926347732544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049824, + "balance_loss_mlp": 1.00562072, + "epoch": 0.3559061177375914, + "flos": 403154539776.0, + "grad_norm": 0.03235738057519393, + "language_loss": 0.9068622, + "learning_rate": 0.0007463940332686098, + "loss": 0.91736042, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.44287109, + "step": 1850, + "time_per_iteration": 2.4913558959960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056333, + "balance_loss_mlp": 1.01196373, + "epoch": 0.35609849942285493, + "flos": 697895042304.0, + "grad_norm": 0.0320980052654178, + "language_loss": 0.85078359, + "learning_rate": 0.0007461228971129205, + "loss": 0.86134696, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.44458008, + "step": 1851, + "time_per_iteration": 2.898726463317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.01557255, + "epoch": 0.3562908811081185, + "flos": 570002653440.0, + "grad_norm": 0.036011031747473804, + "language_loss": 0.86088216, + "learning_rate": 0.0007458516654038483, + "loss": 0.87148154, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.44458008, + "step": 1852, + "time_per_iteration": 2.6340625286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050769, + "balance_loss_mlp": 1.00651896, + "epoch": 0.35648326279338205, + "flos": 683610468864.0, + "grad_norm": 0.03085087761867809, + "language_loss": 0.87196577, + "learning_rate": 0.0007455803382466946, + "loss": 0.88247347, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.44335938, + "step": 1853, + "time_per_iteration": 2.7936782836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_mlp": 1.00468445, + "epoch": 0.35667564447864564, + "flos": 630341980416.0, + "grad_norm": 0.02905562967314866, + "language_loss": 0.8756358, + "learning_rate": 0.0007453089157467979, + "loss": 0.88612318, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.44140625, + "step": 1854, + "time_per_iteration": 2.8003768920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053337, + "balance_loss_mlp": 1.00920558, + "epoch": 0.35686802616390917, + "flos": 815505844224.0, + "grad_norm": 0.03187136352260198, + "language_loss": 0.82840991, + "learning_rate": 0.0007450373980095341, + "loss": 0.83894324, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.44213867, + "step": 1855, + "time_per_iteration": 3.072218179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.00869787, + "epoch": 0.35706040784917276, + "flos": 527206280448.0, + "grad_norm": 0.03314729603592228, + "language_loss": 0.87318838, + "learning_rate": 0.0007447657851403155, + "loss": 0.88371575, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.44116211, + "step": 1856, + "time_per_iteration": 2.5849640369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047421, + "balance_loss_mlp": 1.00338531, + "epoch": 0.35725278953443634, + "flos": 513065570304.0, + "grad_norm": 0.033114806318055315, + "language_loss": 0.79136717, + "learning_rate": 0.0007444940772445915, + "loss": 0.80184138, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.44116211, + "step": 1857, + "time_per_iteration": 2.729100227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048076, + "balance_loss_mlp": 1.00404048, + "epoch": 0.3574451712196999, + "flos": 488493573888.0, + "grad_norm": 0.030889137628629628, + "language_loss": 0.80389744, + "learning_rate": 0.0007442222744278484, + "loss": 0.81437826, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.44116211, + "step": 1858, + "time_per_iteration": 2.673224687576294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_mlp": 1.00433075, + "epoch": 0.35763755290496346, + "flos": 551822018304.0, + "grad_norm": 0.029026961526961815, + "language_loss": 0.8481214, + "learning_rate": 0.0007439503767956099, + "loss": 0.8586058, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.44189453, + "step": 1859, + "time_per_iteration": 2.7095680236816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_mlp": 1.00567627, + "epoch": 0.357829934590227, + "flos": 1507228232448.0, + "grad_norm": 0.007157576597672099, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80719817, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.42578125, + "step": 1860, + "time_per_iteration": 4.909587383270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00549006, + "epoch": 0.3580223162754906, + "flos": 569842260480.0, + "grad_norm": 0.027013738684289513, + "language_loss": 0.86190987, + "learning_rate": 0.000743406297506922, + "loss": 0.87240434, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.44042969, + "step": 1861, + "time_per_iteration": 2.7355735301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00518215, + "epoch": 0.3582146979607541, + "flos": 627761131776.0, + "grad_norm": 0.0339710504259095, + "language_loss": 0.84903038, + "learning_rate": 0.0007431341160617031, + "loss": 0.8595221, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.44067383, + "step": 1862, + "time_per_iteration": 2.8932178020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054928, + "balance_loss_mlp": 1.01082051, + "epoch": 0.3584070796460177, + "flos": 508319923200.0, + "grad_norm": 0.030700215862736833, + "language_loss": 0.88826722, + "learning_rate": 0.0007428618402234491, + "loss": 0.89881647, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.44189453, + "step": 1863, + "time_per_iteration": 2.6574699878692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105281, + "balance_loss_mlp": 1.00882196, + "epoch": 0.3585994613312813, + "flos": 607641219840.0, + "grad_norm": 0.030466419719222444, + "language_loss": 0.80836076, + "learning_rate": 0.0007425894700978668, + "loss": 0.8188889, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.44067383, + "step": 1864, + "time_per_iteration": 2.7388875484466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048686, + "balance_loss_mlp": 1.00467396, + "epoch": 0.3587918430165448, + "flos": 1415089579776.0, + "grad_norm": 0.030441642762586523, + "language_loss": 0.8033703, + "learning_rate": 0.0007423170057906996, + "loss": 0.8138572, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.44091797, + "step": 1865, + "time_per_iteration": 3.8431384563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_mlp": 1.00086236, + "epoch": 0.3589842247018084, + "flos": 479514561024.0, + "grad_norm": 0.03198832631900347, + "language_loss": 0.8674798, + "learning_rate": 0.0007420444474077275, + "loss": 0.87792838, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.44067383, + "step": 1866, + "time_per_iteration": 2.5487258434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_mlp": 1.0028863, + "epoch": 0.35917660638707194, + "flos": 505706026752.0, + "grad_norm": 0.036738697797889144, + "language_loss": 0.90374953, + "learning_rate": 0.0007417717950547671, + "loss": 0.91421801, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.44042969, + "step": 1867, + "time_per_iteration": 2.6784894466400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052044, + "balance_loss_mlp": 1.00960541, + "epoch": 0.3593689880723355, + "flos": 1495484645376.0, + "grad_norm": 0.0080630279180651, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77048653, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.42480469, + "step": 1868, + "time_per_iteration": 4.930212497711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104515, + "balance_loss_mlp": 1.00118589, + "epoch": 0.35956136975759906, + "flos": 529672422912.0, + "grad_norm": 0.03031015371847706, + "language_loss": 0.85577166, + "learning_rate": 0.0007412262088623299, + "loss": 0.86622322, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.44042969, + "step": 1869, + "time_per_iteration": 2.73066782951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_mlp": 1.00385797, + "epoch": 0.35975375144286265, + "flos": 536000600832.0, + "grad_norm": 0.03552204952813077, + "language_loss": 0.80084878, + "learning_rate": 0.0007409532752346684, + "loss": 0.81132627, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.43969727, + "step": 1870, + "time_per_iteration": 2.6379218101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00638759, + "epoch": 0.3599461331281262, + "flos": 505929603072.0, + "grad_norm": 0.028943079800369927, + "language_loss": 0.8876543, + "learning_rate": 0.0007406802480606491, + "loss": 0.89815807, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.44067383, + "step": 1871, + "time_per_iteration": 2.6258225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049318, + "balance_loss_mlp": 1.00547302, + "epoch": 0.36013851481338977, + "flos": 512537737728.0, + "grad_norm": 0.03609789661305553, + "language_loss": 0.91903639, + "learning_rate": 0.0007404071274462707, + "loss": 0.92952955, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.43920898, + "step": 1872, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_mlp": 1.00494921, + "epoch": 0.36033089649865335, + "flos": 548632657152.0, + "grad_norm": 0.03255043761438457, + "language_loss": 0.84506214, + "learning_rate": 0.0007401339134975682, + "loss": 0.85555267, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.44189453, + "step": 1873, + "time_per_iteration": 2.6355786323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_mlp": 1.00575614, + "epoch": 0.3605232781839169, + "flos": 459614334720.0, + "grad_norm": 0.03456024010205507, + "language_loss": 0.84983587, + "learning_rate": 0.0007398606063206122, + "loss": 0.86033404, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.44140625, + "step": 1874, + "time_per_iteration": 2.5788064002990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_mlp": 1.00577569, + "epoch": 0.36071565986918047, + "flos": 510564434688.0, + "grad_norm": 0.03262157431229983, + "language_loss": 0.79280519, + "learning_rate": 0.0007395872060215101, + "loss": 0.80330336, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.44116211, + "step": 1875, + "time_per_iteration": 2.59242582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_mlp": 1.00785792, + "epoch": 0.360908041554444, + "flos": 560257647360.0, + "grad_norm": 0.03426029536230158, + "language_loss": 0.89306337, + "learning_rate": 0.0007393137127064056, + "loss": 0.9035809, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.43969727, + "step": 1876, + "time_per_iteration": 2.6217613220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00577068, + "epoch": 0.3611004232397076, + "flos": 524879143680.0, + "grad_norm": 0.03313366432597027, + "language_loss": 0.84778088, + "learning_rate": 0.0007390401264814779, + "loss": 0.85827708, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.43920898, + "step": 1877, + "time_per_iteration": 2.621366262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_mlp": 1.00752687, + "epoch": 0.3612928049249711, + "flos": 542033270784.0, + "grad_norm": 0.036139064810301956, + "language_loss": 0.85492337, + "learning_rate": 0.0007387664474529427, + "loss": 0.86543715, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.43920898, + "step": 1878, + "time_per_iteration": 2.6200942993164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.00776029, + "epoch": 0.3614851866102347, + "flos": 553630070784.0, + "grad_norm": 0.03346030230294773, + "language_loss": 0.91826439, + "learning_rate": 0.0007384926757270518, + "loss": 0.92877924, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.43798828, + "step": 1879, + "time_per_iteration": 2.6367645263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048472, + "balance_loss_mlp": 1.00481761, + "epoch": 0.36167756829549824, + "flos": 773427832320.0, + "grad_norm": 0.030641441804162946, + "language_loss": 0.80120707, + "learning_rate": 0.0007382188114100924, + "loss": 0.81169182, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.43725586, + "step": 1880, + "time_per_iteration": 2.9662272930145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_mlp": 1.0051316, + "epoch": 0.36186994998076183, + "flos": 713188627200.0, + "grad_norm": 0.030233131555612264, + "language_loss": 0.82161707, + "learning_rate": 0.0007379448546083884, + "loss": 0.83210421, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.43652344, + "step": 1881, + "time_per_iteration": 2.9433577060699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_mlp": 1.00420797, + "epoch": 0.3620623316660254, + "flos": 748901522688.0, + "grad_norm": 0.028477152913266954, + "language_loss": 0.88624489, + "learning_rate": 0.0007376708054282992, + "loss": 0.89672405, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.43774414, + "step": 1882, + "time_per_iteration": 2.9565789699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00425851, + "epoch": 0.36225471335128895, + "flos": 483535044096.0, + "grad_norm": 0.03088815199044137, + "language_loss": 0.84632647, + "learning_rate": 0.0007373966639762201, + "loss": 0.85680467, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.4362793, + "step": 1883, + "time_per_iteration": 2.6308107376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_mlp": 1.00762069, + "epoch": 0.36244709503655254, + "flos": 507911654400.0, + "grad_norm": 0.045291722940018896, + "language_loss": 0.89109468, + "learning_rate": 0.0007371224303585822, + "loss": 0.90160698, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.43676758, + "step": 1884, + "time_per_iteration": 2.5738682746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053806, + "balance_loss_mlp": 1.01194, + "epoch": 0.36263947672181607, + "flos": 1397054741760.0, + "grad_norm": 0.007615502937667497, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81410873, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.41894531, + "step": 1885, + "time_per_iteration": 4.7547221183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105859, + "balance_loss_mlp": 1.01500738, + "epoch": 0.36283185840707965, + "flos": 654523204608.0, + "grad_norm": 0.03432185210428161, + "language_loss": 0.83272493, + "learning_rate": 0.0007365736870525335, + "loss": 0.84331077, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.43652344, + "step": 1886, + "time_per_iteration": 2.8305654525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_mlp": 1.00591362, + "epoch": 0.3630242400923432, + "flos": 489845725440.0, + "grad_norm": 0.036050619102321185, + "language_loss": 0.8310129, + "learning_rate": 0.000736299177577164, + "loss": 0.84150714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.43579102, + "step": 1887, + "time_per_iteration": 2.632485866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105207, + "balance_loss_mlp": 1.00853443, + "epoch": 0.3632166217776068, + "flos": 518232125184.0, + "grad_norm": 0.034844830144856315, + "language_loss": 0.84275633, + "learning_rate": 0.0007360245763623174, + "loss": 0.85327709, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.43603516, + "step": 1888, + "time_per_iteration": 2.6480350494384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049354, + "balance_loss_mlp": 1.00596213, + "epoch": 0.36340900346287036, + "flos": 647348353536.0, + "grad_norm": 0.03423797247490227, + "language_loss": 0.90607542, + "learning_rate": 0.0007357498835146039, + "loss": 0.91656893, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.43457031, + "step": 1889, + "time_per_iteration": 2.8152430057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055179, + "balance_loss_mlp": 1.01154852, + "epoch": 0.3636013851481339, + "flos": 554411615232.0, + "grad_norm": 0.0362068794335816, + "language_loss": 0.87730169, + "learning_rate": 0.0007354750991406684, + "loss": 0.8878535, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.43701172, + "step": 1890, + "time_per_iteration": 2.71056866645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047867, + "balance_loss_mlp": 1.0042125, + "epoch": 0.3637937668333975, + "flos": 547692665088.0, + "grad_norm": 0.03762567530645649, + "language_loss": 0.81321651, + "learning_rate": 0.0007352002233471919, + "loss": 0.82369518, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.43725586, + "step": 1891, + "time_per_iteration": 2.6590068340301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_mlp": 1.01098096, + "epoch": 0.363986148518661, + "flos": 539211349248.0, + "grad_norm": 0.036762310622647384, + "language_loss": 0.79772675, + "learning_rate": 0.0007349252562408906, + "loss": 0.808276, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.44018555, + "step": 1892, + "time_per_iteration": 2.715721368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111044, + "balance_loss_mlp": 1.0663805, + "epoch": 0.3641785302039246, + "flos": 661511417856.0, + "grad_norm": 0.04360229312277944, + "language_loss": 0.82000142, + "learning_rate": 0.0007346501979285158, + "loss": 0.83110583, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.44140625, + "step": 1893, + "time_per_iteration": 2.927184820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_mlp": 1.01934052, + "epoch": 0.36437091188918813, + "flos": 1472084965632.0, + "grad_norm": 0.015393341944361743, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81600404, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.41992188, + "step": 1894, + "time_per_iteration": 4.786630868911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050162, + "balance_loss_mlp": 1.00648379, + "epoch": 0.3645632935744517, + "flos": 598445433600.0, + "grad_norm": 0.030741456608760154, + "language_loss": 0.86771834, + "learning_rate": 0.0007340998081127308, + "loss": 0.87822002, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.4375, + "step": 1895, + "time_per_iteration": 2.7590408325195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00284135, + "epoch": 0.36475567525971525, + "flos": 600696748032.0, + "grad_norm": 0.032247737775586885, + "language_loss": 0.91682166, + "learning_rate": 0.0007338244768230007, + "loss": 0.92728615, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.43676758, + "step": 1896, + "time_per_iteration": 2.806001663208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048582, + "balance_loss_mlp": 1.00502336, + "epoch": 0.36494805694497884, + "flos": 799832180736.0, + "grad_norm": 0.03166243516623692, + "language_loss": 0.89817142, + "learning_rate": 0.0007335490547545578, + "loss": 0.90865725, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.4362793, + "step": 1897, + "time_per_iteration": 3.0448927879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049034, + "balance_loss_mlp": 1.00535595, + "epoch": 0.3651404386302424, + "flos": 638478210816.0, + "grad_norm": 0.03536594015703217, + "language_loss": 0.82896376, + "learning_rate": 0.0007332735420143308, + "loss": 0.83945411, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.4375, + "step": 1898, + "time_per_iteration": 2.739990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_mlp": 1.00419891, + "epoch": 0.36533282031550596, + "flos": 492563634432.0, + "grad_norm": 0.03491103953335563, + "language_loss": 0.87321162, + "learning_rate": 0.0007329979387092826, + "loss": 0.88369012, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.43725586, + "step": 1899, + "time_per_iteration": 2.5661838054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_mlp": 1.00020182, + "epoch": 0.36552520200076954, + "flos": 857509979136.0, + "grad_norm": 0.025671163998745472, + "language_loss": 0.84557235, + "learning_rate": 0.0007327222449464124, + "loss": 0.85601258, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.43896484, + "step": 1900, + "time_per_iteration": 3.2916476726531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_mlp": 1.00545931, + "epoch": 0.3657175836860331, + "flos": 484716109056.0, + "grad_norm": 0.033162883177173925, + "language_loss": 0.89287698, + "learning_rate": 0.0007324464608327538, + "loss": 0.90336835, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.4375, + "step": 1901, + "time_per_iteration": 2.6514644622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050151, + "balance_loss_mlp": 1.00647259, + "epoch": 0.36590996537129666, + "flos": 435721815552.0, + "grad_norm": 0.0385016057803441, + "language_loss": 0.88887352, + "learning_rate": 0.0007321705864753758, + "loss": 0.89937502, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.4375, + "step": 1902, + "time_per_iteration": 2.6785683631896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00171947, + "epoch": 0.3661023470565602, + "flos": 713514270720.0, + "grad_norm": 0.027132815564249787, + "language_loss": 0.85073566, + "learning_rate": 0.0007318946219813823, + "loss": 0.86119133, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.43920898, + "step": 1903, + "time_per_iteration": 2.9874324798583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00431406, + "epoch": 0.3662947287418238, + "flos": 565823722752.0, + "grad_norm": 0.03452387251033087, + "language_loss": 0.90632051, + "learning_rate": 0.000731618567457912, + "loss": 0.91680402, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.44116211, + "step": 1904, + "time_per_iteration": 2.684290885925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049154, + "balance_loss_mlp": 1.00516582, + "epoch": 0.3664871104270873, + "flos": 791203110912.0, + "grad_norm": 0.032826620308443535, + "language_loss": 0.87174082, + "learning_rate": 0.000731342423012139, + "loss": 0.88223237, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.44067383, + "step": 1905, + "time_per_iteration": 3.0617177486419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051542, + "balance_loss_mlp": 1.00750625, + "epoch": 0.3666794921123509, + "flos": 753981561600.0, + "grad_norm": 0.03506961035904521, + "language_loss": 0.83108962, + "learning_rate": 0.0007310661887512722, + "loss": 0.84160507, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.44116211, + "step": 1906, + "time_per_iteration": 3.046901226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.0011121, + "epoch": 0.3668718737976145, + "flos": 524607935232.0, + "grad_norm": 0.03388484398579531, + "language_loss": 0.82964659, + "learning_rate": 0.0007307898647825549, + "loss": 0.84010023, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.44335938, + "step": 1907, + "time_per_iteration": 2.6592161655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_mlp": 1.00767255, + "epoch": 0.367064255482878, + "flos": 573046205952.0, + "grad_norm": 0.03554957537225944, + "language_loss": 0.8992576, + "learning_rate": 0.0007305134512132659, + "loss": 0.90977585, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.44238281, + "step": 1908, + "time_per_iteration": 2.6961183547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055033, + "balance_loss_mlp": 1.01078284, + "epoch": 0.3672566371681416, + "flos": 448054473216.0, + "grad_norm": 0.04018581054394134, + "language_loss": 0.843858, + "learning_rate": 0.0007302369481507183, + "loss": 0.85440832, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.44335938, + "step": 1909, + "time_per_iteration": 2.488203763961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056026, + "balance_loss_mlp": 1.01358795, + "epoch": 0.36744901885340514, + "flos": 1543366893312.0, + "grad_norm": 0.00771809390988723, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81017786, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.42480469, + "step": 1910, + "time_per_iteration": 4.828088045120239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059395, + "balance_loss_mlp": 1.01457202, + "epoch": 0.36764140053866873, + "flos": 564762221568.0, + "grad_norm": 0.032014471163266715, + "language_loss": 0.86287534, + "learning_rate": 0.000729683673975274, + "loss": 0.87346923, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.44824219, + "step": 1911, + "time_per_iteration": 2.6982359886169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058317, + "balance_loss_mlp": 1.01366162, + "epoch": 0.36783378222393226, + "flos": 1218652614144.0, + "grad_norm": 0.03007186425733569, + "language_loss": 0.8357197, + "learning_rate": 0.0007294069030771774, + "loss": 0.84630299, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.44702148, + "step": 1912, + "time_per_iteration": 3.6612210273742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049037, + "balance_loss_mlp": 1.0043577, + "epoch": 0.36802616390919585, + "flos": 499720988928.0, + "grad_norm": 0.03131225250708543, + "language_loss": 0.91280997, + "learning_rate": 0.0007291300431154224, + "loss": 0.92330033, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.44726562, + "step": 1913, + "time_per_iteration": 2.574129581451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053715, + "balance_loss_mlp": 1.01108551, + "epoch": 0.36821854559445943, + "flos": 1585618904064.0, + "grad_norm": 0.006266309435424964, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7144345, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.42675781, + "step": 1914, + "time_per_iteration": 4.960723876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052577, + "balance_loss_mlp": 1.0082792, + "epoch": 0.36841092727972297, + "flos": 837090668544.0, + "grad_norm": 0.03136779226227803, + "language_loss": 0.80375087, + "learning_rate": 0.0007285760564309179, + "loss": 0.81427664, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.44384766, + "step": 1915, + "time_per_iteration": 3.0985960960388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010541, + "balance_loss_mlp": 1.00965917, + "epoch": 0.36860330896498655, + "flos": 691211085312.0, + "grad_norm": 0.031502418433557444, + "language_loss": 0.85988045, + "learning_rate": 0.0007282989299232448, + "loss": 0.87042141, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.4453125, + "step": 1916, + "time_per_iteration": 3.034715175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055093, + "balance_loss_mlp": 1.01065195, + "epoch": 0.3687956906502501, + "flos": 555240791808.0, + "grad_norm": 0.03953946470073971, + "language_loss": 0.84794021, + "learning_rate": 0.0007280217147820668, + "loss": 0.85849106, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.4453125, + "step": 1917, + "time_per_iteration": 2.61297869682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_mlp": 1.0093317, + "epoch": 0.3689880723355137, + "flos": 577820043264.0, + "grad_norm": 0.030128455165502346, + "language_loss": 0.7994225, + "learning_rate": 0.0007277444111150079, + "loss": 0.80996048, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.44555664, + "step": 1918, + "time_per_iteration": 2.7244873046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_mlp": 1.00845671, + "epoch": 0.3691804540207772, + "flos": 529887250944.0, + "grad_norm": 0.035938670194894204, + "language_loss": 0.84948546, + "learning_rate": 0.0007274670190297272, + "loss": 0.86001301, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.44384766, + "step": 1919, + "time_per_iteration": 2.6209609508514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_mlp": 1.0041858, + "epoch": 0.3693728357060408, + "flos": 562181372928.0, + "grad_norm": 0.026922320390231402, + "language_loss": 0.82273662, + "learning_rate": 0.0007271895386339179, + "loss": 0.83322287, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.4453125, + "step": 1920, + "time_per_iteration": 2.7952609062194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00292087, + "epoch": 0.3695652173913043, + "flos": 580900534272.0, + "grad_norm": 0.03055527362799568, + "language_loss": 0.83712995, + "learning_rate": 0.0007269119700353073, + "loss": 0.84760189, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.44360352, + "step": 1921, + "time_per_iteration": 2.808595895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049468, + "balance_loss_mlp": 1.00519335, + "epoch": 0.3697575990765679, + "flos": 514059997440.0, + "grad_norm": 0.029192022992987326, + "language_loss": 0.85655916, + "learning_rate": 0.0007266343133416571, + "loss": 0.86705387, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.44360352, + "step": 1922, + "time_per_iteration": 2.7229409217834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045757, + "balance_loss_mlp": 1.00255585, + "epoch": 0.3699499807618315, + "flos": 1573906430976.0, + "grad_norm": 0.004633598174219594, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.7816267, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.43261719, + "step": 1923, + "time_per_iteration": 4.855220556259155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049661, + "balance_loss_mlp": 1.00526702, + "epoch": 0.37014236244709503, + "flos": 498325095936.0, + "grad_norm": 0.04063724538866958, + "language_loss": 0.84789312, + "learning_rate": 0.0007260787361004556, + "loss": 0.85838968, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.44482422, + "step": 1924, + "time_per_iteration": 2.5634405612945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063278, + "balance_loss_mlp": 1.01998138, + "epoch": 0.3703347441323586, + "flos": 1447608233472.0, + "grad_norm": 0.011285785538321925, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.7482478, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 4.881471157073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050604, + "balance_loss_mlp": 1.00601971, + "epoch": 0.37052712581762215, + "flos": 564714589440.0, + "grad_norm": 0.030700116077417884, + "language_loss": 0.87676865, + "learning_rate": 0.0007255228077730903, + "loss": 0.88727468, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.44628906, + "step": 1926, + "time_per_iteration": 2.6604056358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_mlp": 1.00426066, + "epoch": 0.37071950750288574, + "flos": 927571958016.0, + "grad_norm": 0.030848240929213684, + "language_loss": 0.82266426, + "learning_rate": 0.0007252447122218632, + "loss": 0.83315009, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.4440918, + "step": 1927, + "time_per_iteration": 3.189232110977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_mlp": 1.00424135, + "epoch": 0.37091188918814927, + "flos": 419201478912.0, + "grad_norm": 0.038028798643346066, + "language_loss": 0.88517463, + "learning_rate": 0.0007249665292228834, + "loss": 0.89565861, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.44238281, + "step": 1928, + "time_per_iteration": 2.6051783561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_mlp": 1.00443351, + "epoch": 0.37110427087341286, + "flos": 464147099136.0, + "grad_norm": 0.03246756835091633, + "language_loss": 0.8426615, + "learning_rate": 0.000724688258884151, + "loss": 0.85314661, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.44165039, + "step": 1929, + "time_per_iteration": 2.5537402629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105112, + "balance_loss_mlp": 1.00703681, + "epoch": 0.3712966525586764, + "flos": 851081679360.0, + "grad_norm": 0.026814038228573516, + "language_loss": 0.86998665, + "learning_rate": 0.0007244099013137002, + "loss": 0.88049793, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.44165039, + "step": 1930, + "time_per_iteration": 3.091195821762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052812, + "balance_loss_mlp": 1.00901484, + "epoch": 0.37148903424394, + "flos": 927559319040.0, + "grad_norm": 0.03484228463474462, + "language_loss": 0.89224607, + "learning_rate": 0.0007241314566195993, + "loss": 0.90277416, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.4387207, + "step": 1931, + "time_per_iteration": 3.2276151180267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00616395, + "epoch": 0.37168141592920356, + "flos": 520821722112.0, + "grad_norm": 0.033577876196724185, + "language_loss": 0.86394525, + "learning_rate": 0.0007238529249099496, + "loss": 0.87444603, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.43994141, + "step": 1932, + "time_per_iteration": 2.6099538803100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_mlp": 1.00075531, + "epoch": 0.3718737976144671, + "flos": 1449062452224.0, + "grad_norm": 0.005805601038449312, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78900075, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.42480469, + "step": 1933, + "time_per_iteration": 4.864013910293579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00741839, + "epoch": 0.3720661792997307, + "flos": 760954223616.0, + "grad_norm": 0.031651541573232696, + "language_loss": 0.81381935, + "learning_rate": 0.000723295600876581, + "loss": 0.82433319, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.44042969, + "step": 1934, + "time_per_iteration": 3.003988742828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_mlp": 1.00353265, + "epoch": 0.3722585609849942, + "flos": 518045487360.0, + "grad_norm": 0.031160015664157277, + "language_loss": 0.88386387, + "learning_rate": 0.0007230168087692344, + "loss": 0.89433783, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.43945312, + "step": 1935, + "time_per_iteration": 2.6490824222564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_mlp": 1.00165451, + "epoch": 0.3724509426702578, + "flos": 783869812224.0, + "grad_norm": 0.03743087194604022, + "language_loss": 0.82867873, + "learning_rate": 0.0007227379300790839, + "loss": 0.83913326, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.4387207, + "step": 1936, + "time_per_iteration": 3.010700225830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044084, + "balance_loss_mlp": 1.00011992, + "epoch": 0.37264332435552133, + "flos": 392599799040.0, + "grad_norm": 0.032423549870759565, + "language_loss": 0.86443603, + "learning_rate": 0.0007224589649143997, + "loss": 0.87487686, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.44042969, + "step": 1937, + "time_per_iteration": 2.54010272026062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_mlp": 1.00072384, + "epoch": 0.3728357060407849, + "flos": 543913254912.0, + "grad_norm": 0.03387233199209411, + "language_loss": 0.81436574, + "learning_rate": 0.0007221799133834861, + "loss": 0.82481098, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.4387207, + "step": 1938, + "time_per_iteration": 2.6355655193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_mlp": 1.00154293, + "epoch": 0.3730280877260485, + "flos": 434484370176.0, + "grad_norm": 0.03416430777388856, + "language_loss": 0.82122993, + "learning_rate": 0.00072190077559468, + "loss": 0.83168304, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.43847656, + "step": 1939, + "time_per_iteration": 2.5033867359161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049643, + "balance_loss_mlp": 1.00579786, + "epoch": 0.37322046941131204, + "flos": 532511841024.0, + "grad_norm": 0.031902006564455146, + "language_loss": 0.89473069, + "learning_rate": 0.0007216215516563527, + "loss": 0.90522707, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.43920898, + "step": 1940, + "time_per_iteration": 2.685201406478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_mlp": 1.00538588, + "epoch": 0.3734128510965756, + "flos": 532576969728.0, + "grad_norm": 0.03682978505173481, + "language_loss": 0.83770883, + "learning_rate": 0.0007213422416769083, + "loss": 0.84820092, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.43896484, + "step": 1941, + "time_per_iteration": 2.5981826782226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104851, + "balance_loss_mlp": 1.00454593, + "epoch": 0.37360523278183916, + "flos": 501433777152.0, + "grad_norm": 0.029644951468961563, + "language_loss": 0.75750655, + "learning_rate": 0.0007210628457647849, + "loss": 0.76799166, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.44042969, + "step": 1942, + "time_per_iteration": 2.5780391693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_mlp": 1.00365365, + "epoch": 0.37379761446710275, + "flos": 549112857600.0, + "grad_norm": 0.03283775645447924, + "language_loss": 0.79155779, + "learning_rate": 0.000720783364028453, + "loss": 0.80203396, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.44042969, + "step": 1943, + "time_per_iteration": 2.7498555183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052612, + "balance_loss_mlp": 1.0085758, + "epoch": 0.3739899961523663, + "flos": 476740271616.0, + "grad_norm": 0.03229344723146533, + "language_loss": 0.88345349, + "learning_rate": 0.0007205037965764177, + "loss": 0.89397967, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.44116211, + "step": 1944, + "time_per_iteration": 2.559565305709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049461, + "balance_loss_mlp": 1.00533009, + "epoch": 0.37418237783762986, + "flos": 613077037824.0, + "grad_norm": 0.033726561022773015, + "language_loss": 0.85856438, + "learning_rate": 0.0007202241435172161, + "loss": 0.86905897, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.44213867, + "step": 1945, + "time_per_iteration": 2.7495012283325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_mlp": 1.00618136, + "epoch": 0.3743747595228934, + "flos": 767629432320.0, + "grad_norm": 0.030482282234963888, + "language_loss": 0.88839138, + "learning_rate": 0.0007199444049594198, + "loss": 0.89889503, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.44262695, + "step": 1946, + "time_per_iteration": 2.927438259124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_mlp": 1.00679135, + "epoch": 0.374567141208157, + "flos": 525491546880.0, + "grad_norm": 0.03274984488565387, + "language_loss": 0.84098482, + "learning_rate": 0.0007196645810116322, + "loss": 0.85149455, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.44262695, + "step": 1947, + "time_per_iteration": 2.669954538345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00717854, + "epoch": 0.37475952289342057, + "flos": 682614096384.0, + "grad_norm": 0.03500222096290466, + "language_loss": 0.84308642, + "learning_rate": 0.0007193846717824912, + "loss": 0.85360044, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.44311523, + "step": 1948, + "time_per_iteration": 2.873595714569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_mlp": 1.01018322, + "epoch": 0.3749519045786841, + "flos": 461216307456.0, + "grad_norm": 0.03758393676626501, + "language_loss": 0.89286113, + "learning_rate": 0.0007191046773806669, + "loss": 0.90340507, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.44287109, + "step": 1949, + "time_per_iteration": 2.5632805824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052491, + "balance_loss_mlp": 1.00816894, + "epoch": 0.3751442862639477, + "flos": 956388013824.0, + "grad_norm": 0.04355990755149793, + "language_loss": 0.83803475, + "learning_rate": 0.0007188245979148631, + "loss": 0.84855968, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.4440918, + "step": 1950, + "time_per_iteration": 3.153048515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050564, + "balance_loss_mlp": 1.00619411, + "epoch": 0.3753366679492112, + "flos": 528806307840.0, + "grad_norm": 0.034134677221205334, + "language_loss": 0.88437903, + "learning_rate": 0.0007185444334938157, + "loss": 0.89488459, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.44458008, + "step": 1951, + "time_per_iteration": 2.77795147895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052982, + "balance_loss_mlp": 1.0084213, + "epoch": 0.3755290496344748, + "flos": 522849460224.0, + "grad_norm": 0.03641649118573359, + "language_loss": 0.85489821, + "learning_rate": 0.0007182641842262947, + "loss": 0.86542803, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.44628906, + "step": 1952, + "time_per_iteration": 2.6038033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063178, + "balance_loss_mlp": 1.01852179, + "epoch": 0.37572143131973834, + "flos": 622372945920.0, + "grad_norm": 0.036303705105214745, + "language_loss": 0.78406018, + "learning_rate": 0.0007179838502211022, + "loss": 0.79469192, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.44702148, + "step": 1953, + "time_per_iteration": 2.8537991046905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050235, + "balance_loss_mlp": 1.00565112, + "epoch": 0.37591381300500193, + "flos": 772274957568.0, + "grad_norm": 0.033405608161133214, + "language_loss": 0.87193865, + "learning_rate": 0.0007177034315870738, + "loss": 0.88244104, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.44677734, + "step": 1954, + "time_per_iteration": 2.9944725036621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_mlp": 1.00469208, + "epoch": 0.37610619469026546, + "flos": 521481757440.0, + "grad_norm": 0.05036646851246907, + "language_loss": 0.91552407, + "learning_rate": 0.0007174229284330773, + "loss": 0.92601728, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.44702148, + "step": 1955, + "time_per_iteration": 2.607128143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046853, + "balance_loss_mlp": 1.0023644, + "epoch": 0.37629857637552905, + "flos": 599971584000.0, + "grad_norm": 0.029911324472659546, + "language_loss": 0.87468076, + "learning_rate": 0.0007171423408680141, + "loss": 0.88514924, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.44555664, + "step": 1956, + "time_per_iteration": 2.8234241008758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00272334, + "epoch": 0.37649095806079264, + "flos": 566019108864.0, + "grad_norm": 0.03303955535560464, + "language_loss": 0.90624022, + "learning_rate": 0.0007168616690008176, + "loss": 0.91671115, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.44458008, + "step": 1957, + "time_per_iteration": 2.645219326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047066, + "balance_loss_mlp": 1.00271976, + "epoch": 0.37668333974605617, + "flos": 593569529088.0, + "grad_norm": 0.03512927569377508, + "language_loss": 0.86650079, + "learning_rate": 0.0007165809129404545, + "loss": 0.87697142, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.44433594, + "step": 1958, + "time_per_iteration": 2.762319564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105165, + "balance_loss_mlp": 1.00742376, + "epoch": 0.37687572143131975, + "flos": 420365047296.0, + "grad_norm": 0.03381206580119959, + "language_loss": 0.8673501, + "learning_rate": 0.0007163000727959239, + "loss": 0.87786663, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.44311523, + "step": 1959, + "time_per_iteration": 2.4887454509735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_mlp": 1.00466919, + "epoch": 0.3770681031165833, + "flos": 1360387269888.0, + "grad_norm": 0.007286715675134549, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79006183, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.42480469, + "step": 1960, + "time_per_iteration": 4.844388961791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053466, + "balance_loss_mlp": 1.00938201, + "epoch": 0.3772604848018469, + "flos": 646154649600.0, + "grad_norm": 0.030030705089392724, + "language_loss": 0.85244703, + "learning_rate": 0.00071573814069052, + "loss": 0.86298174, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.44165039, + "step": 1961, + "time_per_iteration": 2.93870210647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_mlp": 0.99976981, + "epoch": 0.3774528664871104, + "flos": 903202150656.0, + "grad_norm": 0.029467737659617427, + "language_loss": 0.88618672, + "learning_rate": 0.0007154570489478081, + "loss": 0.89662528, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.44165039, + "step": 1962, + "time_per_iteration": 3.2101829051971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_mlp": 1.00241697, + "epoch": 0.377645248172374, + "flos": 789464077824.0, + "grad_norm": 0.02894999631439154, + "language_loss": 0.87102842, + "learning_rate": 0.0007151758735572514, + "loss": 0.88149416, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.44238281, + "step": 1963, + "time_per_iteration": 3.0217864513397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046842, + "balance_loss_mlp": 1.00282979, + "epoch": 0.3778376298576376, + "flos": 587925686016.0, + "grad_norm": 0.035422959183698866, + "language_loss": 0.81287247, + "learning_rate": 0.0007148946146280119, + "loss": 0.82334089, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.44091797, + "step": 1964, + "time_per_iteration": 2.9066553115844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_mlp": 1.01407623, + "epoch": 0.3780300115429011, + "flos": 1399672528896.0, + "grad_norm": 0.012885740561533653, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73248661, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.42480469, + "step": 1965, + "time_per_iteration": 4.874085426330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3782223932281647, + "flos": 1360634178816.0, + "grad_norm": 0.008484298942656315, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397645, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.42578125, + "step": 1966, + "time_per_iteration": 4.964066743850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048666, + "balance_loss_mlp": 1.00467777, + "epoch": 0.37841477491342823, + "flos": 705517046016.0, + "grad_norm": 0.02737284959483133, + "language_loss": 0.8436377, + "learning_rate": 0.0007140503377003022, + "loss": 0.85412437, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.44067383, + "step": 1967, + "time_per_iteration": 3.014033555984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_mlp": 1.00764298, + "epoch": 0.3786071565986918, + "flos": 530156514048.0, + "grad_norm": 0.03014770490429956, + "language_loss": 0.85294402, + "learning_rate": 0.000713768745708599, + "loss": 0.86346149, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.44189453, + "step": 1968, + "time_per_iteration": 2.6359875202178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_mlp": 1.0084002, + "epoch": 0.37879953828395535, + "flos": 994901443584.0, + "grad_norm": 0.03323886334735767, + "language_loss": 0.78270096, + "learning_rate": 0.0007134870707245085, + "loss": 0.79322648, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.44238281, + "step": 1969, + "time_per_iteration": 3.276670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054281, + "balance_loss_mlp": 1.01010239, + "epoch": 0.37899191996921894, + "flos": 627793212672.0, + "grad_norm": 0.033324026165203316, + "language_loss": 0.84867144, + "learning_rate": 0.0007132053128573864, + "loss": 0.85921425, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.44262695, + "step": 1970, + "time_per_iteration": 2.747647523880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_mlp": 1.00727034, + "epoch": 0.37918430165448247, + "flos": 687520136448.0, + "grad_norm": 0.034311044198206936, + "language_loss": 0.84702653, + "learning_rate": 0.0007129234722166211, + "loss": 0.85754126, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.44287109, + "step": 1971, + "time_per_iteration": 2.8502755165100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104953, + "balance_loss_mlp": 1.00535131, + "epoch": 0.37937668333974606, + "flos": 476618762496.0, + "grad_norm": 0.028798969169212138, + "language_loss": 0.91637433, + "learning_rate": 0.0007126415489116328, + "loss": 0.92686969, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.44262695, + "step": 1972, + "time_per_iteration": 2.703598737716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_mlp": 1.00559556, + "epoch": 0.37956906502500964, + "flos": 708825004032.0, + "grad_norm": 0.033945121596029554, + "language_loss": 0.81780016, + "learning_rate": 0.0007123595430518736, + "loss": 0.82829797, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.44262695, + "step": 1973, + "time_per_iteration": 2.859210252761841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_mlp": 1.00345445, + "epoch": 0.3797614467102732, + "flos": 427559340288.0, + "grad_norm": 0.03504063937858188, + "language_loss": 0.86830699, + "learning_rate": 0.0007120774547468282, + "loss": 0.87878382, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.44311523, + "step": 1974, + "time_per_iteration": 2.5465054512023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105377, + "balance_loss_mlp": 1.00944817, + "epoch": 0.37995382839553676, + "flos": 482881811712.0, + "grad_norm": 0.031503790568027705, + "language_loss": 0.82317638, + "learning_rate": 0.0007117952841060128, + "loss": 0.83371413, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.4440918, + "step": 1975, + "time_per_iteration": 2.789965867996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_mlp": 1.00924242, + "epoch": 0.3801462100808003, + "flos": 561671036928.0, + "grad_norm": 0.03572346778222672, + "language_loss": 0.84539783, + "learning_rate": 0.0007115130312389756, + "loss": 0.85593396, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.44433594, + "step": 1976, + "time_per_iteration": 2.7104804515838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046782, + "balance_loss_mlp": 1.00236499, + "epoch": 0.3803385917660639, + "flos": 465888077568.0, + "grad_norm": 0.03508123942848817, + "language_loss": 0.80071044, + "learning_rate": 0.0007112306962552973, + "loss": 0.81117821, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.44506836, + "step": 1977, + "time_per_iteration": 2.644700527191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053171, + "balance_loss_mlp": 1.00863445, + "epoch": 0.3805309734513274, + "flos": 522905840640.0, + "grad_norm": 0.0297417361696937, + "language_loss": 0.8625899, + "learning_rate": 0.0007109482792645896, + "loss": 0.87312162, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.44580078, + "step": 1978, + "time_per_iteration": 2.736924171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052388, + "balance_loss_mlp": 1.00780404, + "epoch": 0.380723355136591, + "flos": 592553714688.0, + "grad_norm": 0.03207088172149068, + "language_loss": 0.84620887, + "learning_rate": 0.0007106657803764969, + "loss": 0.85673285, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.44628906, + "step": 1979, + "time_per_iteration": 2.797027111053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_mlp": 1.00851822, + "epoch": 0.38091573682185453, + "flos": 623855354880.0, + "grad_norm": 0.034228405400289826, + "language_loss": 0.82734859, + "learning_rate": 0.0007103831997006948, + "loss": 0.83788031, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.4465332, + "step": 1980, + "time_per_iteration": 2.774831771850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00601208, + "epoch": 0.3811081185071181, + "flos": 570176652288.0, + "grad_norm": 0.02916230611543443, + "language_loss": 0.85986841, + "learning_rate": 0.0007101005373468908, + "loss": 0.87037432, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.4465332, + "step": 1981, + "time_per_iteration": 2.889430284500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_mlp": 1.00647449, + "epoch": 0.3813005001923817, + "flos": 585991266816.0, + "grad_norm": 0.029260882769569122, + "language_loss": 0.87282979, + "learning_rate": 0.0007098177934248242, + "loss": 0.88334191, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.44726562, + "step": 1982, + "time_per_iteration": 2.734011173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_mlp": 1.00509369, + "epoch": 0.38149288187764524, + "flos": 622811350272.0, + "grad_norm": 0.03279838714755621, + "language_loss": 0.86164075, + "learning_rate": 0.0007095349680442661, + "loss": 0.87213778, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.44677734, + "step": 1983, + "time_per_iteration": 2.8532214164733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049432, + "balance_loss_mlp": 1.00496709, + "epoch": 0.3816852635629088, + "flos": 571798066944.0, + "grad_norm": 0.03407469020321441, + "language_loss": 0.79342288, + "learning_rate": 0.0007092520613150188, + "loss": 0.80391723, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.4453125, + "step": 1984, + "time_per_iteration": 2.6656527519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_mlp": 1.01058352, + "epoch": 0.38187764524817236, + "flos": 566679144192.0, + "grad_norm": 0.03287674379309895, + "language_loss": 0.81891948, + "learning_rate": 0.0007089690733469165, + "loss": 0.82946956, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.44506836, + "step": 1985, + "time_per_iteration": 2.6921868324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104986, + "balance_loss_mlp": 1.00544298, + "epoch": 0.38207002693343595, + "flos": 632399854080.0, + "grad_norm": 0.03591516825864857, + "language_loss": 0.8265506, + "learning_rate": 0.000708686004249825, + "loss": 0.83704919, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.44506836, + "step": 1986, + "time_per_iteration": 2.771472454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_mlp": 1.0026772, + "epoch": 0.3822624086186995, + "flos": 549841912320.0, + "grad_norm": 0.027805852633017242, + "language_loss": 0.91746366, + "learning_rate": 0.0007084028541336413, + "loss": 0.92793083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.44116211, + "step": 1987, + "time_per_iteration": 2.7168381214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049881, + "balance_loss_mlp": 1.00572634, + "epoch": 0.38245479030396307, + "flos": 615067837440.0, + "grad_norm": 0.03052630202850825, + "language_loss": 0.86906445, + "learning_rate": 0.0007081196231082942, + "loss": 0.87956333, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.44238281, + "step": 1988, + "time_per_iteration": 2.8021280765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00325835, + "epoch": 0.38264717198922665, + "flos": 669304508160.0, + "grad_norm": 0.03253134732635267, + "language_loss": 0.8090933, + "learning_rate": 0.0007078363112837436, + "loss": 0.81956601, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.44091797, + "step": 1989, + "time_per_iteration": 2.812901020050049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00232375, + "epoch": 0.3828395536744902, + "flos": 455687170560.0, + "grad_norm": 0.03353740504071411, + "language_loss": 0.8610149, + "learning_rate": 0.000707552918769981, + "loss": 0.87147707, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.43969727, + "step": 1990, + "time_per_iteration": 2.503817081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0038017, + "epoch": 0.3830319353597538, + "flos": 500483091456.0, + "grad_norm": 0.030831133245435974, + "language_loss": 0.84298265, + "learning_rate": 0.000707269445677029, + "loss": 0.85345787, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.43798828, + "step": 1991, + "time_per_iteration": 2.77250599861145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_mlp": 1.00373507, + "epoch": 0.3832243170450173, + "flos": 745467197952.0, + "grad_norm": 0.03142895241328533, + "language_loss": 0.85860848, + "learning_rate": 0.0007069858921149416, + "loss": 0.86908376, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.4387207, + "step": 1992, + "time_per_iteration": 3.001058578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_mlp": 1.00363255, + "epoch": 0.3834166987302809, + "flos": 579346193664.0, + "grad_norm": 0.027707623231004064, + "language_loss": 0.86360574, + "learning_rate": 0.0007067022581938043, + "loss": 0.87407815, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.43676758, + "step": 1993, + "time_per_iteration": 2.896017551422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049302, + "balance_loss_mlp": 1.00579047, + "epoch": 0.3836090804155444, + "flos": 537609376512.0, + "grad_norm": 0.038344647976828676, + "language_loss": 0.83944476, + "learning_rate": 0.0007064185440237334, + "loss": 0.8499378, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.43579102, + "step": 1994, + "time_per_iteration": 2.8133461475372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051736, + "balance_loss_mlp": 1.00820076, + "epoch": 0.383801462100808, + "flos": 603052075008.0, + "grad_norm": 0.0304270283066245, + "language_loss": 0.85033917, + "learning_rate": 0.0007061347497148764, + "loss": 0.86085653, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.43603516, + "step": 1995, + "time_per_iteration": 2.829977035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050646, + "balance_loss_mlp": 1.00694358, + "epoch": 0.38399384378607154, + "flos": 573799560192.0, + "grad_norm": 0.034646706108572276, + "language_loss": 0.86866224, + "learning_rate": 0.0007058508753774122, + "loss": 0.87916863, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.43774414, + "step": 1996, + "time_per_iteration": 2.684966564178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049901, + "balance_loss_mlp": 1.00629473, + "epoch": 0.38418622547133513, + "flos": 537780463104.0, + "grad_norm": 0.03333459391135046, + "language_loss": 0.87270373, + "learning_rate": 0.0007055669211215505, + "loss": 0.88320273, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.43676758, + "step": 1997, + "time_per_iteration": 2.623508930206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054497, + "balance_loss_mlp": 1.01079535, + "epoch": 0.3843786071565987, + "flos": 574014388224.0, + "grad_norm": 0.04127067736406929, + "language_loss": 0.78599155, + "learning_rate": 0.0007052828870575322, + "loss": 0.79653656, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.43774414, + "step": 1998, + "time_per_iteration": 2.644423723220825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_mlp": 1.00761676, + "epoch": 0.38457098884186225, + "flos": 730080294144.0, + "grad_norm": 0.03146347648703673, + "language_loss": 0.87266672, + "learning_rate": 0.0007049987732956291, + "loss": 0.88318008, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.43798828, + "step": 1999, + "time_per_iteration": 2.963409185409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_mlp": 1.00447905, + "epoch": 0.38476337052712584, + "flos": 584621618688.0, + "grad_norm": 0.024706606255084192, + "language_loss": 0.83278054, + "learning_rate": 0.0007047145799461439, + "loss": 0.84326208, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.4375, + "step": 2000, + "time_per_iteration": 2.86661434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_mlp": 1.00459874, + "epoch": 0.38495575221238937, + "flos": 554159848704.0, + "grad_norm": 0.03147773281119346, + "language_loss": 0.83074015, + "learning_rate": 0.00070443030711941, + "loss": 0.84122348, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.43798828, + "step": 2001, + "time_per_iteration": 2.778719425201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_mlp": 1.00175321, + "epoch": 0.38514813389765296, + "flos": 655678024704.0, + "grad_norm": 0.03168685191580143, + "language_loss": 0.82975376, + "learning_rate": 0.0007041459549257924, + "loss": 0.84020758, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.43701172, + "step": 2002, + "time_per_iteration": 2.8597054481506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046243, + "balance_loss_mlp": 1.00261223, + "epoch": 0.3853405155829165, + "flos": 869647250688.0, + "grad_norm": 0.03552713767777679, + "language_loss": 0.78954732, + "learning_rate": 0.0007038615234756859, + "loss": 0.80000973, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.43701172, + "step": 2003, + "time_per_iteration": 3.167647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.00697505, + "epoch": 0.3855328972681801, + "flos": 547469088768.0, + "grad_norm": 0.03596547507231522, + "language_loss": 0.84374714, + "learning_rate": 0.000703577012879517, + "loss": 0.85425198, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.43579102, + "step": 2004, + "time_per_iteration": 2.644718885421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00397706, + "epoch": 0.3857252789534436, + "flos": 535099492608.0, + "grad_norm": 0.03525407945169758, + "language_loss": 0.89214581, + "learning_rate": 0.0007032924232477423, + "loss": 0.90262067, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.43579102, + "step": 2005, + "time_per_iteration": 2.6340301036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_mlp": 1.01023984, + "epoch": 0.3859176606387072, + "flos": 492767768832.0, + "grad_norm": 0.0325086763316175, + "language_loss": 0.80829036, + "learning_rate": 0.0007030077546908493, + "loss": 0.81882888, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.43676758, + "step": 2006, + "time_per_iteration": 2.6427574157714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051659, + "balance_loss_mlp": 1.00969696, + "epoch": 0.3861100423239708, + "flos": 1490158675968.0, + "grad_norm": 0.006099468603868092, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84116316, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.41992188, + "step": 2007, + "time_per_iteration": 4.792185068130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_mlp": 1.00383234, + "epoch": 0.3863024240092343, + "flos": 474693091584.0, + "grad_norm": 0.0379943815396184, + "language_loss": 0.79703128, + "learning_rate": 0.0007024381812438117, + "loss": 0.80750644, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.4375, + "step": 2008, + "time_per_iteration": 2.6320388317108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_mlp": 1.0153178, + "epoch": 0.3864948056944979, + "flos": 717979961088.0, + "grad_norm": 0.04179543058298576, + "language_loss": 0.84345418, + "learning_rate": 0.0007021532765747951, + "loss": 0.85404319, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.43652344, + "step": 2009, + "time_per_iteration": 3.0408942699432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_mlp": 1.01370513, + "epoch": 0.38668718737976143, + "flos": 728955609600.0, + "grad_norm": 0.033678441310908816, + "language_loss": 0.80296206, + "learning_rate": 0.0007018682934229162, + "loss": 0.81353402, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.43554688, + "step": 2010, + "time_per_iteration": 2.9119958877563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_mlp": 1.01025474, + "epoch": 0.386879569065025, + "flos": 526489864704.0, + "grad_norm": 0.031759350944825356, + "language_loss": 0.83489478, + "learning_rate": 0.0007015832318988152, + "loss": 0.84543192, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.43530273, + "step": 2011, + "time_per_iteration": 2.625828981399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_mlp": 1.00643158, + "epoch": 0.38707195075028855, + "flos": 1530727067136.0, + "grad_norm": 0.008010138125144308, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74938273, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.41796875, + "step": 2012, + "time_per_iteration": 4.969848155975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_mlp": 1.00555038, + "epoch": 0.38726433243555214, + "flos": 558386411520.0, + "grad_norm": 0.029387859415775444, + "language_loss": 0.84841448, + "learning_rate": 0.0007010128741766604, + "loss": 0.85890484, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.43554688, + "step": 2013, + "time_per_iteration": 2.808583974838257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_mlp": 1.00205982, + "epoch": 0.38745671412081567, + "flos": 554756700672.0, + "grad_norm": 0.037665143906504196, + "language_loss": 0.84820414, + "learning_rate": 0.0007007275782000391, + "loss": 0.85866058, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.43652344, + "step": 2014, + "time_per_iteration": 2.6201975345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_mlp": 1.00775766, + "epoch": 0.38764909580607926, + "flos": 459345071616.0, + "grad_norm": 0.03590133597746071, + "language_loss": 0.85486585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86537898, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.4362793, + "step": 2015, + "time_per_iteration": 2.5167059898376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_mlp": 1.00792837, + "epoch": 0.38784147749134285, + "flos": 523259674368.0, + "grad_norm": 0.036833384765870066, + "language_loss": 0.90223992, + "learning_rate": 0.0007001567525695169, + "loss": 0.9127546, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.43603516, + "step": 2016, + "time_per_iteration": 2.663416624069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_mlp": 0.99923599, + "epoch": 0.3880338591766064, + "flos": 667401191424.0, + "grad_norm": 0.027528515382714943, + "language_loss": 0.84397906, + "learning_rate": 0.0006998712231372303, + "loss": 0.85440457, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.43383789, + "step": 2017, + "time_per_iteration": 2.982222080230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_mlp": 1.00389743, + "epoch": 0.38822624086186996, + "flos": 595176359424.0, + "grad_norm": 0.028816590459513517, + "language_loss": 0.86776507, + "learning_rate": 0.0006995856161080532, + "loss": 0.87823659, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.43310547, + "step": 2018, + "time_per_iteration": 2.8449933528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046566, + "balance_loss_mlp": 1.00300694, + "epoch": 0.3884186225471335, + "flos": 613682638080.0, + "grad_norm": 0.032032500930829794, + "language_loss": 0.82425624, + "learning_rate": 0.0006992999315928679, + "loss": 0.83472192, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.4362793, + "step": 2019, + "time_per_iteration": 2.803743362426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_mlp": 1.00401926, + "epoch": 0.3886110042323971, + "flos": 608244874752.0, + "grad_norm": 0.027721707471257077, + "language_loss": 0.86241317, + "learning_rate": 0.0006990141697025871, + "loss": 0.87288654, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.43383789, + "step": 2020, + "time_per_iteration": 2.7804739475250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046585, + "balance_loss_mlp": 1.00481415, + "epoch": 0.3888033859176606, + "flos": 1531196573952.0, + "grad_norm": 0.004554603876592686, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77406228, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.41796875, + "step": 2021, + "time_per_iteration": 4.76949667930603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_mlp": 1.00478315, + "epoch": 0.3889957676029242, + "flos": 693672370176.0, + "grad_norm": 0.038162906437672096, + "language_loss": 0.8292582, + "learning_rate": 0.0006984424142405392, + "loss": 0.83973902, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.43359375, + "step": 2022, + "time_per_iteration": 2.7983930110931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_mlp": 1.00599611, + "epoch": 0.3891881492881878, + "flos": 516195638784.0, + "grad_norm": 0.03974199995652067, + "language_loss": 0.82402384, + "learning_rate": 0.0006981564208907474, + "loss": 0.83451867, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.43554688, + "step": 2023, + "time_per_iteration": 2.613600730895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_mlp": 1.00707471, + "epoch": 0.3893805309734513, + "flos": 630176729856.0, + "grad_norm": 0.03303002735023947, + "language_loss": 0.90586042, + "learning_rate": 0.0006978703506098102, + "loss": 0.91636622, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.43579102, + "step": 2024, + "time_per_iteration": 2.7258403301239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_mlp": 1.00748503, + "epoch": 0.3895729126587149, + "flos": 545207080704.0, + "grad_norm": 0.0334033578711094, + "language_loss": 0.88520938, + "learning_rate": 0.00069758420350879, + "loss": 0.89571834, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.43481445, + "step": 2025, + "time_per_iteration": 2.6406970024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_mlp": 1.00427127, + "epoch": 0.38976529434397844, + "flos": 619407161088.0, + "grad_norm": 0.03600656764113765, + "language_loss": 0.86979783, + "learning_rate": 0.000697297979698779, + "loss": 0.88027489, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.43505859, + "step": 2026, + "time_per_iteration": 2.729025363922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00297225, + "epoch": 0.38995767602924203, + "flos": 836346062592.0, + "grad_norm": 0.030634369701250594, + "language_loss": 0.84155977, + "learning_rate": 0.0006970116792908992, + "loss": 0.85202479, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.43603516, + "step": 2027, + "time_per_iteration": 3.0780837535858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054265, + "balance_loss_mlp": 1.01070547, + "epoch": 0.39015005771450556, + "flos": 542647619328.0, + "grad_norm": 0.03376343400122794, + "language_loss": 0.81809974, + "learning_rate": 0.000696725302396302, + "loss": 0.82864237, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.4362793, + "step": 2028, + "time_per_iteration": 2.6632442474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_mlp": 1.00277102, + "epoch": 0.39034243939976915, + "flos": 1009142275584.0, + "grad_norm": 0.030316104633677343, + "language_loss": 0.86213875, + "learning_rate": 0.0006964388491261692, + "loss": 0.872603, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.43725586, + "step": 2029, + "time_per_iteration": 3.2410776615142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052185, + "balance_loss_mlp": 1.00848317, + "epoch": 0.3905348210850327, + "flos": 680241272832.0, + "grad_norm": 0.03528753395725821, + "language_loss": 0.88294208, + "learning_rate": 0.0006961523195917114, + "loss": 0.89346391, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.43774414, + "step": 2030, + "time_per_iteration": 2.8754475116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104739, + "balance_loss_mlp": 1.00375915, + "epoch": 0.39072720277029627, + "flos": 549989666304.0, + "grad_norm": 0.032806843563698423, + "language_loss": 0.78588331, + "learning_rate": 0.0006958657139041696, + "loss": 0.79635721, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.43701172, + "step": 2031, + "time_per_iteration": 2.7329561710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_mlp": 1.00554657, + "epoch": 0.39091958445555985, + "flos": 1551054025728.0, + "grad_norm": 0.008088132411436895, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77760577, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.42089844, + "step": 2032, + "time_per_iteration": 4.958296298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048975, + "balance_loss_mlp": 1.00529635, + "epoch": 0.3911119661408234, + "flos": 505052794368.0, + "grad_norm": 0.03533188094946227, + "language_loss": 0.78901434, + "learning_rate": 0.0006952922745149434, + "loss": 0.7995041, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.4375, + "step": 2033, + "time_per_iteration": 2.6192519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050645, + "balance_loss_mlp": 1.00684798, + "epoch": 0.391304347826087, + "flos": 558330031104.0, + "grad_norm": 0.032114717040763616, + "language_loss": 0.88009661, + "learning_rate": 0.000695005441035888, + "loss": 0.89060307, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.4387207, + "step": 2034, + "time_per_iteration": 2.6519060134887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_mlp": 1.00334167, + "epoch": 0.3914967295113505, + "flos": 1502944322304.0, + "grad_norm": 0.004600085335304226, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7476902, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.42285156, + "step": 2035, + "time_per_iteration": 4.875830888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049757, + "balance_loss_mlp": 1.00581694, + "epoch": 0.3916891111966141, + "flos": 708330219264.0, + "grad_norm": 0.02756997110289995, + "language_loss": 0.81809461, + "learning_rate": 0.0006944315470656863, + "loss": 0.82859218, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.44018555, + "step": 2036, + "time_per_iteration": 2.9486818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104941, + "balance_loss_mlp": 1.00537384, + "epoch": 0.3918814928818776, + "flos": 557409480960.0, + "grad_norm": 0.03430912315299504, + "language_loss": 0.91194409, + "learning_rate": 0.000694144486797345, + "loss": 0.92243814, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.44116211, + "step": 2037, + "time_per_iteration": 2.661637783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_mlp": 1.01155853, + "epoch": 0.3920738745671412, + "flos": 1541688131328.0, + "grad_norm": 0.009695617032389551, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80574143, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.421875, + "step": 2038, + "time_per_iteration": 4.676162004470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_mlp": 1.00672829, + "epoch": 0.39226625625240474, + "flos": 499805559552.0, + "grad_norm": 0.03059706599431713, + "language_loss": 0.9011066, + "learning_rate": 0.0006935701402514156, + "loss": 0.91161263, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.43945312, + "step": 2039, + "time_per_iteration": 2.5921828746795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_mlp": 0.99837494, + "epoch": 0.39245863793766833, + "flos": 1350453680640.0, + "grad_norm": 0.0024785612799689367, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74075705, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.42480469, + "step": 2040, + "time_per_iteration": 4.920953273773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_mlp": 1.00180471, + "epoch": 0.3926510196229319, + "flos": 1348115873280.0, + "grad_norm": 0.032003611488688986, + "language_loss": 0.84899294, + "learning_rate": 0.0006929954931031422, + "loss": 0.85944915, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.43896484, + "step": 2041, + "time_per_iteration": 3.7454288005828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_mlp": 1.00144792, + "epoch": 0.39284340130819545, + "flos": 500604600576.0, + "grad_norm": 0.027328608847006428, + "language_loss": 0.89267606, + "learning_rate": 0.0006927080570819805, + "loss": 0.9031285, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.4387207, + "step": 2042, + "time_per_iteration": 2.6191000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049281, + "balance_loss_mlp": 1.00565004, + "epoch": 0.39303578299345904, + "flos": 521342751744.0, + "grad_norm": 0.03887631720492337, + "language_loss": 0.81479704, + "learning_rate": 0.0006924205462449161, + "loss": 0.82528985, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.43701172, + "step": 2043, + "time_per_iteration": 2.6156415939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_mlp": 1.00467432, + "epoch": 0.39322816467872257, + "flos": 909539076864.0, + "grad_norm": 0.03230930456366714, + "language_loss": 0.82451463, + "learning_rate": 0.0006921329607035702, + "loss": 0.83499742, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.43676758, + "step": 2044, + "time_per_iteration": 3.248239040374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_mlp": 1.0066911, + "epoch": 0.39342054636398616, + "flos": 518642339328.0, + "grad_norm": 0.028076885263619615, + "language_loss": 0.88591248, + "learning_rate": 0.0006918453005695938, + "loss": 0.89641762, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.43896484, + "step": 2045, + "time_per_iteration": 2.6417062282562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_mlp": 1.00430059, + "epoch": 0.3936129280492497, + "flos": 549012735744.0, + "grad_norm": 0.027900695924135757, + "language_loss": 0.84910023, + "learning_rate": 0.0006915575659546662, + "loss": 0.85958266, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.44018555, + "step": 2046, + "time_per_iteration": 2.6784913539886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053966, + "balance_loss_mlp": 1.0100733, + "epoch": 0.3938053097345133, + "flos": 527141151744.0, + "grad_norm": 0.03448231278490725, + "language_loss": 0.81310439, + "learning_rate": 0.0006912697569704959, + "loss": 0.82364404, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.43969727, + "step": 2047, + "time_per_iteration": 2.6214752197265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050494, + "balance_loss_mlp": 1.00679207, + "epoch": 0.39399769141977686, + "flos": 472589531136.0, + "grad_norm": 0.03168334850546869, + "language_loss": 0.87124646, + "learning_rate": 0.0006909818737288205, + "loss": 0.88175148, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.43774414, + "step": 2048, + "time_per_iteration": 2.6057982444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_mlp": 1.00775015, + "epoch": 0.3941900731050404, + "flos": 502727602944.0, + "grad_norm": 0.03501112209435681, + "language_loss": 0.81578481, + "learning_rate": 0.000690693916341406, + "loss": 0.82629883, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.43725586, + "step": 2049, + "time_per_iteration": 2.6459243297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.00910771, + "epoch": 0.394382454790304, + "flos": 582007722240.0, + "grad_norm": 0.03071224069667877, + "language_loss": 0.83009964, + "learning_rate": 0.0006904058849200475, + "loss": 0.8406263, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.4362793, + "step": 2050, + "time_per_iteration": 2.766828775405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_mlp": 1.00243104, + "epoch": 0.3945748364755675, + "flos": 514845432576.0, + "grad_norm": 0.030877215482718844, + "language_loss": 0.85563171, + "learning_rate": 0.0006901177795765683, + "loss": 0.86609566, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.44042969, + "step": 2051, + "time_per_iteration": 2.659912109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051919, + "balance_loss_mlp": 1.00807357, + "epoch": 0.3947672181608311, + "flos": 595058740992.0, + "grad_norm": 0.03343854917241654, + "language_loss": 0.821091, + "learning_rate": 0.0006898296004228213, + "loss": 0.8316102, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.43920898, + "step": 2052, + "time_per_iteration": 2.7115862369537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_mlp": 1.00455475, + "epoch": 0.39495959984609463, + "flos": 1551052080384.0, + "grad_norm": 0.003971648916451202, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79173255, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.41992188, + "step": 2053, + "time_per_iteration": 4.894740343093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051907, + "balance_loss_mlp": 1.00818145, + "epoch": 0.3951519815313582, + "flos": 497524109568.0, + "grad_norm": 0.03573797234588687, + "language_loss": 0.80267316, + "learning_rate": 0.0006892530211320763, + "loss": 0.81319225, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.43798828, + "step": 2054, + "time_per_iteration": 2.767686605453491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104549, + "balance_loss_mlp": 1.00193131, + "epoch": 0.39534436321662175, + "flos": 532223136000.0, + "grad_norm": 0.03591265467553322, + "language_loss": 0.84680569, + "learning_rate": 0.000688964621218926, + "loss": 0.85726058, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.4362793, + "step": 2055, + "time_per_iteration": 2.6054694652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_mlp": 1.004722, + "epoch": 0.39553674490188534, + "flos": 703725523200.0, + "grad_norm": 0.03424008758122415, + "language_loss": 0.8074584, + "learning_rate": 0.0006886761479432037, + "loss": 0.8179388, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.43383789, + "step": 2056, + "time_per_iteration": 2.8390727043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.0042696, + "epoch": 0.3957291265871489, + "flos": 410656979712.0, + "grad_norm": 0.03388460034269331, + "language_loss": 0.85256028, + "learning_rate": 0.0006883876014169045, + "loss": 0.86303759, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.43530273, + "step": 2057, + "time_per_iteration": 2.554170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051678, + "balance_loss_mlp": 1.00814319, + "epoch": 0.39592150827241246, + "flos": 619639485696.0, + "grad_norm": 0.03722447028160607, + "language_loss": 0.90694773, + "learning_rate": 0.000688098981752052, + "loss": 0.91746461, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.43603516, + "step": 2058, + "time_per_iteration": 2.733053684234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_mlp": 1.00568974, + "epoch": 0.39611388995767605, + "flos": 822721524480.0, + "grad_norm": 0.04279286873756595, + "language_loss": 0.80609208, + "learning_rate": 0.0006878102890606982, + "loss": 0.81658387, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.43554688, + "step": 2059, + "time_per_iteration": 3.084789752960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_mlp": 1.00416124, + "epoch": 0.3963062716429396, + "flos": 493214921472.0, + "grad_norm": 0.03961147378322192, + "language_loss": 0.81771576, + "learning_rate": 0.0006875215234549239, + "loss": 0.82819128, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.43457031, + "step": 2060, + "time_per_iteration": 2.5823421478271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_mlp": 1.00351596, + "epoch": 0.39649865332820317, + "flos": 585834764544.0, + "grad_norm": 0.03854635921535854, + "language_loss": 0.8654902, + "learning_rate": 0.0006872326850468376, + "loss": 0.87595946, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.43481445, + "step": 2061, + "time_per_iteration": 2.705690860748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.0052762, + "epoch": 0.3966910350134667, + "flos": 459512267520.0, + "grad_norm": 0.037411346592439484, + "language_loss": 0.79843795, + "learning_rate": 0.0006869437739485762, + "loss": 0.80892581, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.43579102, + "step": 2062, + "time_per_iteration": 2.5978832244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050469, + "balance_loss_mlp": 1.00710082, + "epoch": 0.3968834166987303, + "flos": 509615694336.0, + "grad_norm": 0.03224635872548594, + "language_loss": 0.93265009, + "learning_rate": 0.0006866547902723053, + "loss": 0.94315481, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.43432617, + "step": 2063, + "time_per_iteration": 2.7325148582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.00502992, + "epoch": 0.3970757983839938, + "flos": 573743179776.0, + "grad_norm": 0.0353853142482034, + "language_loss": 0.80804694, + "learning_rate": 0.000686365734130218, + "loss": 0.81852973, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.43310547, + "step": 2064, + "time_per_iteration": 2.719521999359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_mlp": 1.00350547, + "epoch": 0.3972681800692574, + "flos": 482586303744.0, + "grad_norm": 0.03284702600830507, + "language_loss": 0.8411094, + "learning_rate": 0.000686076605634536, + "loss": 0.8515777, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.43383789, + "step": 2065, + "time_per_iteration": 2.6333730220794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_mlp": 1.00822306, + "epoch": 0.397460561754521, + "flos": 488905733376.0, + "grad_norm": 0.0324228687482344, + "language_loss": 0.84781277, + "learning_rate": 0.0006857874048975088, + "loss": 0.85833061, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.4362793, + "step": 2066, + "time_per_iteration": 2.5906848907470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_mlp": 1.00659895, + "epoch": 0.3976529434397845, + "flos": 422896318464.0, + "grad_norm": 0.03171433053589848, + "language_loss": 0.8744958, + "learning_rate": 0.0006854981320314142, + "loss": 0.8849957, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.43457031, + "step": 2067, + "time_per_iteration": 2.4699788093566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_mlp": 1.00240779, + "epoch": 0.3978453251250481, + "flos": 546622415616.0, + "grad_norm": 0.03563960500295594, + "language_loss": 0.8728829, + "learning_rate": 0.0006852087871485579, + "loss": 0.88334048, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.43408203, + "step": 2068, + "time_per_iteration": 2.6414859294891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_mlp": 1.00163472, + "epoch": 0.39803770681031164, + "flos": 652002627072.0, + "grad_norm": 0.03732729296318665, + "language_loss": 0.82978511, + "learning_rate": 0.0006849193703612735, + "loss": 0.84023428, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.43334961, + "step": 2069, + "time_per_iteration": 2.791269063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_mlp": 0.999928, + "epoch": 0.39823008849557523, + "flos": 741427272960.0, + "grad_norm": 0.030595728613543666, + "language_loss": 0.78243995, + "learning_rate": 0.0006846298817819225, + "loss": 0.79287314, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.43457031, + "step": 2070, + "time_per_iteration": 2.9561986923217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_mlp": 1.00235701, + "epoch": 0.39842247018083876, + "flos": 385889597184.0, + "grad_norm": 0.036398106493658954, + "language_loss": 0.81909132, + "learning_rate": 0.0006843403215228945, + "loss": 0.82954645, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.43212891, + "step": 2071, + "time_per_iteration": 2.4993679523468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.00218797, + "epoch": 0.39861485186610235, + "flos": 534763155456.0, + "grad_norm": 0.028807086351499752, + "language_loss": 0.8150484, + "learning_rate": 0.0006840506896966065, + "loss": 0.82550067, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.4309082, + "step": 2072, + "time_per_iteration": 2.7684881687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049113, + "balance_loss_mlp": 1.00595963, + "epoch": 0.39880723355136594, + "flos": 644413671168.0, + "grad_norm": 0.03625588542647267, + "language_loss": 0.83127856, + "learning_rate": 0.0006837609864155038, + "loss": 0.8417697, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.43212891, + "step": 2073, + "time_per_iteration": 2.8514270782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_mlp": 1.00782108, + "epoch": 0.39899961523662947, + "flos": 516892612608.0, + "grad_norm": 0.031931162968107815, + "language_loss": 0.83936673, + "learning_rate": 0.0006834712117920592, + "loss": 0.84987766, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.43334961, + "step": 2074, + "time_per_iteration": 2.6099319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_mlp": 1.00583923, + "epoch": 0.39919199692189306, + "flos": 465338857728.0, + "grad_norm": 0.040350277752625376, + "language_loss": 0.86345923, + "learning_rate": 0.0006831813659387729, + "loss": 0.87394845, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.43139648, + "step": 2075, + "time_per_iteration": 2.5189003944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_mlp": 1.00421119, + "epoch": 0.3993843786071566, + "flos": 532679036928.0, + "grad_norm": 0.031639049857806745, + "language_loss": 0.84865057, + "learning_rate": 0.0006828914489681733, + "loss": 0.85912478, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.43261719, + "step": 2076, + "time_per_iteration": 2.7052366733551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_mlp": 1.00252223, + "epoch": 0.3995767602924202, + "flos": 505024604160.0, + "grad_norm": 0.02906284980485529, + "language_loss": 0.85967886, + "learning_rate": 0.0006826014609928162, + "loss": 0.87013543, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.43188477, + "step": 2077, + "time_per_iteration": 2.7127158641815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046635, + "balance_loss_mlp": 1.00514984, + "epoch": 0.3997691419776837, + "flos": 1457473781760.0, + "grad_norm": 0.010869866041652092, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84246022, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.41503906, + "step": 2078, + "time_per_iteration": 4.8602213859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_mlp": 1.00586236, + "epoch": 0.3999615236629473, + "flos": 531756541440.0, + "grad_norm": 0.03484656463436615, + "language_loss": 0.80513203, + "learning_rate": 0.0006820212724781896, + "loss": 0.81562173, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.43164062, + "step": 2079, + "time_per_iteration": 2.6769065856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00732243, + "epoch": 0.4001539053482108, + "flos": 696362088960.0, + "grad_norm": 0.03370335981625205, + "language_loss": 0.84624374, + "learning_rate": 0.0006817310721641694, + "loss": 0.85674727, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.4309082, + "step": 2080, + "time_per_iteration": 2.8362321853637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_mlp": 1.00619566, + "epoch": 0.4003462870334744, + "flos": 521379690240.0, + "grad_norm": 0.0372462453928972, + "language_loss": 0.84107649, + "learning_rate": 0.00068144080129589, + "loss": 0.85156924, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.43139648, + "step": 2081, + "time_per_iteration": 2.673391342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047044, + "balance_loss_mlp": 1.00400949, + "epoch": 0.400538668718738, + "flos": 493503626496.0, + "grad_norm": 0.03624950820375382, + "language_loss": 0.83452618, + "learning_rate": 0.0006811504599860441, + "loss": 0.84499657, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.4309082, + "step": 2082, + "time_per_iteration": 2.5872161388397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048557, + "balance_loss_mlp": 1.0056175, + "epoch": 0.40073105040400153, + "flos": 491452555776.0, + "grad_norm": 0.03058886918361784, + "language_loss": 0.86615109, + "learning_rate": 0.0006808600483473526, + "loss": 0.87663668, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.42993164, + "step": 2083, + "time_per_iteration": 2.9167916774749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_mlp": 1.00165451, + "epoch": 0.4009234320892651, + "flos": 563540327424.0, + "grad_norm": 0.029579631805043773, + "language_loss": 0.86442864, + "learning_rate": 0.0006805695664925629, + "loss": 0.87487578, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.43115234, + "step": 2084, + "time_per_iteration": 2.8129522800445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_mlp": 1.00328159, + "epoch": 0.40111581377452865, + "flos": 426853618176.0, + "grad_norm": 0.03869673141168483, + "language_loss": 0.84653956, + "learning_rate": 0.0006802790145344506, + "loss": 0.85700059, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.42871094, + "step": 2085, + "time_per_iteration": 2.4816439151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_mlp": 1.00480056, + "epoch": 0.40130819545979224, + "flos": 613643754240.0, + "grad_norm": 0.033294901740297575, + "language_loss": 0.87748265, + "learning_rate": 0.0006799883925858176, + "loss": 0.88795811, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.42797852, + "step": 2086, + "time_per_iteration": 2.883460760116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010467, + "balance_loss_mlp": 1.00397515, + "epoch": 0.40150057714505577, + "flos": 524451432960.0, + "grad_norm": 0.03567087941007639, + "language_loss": 0.85852945, + "learning_rate": 0.0006796977007594933, + "loss": 0.86899644, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.42773438, + "step": 2087, + "time_per_iteration": 2.6274635791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_mlp": 1.00641906, + "epoch": 0.40169295883031936, + "flos": 562554648576.0, + "grad_norm": 0.03237434691106299, + "language_loss": 0.86948609, + "learning_rate": 0.0006794069391683345, + "loss": 0.87997776, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.42797852, + "step": 2088, + "time_per_iteration": 2.7452995777130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00204611, + "epoch": 0.4018853405155829, + "flos": 520020735744.0, + "grad_norm": 0.03787206100605993, + "language_loss": 0.81785774, + "learning_rate": 0.0006791161079252248, + "loss": 0.82830572, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.42797852, + "step": 2089, + "time_per_iteration": 2.7205429077148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104968, + "balance_loss_mlp": 1.00683641, + "epoch": 0.4020777222008465, + "flos": 527288905728.0, + "grad_norm": 0.03117280194599123, + "language_loss": 0.83103907, + "learning_rate": 0.0006788252071430747, + "loss": 0.84153581, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.42895508, + "step": 2090, + "time_per_iteration": 2.659057378768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105285, + "balance_loss_mlp": 1.01000619, + "epoch": 0.40227010388611006, + "flos": 526841753088.0, + "grad_norm": 0.038447003118097976, + "language_loss": 0.86962426, + "learning_rate": 0.0006785342369348222, + "loss": 0.88015276, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.42895508, + "step": 2091, + "time_per_iteration": 2.7038679122924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_mlp": 1.00374973, + "epoch": 0.4024624855713736, + "flos": 433227482880.0, + "grad_norm": 0.04129881296644863, + "language_loss": 0.80178273, + "learning_rate": 0.0006782431974134316, + "loss": 0.81224871, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.42895508, + "step": 2092, + "time_per_iteration": 2.522822618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_mlp": 1.00185025, + "epoch": 0.4026548672566372, + "flos": 768092136192.0, + "grad_norm": 0.028161411572745265, + "language_loss": 0.89556634, + "learning_rate": 0.0006779520886918949, + "loss": 0.90601373, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.42944336, + "step": 2093, + "time_per_iteration": 3.059269905090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051245, + "balance_loss_mlp": 1.00847256, + "epoch": 0.4028472489419007, + "flos": 644118163200.0, + "grad_norm": 0.031871945568835235, + "language_loss": 0.81586826, + "learning_rate": 0.0006776609108832301, + "loss": 0.82638067, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.42822266, + "step": 2094, + "time_per_iteration": 2.824986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_mlp": 1.00707567, + "epoch": 0.4030396306271643, + "flos": 492824149248.0, + "grad_norm": 0.03027887325873737, + "language_loss": 0.85679066, + "learning_rate": 0.0006773696641004828, + "loss": 0.86729133, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.43041992, + "step": 2095, + "time_per_iteration": 2.575521230697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00742352, + "epoch": 0.40323201231242783, + "flos": 903195347712.0, + "grad_norm": 0.03549236004367387, + "language_loss": 0.78398442, + "learning_rate": 0.0006770783484567247, + "loss": 0.7944876, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.42944336, + "step": 2096, + "time_per_iteration": 3.1476502418518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_mlp": 1.00417244, + "epoch": 0.4034243939976914, + "flos": 571730992896.0, + "grad_norm": 0.04456027219971551, + "language_loss": 0.86790794, + "learning_rate": 0.000676786964065055, + "loss": 0.87837982, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.43066406, + "step": 2097, + "time_per_iteration": 2.826936960220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049393, + "balance_loss_mlp": 1.00635874, + "epoch": 0.403616775682955, + "flos": 508460874240.0, + "grad_norm": 0.03200015951198879, + "language_loss": 0.79479361, + "learning_rate": 0.0006764955110385986, + "loss": 0.80528748, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.4309082, + "step": 2098, + "time_per_iteration": 2.732429027557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105173, + "balance_loss_mlp": 1.0086236, + "epoch": 0.40380915736821854, + "flos": 520411507968.0, + "grad_norm": 0.033549102084289066, + "language_loss": 0.81161886, + "learning_rate": 0.0006762039894905083, + "loss": 0.82213616, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.43164062, + "step": 2099, + "time_per_iteration": 2.638117790222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104845, + "balance_loss_mlp": 1.00524902, + "epoch": 0.40400153905348213, + "flos": 442887918336.0, + "grad_norm": 0.03592642868139018, + "language_loss": 0.80970824, + "learning_rate": 0.000675912399533962, + "loss": 0.82019281, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.43261719, + "step": 2100, + "time_per_iteration": 2.58172345161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_mlp": 1.00585735, + "epoch": 0.40419392073874566, + "flos": 773705843712.0, + "grad_norm": 0.032245854328407444, + "language_loss": 0.85358262, + "learning_rate": 0.0006756207412821656, + "loss": 0.86407304, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.43237305, + "step": 2101, + "time_per_iteration": 3.0158467292785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053218, + "balance_loss_mlp": 1.01006424, + "epoch": 0.40438630242400925, + "flos": 767990068992.0, + "grad_norm": 0.03424537155124627, + "language_loss": 0.81043333, + "learning_rate": 0.0006753290148483505, + "loss": 0.82096547, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.43212891, + "step": 2102, + "time_per_iteration": 3.0169148445129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050406, + "balance_loss_mlp": 1.0073241, + "epoch": 0.4045786841092728, + "flos": 416129736192.0, + "grad_norm": 0.032341452227877814, + "language_loss": 0.79544723, + "learning_rate": 0.0006750372203457752, + "loss": 0.80595136, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.43139648, + "step": 2103, + "time_per_iteration": 2.459439277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104576, + "balance_loss_mlp": 1.00274944, + "epoch": 0.40477106579453637, + "flos": 540309788928.0, + "grad_norm": 0.028365330829485943, + "language_loss": 0.87031502, + "learning_rate": 0.0006747453578877242, + "loss": 0.88077265, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.43066406, + "step": 2104, + "time_per_iteration": 2.704583168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.00413048, + "epoch": 0.4049634474797999, + "flos": 828092213760.0, + "grad_norm": 0.03564801319951872, + "language_loss": 0.83885705, + "learning_rate": 0.0006744534275875085, + "loss": 0.84932852, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.43066406, + "step": 2105, + "time_per_iteration": 3.070952892303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049443, + "balance_loss_mlp": 1.00631273, + "epoch": 0.4051558291650635, + "flos": 573753873408.0, + "grad_norm": 0.03321600555114549, + "language_loss": 0.86069483, + "learning_rate": 0.0006741614295584657, + "loss": 0.87118924, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.43188477, + "step": 2106, + "time_per_iteration": 2.677860736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_mlp": 1.00802493, + "epoch": 0.4053482108503271, + "flos": 733245355776.0, + "grad_norm": 0.034313991245887424, + "language_loss": 0.78860825, + "learning_rate": 0.0006738693639139595, + "loss": 0.79911888, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.4309082, + "step": 2107, + "time_per_iteration": 3.021329402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_mlp": 1.0043304, + "epoch": 0.4055405925355906, + "flos": 1214950971648.0, + "grad_norm": 0.03202932182515954, + "language_loss": 0.77947468, + "learning_rate": 0.0006735772307673796, + "loss": 0.7899493, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.43188477, + "step": 2108, + "time_per_iteration": 3.524618148803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_mlp": 1.00476336, + "epoch": 0.4057329742208542, + "flos": 717108988416.0, + "grad_norm": 0.03284224075250963, + "language_loss": 0.84037805, + "learning_rate": 0.0006732850302321421, + "loss": 0.85085559, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.43041992, + "step": 2109, + "time_per_iteration": 2.9528980255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_mlp": 1.00423336, + "epoch": 0.4059253559061177, + "flos": 565953980160.0, + "grad_norm": 0.033245578967332844, + "language_loss": 0.85031784, + "learning_rate": 0.00067299276242169, + "loss": 0.86078906, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.42944336, + "step": 2110, + "time_per_iteration": 2.715207815170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046326, + "balance_loss_mlp": 1.00493622, + "epoch": 0.4061177375913813, + "flos": 1597189459200.0, + "grad_norm": 0.00881896921345328, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75428492, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.4140625, + "step": 2111, + "time_per_iteration": 4.921623468399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045683, + "balance_loss_mlp": 1.00276768, + "epoch": 0.40631011927664484, + "flos": 616622178048.0, + "grad_norm": 0.03872377126422628, + "language_loss": 0.78301811, + "learning_rate": 0.0006724080254290395, + "loss": 0.79347491, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.4296875, + "step": 2112, + "time_per_iteration": 2.7997756004333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104896, + "balance_loss_mlp": 1.00606835, + "epoch": 0.40650250096190843, + "flos": 558748993536.0, + "grad_norm": 0.03550284292845091, + "language_loss": 0.90693575, + "learning_rate": 0.0006721155564738566, + "loss": 0.91742539, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.42944336, + "step": 2113, + "time_per_iteration": 2.6585686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_mlp": 1.00054932, + "epoch": 0.40669488264717196, + "flos": 1583545479168.0, + "grad_norm": 0.009767435928617773, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79664576, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.4140625, + "step": 2114, + "time_per_iteration": 4.948775053024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047453, + "balance_loss_mlp": 1.00460887, + "epoch": 0.40688726433243555, + "flos": 508656260352.0, + "grad_norm": 0.031160170727070474, + "language_loss": 0.86169994, + "learning_rate": 0.0006715304182135078, + "loss": 0.8721745, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.42895508, + "step": 2115, + "time_per_iteration": 2.6279850006103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00449646, + "epoch": 0.40707964601769914, + "flos": 590352944640.0, + "grad_norm": 0.04782787246513916, + "language_loss": 0.89337373, + "learning_rate": 0.0006712377491355127, + "loss": 0.90384591, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.42773438, + "step": 2116, + "time_per_iteration": 2.863960027694702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_mlp": 1.00449598, + "epoch": 0.40727202770296267, + "flos": 581651943168.0, + "grad_norm": 0.026696862883813798, + "language_loss": 0.81451207, + "learning_rate": 0.0006709450135771274, + "loss": 0.8249836, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.42700195, + "step": 2117, + "time_per_iteration": 2.94854998588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_mlp": 1.00589585, + "epoch": 0.40746440938822626, + "flos": 505109174784.0, + "grad_norm": 0.029498043522937258, + "language_loss": 0.87031925, + "learning_rate": 0.0006706522116520023, + "loss": 0.88080668, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.42895508, + "step": 2118, + "time_per_iteration": 2.6655611991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051527, + "balance_loss_mlp": 1.00880289, + "epoch": 0.4076567910734898, + "flos": 606711921408.0, + "grad_norm": 0.03542644850365937, + "language_loss": 0.83226359, + "learning_rate": 0.0006703593434738127, + "loss": 0.84277886, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.42773438, + "step": 2119, + "time_per_iteration": 2.7478883266448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049424, + "balance_loss_mlp": 1.00662768, + "epoch": 0.4078491727587534, + "flos": 480519681792.0, + "grad_norm": 0.032767120193604775, + "language_loss": 0.788118, + "learning_rate": 0.0006700664091562604, + "loss": 0.79861224, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.4284668, + "step": 2120, + "time_per_iteration": 2.532407760620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054491, + "balance_loss_mlp": 1.01167095, + "epoch": 0.4080415544440169, + "flos": 511419856128.0, + "grad_norm": 0.031947051498113735, + "language_loss": 0.85428649, + "learning_rate": 0.0006697734088130725, + "loss": 0.86483139, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.42871094, + "step": 2121, + "time_per_iteration": 2.6053290367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_mlp": 1.00899482, + "epoch": 0.4082339361292805, + "flos": 735928271616.0, + "grad_norm": 0.0331707162631359, + "language_loss": 0.86154819, + "learning_rate": 0.0006694803425580018, + "loss": 0.87206686, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.42919922, + "step": 2122, + "time_per_iteration": 2.995340585708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051457, + "balance_loss_mlp": 1.00863671, + "epoch": 0.4084263178145441, + "flos": 458405079552.0, + "grad_norm": 0.03582566166827548, + "language_loss": 0.85069245, + "learning_rate": 0.0006691872105048268, + "loss": 0.86120701, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.42871094, + "step": 2123, + "time_per_iteration": 2.6434147357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_mlp": 1.00655949, + "epoch": 0.4086186994998076, + "flos": 564026363904.0, + "grad_norm": 0.030981369506813725, + "language_loss": 0.84940457, + "learning_rate": 0.0006688940127673513, + "loss": 0.85990047, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.4309082, + "step": 2124, + "time_per_iteration": 2.677267074584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051992, + "balance_loss_mlp": 1.00914872, + "epoch": 0.4088110811850712, + "flos": 574894109184.0, + "grad_norm": 0.03166953679677798, + "language_loss": 0.86061293, + "learning_rate": 0.0006686007494594049, + "loss": 0.87113285, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.42895508, + "step": 2125, + "time_per_iteration": 2.806321620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_mlp": 1.00845325, + "epoch": 0.40900346287033473, + "flos": 457847111424.0, + "grad_norm": 0.04138148105998068, + "language_loss": 0.81154513, + "learning_rate": 0.0006683074206948425, + "loss": 0.82205856, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.42944336, + "step": 2126, + "time_per_iteration": 2.5422966480255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051657, + "balance_loss_mlp": 1.00878966, + "epoch": 0.4091958445555983, + "flos": 618595481088.0, + "grad_norm": 0.03139043933990307, + "language_loss": 0.81871778, + "learning_rate": 0.0006680140265875443, + "loss": 0.82923436, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.42919922, + "step": 2127, + "time_per_iteration": 2.8402438163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_mlp": 1.0048064, + "epoch": 0.40938822624086185, + "flos": 473371075584.0, + "grad_norm": 0.031125843736347292, + "language_loss": 0.96506268, + "learning_rate": 0.0006677205672514162, + "loss": 0.97553754, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.42724609, + "step": 2128, + "time_per_iteration": 2.6291539669036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047334, + "balance_loss_mlp": 1.00460982, + "epoch": 0.40958060792612544, + "flos": 571118589696.0, + "grad_norm": 0.02838685720934929, + "language_loss": 0.89474666, + "learning_rate": 0.000667427042800389, + "loss": 0.90522003, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.42773438, + "step": 2129, + "time_per_iteration": 2.749999761581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_mlp": 1.00435364, + "epoch": 0.40977298961138897, + "flos": 610471889664.0, + "grad_norm": 0.033304274322438925, + "language_loss": 0.8343153, + "learning_rate": 0.0006671334533484192, + "loss": 0.84478706, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.42871094, + "step": 2130, + "time_per_iteration": 2.778238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_mlp": 1.00636995, + "epoch": 0.40996537129665256, + "flos": 582873837312.0, + "grad_norm": 0.027360354791446346, + "language_loss": 0.83860981, + "learning_rate": 0.0006668397990094881, + "loss": 0.84910274, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.4296875, + "step": 2131, + "time_per_iteration": 2.711257219314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_mlp": 1.00145221, + "epoch": 0.41015775298191615, + "flos": 517554593280.0, + "grad_norm": 0.031461982022778785, + "language_loss": 0.85118818, + "learning_rate": 0.0006665460798976027, + "loss": 0.86163139, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.42919922, + "step": 2132, + "time_per_iteration": 2.7143847942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046552, + "balance_loss_mlp": 1.00370777, + "epoch": 0.4103501346671797, + "flos": 511446100992.0, + "grad_norm": 0.02874706903740214, + "language_loss": 0.82064044, + "learning_rate": 0.0006662522961267947, + "loss": 0.83110595, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.42895508, + "step": 2133, + "time_per_iteration": 2.683544635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_mlp": 1.00212467, + "epoch": 0.41054251635244327, + "flos": 550927713024.0, + "grad_norm": 0.027003210560574007, + "language_loss": 0.87900901, + "learning_rate": 0.0006659584478111211, + "loss": 0.88945937, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.4296875, + "step": 2134, + "time_per_iteration": 2.781217336654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_mlp": 1.00254142, + "epoch": 0.4107348980377068, + "flos": 841299734784.0, + "grad_norm": 0.03651700728131785, + "language_loss": 0.83066756, + "learning_rate": 0.000665664535064664, + "loss": 0.84112048, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.42797852, + "step": 2135, + "time_per_iteration": 3.067751169204712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_mlp": 1.00390935, + "epoch": 0.4109272797229704, + "flos": 504764089344.0, + "grad_norm": 0.03160666135819327, + "language_loss": 0.83225, + "learning_rate": 0.0006653705580015303, + "loss": 0.84271616, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.42749023, + "step": 2136, + "time_per_iteration": 2.6899030208587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.00521994, + "epoch": 0.4111196614082339, + "flos": 612024284928.0, + "grad_norm": 0.02957451828286975, + "language_loss": 0.87109792, + "learning_rate": 0.0006650765167358523, + "loss": 0.8815788, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.42919922, + "step": 2137, + "time_per_iteration": 2.8179140090942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048701, + "balance_loss_mlp": 1.00590456, + "epoch": 0.4113120430934975, + "flos": 454104639744.0, + "grad_norm": 0.033800673848535426, + "language_loss": 0.91012341, + "learning_rate": 0.0006647824113817864, + "loss": 0.92061043, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.4284668, + "step": 2138, + "time_per_iteration": 2.5263419151306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00635624, + "epoch": 0.41150442477876104, + "flos": 542710802688.0, + "grad_norm": 0.028316546184043286, + "language_loss": 0.818874, + "learning_rate": 0.000664488242053515, + "loss": 0.82936704, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.42993164, + "step": 2139, + "time_per_iteration": 2.770169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_mlp": 1.0037353, + "epoch": 0.4116968064640246, + "flos": 577392332544.0, + "grad_norm": 0.027329597632332964, + "language_loss": 0.84529692, + "learning_rate": 0.0006641940088652445, + "loss": 0.8557626, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.42895508, + "step": 2140, + "time_per_iteration": 2.761660575866699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046986, + "balance_loss_mlp": 1.00416613, + "epoch": 0.4118891881492882, + "flos": 497150833920.0, + "grad_norm": 0.03165424709394261, + "language_loss": 0.82833397, + "learning_rate": 0.0006638997119312065, + "loss": 0.83880383, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.42871094, + "step": 2141, + "time_per_iteration": 2.6978652477264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071571, + "balance_loss_mlp": 1.02980042, + "epoch": 0.41208156983455174, + "flos": 1541573425152.0, + "grad_norm": 0.013007961614308571, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76134878, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.41796875, + "step": 2142, + "time_per_iteration": 4.915013551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.00611305, + "epoch": 0.41227395151981533, + "flos": 586058340864.0, + "grad_norm": 0.033991757131589403, + "language_loss": 0.85150123, + "learning_rate": 0.000663310927282877, + "loss": 0.86199009, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.42822266, + "step": 2143, + "time_per_iteration": 2.7552297115325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_mlp": 1.00635242, + "epoch": 0.41246633320507886, + "flos": 443893039104.0, + "grad_norm": 0.031026250164357557, + "language_loss": 0.8627826, + "learning_rate": 0.000663016439797172, + "loss": 0.87327409, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.4284668, + "step": 2144, + "time_per_iteration": 2.627795934677124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_mlp": 1.00593042, + "epoch": 0.41265871489034245, + "flos": 581095920384.0, + "grad_norm": 0.032902127624834396, + "language_loss": 0.81700695, + "learning_rate": 0.0006627218890228724, + "loss": 0.82749426, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.4284668, + "step": 2145, + "time_per_iteration": 2.7726335525512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051442, + "balance_loss_mlp": 1.00852692, + "epoch": 0.412851096575606, + "flos": 762529951488.0, + "grad_norm": 0.03700396426728773, + "language_loss": 0.8427214, + "learning_rate": 0.0006624272750743326, + "loss": 0.85323578, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.4296875, + "step": 2146, + "time_per_iteration": 3.047786235809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051555, + "balance_loss_mlp": 1.00854468, + "epoch": 0.41304347826086957, + "flos": 556521978624.0, + "grad_norm": 0.0279029176228374, + "language_loss": 0.83148611, + "learning_rate": 0.0006621325980659322, + "loss": 0.84200168, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.43066406, + "step": 2147, + "time_per_iteration": 2.7805261611938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105086, + "balance_loss_mlp": 1.00796807, + "epoch": 0.41323585994613315, + "flos": 666894746112.0, + "grad_norm": 0.03289726182172815, + "language_loss": 0.82395911, + "learning_rate": 0.000661837858112075, + "loss": 0.83446777, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.42944336, + "step": 2148, + "time_per_iteration": 2.8236329555511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_mlp": 1.00153887, + "epoch": 0.4134282416313967, + "flos": 549785531904.0, + "grad_norm": 0.03194652549549522, + "language_loss": 0.89158356, + "learning_rate": 0.0006615430553271888, + "loss": 0.90202832, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.42993164, + "step": 2149, + "time_per_iteration": 2.7931926250457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043808, + "balance_loss_mlp": 1.00101149, + "epoch": 0.4136206233166603, + "flos": 647513604096.0, + "grad_norm": 0.02946183128139913, + "language_loss": 0.8604427, + "learning_rate": 0.0006612481898257264, + "loss": 0.87088078, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.4284668, + "step": 2150, + "time_per_iteration": 2.853116512298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_mlp": 1.00279057, + "epoch": 0.4138130050019238, + "flos": 518364327936.0, + "grad_norm": 0.034556300996824205, + "language_loss": 0.85756087, + "learning_rate": 0.000660953261722165, + "loss": 0.86801755, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.42919922, + "step": 2151, + "time_per_iteration": 2.5899548530578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_mlp": 1.00575054, + "epoch": 0.4140053866871874, + "flos": 610369822464.0, + "grad_norm": 0.032804683798420206, + "language_loss": 0.83155799, + "learning_rate": 0.0006606582711310055, + "loss": 0.84204322, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.42822266, + "step": 2152, + "time_per_iteration": 2.7591912746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_mlp": 1.00258613, + "epoch": 0.4141977683724509, + "flos": 580846099200.0, + "grad_norm": 0.031179869336458114, + "language_loss": 0.84146237, + "learning_rate": 0.0006603632181667736, + "loss": 0.85191619, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.4284668, + "step": 2153, + "time_per_iteration": 2.661051034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045906, + "balance_loss_mlp": 1.00470734, + "epoch": 0.4143901500577145, + "flos": 1310178863616.0, + "grad_norm": 0.005957353398288201, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79989231, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.41210938, + "step": 2154, + "time_per_iteration": 4.908870458602905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_mlp": 1.00416827, + "epoch": 0.41458253174297804, + "flos": 461122988544.0, + "grad_norm": 0.03503771604154275, + "language_loss": 0.82412434, + "learning_rate": 0.0006597729255773153, + "loss": 0.83459282, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.42724609, + "step": 2155, + "time_per_iteration": 2.51566481590271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048208, + "balance_loss_mlp": 1.00531614, + "epoch": 0.41477491342824163, + "flos": 554439805440.0, + "grad_norm": 0.033219020360443, + "language_loss": 0.82733047, + "learning_rate": 0.0006594776861812608, + "loss": 0.83781254, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.42944336, + "step": 2156, + "time_per_iteration": 2.7139203548431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_mlp": 1.00501156, + "epoch": 0.4149672951135052, + "flos": 699086800896.0, + "grad_norm": 0.029687792529517126, + "language_loss": 0.87240821, + "learning_rate": 0.0006591823848704776, + "loss": 0.88288647, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.42871094, + "step": 2157, + "time_per_iteration": 2.950136661529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_mlp": 1.00647271, + "epoch": 0.41515967679876875, + "flos": 566837591808.0, + "grad_norm": 0.02753963183350331, + "language_loss": 0.82045114, + "learning_rate": 0.0006588870217596117, + "loss": 0.83094263, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.42724609, + "step": 2158, + "time_per_iteration": 2.742954730987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_mlp": 1.00440574, + "epoch": 0.41535205848403234, + "flos": 502178383104.0, + "grad_norm": 0.03782519840746282, + "language_loss": 0.86309534, + "learning_rate": 0.0006585915969633334, + "loss": 0.8735671, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.42822266, + "step": 2159, + "time_per_iteration": 2.6314492225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048815, + "balance_loss_mlp": 1.00599504, + "epoch": 0.41554444016929587, + "flos": 608702721024.0, + "grad_norm": 0.03160589415450587, + "language_loss": 0.8965854, + "learning_rate": 0.0006582961105963366, + "loss": 0.90707356, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.42871094, + "step": 2160, + "time_per_iteration": 2.779524564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052466, + "balance_loss_mlp": 1.0094316, + "epoch": 0.41573682185455946, + "flos": 530156514048.0, + "grad_norm": 0.0316987683946157, + "language_loss": 0.78011453, + "learning_rate": 0.0006580005627733395, + "loss": 0.79063922, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.4309082, + "step": 2161, + "time_per_iteration": 2.655961275100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053795, + "balance_loss_mlp": 1.01095116, + "epoch": 0.415929203539823, + "flos": 506038473216.0, + "grad_norm": 0.030200496407476712, + "language_loss": 0.82344484, + "learning_rate": 0.0006577049536090838, + "loss": 0.83398283, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.42895508, + "step": 2162, + "time_per_iteration": 2.734727144241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_mlp": 1.00536776, + "epoch": 0.4161215852250866, + "flos": 583824523008.0, + "grad_norm": 0.03528478058898885, + "language_loss": 0.86106777, + "learning_rate": 0.000657409283218335, + "loss": 0.87155068, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.4296875, + "step": 2163, + "time_per_iteration": 2.659733533859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051194, + "balance_loss_mlp": 1.00844538, + "epoch": 0.4163139669103501, + "flos": 491760702720.0, + "grad_norm": 0.03176725688202085, + "language_loss": 0.81183624, + "learning_rate": 0.0006571135517158829, + "loss": 0.82234824, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.42797852, + "step": 2164, + "time_per_iteration": 2.639364004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_mlp": 1.00241089, + "epoch": 0.4165063485956137, + "flos": 1291023243264.0, + "grad_norm": 0.009317160244550511, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807671, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.41210938, + "step": 2165, + "time_per_iteration": 4.755609750747681 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_mlp": 1.00600576, + "epoch": 0.4166987302808773, + "flos": 496258473984.0, + "grad_norm": 0.03907979296448248, + "language_loss": 0.83556676, + "learning_rate": 0.0006565219058351444, + "loss": 0.84605455, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.42822266, + "step": 2166, + "time_per_iteration": 2.549835443496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_mlp": 1.00087476, + "epoch": 0.4168911119661408, + "flos": 465067649280.0, + "grad_norm": 0.0316582334519174, + "language_loss": 0.83126116, + "learning_rate": 0.0006562259916865553, + "loss": 0.8416996, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.43017578, + "step": 2167, + "time_per_iteration": 2.577807664871216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045446, + "balance_loss_mlp": 1.00253069, + "epoch": 0.4170834936514044, + "flos": 537943768320.0, + "grad_norm": 0.03263228805326442, + "language_loss": 0.79910517, + "learning_rate": 0.0006559300168856573, + "loss": 0.8095597, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.4296875, + "step": 2168, + "time_per_iteration": 2.716322898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_mlp": 1.00819373, + "epoch": 0.41727587533666793, + "flos": 551750086656.0, + "grad_norm": 0.029704951266317694, + "language_loss": 0.86753178, + "learning_rate": 0.0006556339815473577, + "loss": 0.87804294, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.4296875, + "step": 2169, + "time_per_iteration": 2.627387762069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044935, + "balance_loss_mlp": 1.00204313, + "epoch": 0.4174682570219315, + "flos": 632378466816.0, + "grad_norm": 0.03018462927838879, + "language_loss": 0.86615288, + "learning_rate": 0.000655337885786588, + "loss": 0.87660229, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.42944336, + "step": 2170, + "time_per_iteration": 2.8836913108825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_mlp": 1.00211012, + "epoch": 0.41766063870719505, + "flos": 520756593408.0, + "grad_norm": 0.03274558076895909, + "language_loss": 0.85911119, + "learning_rate": 0.0006550417297183025, + "loss": 0.86956197, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.43017578, + "step": 2171, + "time_per_iteration": 2.6085855960845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054021, + "balance_loss_mlp": 1.0111295, + "epoch": 0.41785302039245864, + "flos": 559055195136.0, + "grad_norm": 0.03215226267597247, + "language_loss": 0.82142568, + "learning_rate": 0.0006547455134574793, + "loss": 0.83196592, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.42944336, + "step": 2172, + "time_per_iteration": 2.7207438945770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_mlp": 1.0057919, + "epoch": 0.41804540207772223, + "flos": 790028848896.0, + "grad_norm": 0.03152263917705172, + "language_loss": 0.84573895, + "learning_rate": 0.0006544492371191198, + "loss": 0.85622525, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.42895508, + "step": 2173, + "time_per_iteration": 3.1091549396514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050974, + "balance_loss_mlp": 1.00791526, + "epoch": 0.41823778376298576, + "flos": 905891869440.0, + "grad_norm": 0.03158772894298815, + "language_loss": 0.83616948, + "learning_rate": 0.0006541529008182485, + "loss": 0.84667921, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.43115234, + "step": 2174, + "time_per_iteration": 3.1934547424316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050617, + "balance_loss_mlp": 1.0074867, + "epoch": 0.41843016544824935, + "flos": 512574676224.0, + "grad_norm": 0.036197783568866736, + "language_loss": 0.87799633, + "learning_rate": 0.0006538565046699136, + "loss": 0.88850248, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.43188477, + "step": 2175, + "time_per_iteration": 2.6156668663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043414, + "balance_loss_mlp": 1.00047445, + "epoch": 0.4186225471335129, + "flos": 654290880000.0, + "grad_norm": 0.03486733903162065, + "language_loss": 0.81864989, + "learning_rate": 0.0006535600487891862, + "loss": 0.82908404, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.42993164, + "step": 2176, + "time_per_iteration": 2.7715044021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00778568, + "epoch": 0.41881492881877647, + "flos": 570226229760.0, + "grad_norm": 0.03182850960977162, + "language_loss": 0.89874047, + "learning_rate": 0.0006532635332911603, + "loss": 0.90924585, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.42797852, + "step": 2177, + "time_per_iteration": 2.714635133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_mlp": 1.00352275, + "epoch": 0.41900731050404, + "flos": 913485682944.0, + "grad_norm": 0.031061931256926825, + "language_loss": 0.81313407, + "learning_rate": 0.0006529669582909541, + "loss": 0.82359695, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.42822266, + "step": 2178, + "time_per_iteration": 3.2592601776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052105, + "balance_loss_mlp": 1.00923753, + "epoch": 0.4191996921893036, + "flos": 536784090624.0, + "grad_norm": 0.03590517964257674, + "language_loss": 0.86468148, + "learning_rate": 0.0006526703239037077, + "loss": 0.87520254, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.42919922, + "step": 2179, + "time_per_iteration": 2.6636452674865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.00259995, + "epoch": 0.4193920738745671, + "flos": 583731204096.0, + "grad_norm": 0.030716470700417473, + "language_loss": 0.86737585, + "learning_rate": 0.0006523736302445851, + "loss": 0.87783122, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.42993164, + "step": 2180, + "time_per_iteration": 2.801374673843384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00535846, + "epoch": 0.4195844555598307, + "flos": 1337802205440.0, + "grad_norm": 0.03692120158624074, + "language_loss": 0.77735525, + "learning_rate": 0.0006520768774287728, + "loss": 0.78783798, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.4296875, + "step": 2181, + "time_per_iteration": 3.781163454055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048528, + "balance_loss_mlp": 1.00568438, + "epoch": 0.4197768372450943, + "flos": 599997828864.0, + "grad_norm": 0.02986751846873145, + "language_loss": 0.85868645, + "learning_rate": 0.0006517800655714806, + "loss": 0.86917174, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.42895508, + "step": 2182, + "time_per_iteration": 2.8340775966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00454116, + "epoch": 0.4199692189303578, + "flos": 736597055232.0, + "grad_norm": 0.031915917751050384, + "language_loss": 0.8544265, + "learning_rate": 0.0006514831947879407, + "loss": 0.86489916, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.42773438, + "step": 2183, + "time_per_iteration": 2.943141460418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048279, + "balance_loss_mlp": 1.005602, + "epoch": 0.4201616006156214, + "flos": 751663173120.0, + "grad_norm": 0.03318909585917556, + "language_loss": 0.78676963, + "learning_rate": 0.0006511862651934091, + "loss": 0.79725242, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.42724609, + "step": 2184, + "time_per_iteration": 3.0779521465301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049999, + "balance_loss_mlp": 1.00713122, + "epoch": 0.42035398230088494, + "flos": 548092185600.0, + "grad_norm": 0.030200903128349884, + "language_loss": 0.82675183, + "learning_rate": 0.0006508892769031638, + "loss": 0.83725178, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.42919922, + "step": 2185, + "time_per_iteration": 2.6862621307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052696, + "balance_loss_mlp": 1.0098995, + "epoch": 0.42054636398614853, + "flos": 618048206592.0, + "grad_norm": 0.035053166321698394, + "language_loss": 0.87309551, + "learning_rate": 0.000650592230032506, + "loss": 0.88362241, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.4284668, + "step": 2186, + "time_per_iteration": 2.7250919342041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051072, + "balance_loss_mlp": 1.00813246, + "epoch": 0.42073874567141206, + "flos": 641667571968.0, + "grad_norm": 0.033545410607481084, + "language_loss": 0.85750729, + "learning_rate": 0.0006502951246967595, + "loss": 0.86801797, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.42993164, + "step": 2187, + "time_per_iteration": 2.8897902965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051911, + "balance_loss_mlp": 1.00911534, + "epoch": 0.42093112735667565, + "flos": 494823697152.0, + "grad_norm": 0.02963421973388752, + "language_loss": 0.87416923, + "learning_rate": 0.0006499979610112706, + "loss": 0.88468838, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.4284668, + "step": 2188, + "time_per_iteration": 2.690762519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_mlp": 1.00219369, + "epoch": 0.4211235090419392, + "flos": 543437912064.0, + "grad_norm": 0.03405892185917734, + "language_loss": 0.84498167, + "learning_rate": 0.000649700739091409, + "loss": 0.85543036, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.42724609, + "step": 2189, + "time_per_iteration": 2.7150561809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050289, + "balance_loss_mlp": 1.00918579, + "epoch": 0.42131589072720277, + "flos": 1535391055872.0, + "grad_norm": 0.006162303642849888, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.7488656, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.41113281, + "step": 2190, + "time_per_iteration": 4.829074382781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_mlp": 1.00466371, + "epoch": 0.42150827241246636, + "flos": 567936031488.0, + "grad_norm": 0.029782751851152003, + "language_loss": 0.85824835, + "learning_rate": 0.0006491061210101557, + "loss": 0.8687222, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.42773438, + "step": 2191, + "time_per_iteration": 2.7018613815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_mlp": 1.00197124, + "epoch": 0.4217006540977299, + "flos": 708842500608.0, + "grad_norm": 0.03166528206992478, + "language_loss": 0.84430063, + "learning_rate": 0.0006488087250796157, + "loss": 0.85474735, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.42749023, + "step": 2192, + "time_per_iteration": 2.907424211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_mlp": 1.00236881, + "epoch": 0.4218930357829935, + "flos": 628562118144.0, + "grad_norm": 0.02920565844268777, + "language_loss": 0.82024074, + "learning_rate": 0.0006485112713764049, + "loss": 0.83069193, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.42797852, + "step": 2193, + "time_per_iteration": 2.9393887519836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_mlp": 1.00435925, + "epoch": 0.422085417468257, + "flos": 461290184448.0, + "grad_norm": 0.02925244938415649, + "language_loss": 0.84264457, + "learning_rate": 0.0006482137600160051, + "loss": 0.85311759, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.42993164, + "step": 2194, + "time_per_iteration": 2.549301862716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050742, + "balance_loss_mlp": 1.00780332, + "epoch": 0.4222777991535206, + "flos": 474981796608.0, + "grad_norm": 0.030629871462955913, + "language_loss": 0.85158336, + "learning_rate": 0.0006479161911139206, + "loss": 0.86209077, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.42993164, + "step": 2195, + "time_per_iteration": 2.6384336948394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_mlp": 1.01116967, + "epoch": 0.4224701808387841, + "flos": 471844925184.0, + "grad_norm": 0.03651823295441523, + "language_loss": 0.8580153, + "learning_rate": 0.0006476185647856778, + "loss": 0.8685571, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.43066406, + "step": 2196, + "time_per_iteration": 2.61171817779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050335, + "balance_loss_mlp": 1.00737166, + "epoch": 0.4226625625240477, + "flos": 678823992576.0, + "grad_norm": 0.03269819945270571, + "language_loss": 0.81914455, + "learning_rate": 0.0006473208811468255, + "loss": 0.8296479, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.43017578, + "step": 2197, + "time_per_iteration": 2.892245292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_mlp": 1.00611031, + "epoch": 0.4228549442093113, + "flos": 504559954944.0, + "grad_norm": 0.030930986611316814, + "language_loss": 0.84766257, + "learning_rate": 0.0006470231403129347, + "loss": 0.85815352, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.43041992, + "step": 2198, + "time_per_iteration": 2.64943265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104433, + "balance_loss_mlp": 1.00119996, + "epoch": 0.42304732589457483, + "flos": 613075092480.0, + "grad_norm": 0.027263393707605364, + "language_loss": 0.81978631, + "learning_rate": 0.0006467253423995988, + "loss": 0.83022958, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.43188477, + "step": 2199, + "time_per_iteration": 2.8850364685058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048401, + "balance_loss_mlp": 1.00527155, + "epoch": 0.4232397075798384, + "flos": 516649594368.0, + "grad_norm": 0.03785502815659436, + "language_loss": 0.79452145, + "learning_rate": 0.000646427487522433, + "loss": 0.80500549, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.43188477, + "step": 2200, + "time_per_iteration": 2.694916009902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050341, + "balance_loss_mlp": 1.00713968, + "epoch": 0.42343208926510195, + "flos": 590934245376.0, + "grad_norm": 0.030735047123199966, + "language_loss": 0.83900952, + "learning_rate": 0.0006461295757970749, + "loss": 0.84951293, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.43261719, + "step": 2201, + "time_per_iteration": 2.835726737976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046215, + "balance_loss_mlp": 1.00320446, + "epoch": 0.42362447095036554, + "flos": 641819216640.0, + "grad_norm": 0.03465447846020762, + "language_loss": 0.82287079, + "learning_rate": 0.0006458316073391839, + "loss": 0.83333296, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.43066406, + "step": 2202, + "time_per_iteration": 2.8503153324127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045543, + "balance_loss_mlp": 1.00241327, + "epoch": 0.42381685263562907, + "flos": 513718802688.0, + "grad_norm": 0.030503622319833546, + "language_loss": 0.88278598, + "learning_rate": 0.0006455335822644422, + "loss": 0.89324141, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.43188477, + "step": 2203, + "time_per_iteration": 2.6294915676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_mlp": 1.00689554, + "epoch": 0.42400923432089266, + "flos": 547822922496.0, + "grad_norm": 0.03601428124518316, + "language_loss": 0.78504658, + "learning_rate": 0.0006452355006885527, + "loss": 0.79554689, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.43188477, + "step": 2204, + "time_per_iteration": 2.7194669246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050575, + "balance_loss_mlp": 1.00756454, + "epoch": 0.4242016160061562, + "flos": 623288638464.0, + "grad_norm": 0.038292152226624715, + "language_loss": 0.88211453, + "learning_rate": 0.0006449373627272412, + "loss": 0.89262021, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.43066406, + "step": 2205, + "time_per_iteration": 2.760643243789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048111, + "balance_loss_mlp": 1.00495708, + "epoch": 0.4243939976914198, + "flos": 572972328960.0, + "grad_norm": 0.03657249930928273, + "language_loss": 0.83085704, + "learning_rate": 0.0006446391684962553, + "loss": 0.84133816, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.43212891, + "step": 2206, + "time_per_iteration": 2.656205892562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050624, + "balance_loss_mlp": 1.00766063, + "epoch": 0.42458637937668336, + "flos": 449665194240.0, + "grad_norm": 0.03531472123955245, + "language_loss": 0.83588743, + "learning_rate": 0.000644340918111364, + "loss": 0.84639364, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.43017578, + "step": 2207, + "time_per_iteration": 2.563599109649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_mlp": 1.00460744, + "epoch": 0.4247787610619469, + "flos": 436336164096.0, + "grad_norm": 0.035922125926704504, + "language_loss": 0.8567791, + "learning_rate": 0.0006440426116883585, + "loss": 0.86725497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.43041992, + "step": 2208, + "time_per_iteration": 2.5554726123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_mlp": 1.00743186, + "epoch": 0.4249711427472105, + "flos": 497122643712.0, + "grad_norm": 0.02878008588010938, + "language_loss": 0.86522639, + "learning_rate": 0.0006437442493430519, + "loss": 0.87572914, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.42895508, + "step": 2209, + "time_per_iteration": 2.698664426803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00334466, + "epoch": 0.425163524432474, + "flos": 657108910848.0, + "grad_norm": 0.03332162137783894, + "language_loss": 0.87084454, + "learning_rate": 0.000643445831191278, + "loss": 0.88130671, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.42919922, + "step": 2210, + "time_per_iteration": 2.919759750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_mlp": 1.00526094, + "epoch": 0.4253559061177376, + "flos": 651779050752.0, + "grad_norm": 0.0360276634161647, + "language_loss": 0.82163692, + "learning_rate": 0.0006431473573488937, + "loss": 0.83211577, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.42675781, + "step": 2211, + "time_per_iteration": 2.7520995140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051038, + "balance_loss_mlp": 1.00836086, + "epoch": 0.42554828780300114, + "flos": 555203853312.0, + "grad_norm": 0.03839138543396186, + "language_loss": 0.85743141, + "learning_rate": 0.0006428488279317765, + "loss": 0.86794186, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.42724609, + "step": 2212, + "time_per_iteration": 2.6509060859680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046098, + "balance_loss_mlp": 1.00356376, + "epoch": 0.4257406694882647, + "flos": 515422842624.0, + "grad_norm": 0.03572196481521071, + "language_loss": 0.88174772, + "learning_rate": 0.0006425502430558259, + "loss": 0.89220864, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.42578125, + "step": 2213, + "time_per_iteration": 2.6220855712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_mlp": 1.00623667, + "epoch": 0.42593305117352825, + "flos": 516705974784.0, + "grad_norm": 0.03258136107598633, + "language_loss": 0.85395515, + "learning_rate": 0.0006422516028369628, + "loss": 0.86444604, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.42895508, + "step": 2214, + "time_per_iteration": 2.6463093757629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_mlp": 1.00069499, + "epoch": 0.42612543285879184, + "flos": 589238953728.0, + "grad_norm": 0.0291937048711678, + "language_loss": 0.83896095, + "learning_rate": 0.0006419529073911296, + "loss": 0.8493973, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.42993164, + "step": 2215, + "time_per_iteration": 2.910792112350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.0052923, + "epoch": 0.42631781454405543, + "flos": 636752783616.0, + "grad_norm": 0.03192715722055512, + "language_loss": 0.86142385, + "learning_rate": 0.0006416541568342901, + "loss": 0.87190473, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.4284668, + "step": 2216, + "time_per_iteration": 2.846374750137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_mlp": 1.00366437, + "epoch": 0.42651019622931896, + "flos": 542246153472.0, + "grad_norm": 0.029068811164029314, + "language_loss": 0.84547782, + "learning_rate": 0.0006413553512824297, + "loss": 0.8559429, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.42895508, + "step": 2217, + "time_per_iteration": 2.7738640308380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.00467396, + "epoch": 0.42670257791458255, + "flos": 559224336384.0, + "grad_norm": 0.03125487953761627, + "language_loss": 0.85257965, + "learning_rate": 0.0006410564908515549, + "loss": 0.86305416, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.42822266, + "step": 2218, + "time_per_iteration": 2.654423713684082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050321, + "balance_loss_mlp": 1.00757229, + "epoch": 0.4268949595998461, + "flos": 622450713600.0, + "grad_norm": 0.03350458888486861, + "language_loss": 0.85655409, + "learning_rate": 0.0006407575756576935, + "loss": 0.86705726, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.42797852, + "step": 2219, + "time_per_iteration": 2.7789905071258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047517, + "balance_loss_mlp": 1.00479233, + "epoch": 0.42708734128510967, + "flos": 539015963136.0, + "grad_norm": 0.029341516559542476, + "language_loss": 0.87978554, + "learning_rate": 0.0006404586058168951, + "loss": 0.8902607, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.42773438, + "step": 2220, + "time_per_iteration": 2.7526872158050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047218, + "balance_loss_mlp": 1.00456524, + "epoch": 0.4272797229703732, + "flos": 503862981120.0, + "grad_norm": 0.03177497968579407, + "language_loss": 0.87384629, + "learning_rate": 0.0006401595814452296, + "loss": 0.88431847, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.42700195, + "step": 2221, + "time_per_iteration": 2.620292901992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_mlp": 1.00282323, + "epoch": 0.4274721046556368, + "flos": 493438497792.0, + "grad_norm": 0.03138650703960668, + "language_loss": 0.81104958, + "learning_rate": 0.000639860502658789, + "loss": 0.82150364, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.42626953, + "step": 2222, + "time_per_iteration": 2.6335668563842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_mlp": 1.01007414, + "epoch": 0.4276644863409004, + "flos": 569462181888.0, + "grad_norm": 0.029337527326174825, + "language_loss": 0.84956491, + "learning_rate": 0.0006395613695736853, + "loss": 0.86009336, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.42822266, + "step": 2223, + "time_per_iteration": 2.69158935546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_mlp": 1.01059997, + "epoch": 0.4278568680261639, + "flos": 608563715328.0, + "grad_norm": 0.03527650476558936, + "language_loss": 0.8254534, + "learning_rate": 0.0006392621823060529, + "loss": 0.83598542, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.42651367, + "step": 2224, + "time_per_iteration": 2.7607972621917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_mlp": 0.99978256, + "epoch": 0.4280492497114275, + "flos": 561579663360.0, + "grad_norm": 0.03854840542263403, + "language_loss": 0.8576616, + "learning_rate": 0.0006389629409720465, + "loss": 0.86808693, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.42797852, + "step": 2225, + "time_per_iteration": 2.675492525100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_mlp": 1.00333333, + "epoch": 0.428241631396691, + "flos": 721902267648.0, + "grad_norm": 0.035169952304445494, + "language_loss": 0.89023572, + "learning_rate": 0.0006386636456878417, + "loss": 0.90069675, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.42822266, + "step": 2226, + "time_per_iteration": 2.8786110877990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046721, + "balance_loss_mlp": 1.00397301, + "epoch": 0.4284340130819546, + "flos": 430370568192.0, + "grad_norm": 0.04053005061098929, + "language_loss": 0.92206526, + "learning_rate": 0.0006383642965696353, + "loss": 0.93253243, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.42797852, + "step": 2227, + "time_per_iteration": 2.468848705291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00519955, + "epoch": 0.42862639476721814, + "flos": 526160330496.0, + "grad_norm": 0.0312355764309364, + "language_loss": 0.83643448, + "learning_rate": 0.000638064893733645, + "loss": 0.84691536, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.42944336, + "step": 2228, + "time_per_iteration": 2.7273313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_mlp": 1.0059433, + "epoch": 0.42881877645248173, + "flos": 466378971648.0, + "grad_norm": 0.033088247906643435, + "language_loss": 0.90412128, + "learning_rate": 0.000637765437296109, + "loss": 0.91460913, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.42895508, + "step": 2229, + "time_per_iteration": 2.6459994316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104348, + "balance_loss_mlp": 1.00051713, + "epoch": 0.42901115813774526, + "flos": 561356087040.0, + "grad_norm": 0.033851055909267555, + "language_loss": 0.85812581, + "learning_rate": 0.000637465927373287, + "loss": 0.86856055, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.43017578, + "step": 2230, + "time_per_iteration": 2.6650984287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_mlp": 1.00843728, + "epoch": 0.42920353982300885, + "flos": 562528403712.0, + "grad_norm": 0.03941473686966497, + "language_loss": 0.79439276, + "learning_rate": 0.000637166364081459, + "loss": 0.80490577, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.42919922, + "step": 2231, + "time_per_iteration": 2.6497089862823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_mlp": 1.00242341, + "epoch": 0.42939592150827244, + "flos": 557316162048.0, + "grad_norm": 0.0345529023969128, + "language_loss": 0.84757453, + "learning_rate": 0.0006368667475369256, + "loss": 0.85802627, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.42797852, + "step": 2232, + "time_per_iteration": 2.7934672832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_mlp": 1.00753021, + "epoch": 0.42958830319353597, + "flos": 1524945185280.0, + "grad_norm": 0.006396251355867503, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79576218, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.40917969, + "step": 2233, + "time_per_iteration": 6.342620372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_mlp": 1.0040741, + "epoch": 0.42978068487879956, + "flos": 1498872316416.0, + "grad_norm": 0.003657386104401554, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.7994051, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.41015625, + "step": 2234, + "time_per_iteration": 4.862509250640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044747, + "balance_loss_mlp": 1.00209367, + "epoch": 0.4299730665640631, + "flos": 548063995392.0, + "grad_norm": 0.029617650166464796, + "language_loss": 0.86346197, + "learning_rate": 0.0006359675795504112, + "loss": 0.87390947, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.42700195, + "step": 2235, + "time_per_iteration": 2.747687339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_mlp": 1.0022428, + "epoch": 0.4301654482493267, + "flos": 1131116700672.0, + "grad_norm": 0.034530900471349386, + "language_loss": 0.74852663, + "learning_rate": 0.0006356677511584775, + "loss": 0.75897634, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.42773438, + "step": 2236, + "time_per_iteration": 3.4453399181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_mlp": 1.00291729, + "epoch": 0.4303578299345902, + "flos": 496742565120.0, + "grad_norm": 0.03572959525697719, + "language_loss": 0.8668766, + "learning_rate": 0.0006353678700956511, + "loss": 0.87733233, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.42700195, + "step": 2237, + "time_per_iteration": 2.562898874282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_mlp": 1.00228131, + "epoch": 0.4305502116198538, + "flos": 616930324992.0, + "grad_norm": 0.03185512314906856, + "language_loss": 0.84350532, + "learning_rate": 0.0006350679364783569, + "loss": 0.853953, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.42529297, + "step": 2238, + "time_per_iteration": 2.7968668937683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044024, + "balance_loss_mlp": 1.00139523, + "epoch": 0.4307425933051173, + "flos": 560322776064.0, + "grad_norm": 0.03209283293682184, + "language_loss": 0.85997605, + "learning_rate": 0.0006347679504230393, + "loss": 0.87041628, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.42675781, + "step": 2239, + "time_per_iteration": 2.634075880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042998, + "balance_loss_mlp": 1.00039279, + "epoch": 0.4309349749903809, + "flos": 973818206976.0, + "grad_norm": 0.03253096283776471, + "language_loss": 0.77016532, + "learning_rate": 0.0006344679120461632, + "loss": 0.7805953, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.42651367, + "step": 2240, + "time_per_iteration": 3.334874153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044857, + "balance_loss_mlp": 1.00222731, + "epoch": 0.4311273566756445, + "flos": 542973262848.0, + "grad_norm": 0.034862997803941254, + "language_loss": 0.8043505, + "learning_rate": 0.0006341678214642134, + "loss": 0.81479907, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.42675781, + "step": 2241, + "time_per_iteration": 2.6504814624786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00386059, + "epoch": 0.43131973836090803, + "flos": 763112219136.0, + "grad_norm": 0.032836493574204505, + "language_loss": 0.83329326, + "learning_rate": 0.0006338676787936963, + "loss": 0.84375745, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.42602539, + "step": 2242, + "time_per_iteration": 3.0819406509399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049013, + "balance_loss_mlp": 1.0064075, + "epoch": 0.4315121200461716, + "flos": 555603373824.0, + "grad_norm": 0.03474898353682057, + "language_loss": 0.8436116, + "learning_rate": 0.0006335674841511367, + "loss": 0.85410172, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.42651367, + "step": 2243, + "time_per_iteration": 2.688323974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.00395203, + "epoch": 0.43170450173143515, + "flos": 1488689872896.0, + "grad_norm": 0.005657229041031833, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80226028, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.40917969, + "step": 2244, + "time_per_iteration": 5.0437562465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_mlp": 1.00093079, + "epoch": 0.43189688341669874, + "flos": 1476910325760.0, + "grad_norm": 0.004174711640612148, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.784073, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.40820312, + "step": 2245, + "time_per_iteration": 4.930269002914429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105129, + "balance_loss_mlp": 1.00870872, + "epoch": 0.43208926510196227, + "flos": 493985772288.0, + "grad_norm": 0.03367129883883542, + "language_loss": 0.83325648, + "learning_rate": 0.0006326665895567652, + "loss": 0.84376937, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.42626953, + "step": 2246, + "time_per_iteration": 2.6496520042419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_mlp": 1.0025456, + "epoch": 0.43228164678722586, + "flos": 521303867904.0, + "grad_norm": 0.0373506965449987, + "language_loss": 0.88340402, + "learning_rate": 0.0006323661881916976, + "loss": 0.89385581, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.42675781, + "step": 2247, + "time_per_iteration": 2.7220535278320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_mlp": 1.00188208, + "epoch": 0.4324740284724894, + "flos": 797396173824.0, + "grad_norm": 0.03547023876634794, + "language_loss": 0.8184936, + "learning_rate": 0.0006320657354375179, + "loss": 0.82893801, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.42602539, + "step": 2248, + "time_per_iteration": 2.939730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_mlp": 1.00463986, + "epoch": 0.432666410157753, + "flos": 483098585088.0, + "grad_norm": 0.03653679675435745, + "language_loss": 0.87333679, + "learning_rate": 0.0006317652314108726, + "loss": 0.88380903, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.42626953, + "step": 2249, + "time_per_iteration": 2.554605007171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104756, + "balance_loss_mlp": 1.00512183, + "epoch": 0.43285879184301657, + "flos": 501210200832.0, + "grad_norm": 0.035110898136686476, + "language_loss": 0.91870761, + "learning_rate": 0.0006314646762284277, + "loss": 0.92918324, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.42480469, + "step": 2250, + "time_per_iteration": 2.6592071056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051212, + "balance_loss_mlp": 1.01029968, + "epoch": 0.4330511735282801, + "flos": 1513793592576.0, + "grad_norm": 0.004753866691066904, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76477039, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.40917969, + "step": 2251, + "time_per_iteration": 4.880429267883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00837088, + "epoch": 0.4332435552135437, + "flos": 700838472960.0, + "grad_norm": 0.03213295924784481, + "language_loss": 0.77973437, + "learning_rate": 0.0006308634128629022, + "loss": 0.790241, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.42333984, + "step": 2252, + "time_per_iteration": 2.882138729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048462, + "balance_loss_mlp": 1.00621462, + "epoch": 0.4334359368988072, + "flos": 593483013120.0, + "grad_norm": 0.03310670466815904, + "language_loss": 0.87855673, + "learning_rate": 0.0006305627049132531, + "loss": 0.8890413, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.42285156, + "step": 2253, + "time_per_iteration": 2.756601095199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052718, + "balance_loss_mlp": 1.01049364, + "epoch": 0.4336283185840708, + "flos": 844276213248.0, + "grad_norm": 0.028181128656308053, + "language_loss": 0.86222875, + "learning_rate": 0.0006302619462746662, + "loss": 0.87275594, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.42260742, + "step": 2254, + "time_per_iteration": 3.1384341716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00748384, + "epoch": 0.43382070026933434, + "flos": 627402440448.0, + "grad_norm": 0.031912731462448586, + "language_loss": 0.90840006, + "learning_rate": 0.0006299611370639069, + "loss": 0.91889828, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.42382812, + "step": 2255, + "time_per_iteration": 2.712411642074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_mlp": 1.00746036, + "epoch": 0.4340130819545979, + "flos": 592210574592.0, + "grad_norm": 0.034079381595113686, + "language_loss": 0.79521996, + "learning_rate": 0.0006296602773977593, + "loss": 0.80571818, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.42407227, + "step": 2256, + "time_per_iteration": 2.714035987854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044192, + "balance_loss_mlp": 1.00182462, + "epoch": 0.4342054636398615, + "flos": 491956088832.0, + "grad_norm": 0.031173748742501443, + "language_loss": 0.88170785, + "learning_rate": 0.0006293593673930277, + "loss": 0.89214981, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.42407227, + "step": 2257, + "time_per_iteration": 2.6403400897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050154, + "balance_loss_mlp": 1.00771534, + "epoch": 0.43439784532512504, + "flos": 700261062912.0, + "grad_norm": 0.031956889919079245, + "language_loss": 0.79138076, + "learning_rate": 0.0006290584071665358, + "loss": 0.80188227, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.42480469, + "step": 2258, + "time_per_iteration": 2.88726544380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051787, + "balance_loss_mlp": 1.00942004, + "epoch": 0.43459022701038863, + "flos": 486802172928.0, + "grad_norm": 0.03220669099915263, + "language_loss": 0.82764459, + "learning_rate": 0.0006287573968351266, + "loss": 0.83816242, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.42407227, + "step": 2259, + "time_per_iteration": 2.556873083114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_mlp": 1.00314939, + "epoch": 0.43478260869565216, + "flos": 644267862528.0, + "grad_norm": 0.0421666552527836, + "language_loss": 0.83019865, + "learning_rate": 0.0006284563365156626, + "loss": 0.84065259, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.42285156, + "step": 2260, + "time_per_iteration": 2.7845253944396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044012, + "balance_loss_mlp": 1.0014782, + "epoch": 0.43497499038091575, + "flos": 427010120448.0, + "grad_norm": 0.03632893260701325, + "language_loss": 0.87946701, + "learning_rate": 0.0006281552263250261, + "loss": 0.88990712, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.42578125, + "step": 2261, + "time_per_iteration": 2.4605414867401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_mlp": 1.00973511, + "epoch": 0.4351673720661793, + "flos": 1541527738368.0, + "grad_norm": 0.007050141628338806, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81742275, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.41015625, + "step": 2262, + "time_per_iteration": 4.901712656021118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105392, + "balance_loss_mlp": 1.01160097, + "epoch": 0.43535975375144287, + "flos": 750466556928.0, + "grad_norm": 0.036118497785784055, + "language_loss": 0.8206706, + "learning_rate": 0.0006275528567978593, + "loss": 0.83120978, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.42358398, + "step": 2263, + "time_per_iteration": 2.9023561477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049817, + "balance_loss_mlp": 1.00749719, + "epoch": 0.4355521354367064, + "flos": 862752356352.0, + "grad_norm": 0.037575674234966834, + "language_loss": 0.82972687, + "learning_rate": 0.0006272515976951898, + "loss": 0.84022498, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.42358398, + "step": 2264, + "time_per_iteration": 3.062626361846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_mlp": 1.00086057, + "epoch": 0.43574451712197, + "flos": 735843700992.0, + "grad_norm": 0.027621901281680974, + "language_loss": 0.7971707, + "learning_rate": 0.0006269502891890687, + "loss": 0.80760157, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.42260742, + "step": 2265, + "time_per_iteration": 3.006544351577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047735, + "balance_loss_mlp": 1.00548732, + "epoch": 0.4359368988072336, + "flos": 571713496320.0, + "grad_norm": 0.03795602123750952, + "language_loss": 0.88080567, + "learning_rate": 0.0006266489313964743, + "loss": 0.89128304, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.42285156, + "step": 2266, + "time_per_iteration": 2.7217609882354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_mlp": 1.00633645, + "epoch": 0.4361292804924971, + "flos": 556671677952.0, + "grad_norm": 0.02985944883667051, + "language_loss": 0.86046827, + "learning_rate": 0.0006263475244344041, + "loss": 0.87095433, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.4230957, + "step": 2267, + "time_per_iteration": 2.844616651535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048894, + "balance_loss_mlp": 1.00688469, + "epoch": 0.4363216621777607, + "flos": 558349473024.0, + "grad_norm": 0.03645132335916721, + "language_loss": 0.84930134, + "learning_rate": 0.0006260460684198746, + "loss": 0.85979033, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.42041016, + "step": 2268, + "time_per_iteration": 2.6209938526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_mlp": 1.00457883, + "epoch": 0.4365140438630242, + "flos": 479197665792.0, + "grad_norm": 0.03681259693925087, + "language_loss": 0.84888554, + "learning_rate": 0.0006257445634699213, + "loss": 0.85935068, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.41967773, + "step": 2269, + "time_per_iteration": 2.5371193885803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_mlp": 1.00675571, + "epoch": 0.4367064255482878, + "flos": 580008174336.0, + "grad_norm": 0.03379370609735099, + "language_loss": 0.83707798, + "learning_rate": 0.0006254430097015993, + "loss": 0.84756517, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.41992188, + "step": 2270, + "time_per_iteration": 2.663670539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053223, + "balance_loss_mlp": 1.01278687, + "epoch": 0.43689880723355135, + "flos": 1462274830848.0, + "grad_norm": 0.005499517712732893, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77532315, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.40429688, + "step": 2271, + "time_per_iteration": 4.872848033905029 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051249, + "balance_loss_mlp": 1.00945389, + "epoch": 0.43709118891881493, + "flos": 668874852096.0, + "grad_norm": 0.028346757116800847, + "language_loss": 0.85555887, + "learning_rate": 0.0006248397561781609, + "loss": 0.86607134, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.41821289, + "step": 2272, + "time_per_iteration": 2.8525848388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_mlp": 1.01004434, + "epoch": 0.43728357060407846, + "flos": 545914748160.0, + "grad_norm": 0.03971939435737374, + "language_loss": 0.86681366, + "learning_rate": 0.0006245380566572482, + "loss": 0.87733418, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.42041016, + "step": 2273, + "time_per_iteration": 2.65950608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_mlp": 1.01047897, + "epoch": 0.43747595228934205, + "flos": 748185106944.0, + "grad_norm": 0.03474296828051499, + "language_loss": 0.764799, + "learning_rate": 0.0006242363087863744, + "loss": 0.77532339, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.41992188, + "step": 2274, + "time_per_iteration": 3.009678363800049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_mlp": 1.00212932, + "epoch": 0.43766833397460564, + "flos": 632530111488.0, + "grad_norm": 0.043644038275203835, + "language_loss": 0.86733937, + "learning_rate": 0.0006239345126826878, + "loss": 0.87778056, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.42016602, + "step": 2275, + "time_per_iteration": 2.7913572788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_mlp": 1.00093269, + "epoch": 0.43786071565986917, + "flos": 532099681536.0, + "grad_norm": 0.03488456741245989, + "language_loss": 0.84520668, + "learning_rate": 0.0006236326684633561, + "loss": 0.85563612, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.42041016, + "step": 2276, + "time_per_iteration": 2.868460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_mlp": 1.00567341, + "epoch": 0.43805309734513276, + "flos": 539558380032.0, + "grad_norm": 0.04090877877929134, + "language_loss": 0.75841373, + "learning_rate": 0.0006233307762455658, + "loss": 0.76888937, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.41918945, + "step": 2277, + "time_per_iteration": 2.675471782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.00563169, + "epoch": 0.4382454790303963, + "flos": 865965050112.0, + "grad_norm": 0.057141626101515054, + "language_loss": 0.83989596, + "learning_rate": 0.0006230288361465216, + "loss": 0.85037291, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.42089844, + "step": 2278, + "time_per_iteration": 3.0322673320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_mlp": 1.005216, + "epoch": 0.4384378607156599, + "flos": 766802201088.0, + "grad_norm": 0.03709867443192191, + "language_loss": 0.85241038, + "learning_rate": 0.0006227268482834473, + "loss": 0.86288601, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.42382812, + "step": 2279, + "time_per_iteration": 2.900203227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044865, + "balance_loss_mlp": 1.0024029, + "epoch": 0.4386302424009234, + "flos": 669797347584.0, + "grad_norm": 0.03112976006735108, + "language_loss": 0.87510288, + "learning_rate": 0.000622424812773585, + "loss": 0.88555157, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.42504883, + "step": 2280, + "time_per_iteration": 2.8384146690368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_mlp": 1.00591767, + "epoch": 0.438822624086187, + "flos": 486150885888.0, + "grad_norm": 0.037274279546085635, + "language_loss": 0.8020004, + "learning_rate": 0.000622122729734195, + "loss": 0.81248468, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.42553711, + "step": 2281, + "time_per_iteration": 2.6004860401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048001, + "balance_loss_mlp": 1.00549114, + "epoch": 0.4390150057714506, + "flos": 500259515136.0, + "grad_norm": 0.032261530197162686, + "language_loss": 0.88006121, + "learning_rate": 0.0006218205992825566, + "loss": 0.8905412, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.42553711, + "step": 2282, + "time_per_iteration": 2.619781494140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049893, + "balance_loss_mlp": 1.00745404, + "epoch": 0.4392073874567141, + "flos": 559352648448.0, + "grad_norm": 0.035010140104523226, + "language_loss": 0.8217926, + "learning_rate": 0.0006215184215359671, + "loss": 0.83229148, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.42480469, + "step": 2283, + "time_per_iteration": 2.7295265197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_mlp": 1.00495577, + "epoch": 0.4393997691419777, + "flos": 606423216384.0, + "grad_norm": 0.031848598857185544, + "language_loss": 0.86998332, + "learning_rate": 0.0006212161966117425, + "loss": 0.88045812, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.42578125, + "step": 2284, + "time_per_iteration": 2.718440532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_mlp": 1.00607538, + "epoch": 0.43959215082724123, + "flos": 805484772096.0, + "grad_norm": 0.035712970592664255, + "language_loss": 0.82239711, + "learning_rate": 0.0006209139246272164, + "loss": 0.83288318, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.42578125, + "step": 2285, + "time_per_iteration": 2.9688222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050536, + "balance_loss_mlp": 1.00793087, + "epoch": 0.4397845325125048, + "flos": 488608280064.0, + "grad_norm": 0.03687327973299051, + "language_loss": 0.82202113, + "learning_rate": 0.0006206116056997421, + "loss": 0.8325265, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.42651367, + "step": 2286, + "time_per_iteration": 2.5476558208465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048309, + "balance_loss_mlp": 1.00579894, + "epoch": 0.43997691419776835, + "flos": 481785317376.0, + "grad_norm": 0.030160303580515496, + "language_loss": 0.8299154, + "learning_rate": 0.0006203092399466892, + "loss": 0.84039849, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.42553711, + "step": 2287, + "time_per_iteration": 2.5308852195739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_mlp": 1.00539064, + "epoch": 0.44016929588303194, + "flos": 484129950720.0, + "grad_norm": 0.02729114822665251, + "language_loss": 0.85650307, + "learning_rate": 0.0006200068274854473, + "loss": 0.8669818, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.42529297, + "step": 2288, + "time_per_iteration": 2.6596133708953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045912, + "balance_loss_mlp": 1.00361645, + "epoch": 0.4403616775682955, + "flos": 573024818688.0, + "grad_norm": 0.028573956325372987, + "language_loss": 0.86632061, + "learning_rate": 0.0006197043684334229, + "loss": 0.87677968, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.42333984, + "step": 2289, + "time_per_iteration": 2.773327350616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00496542, + "epoch": 0.44055405925355906, + "flos": 632000333568.0, + "grad_norm": 0.03542319310998882, + "language_loss": 0.80357343, + "learning_rate": 0.0006194018629080411, + "loss": 0.81404698, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.42431641, + "step": 2290, + "time_per_iteration": 2.7465741634368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_mlp": 1.00444698, + "epoch": 0.44074644093882265, + "flos": 537826149888.0, + "grad_norm": 0.033710926441732514, + "language_loss": 0.82429153, + "learning_rate": 0.0006190993110267451, + "loss": 0.83475971, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.42407227, + "step": 2291, + "time_per_iteration": 2.734936237335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_mlp": 1.00401258, + "epoch": 0.4409388226240862, + "flos": 464166541056.0, + "grad_norm": 0.03677198311176373, + "language_loss": 0.84841394, + "learning_rate": 0.0006187967129069958, + "loss": 0.85887772, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.42407227, + "step": 2292, + "time_per_iteration": 2.491478443145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048193, + "balance_loss_mlp": 1.00604105, + "epoch": 0.44113120430934977, + "flos": 567161289984.0, + "grad_norm": 0.027373577802651455, + "language_loss": 0.87309539, + "learning_rate": 0.0006184940686662722, + "loss": 0.88357735, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.421875, + "step": 2293, + "time_per_iteration": 2.7358779907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045965, + "balance_loss_mlp": 1.00371683, + "epoch": 0.4413235859946133, + "flos": 544675357440.0, + "grad_norm": 0.03072432375615432, + "language_loss": 0.9056381, + "learning_rate": 0.0006181913784220714, + "loss": 0.91609776, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.42285156, + "step": 2294, + "time_per_iteration": 2.6358015537261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_mlp": 1.00485992, + "epoch": 0.4415159676798769, + "flos": 1573305688320.0, + "grad_norm": 0.007789835090792861, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81599367, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.40722656, + "step": 2295, + "time_per_iteration": 4.902246713638306 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_mlp": 1.00181961, + "epoch": 0.4417083493651404, + "flos": 660013457664.0, + "grad_norm": 0.029698143477661094, + "language_loss": 0.80193049, + "learning_rate": 0.0006175858603933146, + "loss": 0.8123709, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.42260742, + "step": 2296, + "time_per_iteration": 2.8894712924957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010469, + "balance_loss_mlp": 1.00477171, + "epoch": 0.441900731050404, + "flos": 741818045184.0, + "grad_norm": 0.03343125158047759, + "language_loss": 0.81235009, + "learning_rate": 0.0006172830328438416, + "loss": 0.82281911, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.42163086, + "step": 2297, + "time_per_iteration": 3.03363299369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_mlp": 1.00080705, + "epoch": 0.44209311273566754, + "flos": 540596548608.0, + "grad_norm": 0.03516131163144532, + "language_loss": 0.87775767, + "learning_rate": 0.0006169801597610572, + "loss": 0.88818848, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.4230957, + "step": 2298, + "time_per_iteration": 2.7615511417388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047937, + "balance_loss_mlp": 1.00580859, + "epoch": 0.4422854944209311, + "flos": 622730670336.0, + "grad_norm": 0.03691263796350213, + "language_loss": 0.90342188, + "learning_rate": 0.0006166772412625469, + "loss": 0.91390121, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.42163086, + "step": 2299, + "time_per_iteration": 2.757885456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_mlp": 1.00208378, + "epoch": 0.4424778761061947, + "flos": 660061089792.0, + "grad_norm": 0.03315959572172903, + "language_loss": 0.82509053, + "learning_rate": 0.0006163742774659141, + "loss": 0.835536, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.42504883, + "step": 2300, + "time_per_iteration": 2.8489365577697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045508, + "balance_loss_mlp": 1.00316477, + "epoch": 0.44267025779145824, + "flos": 569703254784.0, + "grad_norm": 0.02877714461404429, + "language_loss": 0.86486191, + "learning_rate": 0.0006160712684887801, + "loss": 0.87531698, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.42382812, + "step": 2301, + "time_per_iteration": 2.783581495285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_mlp": 1.00126386, + "epoch": 0.44286263947672183, + "flos": 497819617536.0, + "grad_norm": 0.032325076823307486, + "language_loss": 0.82883227, + "learning_rate": 0.0006157682144487832, + "loss": 0.83926737, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.42285156, + "step": 2302, + "time_per_iteration": 2.8138058185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046294, + "balance_loss_mlp": 1.00395119, + "epoch": 0.44305502116198536, + "flos": 610608950016.0, + "grad_norm": 0.032307808069359366, + "language_loss": 0.83262819, + "learning_rate": 0.0006154651154635793, + "loss": 0.84309107, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.42382812, + "step": 2303, + "time_per_iteration": 2.9065494537353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045793, + "balance_loss_mlp": 1.00349796, + "epoch": 0.44324740284724895, + "flos": 471742857984.0, + "grad_norm": 0.03422426159351285, + "language_loss": 0.85742319, + "learning_rate": 0.0006151619716508421, + "loss": 0.86788118, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.42333984, + "step": 2304, + "time_per_iteration": 2.5973682403564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00377095, + "epoch": 0.4434397845325125, + "flos": 579812788224.0, + "grad_norm": 0.032225909976612614, + "language_loss": 0.87212336, + "learning_rate": 0.0006148587831282625, + "loss": 0.88258433, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.42358398, + "step": 2305, + "time_per_iteration": 2.6349332332611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_mlp": 1.00563049, + "epoch": 0.44363216621777607, + "flos": 1499997967872.0, + "grad_norm": 0.0072841640427745245, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80222803, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.41113281, + "step": 2306, + "time_per_iteration": 4.920953989028931 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047952, + "balance_loss_mlp": 1.00565624, + "epoch": 0.44382454790303966, + "flos": 478285863936.0, + "grad_norm": 0.035350800366555836, + "language_loss": 0.87850344, + "learning_rate": 0.0006142522724244255, + "loss": 0.88898295, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.42333984, + "step": 2307, + "time_per_iteration": 2.5206384658813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044529, + "balance_loss_mlp": 1.00361633, + "epoch": 0.4440169295883032, + "flos": 1547306696448.0, + "grad_norm": 0.0037013242818687312, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77529252, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.40917969, + "step": 2308, + "time_per_iteration": 4.906585454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00570333, + "epoch": 0.4442093112735668, + "flos": 592291254528.0, + "grad_norm": 0.03559804859588436, + "language_loss": 0.78114909, + "learning_rate": 0.000613645584293942, + "loss": 0.79162765, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.421875, + "step": 2309, + "time_per_iteration": 2.9084970951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_mlp": 1.00767648, + "epoch": 0.4444016929588303, + "flos": 531328830720.0, + "grad_norm": 0.036447190975963356, + "language_loss": 0.83448339, + "learning_rate": 0.0006133421739881185, + "loss": 0.84498286, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.4230957, + "step": 2310, + "time_per_iteration": 2.652672052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044821, + "balance_loss_mlp": 1.0026927, + "epoch": 0.4445940746440939, + "flos": 621389212416.0, + "grad_norm": 0.035906278639006764, + "language_loss": 0.83511341, + "learning_rate": 0.0006130387196789605, + "loss": 0.84556162, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.42163086, + "step": 2311, + "time_per_iteration": 2.747197151184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.00328362, + "epoch": 0.4447864563293574, + "flos": 630376973568.0, + "grad_norm": 0.027043038636915952, + "language_loss": 0.84677482, + "learning_rate": 0.0006127352214842795, + "loss": 0.85723037, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.4230957, + "step": 2312, + "time_per_iteration": 3.0515668392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045637, + "balance_loss_mlp": 1.00327015, + "epoch": 0.444978838014621, + "flos": 652002627072.0, + "grad_norm": 0.034195517498726076, + "language_loss": 0.85929281, + "learning_rate": 0.0006124316795219041, + "loss": 0.86974919, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.42407227, + "step": 2313, + "time_per_iteration": 2.778184652328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050022, + "balance_loss_mlp": 1.00786984, + "epoch": 0.44517121969988455, + "flos": 613589319168.0, + "grad_norm": 0.029604729226228255, + "language_loss": 0.82924336, + "learning_rate": 0.0006121280939096794, + "loss": 0.83974361, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.421875, + "step": 2314, + "time_per_iteration": 2.7615392208099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045707, + "balance_loss_mlp": 1.00350666, + "epoch": 0.44536360138514813, + "flos": 489715468032.0, + "grad_norm": 0.036472505020621125, + "language_loss": 0.8826952, + "learning_rate": 0.000611824464765468, + "loss": 0.89315224, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.42236328, + "step": 2315, + "time_per_iteration": 2.67606782913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_mlp": 1.01759338, + "epoch": 0.4455559830704117, + "flos": 1519056390144.0, + "grad_norm": 0.01193419136680653, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79653352, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.40820312, + "step": 2316, + "time_per_iteration": 4.725375652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045978, + "balance_loss_mlp": 1.00384891, + "epoch": 0.44574836475567525, + "flos": 616817564160.0, + "grad_norm": 0.032139423648612636, + "language_loss": 0.85745513, + "learning_rate": 0.000611217076352619, + "loss": 0.86791497, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.42163086, + "step": 2317, + "time_per_iteration": 2.8277692794799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046443, + "balance_loss_mlp": 1.00429094, + "epoch": 0.44594074644093884, + "flos": 507434366208.0, + "grad_norm": 0.030845694350894858, + "language_loss": 0.83782113, + "learning_rate": 0.0006109133173197905, + "loss": 0.84828556, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.421875, + "step": 2318, + "time_per_iteration": 2.740814685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044287, + "balance_loss_mlp": 1.0021348, + "epoch": 0.44613312812620237, + "flos": 728313070848.0, + "grad_norm": 0.03532114030566384, + "language_loss": 0.86011016, + "learning_rate": 0.0006106095152265935, + "loss": 0.87055302, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.421875, + "step": 2319, + "time_per_iteration": 2.982090473175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048334, + "balance_loss_mlp": 1.00615764, + "epoch": 0.44632550981146596, + "flos": 637058985216.0, + "grad_norm": 0.029959494040304766, + "language_loss": 0.85331011, + "learning_rate": 0.0006103056701909739, + "loss": 0.86379343, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.42211914, + "step": 2320, + "time_per_iteration": 2.911764621734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_mlp": 1.00878716, + "epoch": 0.4465178914967295, + "flos": 828618100992.0, + "grad_norm": 0.026414177364328564, + "language_loss": 0.83389866, + "learning_rate": 0.0006100017823308956, + "loss": 0.8444078, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.42163086, + "step": 2321, + "time_per_iteration": 3.166370153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00672007, + "epoch": 0.4467102731819931, + "flos": 667033751808.0, + "grad_norm": 0.03675396641442824, + "language_loss": 0.80177474, + "learning_rate": 0.0006096978517643377, + "loss": 0.81226206, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.42041016, + "step": 2322, + "time_per_iteration": 2.7839677333831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049863, + "balance_loss_mlp": 1.00780618, + "epoch": 0.4469026548672566, + "flos": 513970569216.0, + "grad_norm": 0.036357166954029595, + "language_loss": 0.84299958, + "learning_rate": 0.0006093938786092968, + "loss": 0.85349822, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.42089844, + "step": 2323, + "time_per_iteration": 2.6366002559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_mlp": 1.01054394, + "epoch": 0.4470950365525202, + "flos": 685286318592.0, + "grad_norm": 0.03621901423501995, + "language_loss": 0.9042533, + "learning_rate": 0.0006090898629837857, + "loss": 0.91477954, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.42114258, + "step": 2324, + "time_per_iteration": 2.8338427543640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_mlp": 1.00514829, + "epoch": 0.4472874182377838, + "flos": 628535873280.0, + "grad_norm": 0.028780974393906523, + "language_loss": 0.87792349, + "learning_rate": 0.0006087858050058337, + "loss": 0.88839531, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.4206543, + "step": 2325, + "time_per_iteration": 2.7868492603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_mlp": 1.00534439, + "epoch": 0.4474797999230473, + "flos": 548241884928.0, + "grad_norm": 0.03362424978515615, + "language_loss": 0.83227015, + "learning_rate": 0.0006084817047934866, + "loss": 0.84274435, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.42114258, + "step": 2326, + "time_per_iteration": 2.6603922843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_mlp": 1.01144028, + "epoch": 0.4476721816083109, + "flos": 456757420032.0, + "grad_norm": 0.033869443234677665, + "language_loss": 0.90294945, + "learning_rate": 0.0006081775624648066, + "loss": 0.91348392, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.42041016, + "step": 2327, + "time_per_iteration": 2.563965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_mlp": 1.00730181, + "epoch": 0.44786456329357444, + "flos": 482501733120.0, + "grad_norm": 0.03973119590818811, + "language_loss": 0.83093679, + "learning_rate": 0.0006078733781378721, + "loss": 0.8414318, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.42236328, + "step": 2328, + "time_per_iteration": 2.5500621795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056217, + "balance_loss_mlp": 1.01401651, + "epoch": 0.448056944978838, + "flos": 553237353216.0, + "grad_norm": 0.0336771809947293, + "language_loss": 0.82818258, + "learning_rate": 0.0006075691519307781, + "loss": 0.83874476, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.42236328, + "step": 2329, + "time_per_iteration": 2.8369436264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053357, + "balance_loss_mlp": 1.01125205, + "epoch": 0.44824932666410156, + "flos": 551917282560.0, + "grad_norm": 0.03290883990888194, + "language_loss": 0.81853932, + "learning_rate": 0.0006072648839616356, + "loss": 0.82907289, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.42138672, + "step": 2330, + "time_per_iteration": 2.707853078842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_mlp": 1.00861132, + "epoch": 0.44844170834936514, + "flos": 990273414912.0, + "grad_norm": 0.029288900679948552, + "language_loss": 0.83132529, + "learning_rate": 0.0006069605743485718, + "loss": 0.84183216, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.42114258, + "step": 2331, + "time_per_iteration": 3.347529649734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053351, + "balance_loss_mlp": 1.011127, + "epoch": 0.44863409003462873, + "flos": 592451647488.0, + "grad_norm": 0.033148459483392366, + "language_loss": 0.84139442, + "learning_rate": 0.0006066562232097303, + "loss": 0.85192794, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.42260742, + "step": 2332, + "time_per_iteration": 2.7059993743896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048256, + "balance_loss_mlp": 1.00600874, + "epoch": 0.44882647171989226, + "flos": 725985934080.0, + "grad_norm": 0.033171968523288915, + "language_loss": 0.86700636, + "learning_rate": 0.0006063518306632708, + "loss": 0.87748891, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.42285156, + "step": 2333, + "time_per_iteration": 2.9296460151672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_mlp": 1.00607038, + "epoch": 0.44901885340515585, + "flos": 535991852544.0, + "grad_norm": 0.03657763323068719, + "language_loss": 0.83056581, + "learning_rate": 0.0006060473968273688, + "loss": 0.84104872, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.42260742, + "step": 2334, + "time_per_iteration": 2.6368448734283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104747, + "balance_loss_mlp": 1.0070343, + "epoch": 0.4492112350904194, + "flos": 1558693526016.0, + "grad_norm": 0.008278759352477436, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.7892701, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.40429688, + "step": 2335, + "time_per_iteration": 4.866518497467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050045, + "balance_loss_mlp": 1.00951385, + "epoch": 0.44940361677568297, + "flos": 1526703660288.0, + "grad_norm": 0.009772749846677187, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82055259, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.40527344, + "step": 2336, + "time_per_iteration": 4.832434892654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049508, + "balance_loss_mlp": 1.00759399, + "epoch": 0.4495959984609465, + "flos": 383321387520.0, + "grad_norm": 0.039418428301582195, + "language_loss": 0.88819385, + "learning_rate": 0.0006051338487650047, + "loss": 0.89868897, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.41943359, + "step": 2337, + "time_per_iteration": 2.4261343479156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104738, + "balance_loss_mlp": 1.00537109, + "epoch": 0.4497883801462101, + "flos": 498883064064.0, + "grad_norm": 0.03829280299631375, + "language_loss": 0.83062887, + "learning_rate": 0.0006048292509534095, + "loss": 0.84110272, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.42041016, + "step": 2338, + "time_per_iteration": 2.5792438983917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00425851, + "epoch": 0.4499807618314736, + "flos": 615590812416.0, + "grad_norm": 0.03236488600067343, + "language_loss": 0.78186011, + "learning_rate": 0.0006045246124434895, + "loss": 0.79232258, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.42016602, + "step": 2339, + "time_per_iteration": 2.736332654953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049178, + "balance_loss_mlp": 1.00704992, + "epoch": 0.4501731435167372, + "flos": 1007068850688.0, + "grad_norm": 0.0336222564343559, + "language_loss": 0.8735106, + "learning_rate": 0.0006042199333535162, + "loss": 0.88400233, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.42163086, + "step": 2340, + "time_per_iteration": 3.3217411041259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_mlp": 1.0066278, + "epoch": 0.4503655252020008, + "flos": 822328806912.0, + "grad_norm": 0.031746848330129245, + "language_loss": 0.8445214, + "learning_rate": 0.0006039152138017763, + "loss": 0.85500968, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.42236328, + "step": 2341, + "time_per_iteration": 3.027831792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_mlp": 1.00464213, + "epoch": 0.4505579068872643, + "flos": 487414576128.0, + "grad_norm": 0.03971234339866032, + "language_loss": 0.84330553, + "learning_rate": 0.0006036104539065726, + "loss": 0.85377491, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.42333984, + "step": 2342, + "time_per_iteration": 2.6650640964508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_mlp": 1.00030267, + "epoch": 0.4507502885725279, + "flos": 886336728576.0, + "grad_norm": 0.030953760348096254, + "language_loss": 0.8473978, + "learning_rate": 0.000603305653786223, + "loss": 0.85782403, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.42358398, + "step": 2343, + "time_per_iteration": 3.146728277206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_mlp": 1.00284708, + "epoch": 0.45094267025779144, + "flos": 579422016000.0, + "grad_norm": 0.032254310776320565, + "language_loss": 0.84862161, + "learning_rate": 0.0006030008135590622, + "loss": 0.859074, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.42431641, + "step": 2344, + "time_per_iteration": 2.716326951980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046244, + "balance_loss_mlp": 1.00387657, + "epoch": 0.45113505194305503, + "flos": 526442232576.0, + "grad_norm": 0.029625683171065443, + "language_loss": 0.81110835, + "learning_rate": 0.0006026959333434387, + "loss": 0.82157081, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.42407227, + "step": 2345, + "time_per_iteration": 2.757293939590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046277, + "balance_loss_mlp": 1.00379133, + "epoch": 0.45132743362831856, + "flos": 503116429824.0, + "grad_norm": 0.029442245536271623, + "language_loss": 0.77997512, + "learning_rate": 0.0006023910132577181, + "loss": 0.79043788, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.42529297, + "step": 2346, + "time_per_iteration": 2.6643226146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_mlp": 1.00201178, + "epoch": 0.45151981531358215, + "flos": 432836710656.0, + "grad_norm": 0.03508285710405181, + "language_loss": 0.85304409, + "learning_rate": 0.0006020860534202806, + "loss": 0.86348718, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.42333984, + "step": 2347, + "time_per_iteration": 2.508922815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_mlp": 1.00444722, + "epoch": 0.4517121969988457, + "flos": 713494828800.0, + "grad_norm": 0.031320840574665956, + "language_loss": 0.81720173, + "learning_rate": 0.0006017810539495224, + "loss": 0.8276692, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.42333984, + "step": 2348, + "time_per_iteration": 2.916851282119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046985, + "balance_loss_mlp": 1.00459409, + "epoch": 0.45190457868410927, + "flos": 580557394176.0, + "grad_norm": 0.03199810496833265, + "language_loss": 0.82887936, + "learning_rate": 0.0006014760149638547, + "loss": 0.83934915, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.42431641, + "step": 2349, + "time_per_iteration": 2.6583147048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_mlp": 1.00189018, + "epoch": 0.45209696036937286, + "flos": 483628363008.0, + "grad_norm": 0.034942038630734404, + "language_loss": 0.89322019, + "learning_rate": 0.000601170936581704, + "loss": 0.90366322, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.42456055, + "step": 2350, + "time_per_iteration": 2.5171234607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051553, + "balance_loss_mlp": 1.00906706, + "epoch": 0.4522893420546364, + "flos": 541260474624.0, + "grad_norm": 0.03828852417675836, + "language_loss": 0.85383743, + "learning_rate": 0.0006008658189215121, + "loss": 0.86435294, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.42529297, + "step": 2351, + "time_per_iteration": 2.6463332176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_mlp": 1.00725281, + "epoch": 0.4524817237399, + "flos": 497691305472.0, + "grad_norm": 0.039190213199739796, + "language_loss": 0.80507791, + "learning_rate": 0.0006005606621017366, + "loss": 0.81557548, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.42553711, + "step": 2352, + "time_per_iteration": 2.5637879371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048153, + "balance_loss_mlp": 1.00597668, + "epoch": 0.4526741054251635, + "flos": 653841782016.0, + "grad_norm": 0.04275245206988235, + "language_loss": 0.80476063, + "learning_rate": 0.0006002554662408496, + "loss": 0.81524217, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.42211914, + "step": 2353, + "time_per_iteration": 2.8951141834259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_mlp": 1.00500786, + "epoch": 0.4528664871104271, + "flos": 572004146688.0, + "grad_norm": 0.03654890079235127, + "language_loss": 0.91683698, + "learning_rate": 0.0005999502314573388, + "loss": 0.92731076, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.42407227, + "step": 2354, + "time_per_iteration": 2.64512300491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051522, + "balance_loss_mlp": 1.00927448, + "epoch": 0.45305886879569063, + "flos": 459679463424.0, + "grad_norm": 0.03675635166201985, + "language_loss": 0.86984789, + "learning_rate": 0.0005996449578697066, + "loss": 0.88036311, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.42285156, + "step": 2355, + "time_per_iteration": 2.6577048301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.0069412, + "epoch": 0.4532512504809542, + "flos": 506207614464.0, + "grad_norm": 0.033984488129296754, + "language_loss": 0.81732345, + "learning_rate": 0.0005993396455964709, + "loss": 0.82781321, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.4206543, + "step": 2356, + "time_per_iteration": 2.7086563110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048885, + "balance_loss_mlp": 1.0067569, + "epoch": 0.4534436321662178, + "flos": 583312241664.0, + "grad_norm": 0.03467705138292274, + "language_loss": 0.82385033, + "learning_rate": 0.0005990342947561647, + "loss": 0.8343392, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.42163086, + "step": 2357, + "time_per_iteration": 2.6705219745635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_mlp": 1.00484145, + "epoch": 0.45363601385148133, + "flos": 550773156096.0, + "grad_norm": 0.03186226313127573, + "language_loss": 0.78742826, + "learning_rate": 0.0005987289054673351, + "loss": 0.79789847, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.42211914, + "step": 2358, + "time_per_iteration": 2.6073710918426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105212, + "balance_loss_mlp": 1.01063538, + "epoch": 0.4538283955367449, + "flos": 1477793937408.0, + "grad_norm": 0.008894510659601113, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77627861, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.41503906, + "step": 2359, + "time_per_iteration": 4.796559810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044324, + "balance_loss_mlp": 1.00245762, + "epoch": 0.45402077722200845, + "flos": 585797826048.0, + "grad_norm": 0.043889208643714143, + "language_loss": 0.91937214, + "learning_rate": 0.0005981180120183722, + "loss": 0.92981529, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.41894531, + "step": 2360, + "time_per_iteration": 2.6962461471557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104759, + "balance_loss_mlp": 1.00584316, + "epoch": 0.45421315890727204, + "flos": 532889974272.0, + "grad_norm": 0.05191452902852925, + "language_loss": 0.85740328, + "learning_rate": 0.0005978125080954089, + "loss": 0.86787915, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.41772461, + "step": 2361, + "time_per_iteration": 2.777160882949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049759, + "balance_loss_mlp": 1.00794065, + "epoch": 0.4544055405925356, + "flos": 786552728064.0, + "grad_norm": 0.0404371323010207, + "language_loss": 0.77941048, + "learning_rate": 0.000597506966198262, + "loss": 0.78990805, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.41845703, + "step": 2362, + "time_per_iteration": 2.9561667442321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048479, + "balance_loss_mlp": 1.00663614, + "epoch": 0.45459792227779916, + "flos": 519202252800.0, + "grad_norm": 0.0386377549927772, + "language_loss": 0.84570003, + "learning_rate": 0.0005972013864455536, + "loss": 0.85618478, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.41870117, + "step": 2363, + "time_per_iteration": 2.577075958251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_mlp": 1.00757432, + "epoch": 0.4547903039630627, + "flos": 538598946048.0, + "grad_norm": 0.03734609962487706, + "language_loss": 0.86156821, + "learning_rate": 0.0005968957689559203, + "loss": 0.87206089, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.41723633, + "step": 2364, + "time_per_iteration": 2.663912773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_mlp": 1.00543737, + "epoch": 0.4549826856483263, + "flos": 529691864832.0, + "grad_norm": 0.03600076061776594, + "language_loss": 0.89443278, + "learning_rate": 0.0005965901138480131, + "loss": 0.90490627, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.41943359, + "step": 2365, + "time_per_iteration": 2.635735034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048292, + "balance_loss_mlp": 1.00633037, + "epoch": 0.45517506733358987, + "flos": 521983345152.0, + "grad_norm": 0.04096543812015268, + "language_loss": 0.87860775, + "learning_rate": 0.0005962844212404982, + "loss": 0.88909072, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.41992188, + "step": 2366, + "time_per_iteration": 2.675039291381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049904, + "balance_loss_mlp": 1.00799048, + "epoch": 0.4553674490188534, + "flos": 452009827584.0, + "grad_norm": 0.02917585056549172, + "language_loss": 0.88090932, + "learning_rate": 0.0005959786912520558, + "loss": 0.89140838, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.41943359, + "step": 2367, + "time_per_iteration": 2.605693817138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046028, + "balance_loss_mlp": 1.00399494, + "epoch": 0.455559830704117, + "flos": 547745154816.0, + "grad_norm": 0.029185999772899627, + "language_loss": 0.84459692, + "learning_rate": 0.0005956729240013806, + "loss": 0.85505724, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.4206543, + "step": 2368, + "time_per_iteration": 2.792929172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104665, + "balance_loss_mlp": 1.00447345, + "epoch": 0.4557522123893805, + "flos": 584866582272.0, + "grad_norm": 0.02991931447914949, + "language_loss": 0.92050606, + "learning_rate": 0.0005953671196071824, + "loss": 0.93097258, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.42211914, + "step": 2369, + "time_per_iteration": 2.7024593353271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052055, + "balance_loss_mlp": 1.00992644, + "epoch": 0.4559445940746441, + "flos": 527484291840.0, + "grad_norm": 0.03299201390628513, + "language_loss": 0.80723774, + "learning_rate": 0.0005950612781881846, + "loss": 0.81775832, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.42163086, + "step": 2370, + "time_per_iteration": 2.7288575172424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.0061928, + "epoch": 0.45613697575990764, + "flos": 653368384512.0, + "grad_norm": 0.034012751150725565, + "language_loss": 0.76432264, + "learning_rate": 0.0005947553998631259, + "loss": 0.77480543, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.42114258, + "step": 2371, + "time_per_iteration": 2.865060567855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051777, + "balance_loss_mlp": 1.00976777, + "epoch": 0.4563293574451712, + "flos": 868624633344.0, + "grad_norm": 0.02789239974176414, + "language_loss": 0.79458821, + "learning_rate": 0.000594449484750758, + "loss": 0.80510592, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.42041016, + "step": 2372, + "time_per_iteration": 3.147550344467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_mlp": 1.00242209, + "epoch": 0.45652173913043476, + "flos": 499132885248.0, + "grad_norm": 0.03342359133343608, + "language_loss": 0.83513892, + "learning_rate": 0.0005941435329698484, + "loss": 0.84558398, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.42114258, + "step": 2373, + "time_per_iteration": 2.6924219131469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_mlp": 1.00441325, + "epoch": 0.45671412081569834, + "flos": 561959741952.0, + "grad_norm": 0.03267163379038315, + "language_loss": 0.83796972, + "learning_rate": 0.0005938375446391778, + "loss": 0.84843373, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.42016602, + "step": 2374, + "time_per_iteration": 2.731687307357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_mlp": 1.00281477, + "epoch": 0.45690650250096193, + "flos": 504123495936.0, + "grad_norm": 0.03711297965033783, + "language_loss": 0.89367199, + "learning_rate": 0.0005935315198775415, + "loss": 0.90412098, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.42114258, + "step": 2375, + "time_per_iteration": 2.679049015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_mlp": 1.0040555, + "epoch": 0.45709888418622546, + "flos": 431599265280.0, + "grad_norm": 0.033405413713201326, + "language_loss": 0.87559128, + "learning_rate": 0.0005932254588037486, + "loss": 0.88605309, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.42163086, + "step": 2376, + "time_per_iteration": 2.5139987468719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_mlp": 1.00384891, + "epoch": 0.45729126587148905, + "flos": 526693999104.0, + "grad_norm": 0.034118342932564036, + "language_loss": 0.86638731, + "learning_rate": 0.000592919361536623, + "loss": 0.87684566, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.42016602, + "step": 2377, + "time_per_iteration": 2.652921438217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047367, + "balance_loss_mlp": 1.00545263, + "epoch": 0.4574836475567526, + "flos": 639148939776.0, + "grad_norm": 0.03214355149845838, + "language_loss": 0.89487022, + "learning_rate": 0.0005926132281950017, + "loss": 0.90534389, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.41943359, + "step": 2378, + "time_per_iteration": 2.7740533351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050302, + "balance_loss_mlp": 1.00819683, + "epoch": 0.45767602924201617, + "flos": 650791426560.0, + "grad_norm": 0.03291422707035226, + "language_loss": 0.85368007, + "learning_rate": 0.0005923070588977367, + "loss": 0.86418307, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.42138672, + "step": 2379, + "time_per_iteration": 2.8456881046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050793, + "balance_loss_mlp": 1.00873554, + "epoch": 0.4578684109272797, + "flos": 747963475968.0, + "grad_norm": 0.03509802642472786, + "language_loss": 0.86739749, + "learning_rate": 0.0005920008537636931, + "loss": 0.87790543, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.42089844, + "step": 2380, + "time_per_iteration": 2.910720109939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048858, + "balance_loss_mlp": 1.00692058, + "epoch": 0.4580607926125433, + "flos": 642729073152.0, + "grad_norm": 0.029242782263759974, + "language_loss": 0.87235177, + "learning_rate": 0.0005916946129117504, + "loss": 0.88284034, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.41967773, + "step": 2381, + "time_per_iteration": 2.8813161849975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.00948262, + "epoch": 0.4582531742978069, + "flos": 803240260608.0, + "grad_norm": 0.03239264438363608, + "language_loss": 0.81130052, + "learning_rate": 0.0005913883364608017, + "loss": 0.82181567, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.4206543, + "step": 2382, + "time_per_iteration": 3.062751531600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105103, + "balance_loss_mlp": 1.00914025, + "epoch": 0.4584455559830704, + "flos": 685518643200.0, + "grad_norm": 0.031797549541833704, + "language_loss": 0.88895178, + "learning_rate": 0.0005910820245297542, + "loss": 0.8994621, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.41918945, + "step": 2383, + "time_per_iteration": 2.8653757572174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_mlp": 1.00387442, + "epoch": 0.458637937668334, + "flos": 519282932736.0, + "grad_norm": 0.03550111139800055, + "language_loss": 0.80986464, + "learning_rate": 0.000590775677237529, + "loss": 0.82032269, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.41967773, + "step": 2384, + "time_per_iteration": 2.7324440479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_mlp": 1.0042969, + "epoch": 0.4588303193535975, + "flos": 506533257984.0, + "grad_norm": 0.03366806840699952, + "language_loss": 0.80683196, + "learning_rate": 0.0005904692947030601, + "loss": 0.81729311, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.41845703, + "step": 2385, + "time_per_iteration": 2.5837819576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_mlp": 1.00176287, + "epoch": 0.4590227010388611, + "flos": 496909761024.0, + "grad_norm": 0.03855013464211847, + "language_loss": 0.89966094, + "learning_rate": 0.0005901628770452963, + "loss": 0.91009706, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.41870117, + "step": 2386, + "time_per_iteration": 2.60300350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_mlp": 1.00132906, + "epoch": 0.45921508272412465, + "flos": 494602066176.0, + "grad_norm": 0.034718704885035666, + "language_loss": 0.87768519, + "learning_rate": 0.000589856424383199, + "loss": 0.88811642, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.41821289, + "step": 2387, + "time_per_iteration": 2.6108267307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_mlp": 1.00232685, + "epoch": 0.45940746440938823, + "flos": 692593372416.0, + "grad_norm": 0.03330437261727838, + "language_loss": 0.83652228, + "learning_rate": 0.000589549936835744, + "loss": 0.846964, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.41870117, + "step": 2388, + "time_per_iteration": 2.8968546390533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_mlp": 1.00545883, + "epoch": 0.45959984609465176, + "flos": 504737844480.0, + "grad_norm": 0.03238722342606361, + "language_loss": 0.79404306, + "learning_rate": 0.0005892434145219202, + "loss": 0.80451536, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.41796875, + "step": 2389, + "time_per_iteration": 2.6019601821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_mlp": 1.00350749, + "epoch": 0.45979222777991535, + "flos": 677840259072.0, + "grad_norm": 0.03571192687498619, + "language_loss": 0.83136904, + "learning_rate": 0.0005889368575607303, + "loss": 0.84182131, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.41748047, + "step": 2390, + "time_per_iteration": 2.8418307304382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042843, + "balance_loss_mlp": 1.00107241, + "epoch": 0.45998460946517894, + "flos": 779039594496.0, + "grad_norm": 0.031212653964934608, + "language_loss": 0.79287618, + "learning_rate": 0.00058863026607119, + "loss": 0.80330467, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.41796875, + "step": 2391, + "time_per_iteration": 3.0931389331817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_mlp": 1.00333977, + "epoch": 0.46017699115044247, + "flos": 853022901504.0, + "grad_norm": 0.035796836390277, + "language_loss": 0.80142331, + "learning_rate": 0.0005883236401723287, + "loss": 0.8118751, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.41870117, + "step": 2392, + "time_per_iteration": 3.170374631881714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_mlp": 1.00222623, + "epoch": 0.46036937283570606, + "flos": 576964621824.0, + "grad_norm": 0.03330985308732758, + "language_loss": 0.84980971, + "learning_rate": 0.0005880169799831893, + "loss": 0.86025083, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.41918945, + "step": 2393, + "time_per_iteration": 2.693976879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048284, + "balance_loss_mlp": 1.00641727, + "epoch": 0.4605617545209696, + "flos": 613120779264.0, + "grad_norm": 0.03386951364717573, + "language_loss": 0.82288468, + "learning_rate": 0.0005877102856228278, + "loss": 0.83336759, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.41894531, + "step": 2394, + "time_per_iteration": 2.8137876987457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104836, + "balance_loss_mlp": 1.0063504, + "epoch": 0.4607541362062332, + "flos": 534159500544.0, + "grad_norm": 0.06543347642857557, + "language_loss": 0.85095239, + "learning_rate": 0.0005874035572103133, + "loss": 0.86143595, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.42041016, + "step": 2395, + "time_per_iteration": 2.6604816913604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_mlp": 1.0043298, + "epoch": 0.4609465178914967, + "flos": 648474983424.0, + "grad_norm": 0.04503809754512356, + "language_loss": 0.83026469, + "learning_rate": 0.0005870967948647288, + "loss": 0.84072733, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.41967773, + "step": 2396, + "time_per_iteration": 2.8022336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.00658417, + "epoch": 0.4611388995767603, + "flos": 1469501204736.0, + "grad_norm": 0.004136605290049959, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75355613, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.40722656, + "step": 2397, + "time_per_iteration": 5.5826334953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_mlp": 1.00350773, + "epoch": 0.46133128126202383, + "flos": 724477280256.0, + "grad_norm": 0.03194619056097999, + "language_loss": 0.86316049, + "learning_rate": 0.0005864831688507443, + "loss": 0.8736161, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.42089844, + "step": 2398, + "time_per_iteration": 3.0160725116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051143, + "balance_loss_mlp": 1.00903809, + "epoch": 0.4615236629472874, + "flos": 549114802944.0, + "grad_norm": 0.0336665595141197, + "language_loss": 0.75746781, + "learning_rate": 0.0005861763054205754, + "loss": 0.76797926, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.42138672, + "step": 2399, + "time_per_iteration": 2.7720346450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052341, + "balance_loss_mlp": 1.01011705, + "epoch": 0.461716044632551, + "flos": 603460343808.0, + "grad_norm": 0.030278987672658065, + "language_loss": 0.80694187, + "learning_rate": 0.0005858694085337976, + "loss": 0.81746531, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.42260742, + "step": 2400, + "time_per_iteration": 2.790825366973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_mlp": 1.00722611, + "epoch": 0.46190842631781454, + "flos": 475437697536.0, + "grad_norm": 0.03561782978750914, + "language_loss": 0.83960855, + "learning_rate": 0.0005855624783095589, + "loss": 0.85010278, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.42236328, + "step": 2401, + "time_per_iteration": 2.5512595176696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_mlp": 1.00930238, + "epoch": 0.4621008080030781, + "flos": 438402786048.0, + "grad_norm": 0.034731386600305836, + "language_loss": 0.85895813, + "learning_rate": 0.00058525551486702, + "loss": 0.86947024, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.41943359, + "step": 2402, + "time_per_iteration": 2.5168349742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_mlp": 1.0077796, + "epoch": 0.46229318968834165, + "flos": 526498612992.0, + "grad_norm": 0.03903258697063272, + "language_loss": 0.81848848, + "learning_rate": 0.0005849485183253548, + "loss": 0.82898641, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.42041016, + "step": 2403, + "time_per_iteration": 2.640596389770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043904, + "balance_loss_mlp": 1.00213277, + "epoch": 0.46248557137360524, + "flos": 440534536704.0, + "grad_norm": 0.0318215105397156, + "language_loss": 0.87703103, + "learning_rate": 0.0005846414888037501, + "loss": 0.88747007, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.41796875, + "step": 2404, + "time_per_iteration": 2.4814634323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046869, + "balance_loss_mlp": 1.00516927, + "epoch": 0.4626779530588688, + "flos": 618773370624.0, + "grad_norm": 0.036713203920182555, + "language_loss": 0.8266353, + "learning_rate": 0.0005843344264214049, + "loss": 0.83710396, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.41723633, + "step": 2405, + "time_per_iteration": 2.7493507862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_mlp": 1.00461316, + "epoch": 0.46287033474413236, + "flos": 671360436480.0, + "grad_norm": 0.031131832431387497, + "language_loss": 0.85281026, + "learning_rate": 0.0005840273312975317, + "loss": 0.86327314, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.41699219, + "step": 2406, + "time_per_iteration": 2.8235156536102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045049, + "balance_loss_mlp": 1.00332618, + "epoch": 0.46306271642939595, + "flos": 481199159040.0, + "grad_norm": 0.037353418102982906, + "language_loss": 0.90573472, + "learning_rate": 0.0005837202035513555, + "loss": 0.91618526, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.41748047, + "step": 2407, + "time_per_iteration": 2.5672457218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043171, + "balance_loss_mlp": 1.001472, + "epoch": 0.4632550981146595, + "flos": 581858022912.0, + "grad_norm": 0.03272683029516706, + "language_loss": 0.81903768, + "learning_rate": 0.0005834130433021136, + "loss": 0.82946944, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.41723633, + "step": 2408, + "time_per_iteration": 4.229294538497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_mlp": 1.00044954, + "epoch": 0.46344747979992307, + "flos": 525018149376.0, + "grad_norm": 0.030754893265702864, + "language_loss": 0.73835284, + "learning_rate": 0.0005831058506690563, + "loss": 0.74877453, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.41748047, + "step": 2409, + "time_per_iteration": 2.614616632461548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_mlp": 1.00183976, + "epoch": 0.4636398614851866, + "flos": 747813776640.0, + "grad_norm": 0.03608107183813509, + "language_loss": 0.86105043, + "learning_rate": 0.0005827986257714464, + "loss": 0.87148345, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.41479492, + "step": 2410, + "time_per_iteration": 2.953162670135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_mlp": 1.00935507, + "epoch": 0.4638322431704502, + "flos": 597646392576.0, + "grad_norm": 0.032192415237476964, + "language_loss": 0.89042687, + "learning_rate": 0.0005824913687285591, + "loss": 0.90093744, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.41723633, + "step": 2411, + "time_per_iteration": 2.685081958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045225, + "balance_loss_mlp": 1.00357294, + "epoch": 0.4640246248557137, + "flos": 540533365248.0, + "grad_norm": 0.03324810257023632, + "language_loss": 0.82180583, + "learning_rate": 0.0005821840796596821, + "loss": 0.83225811, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.41674805, + "step": 2412, + "time_per_iteration": 2.7183375358581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_mlp": 1.00403953, + "epoch": 0.4642170065409773, + "flos": 563809590528.0, + "grad_norm": 0.030050486484180242, + "language_loss": 0.80926406, + "learning_rate": 0.0005818767586841158, + "loss": 0.81972128, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.41699219, + "step": 2413, + "time_per_iteration": 2.7701165676116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050594, + "balance_loss_mlp": 1.00884688, + "epoch": 0.46440938822624084, + "flos": 532062743040.0, + "grad_norm": 0.027541485530404662, + "language_loss": 0.86138541, + "learning_rate": 0.0005815694059211726, + "loss": 0.87189138, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.41772461, + "step": 2414, + "time_per_iteration": 2.668760061264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104752, + "balance_loss_mlp": 1.00717926, + "epoch": 0.4646017699115044, + "flos": 1529627649024.0, + "grad_norm": 0.008676045744997887, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81921148, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.40332031, + "step": 2415, + "time_per_iteration": 4.801916599273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054371, + "balance_loss_mlp": 1.01403046, + "epoch": 0.464794151596768, + "flos": 1544174682624.0, + "grad_norm": 0.009441918844152984, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.77999437, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.40332031, + "step": 2416, + "time_per_iteration": 4.990759372711182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.0040803, + "epoch": 0.46498653328203154, + "flos": 502539019776.0, + "grad_norm": 0.03083676606802021, + "language_loss": 0.86654723, + "learning_rate": 0.0005806471581013931, + "loss": 0.87700671, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.41894531, + "step": 2417, + "time_per_iteration": 2.6697516441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046587, + "balance_loss_mlp": 1.00452995, + "epoch": 0.46517891496729513, + "flos": 677301732864.0, + "grad_norm": 0.03671323650301262, + "language_loss": 0.79226685, + "learning_rate": 0.0005803396793823146, + "loss": 0.80273271, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.42089844, + "step": 2418, + "time_per_iteration": 2.8375697135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054201, + "balance_loss_mlp": 1.01212037, + "epoch": 0.46537129665255866, + "flos": 586512296448.0, + "grad_norm": 0.037063881541601694, + "language_loss": 0.86435425, + "learning_rate": 0.0005800321694726065, + "loss": 0.87489623, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.42114258, + "step": 2419, + "time_per_iteration": 2.7743778228759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01136279, + "epoch": 0.46556367833782225, + "flos": 588821936640.0, + "grad_norm": 0.0340005426894483, + "language_loss": 0.87128568, + "learning_rate": 0.0005797246284916545, + "loss": 0.8818208, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.421875, + "step": 2420, + "time_per_iteration": 2.6835851669311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049313, + "balance_loss_mlp": 1.00878143, + "epoch": 0.4657560600230858, + "flos": 1488584893440.0, + "grad_norm": 0.006163961209168608, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78554499, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.40527344, + "step": 2421, + "time_per_iteration": 4.943193197250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_mlp": 1.00570607, + "epoch": 0.46594844170834937, + "flos": 581393373696.0, + "grad_norm": 0.03388172676180004, + "language_loss": 0.8850925, + "learning_rate": 0.0005791094537936233, + "loss": 0.89556992, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.4206543, + "step": 2422, + "time_per_iteration": 2.694913148880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.00559843, + "epoch": 0.4661408233936129, + "flos": 513571048704.0, + "grad_norm": 0.036220885297141736, + "language_loss": 0.82194817, + "learning_rate": 0.0005788018203153762, + "loss": 0.83242476, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.42089844, + "step": 2423, + "time_per_iteration": 2.582130193710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_mlp": 1.006392, + "epoch": 0.4663332050788765, + "flos": 492033856512.0, + "grad_norm": 0.03516767090589214, + "language_loss": 0.86157548, + "learning_rate": 0.000578494156243549, + "loss": 0.87205875, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.41967773, + "step": 2424, + "time_per_iteration": 2.569465160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047846, + "balance_loss_mlp": 1.0060271, + "epoch": 0.4665255867641401, + "flos": 513708109056.0, + "grad_norm": 0.03097112252036683, + "language_loss": 0.89247042, + "learning_rate": 0.0005781864616975878, + "loss": 0.90294886, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.41845703, + "step": 2425, + "time_per_iteration": 2.6580159664154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043502, + "balance_loss_mlp": 1.00175464, + "epoch": 0.4667179684494036, + "flos": 425707546368.0, + "grad_norm": 0.0331787429652153, + "language_loss": 0.84786129, + "learning_rate": 0.0005778787367969502, + "loss": 0.85829628, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.41772461, + "step": 2426, + "time_per_iteration": 2.577146291732788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046987, + "balance_loss_mlp": 1.00526416, + "epoch": 0.4669103501346672, + "flos": 709224524544.0, + "grad_norm": 0.030186535385466236, + "language_loss": 0.81415391, + "learning_rate": 0.0005775709816611053, + "loss": 0.82462376, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.41748047, + "step": 2427, + "time_per_iteration": 2.946763515472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044811, + "balance_loss_mlp": 1.00294447, + "epoch": 0.4671027318199307, + "flos": 555946513920.0, + "grad_norm": 0.029160974795623382, + "language_loss": 0.83887118, + "learning_rate": 0.0005772631964095346, + "loss": 0.84931928, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.41894531, + "step": 2428, + "time_per_iteration": 2.7246575355529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047997, + "balance_loss_mlp": 1.0062499, + "epoch": 0.4672951135051943, + "flos": 568196546304.0, + "grad_norm": 0.03470882192857659, + "language_loss": 0.86100912, + "learning_rate": 0.000576955381161731, + "loss": 0.87148911, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.41772461, + "step": 2429, + "time_per_iteration": 2.6618916988372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.00959647, + "epoch": 0.46748749519045785, + "flos": 425418841344.0, + "grad_norm": 0.034295751127670006, + "language_loss": 0.86858582, + "learning_rate": 0.0005766475360371985, + "loss": 0.87909877, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.41723633, + "step": 2430, + "time_per_iteration": 2.6010043621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048678, + "balance_loss_mlp": 1.00697899, + "epoch": 0.46767987687572143, + "flos": 539371742208.0, + "grad_norm": 0.034969896754344705, + "language_loss": 0.85521102, + "learning_rate": 0.0005763396611554536, + "loss": 0.86569786, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.41723633, + "step": 2431, + "time_per_iteration": 2.6345412731170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045821, + "balance_loss_mlp": 1.00409806, + "epoch": 0.467872258560985, + "flos": 825076851456.0, + "grad_norm": 0.03589185796451142, + "language_loss": 0.80950278, + "learning_rate": 0.0005760317566360237, + "loss": 0.81996095, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.41748047, + "step": 2432, + "time_per_iteration": 3.0410006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050036, + "balance_loss_mlp": 1.0083127, + "epoch": 0.46806464024624855, + "flos": 662854821120.0, + "grad_norm": 0.03375923289076794, + "language_loss": 0.86271471, + "learning_rate": 0.000575723822598448, + "loss": 0.87321508, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.41748047, + "step": 2433, + "time_per_iteration": 2.7712388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00251079, + "epoch": 0.46825702193151214, + "flos": 757055249664.0, + "grad_norm": 0.029730946872360612, + "language_loss": 0.82302332, + "learning_rate": 0.0005754158591622773, + "loss": 0.83346617, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.41796875, + "step": 2434, + "time_per_iteration": 2.9708468914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_mlp": 1.00818896, + "epoch": 0.4684494036167757, + "flos": 440310960384.0, + "grad_norm": 0.03563934149764459, + "language_loss": 0.83011699, + "learning_rate": 0.0005751078664470732, + "loss": 0.84061682, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.41821289, + "step": 2435, + "time_per_iteration": 2.5696167945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.00468564, + "epoch": 0.46864178530203926, + "flos": 533749286400.0, + "grad_norm": 0.031914354194682755, + "language_loss": 0.86557531, + "learning_rate": 0.0005747998445724094, + "loss": 0.87603986, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.41796875, + "step": 2436, + "time_per_iteration": 2.6336376667022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047101, + "balance_loss_mlp": 1.00535429, + "epoch": 0.4688341669873028, + "flos": 577826846208.0, + "grad_norm": 0.03221336233810001, + "language_loss": 0.89470494, + "learning_rate": 0.0005744917936578707, + "loss": 0.90517592, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.41772461, + "step": 2437, + "time_per_iteration": 2.7748000621795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054309, + "balance_loss_mlp": 1.0126332, + "epoch": 0.4690265486725664, + "flos": 540718057728.0, + "grad_norm": 0.029623138174113085, + "language_loss": 0.84520715, + "learning_rate": 0.0005741837138230526, + "loss": 0.85575026, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.41699219, + "step": 2438, + "time_per_iteration": 2.717194080352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047928, + "balance_loss_mlp": 1.0061574, + "epoch": 0.4692189303578299, + "flos": 771882240000.0, + "grad_norm": 0.03250588789777806, + "language_loss": 0.86937356, + "learning_rate": 0.0005738756051875627, + "loss": 0.87985283, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.41796875, + "step": 2439, + "time_per_iteration": 3.0656278133392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050138, + "balance_loss_mlp": 1.00846255, + "epoch": 0.4694113120430935, + "flos": 572514482688.0, + "grad_norm": 0.03167805631394848, + "language_loss": 0.84031767, + "learning_rate": 0.0005735674678710192, + "loss": 0.85081905, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.41699219, + "step": 2440, + "time_per_iteration": 2.6962802410125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010485, + "balance_loss_mlp": 1.00675285, + "epoch": 0.4696036937283571, + "flos": 750095226624.0, + "grad_norm": 0.037443971636707395, + "language_loss": 0.82144701, + "learning_rate": 0.0005732593019930517, + "loss": 0.83193195, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.41772461, + "step": 2441, + "time_per_iteration": 2.9041428565979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050874, + "balance_loss_mlp": 1.00915074, + "epoch": 0.4697960754136206, + "flos": 494443618560.0, + "grad_norm": 0.033679899008564836, + "language_loss": 0.87957233, + "learning_rate": 0.0005729511076733008, + "loss": 0.89008105, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.41748047, + "step": 2442, + "time_per_iteration": 2.6734514236450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056181, + "balance_loss_mlp": 1.01433861, + "epoch": 0.4699884570988842, + "flos": 726361155072.0, + "grad_norm": 0.036289078656904894, + "language_loss": 0.85521489, + "learning_rate": 0.000572642885031418, + "loss": 0.86577672, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.41870117, + "step": 2443, + "time_per_iteration": 2.9099576473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_mlp": 1.01062274, + "epoch": 0.47018083878414774, + "flos": 556578359040.0, + "grad_norm": 0.03125880297204364, + "language_loss": 0.81027329, + "learning_rate": 0.0005723346341870662, + "loss": 0.82079738, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.41821289, + "step": 2444, + "time_per_iteration": 2.7017409801483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046603, + "balance_loss_mlp": 1.00480783, + "epoch": 0.4703732204694113, + "flos": 424962940416.0, + "grad_norm": 0.03329454905005034, + "language_loss": 0.86812586, + "learning_rate": 0.0005720263552599188, + "loss": 0.8785919, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.41821289, + "step": 2445, + "time_per_iteration": 2.462155818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044259, + "balance_loss_mlp": 1.00239313, + "epoch": 0.47056560215467486, + "flos": 704756888832.0, + "grad_norm": 0.03166905827629482, + "language_loss": 0.80339378, + "learning_rate": 0.0005717180483696604, + "loss": 0.81383634, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.41894531, + "step": 2446, + "time_per_iteration": 2.8927905559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_mlp": 1.00115991, + "epoch": 0.47075798383993844, + "flos": 556013587968.0, + "grad_norm": 0.03197533000624638, + "language_loss": 0.8331126, + "learning_rate": 0.0005714097136359862, + "loss": 0.8435452, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.42138672, + "step": 2447, + "time_per_iteration": 2.632544994354248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043755, + "balance_loss_mlp": 1.00169826, + "epoch": 0.470950365525202, + "flos": 565494188544.0, + "grad_norm": 0.028044805803111937, + "language_loss": 0.87163484, + "learning_rate": 0.0005711013511786027, + "loss": 0.88207239, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.42089844, + "step": 2448, + "time_per_iteration": 2.781325578689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049475, + "balance_loss_mlp": 1.00768065, + "epoch": 0.47114274721046556, + "flos": 535499013120.0, + "grad_norm": 0.029728682222295192, + "language_loss": 0.84444499, + "learning_rate": 0.0005707929611172263, + "loss": 0.8549397, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.41821289, + "step": 2449, + "time_per_iteration": 2.704754114151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104782, + "balance_loss_mlp": 1.00576317, + "epoch": 0.47133512889572915, + "flos": 474078743040.0, + "grad_norm": 0.03341999970225476, + "language_loss": 0.84505057, + "learning_rate": 0.000570484543571585, + "loss": 0.85552877, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.42089844, + "step": 2450, + "time_per_iteration": 2.56648850440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_mlp": 1.00129259, + "epoch": 0.4715275105809927, + "flos": 459968168448.0, + "grad_norm": 0.03640704052870178, + "language_loss": 0.83504367, + "learning_rate": 0.0005701760986614171, + "loss": 0.84547579, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.41943359, + "step": 2451, + "time_per_iteration": 2.5392374992370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_mlp": 1.00522745, + "epoch": 0.47171989226625627, + "flos": 422887570176.0, + "grad_norm": 0.0300201122524448, + "language_loss": 0.87997985, + "learning_rate": 0.0005698676265064714, + "loss": 0.89045107, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.41918945, + "step": 2452, + "time_per_iteration": 2.501518487930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_mlp": 1.00378954, + "epoch": 0.4719122739515198, + "flos": 458376889344.0, + "grad_norm": 0.036567202146268483, + "language_loss": 0.89326543, + "learning_rate": 0.0005695591272265074, + "loss": 0.90372366, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.4206543, + "step": 2453, + "time_per_iteration": 2.5203113555908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00703239, + "epoch": 0.4721046556367834, + "flos": 516017749248.0, + "grad_norm": 0.03590555599096038, + "language_loss": 0.82296801, + "learning_rate": 0.0005692506009412954, + "loss": 0.83345866, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.4206543, + "step": 2454, + "time_per_iteration": 2.703277826309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_mlp": 1.00982666, + "epoch": 0.4722970373220469, + "flos": 1575706702080.0, + "grad_norm": 0.007700978657663942, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78601336, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.40234375, + "step": 2455, + "time_per_iteration": 4.935078859329224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_mlp": 1.00380278, + "epoch": 0.4724894190073105, + "flos": 587395908096.0, + "grad_norm": 0.032995428661028114, + "language_loss": 0.90020776, + "learning_rate": 0.0005686334678342593, + "loss": 0.91066664, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.42114258, + "step": 2456, + "time_per_iteration": 2.913954019546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_mlp": 1.00291097, + "epoch": 0.4726818006925741, + "flos": 869073731328.0, + "grad_norm": 0.0323844824027511, + "language_loss": 0.82033843, + "learning_rate": 0.0005683248612520274, + "loss": 0.83078766, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.42041016, + "step": 2457, + "time_per_iteration": 4.4027345180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_mlp": 1.0055697, + "epoch": 0.4728741823778376, + "flos": 754228470528.0, + "grad_norm": 0.03548497467281451, + "language_loss": 0.84315181, + "learning_rate": 0.0005680162281437321, + "loss": 0.85363138, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.42431641, + "step": 2458, + "time_per_iteration": 2.8824384212493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_mlp": 1.00649393, + "epoch": 0.4730665640631012, + "flos": 539658501888.0, + "grad_norm": 0.029540383226657484, + "language_loss": 0.85216498, + "learning_rate": 0.000567707568629195, + "loss": 0.86265045, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.42089844, + "step": 2459, + "time_per_iteration": 2.7024879455566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105311, + "balance_loss_mlp": 1.01088595, + "epoch": 0.47325894574836475, + "flos": 492683198208.0, + "grad_norm": 0.02914158825310119, + "language_loss": 0.8318013, + "learning_rate": 0.0005673988828282486, + "loss": 0.84233236, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.42260742, + "step": 2460, + "time_per_iteration": 2.680508852005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045398, + "balance_loss_mlp": 1.00341213, + "epoch": 0.47345132743362833, + "flos": 765832073472.0, + "grad_norm": 0.11223827549321637, + "language_loss": 0.8158704, + "learning_rate": 0.0005670901708607352, + "loss": 0.82632446, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.42016602, + "step": 2461, + "time_per_iteration": 2.963573455810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_mlp": 1.00873268, + "epoch": 0.47364370911889186, + "flos": 541169101056.0, + "grad_norm": 0.03621241484942453, + "language_loss": 0.84821182, + "learning_rate": 0.0005667814328465076, + "loss": 0.85871977, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.42089844, + "step": 2462, + "time_per_iteration": 2.623180389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052459, + "balance_loss_mlp": 1.01042545, + "epoch": 0.47383609080415545, + "flos": 407092397568.0, + "grad_norm": 0.0408736366196423, + "language_loss": 0.82667732, + "learning_rate": 0.0005664726689054285, + "loss": 0.83720195, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.4206543, + "step": 2463, + "time_per_iteration": 2.463602304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_mlp": 1.01253569, + "epoch": 0.474028472489419, + "flos": 454439031552.0, + "grad_norm": 0.030418063351129263, + "language_loss": 0.81695265, + "learning_rate": 0.0005661638791573704, + "loss": 0.82749808, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.42041016, + "step": 2464, + "time_per_iteration": 2.736748695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048425, + "balance_loss_mlp": 1.00651097, + "epoch": 0.47422085417468257, + "flos": 493195479552.0, + "grad_norm": 0.029840540723241396, + "language_loss": 0.87200695, + "learning_rate": 0.0005658550637222164, + "loss": 0.88249123, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.41943359, + "step": 2465, + "time_per_iteration": 2.618978261947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_mlp": 1.00532842, + "epoch": 0.47441323585994616, + "flos": 740126644224.0, + "grad_norm": 0.027711669007488924, + "language_loss": 0.82591414, + "learning_rate": 0.0005655462227198592, + "loss": 0.8363868, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.41967773, + "step": 2466, + "time_per_iteration": 2.9003212451934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045592, + "balance_loss_mlp": 1.00363016, + "epoch": 0.4746056175452097, + "flos": 485675543040.0, + "grad_norm": 0.03086334809399425, + "language_loss": 0.84889436, + "learning_rate": 0.0005652373562702016, + "loss": 0.85935026, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.41992188, + "step": 2467, + "time_per_iteration": 2.635524272918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050978, + "balance_loss_mlp": 1.00913572, + "epoch": 0.4747979992304733, + "flos": 462006600192.0, + "grad_norm": 0.030700027016666232, + "language_loss": 0.89103687, + "learning_rate": 0.000564928464493156, + "loss": 0.9015466, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.41870117, + "step": 2468, + "time_per_iteration": 2.5902397632598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050963, + "balance_loss_mlp": 1.00900185, + "epoch": 0.4749903809157368, + "flos": 865880479488.0, + "grad_norm": 0.04027391649848807, + "language_loss": 0.82258296, + "learning_rate": 0.000564619547508645, + "loss": 0.83309263, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.41992188, + "step": 2469, + "time_per_iteration": 3.071483850479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_mlp": 1.00877666, + "epoch": 0.4751827626010004, + "flos": 506552699904.0, + "grad_norm": 0.03439249398490307, + "language_loss": 0.83728659, + "learning_rate": 0.0005643106054366008, + "loss": 0.84779418, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.42016602, + "step": 2470, + "time_per_iteration": 2.5717906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054243, + "balance_loss_mlp": 1.01240063, + "epoch": 0.47537514428626393, + "flos": 560453033472.0, + "grad_norm": 0.030831302101538484, + "language_loss": 0.80302799, + "learning_rate": 0.000564001638396965, + "loss": 0.81357038, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.41870117, + "step": 2471, + "time_per_iteration": 2.807666540145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010519, + "balance_loss_mlp": 1.01008177, + "epoch": 0.4755675259715275, + "flos": 835677278976.0, + "grad_norm": 0.03000607606640632, + "language_loss": 0.82444054, + "learning_rate": 0.0005636926465096897, + "loss": 0.83495951, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.41845703, + "step": 2472, + "time_per_iteration": 3.0930862426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_mlp": 1.01106381, + "epoch": 0.47575990765679105, + "flos": 509233670400.0, + "grad_norm": 0.03423576863830587, + "language_loss": 0.88083971, + "learning_rate": 0.0005633836298947363, + "loss": 0.89136827, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.41821289, + "step": 2473, + "time_per_iteration": 2.5820775032043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050107, + "balance_loss_mlp": 1.00819325, + "epoch": 0.47595228934205464, + "flos": 592963928832.0, + "grad_norm": 0.03298724569498326, + "language_loss": 0.71285135, + "learning_rate": 0.000563074588672075, + "loss": 0.72335243, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.41943359, + "step": 2474, + "time_per_iteration": 2.693268299102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_mlp": 1.01231647, + "epoch": 0.4761446710273182, + "flos": 581684024064.0, + "grad_norm": 0.03213378714772974, + "language_loss": 0.85775197, + "learning_rate": 0.0005627655229616868, + "loss": 0.86829406, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.41918945, + "step": 2475, + "time_per_iteration": 2.719207286834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051223, + "balance_loss_mlp": 1.00933242, + "epoch": 0.47633705271258175, + "flos": 674080290816.0, + "grad_norm": 0.026991444464169446, + "language_loss": 0.9029963, + "learning_rate": 0.0005624564328835616, + "loss": 0.91350853, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.41918945, + "step": 2476, + "time_per_iteration": 2.793189764022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054365, + "balance_loss_mlp": 1.0125705, + "epoch": 0.47652943439784534, + "flos": 542971317504.0, + "grad_norm": 0.02962321585608733, + "language_loss": 0.84663439, + "learning_rate": 0.0005621473185576986, + "loss": 0.85717803, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.41821289, + "step": 2477, + "time_per_iteration": 2.7773327827453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_mlp": 1.00822008, + "epoch": 0.4767218160831089, + "flos": 525847325952.0, + "grad_norm": 0.03556533386707064, + "language_loss": 0.87709439, + "learning_rate": 0.0005618381801041068, + "loss": 0.8875953, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.41894531, + "step": 2478, + "time_per_iteration": 2.6155920028686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053209, + "balance_loss_mlp": 1.0111047, + "epoch": 0.47691419776837246, + "flos": 569127790080.0, + "grad_norm": 0.035286823129286084, + "language_loss": 0.83750623, + "learning_rate": 0.0005615290176428044, + "loss": 0.84803832, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.42138672, + "step": 2479, + "time_per_iteration": 2.6538074016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049988, + "balance_loss_mlp": 1.00802612, + "epoch": 0.477106579453636, + "flos": 532025804544.0, + "grad_norm": 0.0314839310376407, + "language_loss": 0.85928833, + "learning_rate": 0.0005612198312938187, + "loss": 0.86978817, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.41992188, + "step": 2480, + "time_per_iteration": 2.781107187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051481, + "balance_loss_mlp": 1.00937629, + "epoch": 0.4772989611388996, + "flos": 595502002944.0, + "grad_norm": 0.03185012593036433, + "language_loss": 0.79825139, + "learning_rate": 0.0005609106211771868, + "loss": 0.80876625, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.42138672, + "step": 2481, + "time_per_iteration": 2.854200839996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049319, + "balance_loss_mlp": 1.00702322, + "epoch": 0.4774913428241631, + "flos": 545708668416.0, + "grad_norm": 0.032298555104441296, + "language_loss": 0.89798552, + "learning_rate": 0.0005606013874129543, + "loss": 0.90847874, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.42333984, + "step": 2482, + "time_per_iteration": 2.8364884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.00214577, + "epoch": 0.4776837245094267, + "flos": 541130217216.0, + "grad_norm": 0.031860038244933726, + "language_loss": 0.8004725, + "learning_rate": 0.0005602921301211768, + "loss": 0.81091738, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.42382812, + "step": 2483, + "time_per_iteration": 2.719606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_mlp": 1.00185454, + "epoch": 0.4778761061946903, + "flos": 472756727040.0, + "grad_norm": 0.037639636071959574, + "language_loss": 0.82567894, + "learning_rate": 0.0005599828494219185, + "loss": 0.83612138, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.42431641, + "step": 2484, + "time_per_iteration": 2.5541560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_mlp": 1.00548136, + "epoch": 0.4780684878799538, + "flos": 727338085632.0, + "grad_norm": 0.033674716450053835, + "language_loss": 0.89748895, + "learning_rate": 0.0005596735454352527, + "loss": 0.90796649, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.4230957, + "step": 2485, + "time_per_iteration": 2.9516124725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_mlp": 1.00921071, + "epoch": 0.4782608695652174, + "flos": 549954673152.0, + "grad_norm": 0.03622289239904689, + "language_loss": 0.86092174, + "learning_rate": 0.0005593642182812619, + "loss": 0.87143582, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.42236328, + "step": 2486, + "time_per_iteration": 2.643221139907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054193, + "balance_loss_mlp": 1.01192153, + "epoch": 0.47845325125048094, + "flos": 831403084032.0, + "grad_norm": 0.035916445699024475, + "language_loss": 0.84163451, + "learning_rate": 0.0005590548680800378, + "loss": 0.85217643, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.4230957, + "step": 2487, + "time_per_iteration": 3.1013588905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_mlp": 1.01356208, + "epoch": 0.4786456329357445, + "flos": 515271197952.0, + "grad_norm": 0.032399463516541584, + "language_loss": 0.76797146, + "learning_rate": 0.0005587454949516804, + "loss": 0.77852952, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.42285156, + "step": 2488, + "time_per_iteration": 2.7681314945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_mlp": 1.00992179, + "epoch": 0.47883801462100806, + "flos": 565730403840.0, + "grad_norm": 0.034669501918414815, + "language_loss": 0.88538134, + "learning_rate": 0.0005584360990162993, + "loss": 0.89590186, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.42163086, + "step": 2489, + "time_per_iteration": 2.6323490142822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105419, + "balance_loss_mlp": 1.01196563, + "epoch": 0.47903039630627164, + "flos": 580705148160.0, + "grad_norm": 0.028676455513171533, + "language_loss": 0.85944891, + "learning_rate": 0.0005581266803940124, + "loss": 0.86999071, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.42260742, + "step": 2490, + "time_per_iteration": 2.758180856704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051095, + "balance_loss_mlp": 1.00891864, + "epoch": 0.47922277799153523, + "flos": 620086638336.0, + "grad_norm": 0.029629924190795385, + "language_loss": 0.8824507, + "learning_rate": 0.0005578172392049471, + "loss": 0.89296162, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.42211914, + "step": 2491, + "time_per_iteration": 2.733055353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049931, + "balance_loss_mlp": 1.00787377, + "epoch": 0.47941515967679876, + "flos": 640859782656.0, + "grad_norm": 0.03401187912624355, + "language_loss": 0.84927547, + "learning_rate": 0.0005575077755692386, + "loss": 0.85977477, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.42089844, + "step": 2492, + "time_per_iteration": 2.7897393703460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051779, + "balance_loss_mlp": 1.00988865, + "epoch": 0.47960754136206235, + "flos": 520876157184.0, + "grad_norm": 0.02611914925979928, + "language_loss": 0.8632732, + "learning_rate": 0.0005571982896070316, + "loss": 0.87379098, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.41918945, + "step": 2493, + "time_per_iteration": 2.667999744415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_mlp": 1.01010633, + "epoch": 0.4797999230473259, + "flos": 476032604160.0, + "grad_norm": 0.03441931276085345, + "language_loss": 0.90227294, + "learning_rate": 0.0005568887814384792, + "loss": 0.9127928, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.41918945, + "step": 2494, + "time_per_iteration": 2.5400681495666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105198, + "balance_loss_mlp": 1.01023245, + "epoch": 0.47999230473258947, + "flos": 533069809152.0, + "grad_norm": 0.031194267436751296, + "language_loss": 0.87632048, + "learning_rate": 0.000556579251183743, + "loss": 0.88684028, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.41772461, + "step": 2495, + "time_per_iteration": 2.662360906600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047828, + "balance_loss_mlp": 1.00615287, + "epoch": 0.480184686417853, + "flos": 602606867712.0, + "grad_norm": 0.03455941378420467, + "language_loss": 0.8073976, + "learning_rate": 0.0005562696989629936, + "loss": 0.81787586, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.41699219, + "step": 2496, + "time_per_iteration": 2.677384614944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049959, + "balance_loss_mlp": 1.00837922, + "epoch": 0.4803770681031166, + "flos": 529262208768.0, + "grad_norm": 0.02987635047659329, + "language_loss": 0.83264202, + "learning_rate": 0.0005559601248964095, + "loss": 0.84314156, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.41601562, + "step": 2497, + "time_per_iteration": 2.629697322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052806, + "balance_loss_mlp": 1.01132119, + "epoch": 0.4805694497883801, + "flos": 512229590784.0, + "grad_norm": 0.031958617017597245, + "language_loss": 0.86286914, + "learning_rate": 0.0005556505291041783, + "loss": 0.87339711, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.41503906, + "step": 2498, + "time_per_iteration": 2.6821835041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105189, + "balance_loss_mlp": 1.0103811, + "epoch": 0.4807618314736437, + "flos": 601606604544.0, + "grad_norm": 0.02993690761083535, + "language_loss": 0.84804475, + "learning_rate": 0.0005553409117064954, + "loss": 0.85856366, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.4152832, + "step": 2499, + "time_per_iteration": 2.868149518966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_mlp": 1.00626087, + "epoch": 0.4809542131589073, + "flos": 570030843648.0, + "grad_norm": 0.03218775088546566, + "language_loss": 0.85501659, + "learning_rate": 0.0005550312728235654, + "loss": 0.86549377, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.41479492, + "step": 2500, + "time_per_iteration": 2.6775684356689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_mlp": 1.00767624, + "epoch": 0.4811465948441708, + "flos": 577166810880.0, + "grad_norm": 0.03560315442462447, + "language_loss": 0.84339613, + "learning_rate": 0.0005547216125756003, + "loss": 0.85388672, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.4140625, + "step": 2501, + "time_per_iteration": 2.730938196182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051501, + "balance_loss_mlp": 1.01011145, + "epoch": 0.4813389765294344, + "flos": 825298482432.0, + "grad_norm": 0.030150461655227775, + "language_loss": 0.82324314, + "learning_rate": 0.0005544119310828211, + "loss": 0.83375812, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.4140625, + "step": 2502, + "time_per_iteration": 3.113402843475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053515, + "balance_loss_mlp": 1.01203024, + "epoch": 0.48153135821469795, + "flos": 636700293888.0, + "grad_norm": 0.03404405348604493, + "language_loss": 0.85394537, + "learning_rate": 0.0005541022284654568, + "loss": 0.8644805, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.41503906, + "step": 2503, + "time_per_iteration": 2.946800708770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055542, + "balance_loss_mlp": 1.01393807, + "epoch": 0.48172373989996153, + "flos": 504709654272.0, + "grad_norm": 0.029988445312160498, + "language_loss": 0.84392428, + "learning_rate": 0.0005537925048437446, + "loss": 0.85447979, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.41625977, + "step": 2504, + "time_per_iteration": 2.5928125381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_mlp": 1.0131073, + "epoch": 0.48191612158522507, + "flos": 1535568945408.0, + "grad_norm": 0.009640282548559968, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76805007, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.40429688, + "step": 2505, + "time_per_iteration": 4.956170320510864 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105109, + "balance_loss_mlp": 1.00936711, + "epoch": 0.48210850327048865, + "flos": 703813006080.0, + "grad_norm": 0.02927379087328487, + "language_loss": 0.88880217, + "learning_rate": 0.0005531729950682664, + "loss": 0.89931303, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.41748047, + "step": 2506, + "time_per_iteration": 2.9935836791992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052672, + "balance_loss_mlp": 1.01106763, + "epoch": 0.4823008849557522, + "flos": 440701732608.0, + "grad_norm": 0.04047033106809228, + "language_loss": 0.85417378, + "learning_rate": 0.000552863209155015, + "loss": 0.86470056, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.41625977, + "step": 2507, + "time_per_iteration": 2.4729647636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053204, + "balance_loss_mlp": 1.01157653, + "epoch": 0.48249326664101577, + "flos": 472813107456.0, + "grad_norm": 0.04603508602748786, + "language_loss": 0.82726657, + "learning_rate": 0.0005525534027184461, + "loss": 0.8377986, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.41650391, + "step": 2508, + "time_per_iteration": 2.5513370037078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055774, + "balance_loss_mlp": 1.01421785, + "epoch": 0.48268564832627936, + "flos": 564315068928.0, + "grad_norm": 0.02879273586569962, + "language_loss": 0.83137357, + "learning_rate": 0.0005522435758788365, + "loss": 0.84193128, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.41577148, + "step": 2509, + "time_per_iteration": 2.753450393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055715, + "balance_loss_mlp": 1.01415896, + "epoch": 0.4828780300115429, + "flos": 630843568128.0, + "grad_norm": 0.03460020680283242, + "language_loss": 0.80409563, + "learning_rate": 0.0005519337287564721, + "loss": 0.8146528, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.41577148, + "step": 2510, + "time_per_iteration": 2.790820360183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_mlp": 1.01020396, + "epoch": 0.4830704116968065, + "flos": 633005454336.0, + "grad_norm": 0.032398618840687954, + "language_loss": 0.83713245, + "learning_rate": 0.000551623861471646, + "loss": 0.84764957, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.4152832, + "step": 2511, + "time_per_iteration": 2.750471353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_mlp": 1.01596832, + "epoch": 0.48326279338207, + "flos": 1572619408128.0, + "grad_norm": 0.008656675131842123, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79874945, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.40136719, + "step": 2512, + "time_per_iteration": 4.832056999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_mlp": 1.00636733, + "epoch": 0.4834551750673336, + "flos": 510238791168.0, + "grad_norm": 0.030652937711335218, + "language_loss": 0.87039137, + "learning_rate": 0.0005510040668958211, + "loss": 0.88087165, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.41674805, + "step": 2513, + "time_per_iteration": 2.593559741973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053741, + "balance_loss_mlp": 1.0134964, + "epoch": 0.48364755675259713, + "flos": 1531828419072.0, + "grad_norm": 0.007806244380112886, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78814328, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.40234375, + "step": 2514, + "time_per_iteration": 4.834583282470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049855, + "balance_loss_mlp": 1.00810826, + "epoch": 0.4838399384378607, + "flos": 566047299072.0, + "grad_norm": 0.0392841259920432, + "language_loss": 0.83837014, + "learning_rate": 0.0005503841931138645, + "loss": 0.84886873, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.41772461, + "step": 2515, + "time_per_iteration": 2.704660177230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_mlp": 1.00741005, + "epoch": 0.4840323201231243, + "flos": 388542377472.0, + "grad_norm": 0.03590543250931975, + "language_loss": 0.82853907, + "learning_rate": 0.0005500742268214025, + "loss": 0.83903086, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.41796875, + "step": 2516, + "time_per_iteration": 2.4684557914733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048399, + "balance_loss_mlp": 1.00662851, + "epoch": 0.48422470180838784, + "flos": 632176277760.0, + "grad_norm": 0.031370714323768, + "language_loss": 0.8605336, + "learning_rate": 0.0005497642410884014, + "loss": 0.87101769, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.41796875, + "step": 2517, + "time_per_iteration": 2.7523274421691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00808239, + "epoch": 0.4844170834936514, + "flos": 500313950208.0, + "grad_norm": 0.02829147010426611, + "language_loss": 0.85602349, + "learning_rate": 0.0005494542360352085, + "loss": 0.86652207, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.41796875, + "step": 2518, + "time_per_iteration": 2.635472059249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_mlp": 1.00882208, + "epoch": 0.48460946517891496, + "flos": 552195293952.0, + "grad_norm": 0.029973626664194793, + "language_loss": 0.86134493, + "learning_rate": 0.0005491442117821783, + "loss": 0.87185204, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.41918945, + "step": 2519, + "time_per_iteration": 2.686150550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050979, + "balance_loss_mlp": 1.00916088, + "epoch": 0.48480184686417854, + "flos": 530462715648.0, + "grad_norm": 0.03547836116600895, + "language_loss": 0.87863553, + "learning_rate": 0.0005488341684496732, + "loss": 0.88914526, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.41845703, + "step": 2520, + "time_per_iteration": 2.6380345821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_mlp": 1.01155508, + "epoch": 0.4849942285494421, + "flos": 533048421888.0, + "grad_norm": 0.030317982530802673, + "language_loss": 0.92374247, + "learning_rate": 0.0005485241061580624, + "loss": 0.93427622, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.41845703, + "step": 2521, + "time_per_iteration": 2.7106375694274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048164, + "balance_loss_mlp": 1.00639331, + "epoch": 0.48518661023470566, + "flos": 723973747200.0, + "grad_norm": 0.029300799536016952, + "language_loss": 0.85061228, + "learning_rate": 0.0005482140250277228, + "loss": 0.86109388, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.41796875, + "step": 2522, + "time_per_iteration": 2.998014450073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_mlp": 1.00859261, + "epoch": 0.4853789919199692, + "flos": 507156354816.0, + "grad_norm": 0.033835684591452045, + "language_loss": 0.87858051, + "learning_rate": 0.0005479039251790387, + "loss": 0.88908345, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.41723633, + "step": 2523, + "time_per_iteration": 2.6554031372070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00470078, + "epoch": 0.4855713736052328, + "flos": 661700001024.0, + "grad_norm": 0.033801552668461764, + "language_loss": 0.85375023, + "learning_rate": 0.0005475938067324014, + "loss": 0.86421466, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.41772461, + "step": 2524, + "time_per_iteration": 2.8294761180877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_mlp": 1.00839663, + "epoch": 0.48576375529049637, + "flos": 437890504704.0, + "grad_norm": 0.03215141471545655, + "language_loss": 0.84198898, + "learning_rate": 0.0005472836698082098, + "loss": 0.85249019, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.41748047, + "step": 2525, + "time_per_iteration": 2.553400754928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_mlp": 1.00858843, + "epoch": 0.4859561369757599, + "flos": 582845647104.0, + "grad_norm": 0.029048493067812663, + "language_loss": 0.84421259, + "learning_rate": 0.0005469735145268694, + "loss": 0.85471547, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.41723633, + "step": 2526, + "time_per_iteration": 2.741071939468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01121581, + "epoch": 0.4861485186610235, + "flos": 488933923584.0, + "grad_norm": 0.035658567470948505, + "language_loss": 0.81546867, + "learning_rate": 0.0005466633410087933, + "loss": 0.82599807, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.41748047, + "step": 2527, + "time_per_iteration": 2.7008073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057697, + "balance_loss_mlp": 1.01735687, + "epoch": 0.486340900346287, + "flos": 1561113981696.0, + "grad_norm": 0.006481424575109751, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78318518, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.40332031, + "step": 2528, + "time_per_iteration": 4.889545679092407 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048632, + "balance_loss_mlp": 1.00719464, + "epoch": 0.4865332820315506, + "flos": 483990945024.0, + "grad_norm": 0.029120047594960542, + "language_loss": 0.88662624, + "learning_rate": 0.0005460429397441214, + "loss": 0.89711249, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.41455078, + "step": 2529, + "time_per_iteration": 4.04598331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.00706387, + "epoch": 0.48672566371681414, + "flos": 536857967616.0, + "grad_norm": 0.030816613356667605, + "language_loss": 0.87420261, + "learning_rate": 0.0005457327122383866, + "loss": 0.88468921, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.41625977, + "step": 2530, + "time_per_iteration": 2.613560676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_mlp": 1.01515198, + "epoch": 0.4869180454020777, + "flos": 1415833195776.0, + "grad_norm": 0.0094125035005948, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75691986, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.40332031, + "step": 2531, + "time_per_iteration": 4.826287269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104985, + "balance_loss_mlp": 1.00831711, + "epoch": 0.48711042708734126, + "flos": 574227270912.0, + "grad_norm": 0.03266780624208146, + "language_loss": 0.76332569, + "learning_rate": 0.0005451122040823244, + "loss": 0.77382421, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.41552734, + "step": 2532, + "time_per_iteration": 2.805912494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_mlp": 1.00438511, + "epoch": 0.48730280877260485, + "flos": 627817512192.0, + "grad_norm": 0.03502227574741412, + "language_loss": 0.77874511, + "learning_rate": 0.0005448019236728997, + "loss": 0.78920573, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.41699219, + "step": 2533, + "time_per_iteration": 2.865936040878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048209, + "balance_loss_mlp": 1.00670052, + "epoch": 0.48749519045786843, + "flos": 513468981504.0, + "grad_norm": 0.035197852276093636, + "language_loss": 0.85303891, + "learning_rate": 0.0005444916258698255, + "loss": 0.86352104, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.4152832, + "step": 2534, + "time_per_iteration": 2.6375105381011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_mlp": 1.00399435, + "epoch": 0.48768757214313196, + "flos": 526479171072.0, + "grad_norm": 0.030578272272676787, + "language_loss": 0.86534977, + "learning_rate": 0.0005441813107935704, + "loss": 0.87580293, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.41333008, + "step": 2535, + "time_per_iteration": 2.6708908081054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044819, + "balance_loss_mlp": 1.0033108, + "epoch": 0.48787995382839555, + "flos": 506031670272.0, + "grad_norm": 0.03128667529665633, + "language_loss": 0.86385322, + "learning_rate": 0.0005438709785646091, + "loss": 0.87430143, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.4152832, + "step": 2536, + "time_per_iteration": 2.587376117706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_mlp": 1.00599802, + "epoch": 0.4880723355136591, + "flos": 576248206080.0, + "grad_norm": 0.031424284702784445, + "language_loss": 0.87241846, + "learning_rate": 0.0005435606293034234, + "loss": 0.88289213, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.41381836, + "step": 2537, + "time_per_iteration": 2.6678061485290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.00425005, + "epoch": 0.48826471719892267, + "flos": 562537152000.0, + "grad_norm": 0.03574143188627203, + "language_loss": 0.85282528, + "learning_rate": 0.0005432502631305016, + "loss": 0.8632828, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.4152832, + "step": 2538, + "time_per_iteration": 2.7138583660125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00763726, + "epoch": 0.4884570988841862, + "flos": 727549022976.0, + "grad_norm": 0.02708673321136359, + "language_loss": 0.84024864, + "learning_rate": 0.0005429398801663386, + "loss": 0.85074031, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.41552734, + "step": 2539, + "time_per_iteration": 2.964188814163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_mlp": 1.00797033, + "epoch": 0.4886494805694498, + "flos": 431924908800.0, + "grad_norm": 0.037537890597472735, + "language_loss": 0.83715379, + "learning_rate": 0.0005426294805314355, + "loss": 0.84764791, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.41455078, + "step": 2540, + "time_per_iteration": 2.5386080741882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_mlp": 1.00251019, + "epoch": 0.4888418622547134, + "flos": 674345663232.0, + "grad_norm": 0.02795943805212824, + "language_loss": 0.80757105, + "learning_rate": 0.0005423190643463003, + "loss": 0.81801265, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.41674805, + "step": 2541, + "time_per_iteration": 3.0026512145996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_mlp": 1.00182211, + "epoch": 0.4890342439399769, + "flos": 542936324352.0, + "grad_norm": 0.03490297591946719, + "language_loss": 0.83297753, + "learning_rate": 0.0005420086317314473, + "loss": 0.84341061, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.41503906, + "step": 2542, + "time_per_iteration": 2.713738441467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_mlp": 1.00457919, + "epoch": 0.4892266256252405, + "flos": 591863543808.0, + "grad_norm": 0.03220316860335889, + "language_loss": 0.81509852, + "learning_rate": 0.0005416981828073971, + "loss": 0.8255589, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.41479492, + "step": 2543, + "time_per_iteration": 2.833582639694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_mlp": 1.00983429, + "epoch": 0.48941900731050403, + "flos": 1519657121280.0, + "grad_norm": 0.011925691275285389, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78164709, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.3984375, + "step": 2544, + "time_per_iteration": 4.825795412063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_mlp": 1.00319445, + "epoch": 0.4896113889957676, + "flos": 471519281664.0, + "grad_norm": 0.035595787649594084, + "language_loss": 0.85265428, + "learning_rate": 0.000541077236513819, + "loss": 0.86310375, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.41772461, + "step": 2545, + "time_per_iteration": 2.5318596363067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046977, + "balance_loss_mlp": 1.00515878, + "epoch": 0.48980377068103115, + "flos": 497552299776.0, + "grad_norm": 0.029954814135253697, + "language_loss": 0.8290776, + "learning_rate": 0.0005407667393853638, + "loss": 0.8395474, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.41845703, + "step": 2546, + "time_per_iteration": 2.6808276176452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049449, + "balance_loss_mlp": 1.00765431, + "epoch": 0.48999615236629473, + "flos": 694108829184.0, + "grad_norm": 0.033072726692276254, + "language_loss": 0.83875388, + "learning_rate": 0.0005404562264298569, + "loss": 0.84924835, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.41821289, + "step": 2547, + "time_per_iteration": 2.8665168285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_mlp": 1.00894189, + "epoch": 0.49018853405155827, + "flos": 542749686528.0, + "grad_norm": 0.0323259245637504, + "language_loss": 0.84166187, + "learning_rate": 0.0005401456977678498, + "loss": 0.85217071, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.41967773, + "step": 2548, + "time_per_iteration": 2.646385431289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_mlp": 1.01248467, + "epoch": 0.49038091573682185, + "flos": 697109607168.0, + "grad_norm": 0.03434023749691101, + "language_loss": 0.7811271, + "learning_rate": 0.0005398351535199008, + "loss": 0.79166895, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.41723633, + "step": 2549, + "time_per_iteration": 3.0581490993499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056036, + "balance_loss_mlp": 1.01443195, + "epoch": 0.49057329742208544, + "flos": 598063409664.0, + "grad_norm": 0.032237778563639685, + "language_loss": 0.84733725, + "learning_rate": 0.0005395245938065735, + "loss": 0.85789764, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.41625977, + "step": 2550, + "time_per_iteration": 2.7877790927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_mlp": 1.01105404, + "epoch": 0.490765679107349, + "flos": 514417721856.0, + "grad_norm": 0.03812364840268788, + "language_loss": 0.82968283, + "learning_rate": 0.0005392140187484379, + "loss": 0.84021086, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.41772461, + "step": 2551, + "time_per_iteration": 2.59513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_mlp": 1.01097441, + "epoch": 0.49095806079261256, + "flos": 630843568128.0, + "grad_norm": 0.028435741934699065, + "language_loss": 0.8977747, + "learning_rate": 0.0005389034284660701, + "loss": 0.90830076, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.41650391, + "step": 2552, + "time_per_iteration": 2.8811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051565, + "balance_loss_mlp": 1.00979364, + "epoch": 0.4911504424778761, + "flos": 916793640960.0, + "grad_norm": 0.038088038632412044, + "language_loss": 0.82567823, + "learning_rate": 0.000538592823080052, + "loss": 0.83619392, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.41796875, + "step": 2553, + "time_per_iteration": 3.147981882095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_mlp": 1.00736189, + "epoch": 0.4913428241631397, + "flos": 439855059456.0, + "grad_norm": 0.03635352086596181, + "language_loss": 0.85271204, + "learning_rate": 0.000538282202710971, + "loss": 0.86320198, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.41650391, + "step": 2554, + "time_per_iteration": 2.5295345783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_mlp": 1.00865471, + "epoch": 0.4915352058484032, + "flos": 637240765440.0, + "grad_norm": 0.03576310950851386, + "language_loss": 0.82746387, + "learning_rate": 0.000537971567479421, + "loss": 0.83796692, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.41674805, + "step": 2555, + "time_per_iteration": 2.7715530395507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047308, + "balance_loss_mlp": 1.00567997, + "epoch": 0.4917275875336668, + "flos": 505510640640.0, + "grad_norm": 0.03586911519664752, + "language_loss": 0.88338435, + "learning_rate": 0.0005376609175060011, + "loss": 0.89385736, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.41650391, + "step": 2556, + "time_per_iteration": 2.6225156784057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_mlp": 1.00252998, + "epoch": 0.49191996921893033, + "flos": 655734405120.0, + "grad_norm": 0.03188042342455107, + "language_loss": 0.80798948, + "learning_rate": 0.0005373502529113162, + "loss": 0.81842965, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.41503906, + "step": 2557, + "time_per_iteration": 2.809008836746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00485921, + "epoch": 0.4921123509041939, + "flos": 493399613952.0, + "grad_norm": 0.03491285747037794, + "language_loss": 0.8216666, + "learning_rate": 0.0005370395738159773, + "loss": 0.83213049, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.41552734, + "step": 2558, + "time_per_iteration": 2.6442172527313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047723, + "balance_loss_mlp": 1.00619018, + "epoch": 0.4923047325894575, + "flos": 547208573952.0, + "grad_norm": 0.0376599347248576, + "language_loss": 0.83764005, + "learning_rate": 0.0005367288803406003, + "loss": 0.84811723, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.41552734, + "step": 2559, + "time_per_iteration": 2.6496431827545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_mlp": 1.00299704, + "epoch": 0.49249711427472104, + "flos": 597590012160.0, + "grad_norm": 0.034513710641845094, + "language_loss": 0.81748044, + "learning_rate": 0.0005364181726058073, + "loss": 0.8279264, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.41625977, + "step": 2560, + "time_per_iteration": 2.677976608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.0049566, + "epoch": 0.4926894959599846, + "flos": 498809187072.0, + "grad_norm": 0.0360523922041074, + "language_loss": 0.83156157, + "learning_rate": 0.0005361074507322261, + "loss": 0.84202433, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.41333008, + "step": 2561, + "time_per_iteration": 2.5902929306030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_mlp": 1.00575542, + "epoch": 0.49288187764524816, + "flos": 537183611136.0, + "grad_norm": 0.03594243708601782, + "language_loss": 0.81942439, + "learning_rate": 0.000535796714840489, + "loss": 0.82989568, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.41381836, + "step": 2562, + "time_per_iteration": 2.6181418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_mlp": 1.00658977, + "epoch": 0.49307425933051174, + "flos": 642713521920.0, + "grad_norm": 0.03700989683335547, + "language_loss": 0.84345794, + "learning_rate": 0.0005354859650512348, + "loss": 0.85393751, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.41381836, + "step": 2563, + "time_per_iteration": 2.7921204566955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048021, + "balance_loss_mlp": 1.00670326, + "epoch": 0.4932666410157753, + "flos": 517265888256.0, + "grad_norm": 0.0348037560143354, + "language_loss": 0.8771596, + "learning_rate": 0.0005351752014851074, + "loss": 0.88763982, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.41333008, + "step": 2564, + "time_per_iteration": 2.602555990219116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048335, + "balance_loss_mlp": 1.00694537, + "epoch": 0.49345902270103886, + "flos": 602652554496.0, + "grad_norm": 0.04115766537624956, + "language_loss": 0.83900678, + "learning_rate": 0.0005348644242627553, + "loss": 0.84949011, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.4140625, + "step": 2565, + "time_per_iteration": 2.7332029342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010522, + "balance_loss_mlp": 1.01195526, + "epoch": 0.49365140438630245, + "flos": 1496984550912.0, + "grad_norm": 0.005471138804527184, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76338828, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.40234375, + "step": 2566, + "time_per_iteration": 4.974903583526611 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051209, + "balance_loss_mlp": 1.00991523, + "epoch": 0.493843786071566, + "flos": 630789133056.0, + "grad_norm": 0.031108020693620165, + "language_loss": 0.8259182, + "learning_rate": 0.0005342428293320013, + "loss": 0.83643031, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.41308594, + "step": 2567, + "time_per_iteration": 2.774355173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_mlp": 1.01332963, + "epoch": 0.49403616775682957, + "flos": 618690745344.0, + "grad_norm": 0.04042101882964004, + "language_loss": 0.84698522, + "learning_rate": 0.0005339320118649238, + "loss": 0.85753244, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.4140625, + "step": 2568, + "time_per_iteration": 2.7593345642089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_mlp": 1.0091759, + "epoch": 0.4942285494420931, + "flos": 578814470400.0, + "grad_norm": 0.03306097920847627, + "language_loss": 0.87056893, + "learning_rate": 0.000533621181224271, + "loss": 0.88107407, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.41357422, + "step": 2569, + "time_per_iteration": 2.815171957015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_mlp": 1.00358069, + "epoch": 0.4944209311273567, + "flos": 631466664960.0, + "grad_norm": 0.04400973771206172, + "language_loss": 0.82116252, + "learning_rate": 0.0005333103375307182, + "loss": 0.83161294, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.41479492, + "step": 2570, + "time_per_iteration": 2.86649227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048831, + "balance_loss_mlp": 1.00751352, + "epoch": 0.4946133128126202, + "flos": 588719869440.0, + "grad_norm": 0.030724614795269025, + "language_loss": 0.86645854, + "learning_rate": 0.0005329994809049451, + "loss": 0.87694681, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.41333008, + "step": 2571, + "time_per_iteration": 2.717759847640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_mlp": 1.00297725, + "epoch": 0.4948056944978838, + "flos": 584847140352.0, + "grad_norm": 0.02937251460087377, + "language_loss": 0.88108343, + "learning_rate": 0.0005326886114676375, + "loss": 0.89152658, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.41357422, + "step": 2572, + "time_per_iteration": 2.767547369003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_mlp": 1.00207376, + "epoch": 0.49499807618314734, + "flos": 482781689856.0, + "grad_norm": 0.032763972727654474, + "language_loss": 0.88217831, + "learning_rate": 0.0005323777293394854, + "loss": 0.8926127, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.41381836, + "step": 2573, + "time_per_iteration": 2.557117223739624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_mlp": 1.00318027, + "epoch": 0.4951904578684109, + "flos": 520038232320.0, + "grad_norm": 0.044201740478413694, + "language_loss": 0.82535017, + "learning_rate": 0.000532066834641184, + "loss": 0.83579636, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.41455078, + "step": 2574, + "time_per_iteration": 2.6565427780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_mlp": 1.00202954, + "epoch": 0.4953828395536745, + "flos": 536578010880.0, + "grad_norm": 0.03171877270725238, + "language_loss": 0.85277009, + "learning_rate": 0.0005317559274934334, + "loss": 0.8632071, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.41699219, + "step": 2575, + "time_per_iteration": 2.720740795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048187, + "balance_loss_mlp": 1.00653565, + "epoch": 0.49557522123893805, + "flos": 529607294208.0, + "grad_norm": 0.03640176927698583, + "language_loss": 0.81348443, + "learning_rate": 0.0005314450080169382, + "loss": 0.82396632, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.41674805, + "step": 2576, + "time_per_iteration": 2.6694118976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_mlp": 1.00729847, + "epoch": 0.49576760292420163, + "flos": 428918294784.0, + "grad_norm": 0.03343170538339807, + "language_loss": 0.81225574, + "learning_rate": 0.0005311340763324083, + "loss": 0.82274544, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.41699219, + "step": 2577, + "time_per_iteration": 2.5676074028015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050384, + "balance_loss_mlp": 1.00866091, + "epoch": 0.49595998460946517, + "flos": 566316562176.0, + "grad_norm": 0.031028578783915843, + "language_loss": 0.83262658, + "learning_rate": 0.0005308231325605578, + "loss": 0.84313035, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.41748047, + "step": 2578, + "time_per_iteration": 2.6750431060791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050534, + "balance_loss_mlp": 1.00893033, + "epoch": 0.49615236629472875, + "flos": 703814951424.0, + "grad_norm": 0.16493684193156796, + "language_loss": 0.7742933, + "learning_rate": 0.0005305121768221061, + "loss": 0.78479862, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.41625977, + "step": 2579, + "time_per_iteration": 3.083477020263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047222, + "balance_loss_mlp": 1.00688171, + "epoch": 0.4963447479799923, + "flos": 1444755209472.0, + "grad_norm": 0.004557610476670616, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76085544, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.40332031, + "step": 2580, + "time_per_iteration": 4.820146083831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_mlp": 1.00602686, + "epoch": 0.49653712966525587, + "flos": 538664074752.0, + "grad_norm": 0.031551785699882776, + "language_loss": 0.92325974, + "learning_rate": 0.0005298902299282984, + "loss": 0.93373942, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.41967773, + "step": 2581, + "time_per_iteration": 2.619842529296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050513, + "balance_loss_mlp": 1.00840831, + "epoch": 0.4967295113505194, + "flos": 608396519424.0, + "grad_norm": 0.03377113658216861, + "language_loss": 0.8488903, + "learning_rate": 0.0005295792390144033, + "loss": 0.8593955, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.42138672, + "step": 2582, + "time_per_iteration": 2.722321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050969, + "balance_loss_mlp": 1.00872111, + "epoch": 0.496921893035783, + "flos": 475531016448.0, + "grad_norm": 0.04081472802053015, + "language_loss": 0.84166956, + "learning_rate": 0.0005292682366168294, + "loss": 0.85217929, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.42285156, + "step": 2583, + "time_per_iteration": 2.5314435958862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104609, + "balance_loss_mlp": 1.00393724, + "epoch": 0.4971142747210466, + "flos": 598603881216.0, + "grad_norm": 0.03300753756436905, + "language_loss": 0.80573511, + "learning_rate": 0.0005289572228563181, + "loss": 0.81619596, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.421875, + "step": 2584, + "time_per_iteration": 2.7332074642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050666, + "balance_loss_mlp": 1.00846612, + "epoch": 0.4973066564063101, + "flos": 600735631872.0, + "grad_norm": 0.03199938195942058, + "language_loss": 0.83498567, + "learning_rate": 0.000528646197853616, + "loss": 0.8454923, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.42236328, + "step": 2585, + "time_per_iteration": 2.748955249786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_mlp": 1.00938058, + "epoch": 0.4974990380915737, + "flos": 650770039296.0, + "grad_norm": 0.03327645798274956, + "language_loss": 0.86559486, + "learning_rate": 0.0005283351617294735, + "loss": 0.87611067, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.42236328, + "step": 2586, + "time_per_iteration": 2.9175055027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051655, + "balance_loss_mlp": 1.01093292, + "epoch": 0.49769141977683723, + "flos": 1532442767616.0, + "grad_norm": 0.005920405298637117, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77688324, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.40722656, + "step": 2587, + "time_per_iteration": 4.992246627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.00936949, + "epoch": 0.4978838014621008, + "flos": 537398439168.0, + "grad_norm": 0.03485872476270145, + "language_loss": 0.87171799, + "learning_rate": 0.0005277130565998916, + "loss": 0.88223433, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.4230957, + "step": 2588, + "time_per_iteration": 2.7742838859558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048839, + "balance_loss_mlp": 1.00666261, + "epoch": 0.49807618314736435, + "flos": 540746247936.0, + "grad_norm": 0.02719767735149213, + "language_loss": 0.82424593, + "learning_rate": 0.0005274019878359748, + "loss": 0.83473426, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.42211914, + "step": 2589, + "time_per_iteration": 2.7111029624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_mlp": 1.00699103, + "epoch": 0.49826856483262794, + "flos": 543522482688.0, + "grad_norm": 0.03488772819740132, + "language_loss": 0.87582624, + "learning_rate": 0.0005270909084336628, + "loss": 0.88631868, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.42285156, + "step": 2590, + "time_per_iteration": 2.6801702976226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_mlp": 1.00911105, + "epoch": 0.4984609465178915, + "flos": 523361741568.0, + "grad_norm": 0.03538182267925601, + "language_loss": 0.89689445, + "learning_rate": 0.0005267798185137276, + "loss": 0.90740824, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.4230957, + "step": 2591, + "time_per_iteration": 2.673933506011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048091, + "balance_loss_mlp": 1.00577164, + "epoch": 0.49865332820315506, + "flos": 575705789184.0, + "grad_norm": 0.03191547825845594, + "language_loss": 0.90023857, + "learning_rate": 0.0005264687181969444, + "loss": 0.91071951, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.42358398, + "step": 2592, + "time_per_iteration": 2.729825735092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047144, + "balance_loss_mlp": 1.00484908, + "epoch": 0.49884570988841864, + "flos": 1015211884032.0, + "grad_norm": 0.03571151562514848, + "language_loss": 0.75975507, + "learning_rate": 0.0005261576076040937, + "loss": 0.77022654, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.42333984, + "step": 2593, + "time_per_iteration": 3.284675359725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_mlp": 1.00502849, + "epoch": 0.4990380915736822, + "flos": 560648419584.0, + "grad_norm": 0.032935336602121515, + "language_loss": 0.84734505, + "learning_rate": 0.0005258464868559591, + "loss": 0.85781705, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.42211914, + "step": 2594, + "time_per_iteration": 2.638974905014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_mlp": 1.00772595, + "epoch": 0.49923047325894576, + "flos": 499944565248.0, + "grad_norm": 0.031535831762229155, + "language_loss": 0.89198703, + "learning_rate": 0.0005255353560733284, + "loss": 0.90248442, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.42041016, + "step": 2595, + "time_per_iteration": 2.5665078163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_mlp": 1.00414276, + "epoch": 0.4994228549442093, + "flos": 1499790921216.0, + "grad_norm": 0.005502914482473529, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76623321, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.40527344, + "step": 2596, + "time_per_iteration": 4.774062395095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_mlp": 1.0082401, + "epoch": 0.4996152366294729, + "flos": 558514723584.0, + "grad_norm": 0.032060383149289634, + "language_loss": 0.83672047, + "learning_rate": 0.0005249130648877492, + "loss": 0.84722298, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.42041016, + "step": 2597, + "time_per_iteration": 2.7558000087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051517, + "balance_loss_mlp": 1.00950754, + "epoch": 0.4998076183147364, + "flos": 416483569920.0, + "grad_norm": 0.036130927396763525, + "language_loss": 0.85007888, + "learning_rate": 0.0005246019047263953, + "loss": 0.86059409, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.42041016, + "step": 2598, + "time_per_iteration": 2.4761478900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045154, + "balance_loss_mlp": 1.00300181, + "epoch": 0.5, + "flos": 468326029824.0, + "grad_norm": 0.035928472301153966, + "language_loss": 0.83319026, + "learning_rate": 0.0005242907350137353, + "loss": 0.84364176, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.421875, + "step": 2599, + "time_per_iteration": 2.551312208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_mlp": 1.00439322, + "epoch": 0.5001923816852636, + "flos": 483756675072.0, + "grad_norm": 0.03511658446114867, + "language_loss": 0.79463625, + "learning_rate": 0.0005239795558705754, + "loss": 0.80510032, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.42041016, + "step": 2600, + "time_per_iteration": 2.6441214084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_mlp": 1.00278771, + "epoch": 0.5003847633705272, + "flos": 534856474368.0, + "grad_norm": 0.03015144944524051, + "language_loss": 0.89835393, + "learning_rate": 0.0005236683674177264, + "loss": 0.90880144, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.41992188, + "step": 2601, + "time_per_iteration": 2.669487953186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_mlp": 1.00746012, + "epoch": 0.5005771450557907, + "flos": 739056394752.0, + "grad_norm": 0.03236196452732128, + "language_loss": 0.82869333, + "learning_rate": 0.0005233571697760021, + "loss": 0.83918852, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.42089844, + "step": 2602, + "time_per_iteration": 2.85748028755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_mlp": 1.00264096, + "epoch": 0.5007695267410542, + "flos": 780307175424.0, + "grad_norm": 0.03720253600362933, + "language_loss": 0.83658135, + "learning_rate": 0.0005230459630662203, + "loss": 0.84702832, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.42089844, + "step": 2603, + "time_per_iteration": 2.9300596714019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_mlp": 1.00358939, + "epoch": 0.5009619084263178, + "flos": 624619402752.0, + "grad_norm": 0.038089595528021734, + "language_loss": 0.82175541, + "learning_rate": 0.0005227347474092022, + "loss": 0.83221114, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.42016602, + "step": 2604, + "time_per_iteration": 2.7056775093078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_mlp": 1.00621724, + "epoch": 0.5011542901115814, + "flos": 532193000448.0, + "grad_norm": 0.026542730624890497, + "language_loss": 0.84019673, + "learning_rate": 0.0005224235229257724, + "loss": 0.85067946, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.42089844, + "step": 2605, + "time_per_iteration": 2.6953065395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048861, + "balance_loss_mlp": 1.00680435, + "epoch": 0.5013466717968449, + "flos": 528628418304.0, + "grad_norm": 0.028335807962849974, + "language_loss": 0.87261045, + "learning_rate": 0.0005221122897367589, + "loss": 0.88309902, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.42089844, + "step": 2606, + "time_per_iteration": 2.7901618480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051336, + "balance_loss_mlp": 1.00939834, + "epoch": 0.5015390534821085, + "flos": 567089358336.0, + "grad_norm": 0.03672669743645021, + "language_loss": 0.81618142, + "learning_rate": 0.0005218010479629932, + "loss": 0.82669473, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.41967773, + "step": 2607, + "time_per_iteration": 2.6298229694366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047474, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5017314351673721, + "flos": 567768835584.0, + "grad_norm": 0.038374388481505664, + "language_loss": 0.82467473, + "learning_rate": 0.0005214897977253102, + "loss": 0.83514941, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.41992188, + "step": 2608, + "time_per_iteration": 2.6571240425109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044229, + "balance_loss_mlp": 1.00231516, + "epoch": 0.5019238168526357, + "flos": 523387986432.0, + "grad_norm": 0.030375370520194293, + "language_loss": 0.84678638, + "learning_rate": 0.0005211785391445473, + "loss": 0.85722864, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.41943359, + "step": 2609, + "time_per_iteration": 2.7354485988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_mlp": 1.00309336, + "epoch": 0.5021161985378992, + "flos": 642637699584.0, + "grad_norm": 0.0345609683707489, + "language_loss": 0.80034763, + "learning_rate": 0.0005208672723415467, + "loss": 0.81079769, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.41943359, + "step": 2610, + "time_per_iteration": 2.8003506660461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_mlp": 1.00431252, + "epoch": 0.5023085802231627, + "flos": 592423457280.0, + "grad_norm": 0.034384432252957974, + "language_loss": 0.79919124, + "learning_rate": 0.0005205559974371525, + "loss": 0.8096537, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.41967773, + "step": 2611, + "time_per_iteration": 2.801931142807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_mlp": 1.00283635, + "epoch": 0.5025009619084263, + "flos": 473334137088.0, + "grad_norm": 0.0314075616675113, + "language_loss": 0.83085155, + "learning_rate": 0.0005202447145522123, + "loss": 0.84129953, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.41992188, + "step": 2612, + "time_per_iteration": 2.7084405422210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104663, + "balance_loss_mlp": 1.00476372, + "epoch": 0.5026933435936899, + "flos": 456077942784.0, + "grad_norm": 0.03248187925620893, + "language_loss": 0.79969329, + "learning_rate": 0.0005199334238075769, + "loss": 0.81015956, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.41894531, + "step": 2613, + "time_per_iteration": 2.5416245460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00367975, + "epoch": 0.5028857252789535, + "flos": 492722082048.0, + "grad_norm": 0.030734349084793038, + "language_loss": 0.92369366, + "learning_rate": 0.0005196221253241, + "loss": 0.93415004, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.41992188, + "step": 2614, + "time_per_iteration": 2.5504183769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.00344431, + "epoch": 0.503078106964217, + "flos": 626731711488.0, + "grad_norm": 0.0333228394962432, + "language_loss": 0.83482671, + "learning_rate": 0.0005193108192226383, + "loss": 0.84528148, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.4206543, + "step": 2615, + "time_per_iteration": 2.7415342330932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_mlp": 1.00445676, + "epoch": 0.5032704886494805, + "flos": 580138431744.0, + "grad_norm": 0.028161477664975402, + "language_loss": 0.87796414, + "learning_rate": 0.000518999505624052, + "loss": 0.88842779, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.41943359, + "step": 2616, + "time_per_iteration": 2.703958749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_mlp": 1.00289583, + "epoch": 0.5034628703347441, + "flos": 472846155264.0, + "grad_norm": 0.026579731156649716, + "language_loss": 0.83874726, + "learning_rate": 0.000518688184649203, + "loss": 0.84919554, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.41967773, + "step": 2617, + "time_per_iteration": 2.7804102897644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046877, + "balance_loss_mlp": 1.00501108, + "epoch": 0.5036552520200077, + "flos": 490813907712.0, + "grad_norm": 0.028739225931260208, + "language_loss": 0.84081781, + "learning_rate": 0.0005183768564189577, + "loss": 0.85128659, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.41894531, + "step": 2618, + "time_per_iteration": 2.559967517852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049251, + "balance_loss_mlp": 1.00724185, + "epoch": 0.5038476337052713, + "flos": 495216414720.0, + "grad_norm": 0.040417435174145346, + "language_loss": 0.82122672, + "learning_rate": 0.0005180655210541838, + "loss": 0.83171928, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.42041016, + "step": 2619, + "time_per_iteration": 2.569495677947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_mlp": 1.00471759, + "epoch": 0.5040400153905348, + "flos": 601740752640.0, + "grad_norm": 0.03616333015321602, + "language_loss": 0.83923668, + "learning_rate": 0.0005177541786757527, + "loss": 0.84970129, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.41772461, + "step": 2620, + "time_per_iteration": 2.7744040489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048715, + "balance_loss_mlp": 1.0068723, + "epoch": 0.5042323970757984, + "flos": 812920137984.0, + "grad_norm": 0.03309299686066053, + "language_loss": 0.83304209, + "learning_rate": 0.000517442829404538, + "loss": 0.84352922, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.41870117, + "step": 2621, + "time_per_iteration": 2.97257137298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048042, + "balance_loss_mlp": 1.00610471, + "epoch": 0.504424778761062, + "flos": 628607804928.0, + "grad_norm": 0.035914844760130495, + "language_loss": 0.87778026, + "learning_rate": 0.0005171314733614166, + "loss": 0.88826072, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.41967773, + "step": 2622, + "time_per_iteration": 2.8732259273529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051056, + "balance_loss_mlp": 1.0091418, + "epoch": 0.5046171604463255, + "flos": 516957741312.0, + "grad_norm": 0.03505567711141955, + "language_loss": 0.79205, + "learning_rate": 0.0005168201106672671, + "loss": 0.80256051, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.41943359, + "step": 2623, + "time_per_iteration": 2.773688316345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_mlp": 1.00590754, + "epoch": 0.504809542131589, + "flos": 528853939968.0, + "grad_norm": 0.0377301000829576, + "language_loss": 0.8564831, + "learning_rate": 0.0005165087414429717, + "loss": 0.86696255, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.4206543, + "step": 2624, + "time_per_iteration": 2.6755454540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051892, + "balance_loss_mlp": 1.0100261, + "epoch": 0.5050019238168526, + "flos": 555175663104.0, + "grad_norm": 0.03350143092818485, + "language_loss": 0.83751678, + "learning_rate": 0.0005161973658094144, + "loss": 0.84803575, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.41894531, + "step": 2625, + "time_per_iteration": 2.6260385513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105232, + "balance_loss_mlp": 1.01057339, + "epoch": 0.5051943055021162, + "flos": 575929365504.0, + "grad_norm": 0.030667351452066165, + "language_loss": 0.83093894, + "learning_rate": 0.000515885983887482, + "loss": 0.84146214, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.41772461, + "step": 2626, + "time_per_iteration": 2.7437500953674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_mlp": 1.00686646, + "epoch": 0.5053866871873798, + "flos": 497682557184.0, + "grad_norm": 0.033924054159163435, + "language_loss": 0.84715843, + "learning_rate": 0.0005155745957980636, + "loss": 0.85764432, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.41748047, + "step": 2627, + "time_per_iteration": 2.625260353088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048009, + "balance_loss_mlp": 1.00638068, + "epoch": 0.5055790688726434, + "flos": 503220442368.0, + "grad_norm": 0.03037314022037546, + "language_loss": 0.89067703, + "learning_rate": 0.000515263201662051, + "loss": 0.90115714, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.41650391, + "step": 2628, + "time_per_iteration": 2.68068265914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_mlp": 1.00565541, + "epoch": 0.5057714505579068, + "flos": 846768600576.0, + "grad_norm": 0.031311962044338205, + "language_loss": 0.83074951, + "learning_rate": 0.0005149518016003378, + "loss": 0.84122205, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.41625977, + "step": 2629, + "time_per_iteration": 3.208085060119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048667, + "balance_loss_mlp": 1.00720644, + "epoch": 0.5059638322431704, + "flos": 498809187072.0, + "grad_norm": 0.03517894489413756, + "language_loss": 0.82677329, + "learning_rate": 0.0005146403957338206, + "loss": 0.83725995, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.41479492, + "step": 2630, + "time_per_iteration": 2.5591788291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_mlp": 1.0044378, + "epoch": 0.506156213928434, + "flos": 619114565376.0, + "grad_norm": 0.029747387185900163, + "language_loss": 0.82375658, + "learning_rate": 0.0005143289841833975, + "loss": 0.83421576, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.41503906, + "step": 2631, + "time_per_iteration": 2.8919997215270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5063485956136976, + "flos": 425790171648.0, + "grad_norm": 0.040524041139339724, + "language_loss": 0.82811654, + "learning_rate": 0.0005140175670699696, + "loss": 0.83857036, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.41455078, + "step": 2632, + "time_per_iteration": 2.6062378883361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045222, + "balance_loss_mlp": 1.0038327, + "epoch": 0.5065409772989612, + "flos": 571070957568.0, + "grad_norm": 0.026263595366118216, + "language_loss": 0.83201623, + "learning_rate": 0.0005137061445144395, + "loss": 0.84246838, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.4140625, + "step": 2633, + "time_per_iteration": 2.9138190746307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00282133, + "epoch": 0.5067333589842247, + "flos": 629970650112.0, + "grad_norm": 0.032671607566671305, + "language_loss": 0.87714005, + "learning_rate": 0.000513394716637712, + "loss": 0.8875829, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.41479492, + "step": 2634, + "time_per_iteration": 2.7618257999420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044567, + "balance_loss_mlp": 1.00422668, + "epoch": 0.5069257406694883, + "flos": 1451098938624.0, + "grad_norm": 0.004578936312393245, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.8023628, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.40332031, + "step": 2635, + "time_per_iteration": 4.85358738899231 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050199, + "balance_loss_mlp": 1.00869, + "epoch": 0.5071181223547518, + "flos": 640058796288.0, + "grad_norm": 0.03342633817994969, + "language_loss": 0.81428993, + "learning_rate": 0.0005127718454042958, + "loss": 0.82479185, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.4152832, + "step": 2636, + "time_per_iteration": 2.8021318912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_mlp": 1.00304461, + "epoch": 0.5073105040400154, + "flos": 714873225216.0, + "grad_norm": 0.031182962990379204, + "language_loss": 0.85094464, + "learning_rate": 0.0005124604022894269, + "loss": 0.86139023, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.4152832, + "step": 2637, + "time_per_iteration": 2.934414863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_mlp": 1.00676727, + "epoch": 0.5075028857252789, + "flos": 1439614899456.0, + "grad_norm": 0.007557162842452459, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7823543, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.40429688, + "step": 2638, + "time_per_iteration": 4.820345878601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104601, + "balance_loss_mlp": 1.00435817, + "epoch": 0.5076952674105425, + "flos": 572308402944.0, + "grad_norm": 0.03427455588588844, + "language_loss": 0.83839953, + "learning_rate": 0.0005118375016679325, + "loss": 0.84885961, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.41674805, + "step": 2639, + "time_per_iteration": 2.753891706466675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104483, + "balance_loss_mlp": 1.00327373, + "epoch": 0.5078876490958061, + "flos": 517713040896.0, + "grad_norm": 0.0397313189962262, + "language_loss": 0.81205344, + "learning_rate": 0.0005115260444031382, + "loss": 0.82250178, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.41577148, + "step": 2640, + "time_per_iteration": 2.5884034633636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_mlp": 1.0042038, + "epoch": 0.5080800307810697, + "flos": 1587622342656.0, + "grad_norm": 0.00452780467183982, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79776466, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.40429688, + "step": 2641, + "time_per_iteration": 5.021141290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048765, + "balance_loss_mlp": 1.0071131, + "epoch": 0.5082724124663333, + "flos": 486187824384.0, + "grad_norm": 0.03665123216497768, + "language_loss": 0.87927556, + "learning_rate": 0.0005109031165700483, + "loss": 0.88976324, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.41674805, + "step": 2642, + "time_per_iteration": 2.564768075942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044813, + "balance_loss_mlp": 1.00313723, + "epoch": 0.5084647941515967, + "flos": 683443272960.0, + "grad_norm": 0.03222315683418769, + "language_loss": 0.84105259, + "learning_rate": 0.0005105916462435945, + "loss": 0.85150075, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.41699219, + "step": 2643, + "time_per_iteration": 2.8432576656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046727, + "balance_loss_mlp": 1.0049082, + "epoch": 0.5086571758368603, + "flos": 549813722112.0, + "grad_norm": 0.031341979306324576, + "language_loss": 0.85911554, + "learning_rate": 0.0005102801718050989, + "loss": 0.86958289, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.41845703, + "step": 2644, + "time_per_iteration": 2.7012667655944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_mlp": 1.00658011, + "epoch": 0.5088495575221239, + "flos": 565079116800.0, + "grad_norm": 0.03553781912080262, + "language_loss": 0.89604807, + "learning_rate": 0.0005099686933754867, + "loss": 0.90653086, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.41723633, + "step": 2645, + "time_per_iteration": 2.774092197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047355, + "balance_loss_mlp": 1.00551212, + "epoch": 0.5090419392073875, + "flos": 552512189184.0, + "grad_norm": 0.03374447512064937, + "language_loss": 0.84807706, + "learning_rate": 0.0005096572110756845, + "loss": 0.85855055, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.41870117, + "step": 2646, + "time_per_iteration": 2.691534996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050517, + "balance_loss_mlp": 1.00857961, + "epoch": 0.509234320892651, + "flos": 568884771840.0, + "grad_norm": 0.0280586539552875, + "language_loss": 0.86222303, + "learning_rate": 0.0005093457250266205, + "loss": 0.87272823, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.41967773, + "step": 2647, + "time_per_iteration": 2.669032573699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_mlp": 1.00750375, + "epoch": 0.5094267025779146, + "flos": 583694265600.0, + "grad_norm": 0.03456739808544309, + "language_loss": 0.83707237, + "learning_rate": 0.000509034235349224, + "loss": 0.84756589, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.41870117, + "step": 2648, + "time_per_iteration": 2.7174429893493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048368, + "balance_loss_mlp": 1.00657344, + "epoch": 0.5096190842631781, + "flos": 593139873024.0, + "grad_norm": 0.03190176036185227, + "language_loss": 0.81830442, + "learning_rate": 0.0005087227421644266, + "loss": 0.82878816, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.41821289, + "step": 2649, + "time_per_iteration": 2.730527877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_mlp": 1.00278723, + "epoch": 0.5098114659484417, + "flos": 514584917760.0, + "grad_norm": 0.03166339002539628, + "language_loss": 0.86503744, + "learning_rate": 0.0005084112455931602, + "loss": 0.87548256, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.41748047, + "step": 2650, + "time_per_iteration": 2.588543176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048046, + "balance_loss_mlp": 1.00627494, + "epoch": 0.5100038476337053, + "flos": 485601666048.0, + "grad_norm": 0.03514605484852806, + "language_loss": 0.85810292, + "learning_rate": 0.0005080997457563586, + "loss": 0.86858344, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.41796875, + "step": 2651, + "time_per_iteration": 2.547510862350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053822, + "balance_loss_mlp": 1.01214612, + "epoch": 0.5101962293189688, + "flos": 462555820032.0, + "grad_norm": 0.03981395249249623, + "language_loss": 0.79794431, + "learning_rate": 0.0005077882427749569, + "loss": 0.80848241, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.41699219, + "step": 2652, + "time_per_iteration": 2.5867154598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052855, + "balance_loss_mlp": 1.0111798, + "epoch": 0.5103886110042324, + "flos": 588133711104.0, + "grad_norm": 0.03576387090025985, + "language_loss": 0.8527801, + "learning_rate": 0.0005074767367698913, + "loss": 0.86330867, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.41699219, + "step": 2653, + "time_per_iteration": 2.668619155883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052633, + "balance_loss_mlp": 1.01083803, + "epoch": 0.510580992689496, + "flos": 846679172352.0, + "grad_norm": 0.03324234024932545, + "language_loss": 0.84336531, + "learning_rate": 0.0005071652278620988, + "loss": 0.85389161, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.41821289, + "step": 2654, + "time_per_iteration": 3.0502736568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_mlp": 1.01043141, + "epoch": 0.5107733743747596, + "flos": 659811268608.0, + "grad_norm": 0.033221976859431776, + "language_loss": 0.83371234, + "learning_rate": 0.0005068537161725186, + "loss": 0.84423465, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.41821289, + "step": 2655, + "time_per_iteration": 2.7732832431793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049493, + "balance_loss_mlp": 1.00784123, + "epoch": 0.510965756060023, + "flos": 702961475328.0, + "grad_norm": 0.03652104464060243, + "language_loss": 0.84970605, + "learning_rate": 0.0005065422018220893, + "loss": 0.860201, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.41674805, + "step": 2656, + "time_per_iteration": 2.8670201301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_mlp": 1.00430822, + "epoch": 0.5111581377452866, + "flos": 560941982208.0, + "grad_norm": 0.03459233510222537, + "language_loss": 0.80690587, + "learning_rate": 0.0005062306849317521, + "loss": 0.81736469, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.41601562, + "step": 2657, + "time_per_iteration": 2.8002302646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_mlp": 1.00202358, + "epoch": 0.5113505194305502, + "flos": 610146246144.0, + "grad_norm": 0.03554743150534212, + "language_loss": 0.83936596, + "learning_rate": 0.0005059191656224487, + "loss": 0.84980083, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.41479492, + "step": 2658, + "time_per_iteration": 2.716935157775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_mlp": 1.0037955, + "epoch": 0.5115429011158138, + "flos": 535535951616.0, + "grad_norm": 0.03199868953010379, + "language_loss": 0.89635181, + "learning_rate": 0.0005056076440151212, + "loss": 0.90680414, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.41455078, + "step": 2659, + "time_per_iteration": 2.6661012172698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_mlp": 1.0019455, + "epoch": 0.5117352828010774, + "flos": 1365275813376.0, + "grad_norm": 0.005851878799964376, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.773305, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.40429688, + "step": 2660, + "time_per_iteration": 4.8821775913238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_mlp": 1.00551248, + "epoch": 0.5119276644863409, + "flos": 634931125248.0, + "grad_norm": 0.030472593638878876, + "language_loss": 0.87624103, + "learning_rate": 0.0005049845943901691, + "loss": 0.88671124, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.4152832, + "step": 2661, + "time_per_iteration": 2.868314743041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_mlp": 1.00434649, + "epoch": 0.5121200461716044, + "flos": 586781559552.0, + "grad_norm": 0.035240788892260635, + "language_loss": 0.87104362, + "learning_rate": 0.0005046730666144338, + "loss": 0.88150167, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.41479492, + "step": 2662, + "time_per_iteration": 2.7716057300567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_mlp": 1.00323498, + "epoch": 0.512312427856868, + "flos": 1034224608000.0, + "grad_norm": 0.027938837780362106, + "language_loss": 0.8826527, + "learning_rate": 0.0005043615370244532, + "loss": 0.89309919, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.41430664, + "step": 2663, + "time_per_iteration": 3.4280622005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046261, + "balance_loss_mlp": 1.00611115, + "epoch": 0.5125048095421316, + "flos": 1540901729280.0, + "grad_norm": 0.006786755652655265, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.7929064, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.40136719, + "step": 2664, + "time_per_iteration": 4.68994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.003901, + "epoch": 0.5126971912273951, + "flos": 592328193024.0, + "grad_norm": 0.02608573212926663, + "language_loss": 0.86075294, + "learning_rate": 0.0005037384728855425, + "loss": 0.87120485, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.41308594, + "step": 2665, + "time_per_iteration": 2.7917027473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_mlp": 1.00552762, + "epoch": 0.5128895729126587, + "flos": 552718268928.0, + "grad_norm": 0.03821611985083245, + "language_loss": 0.85252321, + "learning_rate": 0.0005034269385785075, + "loss": 0.86299217, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.41381836, + "step": 2666, + "time_per_iteration": 2.63472318649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_mlp": 1.00605392, + "epoch": 0.5130819545979223, + "flos": 482232470016.0, + "grad_norm": 0.03834683208397515, + "language_loss": 0.85133517, + "learning_rate": 0.0005031154029410168, + "loss": 0.86180985, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.41430664, + "step": 2667, + "time_per_iteration": 2.517110824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049696, + "balance_loss_mlp": 1.00837803, + "epoch": 0.5132743362831859, + "flos": 476768461824.0, + "grad_norm": 0.033096203996997774, + "language_loss": 0.87656248, + "learning_rate": 0.0005028038660940197, + "loss": 0.88705945, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.41333008, + "step": 2668, + "time_per_iteration": 2.5096347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105007, + "balance_loss_mlp": 1.00870478, + "epoch": 0.5134667179684494, + "flos": 504903095040.0, + "grad_norm": 0.028882778070319505, + "language_loss": 0.84998578, + "learning_rate": 0.0005024923281584648, + "loss": 0.86048645, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.41381836, + "step": 2669, + "time_per_iteration": 2.6474804878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_mlp": 1.0076561, + "epoch": 0.5136590996537129, + "flos": 505005162240.0, + "grad_norm": 0.03165719334287126, + "language_loss": 0.8319236, + "learning_rate": 0.0005021807892553026, + "loss": 0.84241164, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.41162109, + "step": 2670, + "time_per_iteration": 2.7183725833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044063, + "balance_loss_mlp": 1.00269723, + "epoch": 0.5138514813389765, + "flos": 625800467712.0, + "grad_norm": 0.030310171756311025, + "language_loss": 0.85420138, + "learning_rate": 0.0005018692495054828, + "loss": 0.86464202, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.41381836, + "step": 2671, + "time_per_iteration": 2.772813081741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_mlp": 1.00224543, + "epoch": 0.5140438630242401, + "flos": 584634257664.0, + "grad_norm": 0.030896406933945995, + "language_loss": 0.80988181, + "learning_rate": 0.0005015577090299561, + "loss": 0.82031626, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.41210938, + "step": 2672, + "time_per_iteration": 2.6667463779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_mlp": 1.00858212, + "epoch": 0.5142362447095037, + "flos": 488905733376.0, + "grad_norm": 0.032429697018958814, + "language_loss": 0.87124586, + "learning_rate": 0.0005012461679496729, + "loss": 0.88174391, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.41235352, + "step": 2673, + "time_per_iteration": 2.6442089080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104556, + "balance_loss_mlp": 1.00431406, + "epoch": 0.5144286263947672, + "flos": 527885757696.0, + "grad_norm": 0.03122591363863073, + "language_loss": 0.88052714, + "learning_rate": 0.0005009346263855848, + "loss": 0.89098281, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.41259766, + "step": 2674, + "time_per_iteration": 2.602527379989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048679, + "balance_loss_mlp": 1.00736094, + "epoch": 0.5146210080800308, + "flos": 487590520320.0, + "grad_norm": 0.029060606816111258, + "language_loss": 0.84209937, + "learning_rate": 0.0005006230844586422, + "loss": 0.85258621, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.41333008, + "step": 2675, + "time_per_iteration": 2.8685102462768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043387, + "balance_loss_mlp": 1.00216484, + "epoch": 0.5148133897652943, + "flos": 516975237888.0, + "grad_norm": 0.028587045609365692, + "language_loss": 0.79492688, + "learning_rate": 0.0005003115422897968, + "loss": 0.80536079, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.41235352, + "step": 2676, + "time_per_iteration": 2.765714168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_mlp": 1.00024414, + "epoch": 0.5150057714505579, + "flos": 512212094208.0, + "grad_norm": 0.033131913333961045, + "language_loss": 0.87827182, + "learning_rate": 0.0005, + "loss": 0.88868773, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.41357422, + "step": 2677, + "time_per_iteration": 2.705502986907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047696, + "balance_loss_mlp": 1.00623488, + "epoch": 0.5151981531358215, + "flos": 912391133952.0, + "grad_norm": 0.03328612222334398, + "language_loss": 0.79844034, + "learning_rate": 0.0004996884577102033, + "loss": 0.80891728, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.41479492, + "step": 2678, + "time_per_iteration": 3.112602949142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049572, + "balance_loss_mlp": 1.00801528, + "epoch": 0.515390534821085, + "flos": 472930725888.0, + "grad_norm": 0.03414850275815592, + "language_loss": 0.85192269, + "learning_rate": 0.000499376915541358, + "loss": 0.86241841, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.41577148, + "step": 2679, + "time_per_iteration": 2.732088565826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_mlp": 1.00475073, + "epoch": 0.5155829165063486, + "flos": 651358142976.0, + "grad_norm": 0.0316115868451719, + "language_loss": 0.81490767, + "learning_rate": 0.0004990653736144155, + "loss": 0.82537097, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.41601562, + "step": 2680, + "time_per_iteration": 2.9006052017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_mlp": 1.00425994, + "epoch": 0.5157752981916122, + "flos": 415161553920.0, + "grad_norm": 0.034873868180568895, + "language_loss": 0.86566359, + "learning_rate": 0.0004987538320503271, + "loss": 0.876122, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.41601562, + "step": 2681, + "time_per_iteration": 2.5385584831237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049108, + "balance_loss_mlp": 1.00750434, + "epoch": 0.5159676798768758, + "flos": 554932644864.0, + "grad_norm": 0.03448939758068617, + "language_loss": 0.83127022, + "learning_rate": 0.0004984422909700442, + "loss": 0.84176129, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.41625977, + "step": 2682, + "time_per_iteration": 2.7167794704437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105107, + "balance_loss_mlp": 1.00944197, + "epoch": 0.5161600615621393, + "flos": 587621429760.0, + "grad_norm": 0.033752660754493145, + "language_loss": 0.84206975, + "learning_rate": 0.0004981307504945173, + "loss": 0.85258043, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.41650391, + "step": 2683, + "time_per_iteration": 2.6896650791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_mlp": 1.00856805, + "epoch": 0.5163524432474028, + "flos": 589948566528.0, + "grad_norm": 0.03498305011402451, + "language_loss": 0.90086776, + "learning_rate": 0.0004978192107446976, + "loss": 0.9113704, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.41723633, + "step": 2684, + "time_per_iteration": 2.7550315856933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_mlp": 1.00456297, + "epoch": 0.5165448249326664, + "flos": 504905040384.0, + "grad_norm": 0.03233825392148911, + "language_loss": 0.87956327, + "learning_rate": 0.0004975076718415353, + "loss": 0.89002615, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.41748047, + "step": 2685, + "time_per_iteration": 2.5969831943511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046859, + "balance_loss_mlp": 1.00515938, + "epoch": 0.51673720661793, + "flos": 417647138304.0, + "grad_norm": 0.0327603501643271, + "language_loss": 0.91275072, + "learning_rate": 0.0004971961339059806, + "loss": 0.9232192, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.41723633, + "step": 2686, + "time_per_iteration": 2.488780975341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048798, + "balance_loss_mlp": 1.00714636, + "epoch": 0.5169295883031936, + "flos": 600075596544.0, + "grad_norm": 0.03249247039046824, + "language_loss": 0.84663117, + "learning_rate": 0.0004968845970589832, + "loss": 0.8571192, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.41674805, + "step": 2687, + "time_per_iteration": 2.7266340255737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047672, + "balance_loss_mlp": 1.00597274, + "epoch": 0.517121969988457, + "flos": 557911068672.0, + "grad_norm": 0.03510688251477249, + "language_loss": 0.85442108, + "learning_rate": 0.0004965730614214926, + "loss": 0.86489779, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.41723633, + "step": 2688, + "time_per_iteration": 2.669203758239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00721848, + "epoch": 0.5173143516737206, + "flos": 470375155200.0, + "grad_norm": 0.031768698442390816, + "language_loss": 0.85484231, + "learning_rate": 0.0004962615271144576, + "loss": 0.86533004, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.41577148, + "step": 2689, + "time_per_iteration": 2.508864164352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.00578225, + "epoch": 0.5175067333589842, + "flos": 721379292672.0, + "grad_norm": 0.036604011276375, + "language_loss": 0.83442801, + "learning_rate": 0.0004959499942588264, + "loss": 0.84490001, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.41430664, + "step": 2690, + "time_per_iteration": 2.937147617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054985, + "balance_loss_mlp": 1.01473999, + "epoch": 0.5176991150442478, + "flos": 1469344702464.0, + "grad_norm": 0.008104040921495323, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79255009, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.40234375, + "step": 2691, + "time_per_iteration": 4.793481111526489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047518, + "balance_loss_mlp": 1.00593746, + "epoch": 0.5178914967295114, + "flos": 613784705280.0, + "grad_norm": 0.029651978346564224, + "language_loss": 0.85819978, + "learning_rate": 0.0004953269333855661, + "loss": 0.86867493, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.41601562, + "step": 2692, + "time_per_iteration": 2.7456183433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054253, + "balance_loss_mlp": 1.01293516, + "epoch": 0.5180838784147749, + "flos": 501981051648.0, + "grad_norm": 0.03275547277888071, + "language_loss": 0.85017627, + "learning_rate": 0.0004950154056098309, + "loss": 0.86071873, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.41333008, + "step": 2693, + "time_per_iteration": 2.710204839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052407, + "balance_loss_mlp": 1.01108897, + "epoch": 0.5182762601000385, + "flos": 690042659328.0, + "grad_norm": 0.03430000909694698, + "language_loss": 0.84476924, + "learning_rate": 0.0004947038797692867, + "loss": 0.85529327, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.41333008, + "step": 2694, + "time_per_iteration": 2.846104860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053132, + "balance_loss_mlp": 1.01169479, + "epoch": 0.518468641785302, + "flos": 666801427200.0, + "grad_norm": 0.031372779584062496, + "language_loss": 0.77936417, + "learning_rate": 0.0004943923559848789, + "loss": 0.78989553, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.41455078, + "step": 2695, + "time_per_iteration": 2.780346155166626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054261, + "balance_loss_mlp": 1.01303816, + "epoch": 0.5186610234705656, + "flos": 567814522368.0, + "grad_norm": 0.025403978054072948, + "language_loss": 0.9097802, + "learning_rate": 0.0004940808343775515, + "loss": 0.92032284, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.41235352, + "step": 2696, + "time_per_iteration": 2.6940221786499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052298, + "balance_loss_mlp": 1.01093256, + "epoch": 0.5188534051558291, + "flos": 429793158144.0, + "grad_norm": 0.033988353521974116, + "language_loss": 0.8254481, + "learning_rate": 0.0004937693150682479, + "loss": 0.83597112, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.41381836, + "step": 2697, + "time_per_iteration": 2.5146913528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048575, + "balance_loss_mlp": 1.00725734, + "epoch": 0.5190457868410927, + "flos": 547412708352.0, + "grad_norm": 0.031596370266791504, + "language_loss": 0.77111042, + "learning_rate": 0.0004934577981779107, + "loss": 0.78159618, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.41333008, + "step": 2698, + "time_per_iteration": 2.6567137241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_mlp": 1.00327134, + "epoch": 0.5192381685263563, + "flos": 549746648064.0, + "grad_norm": 0.029705122804042017, + "language_loss": 0.81764138, + "learning_rate": 0.0004931462838274817, + "loss": 0.82808805, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.4140625, + "step": 2699, + "time_per_iteration": 2.817087173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050227, + "balance_loss_mlp": 1.00895715, + "epoch": 0.5194305502116199, + "flos": 576350273280.0, + "grad_norm": 0.03619468074242637, + "language_loss": 0.84569639, + "learning_rate": 0.0004928347721379011, + "loss": 0.85619867, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.4128418, + "step": 2700, + "time_per_iteration": 2.6439361572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049587, + "balance_loss_mlp": 1.00831699, + "epoch": 0.5196229318968835, + "flos": 435218282496.0, + "grad_norm": 0.03299749227833017, + "language_loss": 0.82266027, + "learning_rate": 0.0004925232632301089, + "loss": 0.83315617, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.4128418, + "step": 2701, + "time_per_iteration": 2.5564098358154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_mlp": 1.00409007, + "epoch": 0.5198153135821469, + "flos": 559986438912.0, + "grad_norm": 0.03181007655018395, + "language_loss": 0.79940033, + "learning_rate": 0.0004922117572250431, + "loss": 0.80985349, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.41235352, + "step": 2702, + "time_per_iteration": 2.651662826538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00763237, + "epoch": 0.5200076952674105, + "flos": 566835646464.0, + "grad_norm": 0.030877309828348475, + "language_loss": 0.81538028, + "learning_rate": 0.0004919002542436414, + "loss": 0.82586813, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.41162109, + "step": 2703, + "time_per_iteration": 2.829218864440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_mlp": 1.01028192, + "epoch": 0.5202000769526741, + "flos": 572273409792.0, + "grad_norm": 0.031996161034096735, + "language_loss": 0.81638157, + "learning_rate": 0.0004915887544068399, + "loss": 0.82689589, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.41162109, + "step": 2704, + "time_per_iteration": 2.6583306789398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052276, + "balance_loss_mlp": 1.01110101, + "epoch": 0.5203924586379377, + "flos": 695467783680.0, + "grad_norm": 0.03456723160752419, + "language_loss": 0.7851603, + "learning_rate": 0.0004912772578355736, + "loss": 0.79568309, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.41186523, + "step": 2705, + "time_per_iteration": 2.9061107635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051355, + "balance_loss_mlp": 1.01010871, + "epoch": 0.5205848403232012, + "flos": 567691067904.0, + "grad_norm": 0.03253184462937942, + "language_loss": 0.83445644, + "learning_rate": 0.000490965764650776, + "loss": 0.84497005, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.41259766, + "step": 2706, + "time_per_iteration": 2.8724799156188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_mlp": 1.01042521, + "epoch": 0.5207772220084648, + "flos": 1216205913600.0, + "grad_norm": 0.03130848752928153, + "language_loss": 0.83192623, + "learning_rate": 0.0004906542749733798, + "loss": 0.84244412, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.41381836, + "step": 2707, + "time_per_iteration": 3.6585958003997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_mlp": 1.00770402, + "epoch": 0.5209696036937284, + "flos": 594032232960.0, + "grad_norm": 0.02732760694007456, + "language_loss": 0.85709697, + "learning_rate": 0.0004903427889243156, + "loss": 0.86758834, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.41455078, + "step": 2708, + "time_per_iteration": 2.871150016784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00294721, + "epoch": 0.5211619853789919, + "flos": 523956648192.0, + "grad_norm": 0.03352920522422817, + "language_loss": 0.85979593, + "learning_rate": 0.0004900313066245134, + "loss": 0.87024117, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.41601562, + "step": 2709, + "time_per_iteration": 2.6438417434692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_mlp": 1.00632536, + "epoch": 0.5213543670642555, + "flos": 503861035776.0, + "grad_norm": 0.03205745002268137, + "language_loss": 0.81327069, + "learning_rate": 0.0004897198281949012, + "loss": 0.82374883, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.41503906, + "step": 2710, + "time_per_iteration": 2.693906307220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049613, + "balance_loss_mlp": 1.00800931, + "epoch": 0.521546748749519, + "flos": 587072209920.0, + "grad_norm": 0.036857631666753196, + "language_loss": 0.78204525, + "learning_rate": 0.0004894083537564057, + "loss": 0.79254138, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.41625977, + "step": 2711, + "time_per_iteration": 2.7300491333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_mlp": 1.00333273, + "epoch": 0.5217391304347826, + "flos": 571266343680.0, + "grad_norm": 0.030696577254243577, + "language_loss": 0.81681752, + "learning_rate": 0.0004890968834299519, + "loss": 0.82726759, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.41699219, + "step": 2712, + "time_per_iteration": 2.746556043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_mlp": 1.00831652, + "epoch": 0.5219315121200462, + "flos": 543920057856.0, + "grad_norm": 0.028956363679279982, + "language_loss": 0.79082847, + "learning_rate": 0.0004887854173364633, + "loss": 0.80132675, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.4152832, + "step": 2713, + "time_per_iteration": 2.733306884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051945, + "balance_loss_mlp": 1.01045978, + "epoch": 0.5221238938053098, + "flos": 551531367936.0, + "grad_norm": 0.030815907554272836, + "language_loss": 0.82228422, + "learning_rate": 0.0004884739555968617, + "loss": 0.83280361, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.41503906, + "step": 2714, + "time_per_iteration": 2.815034866333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054211, + "balance_loss_mlp": 1.01425171, + "epoch": 0.5223162754905732, + "flos": 1358392579584.0, + "grad_norm": 0.009025254493072253, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80031264, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.39941406, + "step": 2715, + "time_per_iteration": 5.005860090255737 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_mlp": 1.00550854, + "epoch": 0.5225086571758368, + "flos": 568974200064.0, + "grad_norm": 0.030755982791586634, + "language_loss": 0.87142956, + "learning_rate": 0.0004878510456629992, + "loss": 0.88190192, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.41748047, + "step": 2716, + "time_per_iteration": 2.9582624435424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_mlp": 1.00713038, + "epoch": 0.5227010388611004, + "flos": 501136323840.0, + "grad_norm": 0.03155972783921746, + "language_loss": 0.85419679, + "learning_rate": 0.00048753959771057314, + "loss": 0.86468375, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.41577148, + "step": 2717, + "time_per_iteration": 2.623081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104979, + "balance_loss_mlp": 1.00832856, + "epoch": 0.522893420546364, + "flos": 598799267328.0, + "grad_norm": 0.035176839616525644, + "language_loss": 0.83230948, + "learning_rate": 0.0004872281545957044, + "loss": 0.84280741, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.41479492, + "step": 2718, + "time_per_iteration": 2.7231285572052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059587, + "balance_loss_mlp": 1.01800716, + "epoch": 0.5230858022316276, + "flos": 665922673152.0, + "grad_norm": 0.03224340083556492, + "language_loss": 0.86415994, + "learning_rate": 0.0004869167164393055, + "loss": 0.8747558, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.41601562, + "step": 2719, + "time_per_iteration": 2.9305646419525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054772, + "balance_loss_mlp": 1.0132159, + "epoch": 0.5232781839168911, + "flos": 605034126336.0, + "grad_norm": 0.0287825993415993, + "language_loss": 0.89917624, + "learning_rate": 0.00048660528336228793, + "loss": 0.909724, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.41577148, + "step": 2720, + "time_per_iteration": 2.788072347640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_mlp": 1.0080725, + "epoch": 0.5234705656021547, + "flos": 551841460224.0, + "grad_norm": 0.02763684671666484, + "language_loss": 0.90116215, + "learning_rate": 0.0004862938554855606, + "loss": 0.91165972, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.41699219, + "step": 2721, + "time_per_iteration": 2.775818109512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_mlp": 1.00965011, + "epoch": 0.5236629472874182, + "flos": 505295812608.0, + "grad_norm": 0.03601660428487822, + "language_loss": 0.86817378, + "learning_rate": 0.0004859824329300304, + "loss": 0.87868822, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.41821289, + "step": 2722, + "time_per_iteration": 2.587228536605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053208, + "balance_loss_mlp": 1.01138973, + "epoch": 0.5238553289726818, + "flos": 548697785856.0, + "grad_norm": 0.03170706554102953, + "language_loss": 0.83958352, + "learning_rate": 0.00048567101581660244, + "loss": 0.85011566, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.41845703, + "step": 2723, + "time_per_iteration": 2.6208062171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050325, + "balance_loss_mlp": 1.00843501, + "epoch": 0.5240477106579453, + "flos": 533004680448.0, + "grad_norm": 0.03335820140898581, + "language_loss": 0.87488234, + "learning_rate": 0.00048535960426617956, + "loss": 0.88538557, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.41918945, + "step": 2724, + "time_per_iteration": 2.5951199531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050726, + "balance_loss_mlp": 1.00883543, + "epoch": 0.5242400923432089, + "flos": 619090265856.0, + "grad_norm": 0.03212273913620546, + "language_loss": 0.8244487, + "learning_rate": 0.0004850481983996621, + "loss": 0.83495593, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.41918945, + "step": 2725, + "time_per_iteration": 2.747008800506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049694, + "balance_loss_mlp": 1.00785124, + "epoch": 0.5244324740284725, + "flos": 417590757888.0, + "grad_norm": 0.03280670580990367, + "language_loss": 0.88229245, + "learning_rate": 0.0004847367983379492, + "loss": 0.89278936, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.41870117, + "step": 2726, + "time_per_iteration": 2.437721014022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_mlp": 1.00770533, + "epoch": 0.5246248557137361, + "flos": 627732941568.0, + "grad_norm": 0.03120006141405487, + "language_loss": 0.79435945, + "learning_rate": 0.00048442540420193643, + "loss": 0.80485278, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.41650391, + "step": 2727, + "time_per_iteration": 2.927518844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.00911331, + "epoch": 0.5248172373989997, + "flos": 1250403352320.0, + "grad_norm": 0.03663625191481743, + "language_loss": 0.7991612, + "learning_rate": 0.0004841140161125182, + "loss": 0.80966663, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.41455078, + "step": 2728, + "time_per_iteration": 3.574690818786621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053819, + "balance_loss_mlp": 1.01250064, + "epoch": 0.5250096190842631, + "flos": 507883464192.0, + "grad_norm": 0.03360211420143325, + "language_loss": 0.85387456, + "learning_rate": 0.0004838026341905857, + "loss": 0.86441278, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.41333008, + "step": 2729, + "time_per_iteration": 2.7263481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046844, + "balance_loss_mlp": 1.00547838, + "epoch": 0.5252020007695267, + "flos": 612508376064.0, + "grad_norm": 0.029211194306351093, + "language_loss": 0.85320604, + "learning_rate": 0.00048349125855702844, + "loss": 0.86367452, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.41381836, + "step": 2730, + "time_per_iteration": 2.775851011276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047856, + "balance_loss_mlp": 1.00649071, + "epoch": 0.5253943824547903, + "flos": 540292292352.0, + "grad_norm": 0.02938539212610817, + "language_loss": 0.81675971, + "learning_rate": 0.00048317988933273287, + "loss": 0.82723826, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.41381836, + "step": 2731, + "time_per_iteration": 2.7763831615448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_mlp": 1.00613368, + "epoch": 0.5255867641400539, + "flos": 699338567424.0, + "grad_norm": 0.033934632058623626, + "language_loss": 0.82549971, + "learning_rate": 0.00048286852663858367, + "loss": 0.83597326, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.41235352, + "step": 2732, + "time_per_iteration": 2.96213698387146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052513, + "balance_loss_mlp": 1.01131439, + "epoch": 0.5257791458253175, + "flos": 668549208576.0, + "grad_norm": 0.03297641476237434, + "language_loss": 0.84432375, + "learning_rate": 0.000482557170595462, + "loss": 0.85484892, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.41210938, + "step": 2733, + "time_per_iteration": 2.840514659881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050533, + "balance_loss_mlp": 1.00943005, + "epoch": 0.525971527510581, + "flos": 484605293568.0, + "grad_norm": 0.032410991276381265, + "language_loss": 0.88272679, + "learning_rate": 0.0004822458213242475, + "loss": 0.89323211, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.41113281, + "step": 2734, + "time_per_iteration": 2.560474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047172, + "balance_loss_mlp": 1.00613987, + "epoch": 0.5261639091958445, + "flos": 831348648960.0, + "grad_norm": 0.03341440797603734, + "language_loss": 0.86630881, + "learning_rate": 0.00048193447894581627, + "loss": 0.87678051, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.41040039, + "step": 2735, + "time_per_iteration": 3.1240243911743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105002, + "balance_loss_mlp": 1.00886869, + "epoch": 0.5263562908811081, + "flos": 521733523968.0, + "grad_norm": 0.03226346413051534, + "language_loss": 0.88327318, + "learning_rate": 0.00048162314358104243, + "loss": 0.89377338, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.41162109, + "step": 2736, + "time_per_iteration": 2.599510669708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_mlp": 1.00581563, + "epoch": 0.5265486725663717, + "flos": 576098506752.0, + "grad_norm": 0.03477073688653673, + "language_loss": 0.84006953, + "learning_rate": 0.0004813118153507969, + "loss": 0.85054016, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.41259766, + "step": 2737, + "time_per_iteration": 2.7309916019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_mlp": 1.0173111, + "epoch": 0.5267410542516352, + "flos": 1550561186304.0, + "grad_norm": 0.008968329145720436, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83504307, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.3984375, + "step": 2738, + "time_per_iteration": 4.815824747085571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054104, + "balance_loss_mlp": 1.01311994, + "epoch": 0.5269334359368988, + "flos": 931462183680.0, + "grad_norm": 0.03276977156640091, + "language_loss": 0.84196591, + "learning_rate": 0.00048068918077736163, + "loss": 0.85250694, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.40991211, + "step": 2739, + "time_per_iteration": 3.2470173835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051244, + "balance_loss_mlp": 1.01004505, + "epoch": 0.5271258176221624, + "flos": 656635513344.0, + "grad_norm": 0.03436954846361053, + "language_loss": 0.82138938, + "learning_rate": 0.0004803778746759001, + "loss": 0.83190179, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.41210938, + "step": 2740, + "time_per_iteration": 2.920330286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051176, + "balance_loss_mlp": 1.01007247, + "epoch": 0.527318199307426, + "flos": 544062954240.0, + "grad_norm": 0.045913237701965745, + "language_loss": 0.82631075, + "learning_rate": 0.00048006657619242317, + "loss": 0.83682251, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.41113281, + "step": 2741, + "time_per_iteration": 2.612001419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_mlp": 1.00462067, + "epoch": 0.5275105809926895, + "flos": 448899201024.0, + "grad_norm": 0.036563153452021165, + "language_loss": 0.78434455, + "learning_rate": 0.00047975528544778775, + "loss": 0.7948041, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.41357422, + "step": 2742, + "time_per_iteration": 2.590146064758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_mlp": 1.00130069, + "epoch": 0.527702962677953, + "flos": 580053861120.0, + "grad_norm": 0.038221984800347206, + "language_loss": 0.89132345, + "learning_rate": 0.00047944400256284754, + "loss": 0.90174961, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.41333008, + "step": 2743, + "time_per_iteration": 2.691096305847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046686, + "balance_loss_mlp": 1.00548708, + "epoch": 0.5278953443632166, + "flos": 654010923264.0, + "grad_norm": 0.03476413811576821, + "language_loss": 0.80653423, + "learning_rate": 0.0004791327276584532, + "loss": 0.8170011, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.41210938, + "step": 2744, + "time_per_iteration": 2.8089282512664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048783, + "balance_loss_mlp": 1.00753677, + "epoch": 0.5280877260484802, + "flos": 515049566976.0, + "grad_norm": 0.03187296499214836, + "language_loss": 0.81036532, + "learning_rate": 0.00047882146085545264, + "loss": 0.82085317, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.41259766, + "step": 2745, + "time_per_iteration": 2.646883010864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055283, + "balance_loss_mlp": 1.01541901, + "epoch": 0.5282801077337438, + "flos": 1448715421440.0, + "grad_norm": 0.006687794222264933, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76457667, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.3984375, + "step": 2746, + "time_per_iteration": 4.967897653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_mlp": 1.00703144, + "epoch": 0.5284724894190073, + "flos": 605967315456.0, + "grad_norm": 0.03667028691338261, + "language_loss": 0.80105197, + "learning_rate": 0.00047819895203700684, + "loss": 0.81153399, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.41186523, + "step": 2747, + "time_per_iteration": 2.7146098613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105164, + "balance_loss_mlp": 1.01187134, + "epoch": 0.5286648711042709, + "flos": 1498106323200.0, + "grad_norm": 0.006729060992495368, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76564074, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.39746094, + "step": 2748, + "time_per_iteration": 4.6327197551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045337, + "balance_loss_mlp": 1.00416195, + "epoch": 0.5288572527895344, + "flos": 598834260480.0, + "grad_norm": 0.03692084834433464, + "language_loss": 0.89385319, + "learning_rate": 0.0004775764770742277, + "loss": 0.90430653, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.41186523, + "step": 2749, + "time_per_iteration": 2.807567834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_mlp": 1.00394237, + "epoch": 0.529049634474798, + "flos": 558440846592.0, + "grad_norm": 0.03911259999059639, + "language_loss": 0.87067056, + "learning_rate": 0.00047726525259079777, + "loss": 0.88112199, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.41210938, + "step": 2750, + "time_per_iteration": 2.7838735580444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_mlp": 1.00348663, + "epoch": 0.5292420161600616, + "flos": 582435432960.0, + "grad_norm": 0.03406590895995427, + "language_loss": 0.89342177, + "learning_rate": 0.0004769540369337798, + "loss": 0.9038682, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.41162109, + "step": 2751, + "time_per_iteration": 2.716430902481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_mlp": 1.00405395, + "epoch": 0.5294343978453251, + "flos": 609564945408.0, + "grad_norm": 0.0303004693379624, + "language_loss": 0.8646909, + "learning_rate": 0.00047664283022399794, + "loss": 0.87514395, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.41259766, + "step": 2752, + "time_per_iteration": 2.8746426105499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048518, + "balance_loss_mlp": 1.00736678, + "epoch": 0.5296267795305887, + "flos": 647710935552.0, + "grad_norm": 0.032209809873809676, + "language_loss": 0.81781971, + "learning_rate": 0.00047633163258227376, + "loss": 0.82830489, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.41162109, + "step": 2753, + "time_per_iteration": 2.859628677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048376, + "balance_loss_mlp": 1.0070343, + "epoch": 0.5298191612158523, + "flos": 560806867200.0, + "grad_norm": 0.034095977821307535, + "language_loss": 0.85918152, + "learning_rate": 0.0004760204441294247, + "loss": 0.86966527, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.41357422, + "step": 2754, + "time_per_iteration": 2.642761707305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049622, + "balance_loss_mlp": 1.00842357, + "epoch": 0.5300115429011159, + "flos": 515132192256.0, + "grad_norm": 0.03324074908377848, + "language_loss": 0.86806327, + "learning_rate": 0.00047570926498626486, + "loss": 0.87855953, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.41210938, + "step": 2755, + "time_per_iteration": 2.688204765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048264, + "balance_loss_mlp": 1.00699341, + "epoch": 0.5302039245863793, + "flos": 674050155264.0, + "grad_norm": 0.032282959747224574, + "language_loss": 0.82332271, + "learning_rate": 0.00047539809527360474, + "loss": 0.83380532, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.4128418, + "step": 2756, + "time_per_iteration": 2.891369104385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051726, + "balance_loss_mlp": 1.01052761, + "epoch": 0.5303963062716429, + "flos": 732157609728.0, + "grad_norm": 0.027910460797545535, + "language_loss": 0.82830453, + "learning_rate": 0.0004750869351122511, + "loss": 0.83882177, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.41210938, + "step": 2757, + "time_per_iteration": 2.9782614707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051325, + "balance_loss_mlp": 1.01015055, + "epoch": 0.5305886879569065, + "flos": 574552914432.0, + "grad_norm": 0.03118318769242836, + "language_loss": 0.82440865, + "learning_rate": 0.00047477578462300685, + "loss": 0.83492196, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.41186523, + "step": 2758, + "time_per_iteration": 2.7210254669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104635, + "balance_loss_mlp": 1.00498474, + "epoch": 0.5307810696421701, + "flos": 696729528576.0, + "grad_norm": 0.03181982217221047, + "language_loss": 0.79867083, + "learning_rate": 0.0004744646439266718, + "loss": 0.8091343, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.41381836, + "step": 2759, + "time_per_iteration": 2.997299909591675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046488, + "balance_loss_mlp": 1.005265, + "epoch": 0.5309734513274337, + "flos": 650203322880.0, + "grad_norm": 0.04897119780065821, + "language_loss": 0.92728293, + "learning_rate": 0.000474153513144041, + "loss": 0.93774784, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.41235352, + "step": 2760, + "time_per_iteration": 2.9030909538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047525, + "balance_loss_mlp": 1.00618315, + "epoch": 0.5311658330126972, + "flos": 606056743680.0, + "grad_norm": 0.03383323202633534, + "language_loss": 0.87311566, + "learning_rate": 0.00047384239239590633, + "loss": 0.88359094, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.41357422, + "step": 2761, + "time_per_iteration": 2.8522770404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_mlp": 1.00859571, + "epoch": 0.5313582146979607, + "flos": 559317655296.0, + "grad_norm": 0.03320129260812799, + "language_loss": 0.89026552, + "learning_rate": 0.0004735312818030556, + "loss": 0.90076458, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.41333008, + "step": 2762, + "time_per_iteration": 2.6917500495910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00390708, + "epoch": 0.5315505963832243, + "flos": 509446553088.0, + "grad_norm": 0.032512052220750494, + "language_loss": 0.8324827, + "learning_rate": 0.0004732201814862727, + "loss": 0.84293473, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.41308594, + "step": 2763, + "time_per_iteration": 2.7620086669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045981, + "balance_loss_mlp": 1.00461555, + "epoch": 0.5317429780684879, + "flos": 627669758208.0, + "grad_norm": 0.03302669202039023, + "language_loss": 0.81508183, + "learning_rate": 0.0004729090915663373, + "loss": 0.82554156, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.41381836, + "step": 2764, + "time_per_iteration": 2.827430248260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_mlp": 1.00333464, + "epoch": 0.5319353597537514, + "flos": 477699705600.0, + "grad_norm": 0.039772813062738895, + "language_loss": 0.85676539, + "learning_rate": 0.00047259801216402534, + "loss": 0.86721289, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.41430664, + "step": 2765, + "time_per_iteration": 2.5082104206085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104674, + "balance_loss_mlp": 1.00535059, + "epoch": 0.532127741439015, + "flos": 502634284032.0, + "grad_norm": 0.03926492526470634, + "language_loss": 0.86841261, + "learning_rate": 0.00047228694340010845, + "loss": 0.87888008, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.4140625, + "step": 2766, + "time_per_iteration": 2.549739360809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047042, + "balance_loss_mlp": 1.00555718, + "epoch": 0.5323201231242786, + "flos": 1166484510720.0, + "grad_norm": 0.033303639033777616, + "language_loss": 0.86118937, + "learning_rate": 0.0004719758853953544, + "loss": 0.87165976, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.41503906, + "step": 2767, + "time_per_iteration": 3.5872445106506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_mlp": 1.00888503, + "epoch": 0.5325125048095422, + "flos": 379541977344.0, + "grad_norm": 0.045646551162954616, + "language_loss": 0.84812796, + "learning_rate": 0.00047166483827052645, + "loss": 0.85863209, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.41552734, + "step": 2768, + "time_per_iteration": 2.4177846908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057545, + "balance_loss_mlp": 1.01796722, + "epoch": 0.5327048864948057, + "flos": 1544750147328.0, + "grad_norm": 0.015563445131555704, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78136033, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.39550781, + "step": 2769, + "time_per_iteration": 4.974437236785889 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00447309, + "epoch": 0.5328972681800692, + "flos": 912862586112.0, + "grad_norm": 0.03252924413682995, + "language_loss": 0.84066141, + "learning_rate": 0.000471042777143682, + "loss": 0.85112101, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.41503906, + "step": 2770, + "time_per_iteration": 3.204782724380493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104834, + "balance_loss_mlp": 1.00680697, + "epoch": 0.5330896498653328, + "flos": 474851539200.0, + "grad_norm": 0.03462661973501109, + "language_loss": 0.80093729, + "learning_rate": 0.0004707317633831707, + "loss": 0.81142068, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.41552734, + "step": 2771, + "time_per_iteration": 2.566772699356079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049499, + "balance_loss_mlp": 1.00789511, + "epoch": 0.5332820315505964, + "flos": 502634284032.0, + "grad_norm": 0.03484250248812788, + "language_loss": 0.78787035, + "learning_rate": 0.00047042076098559673, + "loss": 0.79836535, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.41625977, + "step": 2772, + "time_per_iteration": 2.5929906368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_mlp": 1.00454724, + "epoch": 0.53347441323586, + "flos": 926033168640.0, + "grad_norm": 0.038112679556298976, + "language_loss": 0.74248701, + "learning_rate": 0.00047010977007170174, + "loss": 0.75295115, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.41894531, + "step": 2773, + "time_per_iteration": 3.221947193145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_mlp": 1.00956452, + "epoch": 0.5336667949211235, + "flos": 575540538624.0, + "grad_norm": 0.03388488907034337, + "language_loss": 0.83005095, + "learning_rate": 0.00046979879076222334, + "loss": 0.8405627, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.41625977, + "step": 2774, + "time_per_iteration": 2.7014822959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_mlp": 1.00767875, + "epoch": 0.533859176606387, + "flos": 1066392363264.0, + "grad_norm": 0.03095569704566717, + "language_loss": 0.85300922, + "learning_rate": 0.0004694878231778939, + "loss": 0.86350143, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.41552734, + "step": 2775, + "time_per_iteration": 3.368795156478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048588, + "balance_loss_mlp": 1.00700808, + "epoch": 0.5340515582916506, + "flos": 747907095552.0, + "grad_norm": 0.030429614039409136, + "language_loss": 0.84799051, + "learning_rate": 0.0004691768674394423, + "loss": 0.8584764, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.41601562, + "step": 2776, + "time_per_iteration": 2.958280324935913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052456, + "balance_loss_mlp": 1.01230621, + "epoch": 0.5342439399769142, + "flos": 1448821379328.0, + "grad_norm": 0.012202915272427423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85536468, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.40136719, + "step": 2777, + "time_per_iteration": 4.774897575378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_mlp": 1.00908661, + "epoch": 0.5344363216621778, + "flos": 1430699069952.0, + "grad_norm": 0.005918596107012712, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77702767, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.40136719, + "step": 2778, + "time_per_iteration": 4.978635549545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051694, + "balance_loss_mlp": 1.01039958, + "epoch": 0.5346287033474413, + "flos": 528676050432.0, + "grad_norm": 0.029867236989907914, + "language_loss": 0.79874206, + "learning_rate": 0.00046824407250656676, + "loss": 0.80925894, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.41308594, + "step": 2779, + "time_per_iteration": 2.610321044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_mlp": 1.00790143, + "epoch": 0.5348210850327049, + "flos": 511756193280.0, + "grad_norm": 0.03028632537310572, + "language_loss": 0.83974576, + "learning_rate": 0.0004679331653588161, + "loss": 0.85023701, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.41235352, + "step": 2780, + "time_per_iteration": 2.641401529312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_mlp": 1.00530362, + "epoch": 0.5350134667179685, + "flos": 463626069504.0, + "grad_norm": 0.032724184133620285, + "language_loss": 0.86073065, + "learning_rate": 0.0004676222706605147, + "loss": 0.87119734, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.41381836, + "step": 2781, + "time_per_iteration": 2.6093719005584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046994, + "balance_loss_mlp": 1.005795, + "epoch": 0.535205848403232, + "flos": 710118829824.0, + "grad_norm": 0.033538440780340566, + "language_loss": 0.85521388, + "learning_rate": 0.0004673113885323626, + "loss": 0.86568379, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.41210938, + "step": 2782, + "time_per_iteration": 2.8278369903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_mlp": 1.00337684, + "epoch": 0.5353982300884956, + "flos": 895793029632.0, + "grad_norm": 0.03115315889801346, + "language_loss": 0.79367262, + "learning_rate": 0.00046700051909505494, + "loss": 0.80411977, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.41357422, + "step": 2783, + "time_per_iteration": 3.181025743484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_mlp": 1.00410628, + "epoch": 0.5355906117737591, + "flos": 537025163520.0, + "grad_norm": 0.03272022966866855, + "language_loss": 0.84359205, + "learning_rate": 0.000466689662469282, + "loss": 0.85404533, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.41235352, + "step": 2784, + "time_per_iteration": 2.623128890991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045278, + "balance_loss_mlp": 1.00419891, + "epoch": 0.5357829934590227, + "flos": 870328673280.0, + "grad_norm": 0.0344669350963294, + "language_loss": 0.84610772, + "learning_rate": 0.00046637881877572917, + "loss": 0.85656047, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.41088867, + "step": 2785, + "time_per_iteration": 3.079174757003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010433, + "balance_loss_mlp": 1.00229168, + "epoch": 0.5359753751442863, + "flos": 554446608384.0, + "grad_norm": 0.028858393123854686, + "language_loss": 0.85135722, + "learning_rate": 0.0004660679881350764, + "loss": 0.86179018, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.41015625, + "step": 2786, + "time_per_iteration": 2.7473020553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_mlp": 1.00150299, + "epoch": 0.5361677568295499, + "flos": 1483759533312.0, + "grad_norm": 0.0067453290840893895, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76649511, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.3984375, + "step": 2787, + "time_per_iteration": 5.041473627090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_mlp": 1.0027802, + "epoch": 0.5363601385148133, + "flos": 807642767616.0, + "grad_norm": 0.03504389904677532, + "language_loss": 0.78613555, + "learning_rate": 0.0004654463664951667, + "loss": 0.79657346, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.41015625, + "step": 2788, + "time_per_iteration": 2.9798529148101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_mlp": 1.00775349, + "epoch": 0.5365525202000769, + "flos": 508879836672.0, + "grad_norm": 0.03320853792290129, + "language_loss": 0.8327626, + "learning_rate": 0.0004651355757372447, + "loss": 0.84325004, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.40991211, + "step": 2789, + "time_per_iteration": 2.643827438354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720656, + "epoch": 0.5367449018853405, + "flos": 530015563008.0, + "grad_norm": 0.032066447391342436, + "language_loss": 0.8626231, + "learning_rate": 0.00046482479851489274, + "loss": 0.87310588, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.41088867, + "step": 2790, + "time_per_iteration": 2.7637765407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046715, + "balance_loss_mlp": 1.0056597, + "epoch": 0.5369372835706041, + "flos": 651217191936.0, + "grad_norm": 0.038515792328953954, + "language_loss": 0.78515691, + "learning_rate": 0.00046451403494876525, + "loss": 0.79562402, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.41064453, + "step": 2791, + "time_per_iteration": 2.9090025424957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_mlp": 1.00504696, + "epoch": 0.5371296652558677, + "flos": 585628684800.0, + "grad_norm": 0.03231753899308558, + "language_loss": 0.84747189, + "learning_rate": 0.0004642032851595111, + "loss": 0.85793316, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.41088867, + "step": 2792, + "time_per_iteration": 2.775444507598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_mlp": 1.00717819, + "epoch": 0.5373220469411312, + "flos": 597084533760.0, + "grad_norm": 0.03483653357210067, + "language_loss": 0.85361469, + "learning_rate": 0.00046389254926777404, + "loss": 0.86409795, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.41162109, + "step": 2793, + "time_per_iteration": 2.8168118000030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00378096, + "epoch": 0.5375144286263948, + "flos": 1116279016704.0, + "grad_norm": 0.03171846878783484, + "language_loss": 0.78282589, + "learning_rate": 0.0004635818273941926, + "loss": 0.79327619, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.41259766, + "step": 2794, + "time_per_iteration": 3.5206284523010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_mlp": 1.00301409, + "epoch": 0.5377068103116583, + "flos": 596769583872.0, + "grad_norm": 0.0416500636560626, + "language_loss": 0.82705241, + "learning_rate": 0.0004632711196593997, + "loss": 0.83749551, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.41308594, + "step": 2795, + "time_per_iteration": 2.81925892829895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010512, + "balance_loss_mlp": 1.0100255, + "epoch": 0.5378991919969219, + "flos": 885650448384.0, + "grad_norm": 0.03764518727969069, + "language_loss": 0.85939819, + "learning_rate": 0.00046296042618402297, + "loss": 0.86991024, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.41186523, + "step": 2796, + "time_per_iteration": 3.076819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047791, + "balance_loss_mlp": 1.00666356, + "epoch": 0.5380915736821854, + "flos": 711951181824.0, + "grad_norm": 0.02842771896049368, + "language_loss": 0.79539001, + "learning_rate": 0.0004626497470886839, + "loss": 0.80586791, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.41137695, + "step": 2797, + "time_per_iteration": 2.9846107959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_mlp": 1.00844073, + "epoch": 0.538283955367449, + "flos": 558115203072.0, + "grad_norm": 0.029565541443496178, + "language_loss": 0.82388103, + "learning_rate": 0.00046233908249399897, + "loss": 0.83437717, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.41186523, + "step": 2798, + "time_per_iteration": 2.7782254219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01255548, + "epoch": 0.5384763370527126, + "flos": 514482850560.0, + "grad_norm": 0.03320479864481119, + "language_loss": 0.78804994, + "learning_rate": 0.00046202843252057905, + "loss": 0.79858828, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.4128418, + "step": 2799, + "time_per_iteration": 2.60296368598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_mlp": 1.00985634, + "epoch": 0.5386687187379762, + "flos": 490720588800.0, + "grad_norm": 0.036707180351256564, + "language_loss": 0.84230787, + "learning_rate": 0.00046171779728902896, + "loss": 0.8528192, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.4128418, + "step": 2800, + "time_per_iteration": 2.5585505962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_mlp": 1.00514555, + "epoch": 0.5388611004232398, + "flos": 483628363008.0, + "grad_norm": 0.04683117604826235, + "language_loss": 0.86678994, + "learning_rate": 0.000461407176919948, + "loss": 0.87725389, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.41259766, + "step": 2801, + "time_per_iteration": 2.5158677101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045703, + "balance_loss_mlp": 1.00440919, + "epoch": 0.5390534821085032, + "flos": 562089999360.0, + "grad_norm": 0.033429611400543416, + "language_loss": 0.85806906, + "learning_rate": 0.00046109657153392997, + "loss": 0.8685261, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.41308594, + "step": 2802, + "time_per_iteration": 2.685462236404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_mlp": 1.00591016, + "epoch": 0.5392458637937668, + "flos": 489361634304.0, + "grad_norm": 0.036955437438287664, + "language_loss": 0.83497781, + "learning_rate": 0.0004607859812515622, + "loss": 0.84544891, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.41210938, + "step": 2803, + "time_per_iteration": 2.6187045574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054182, + "balance_loss_mlp": 1.01300752, + "epoch": 0.5394382454790304, + "flos": 513050019072.0, + "grad_norm": 0.03744234433888121, + "language_loss": 0.88279247, + "learning_rate": 0.00046047540619342667, + "loss": 0.89333427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.41186523, + "step": 2804, + "time_per_iteration": 2.5895795822143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046381, + "balance_loss_mlp": 1.00525355, + "epoch": 0.539630627164294, + "flos": 568689385728.0, + "grad_norm": 0.033797229327163864, + "language_loss": 0.80605161, + "learning_rate": 0.00046016484648009933, + "loss": 0.81651545, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.41137695, + "step": 2805, + "time_per_iteration": 2.691092014312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_mlp": 1.00612748, + "epoch": 0.5398230088495575, + "flos": 527503733760.0, + "grad_norm": 0.03721333567310717, + "language_loss": 0.8141259, + "learning_rate": 0.0004598543022321501, + "loss": 0.82459861, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.41162109, + "step": 2806, + "time_per_iteration": 2.6083474159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_mlp": 1.00312901, + "epoch": 0.5400153905348211, + "flos": 539853888000.0, + "grad_norm": 0.03209862982455251, + "language_loss": 0.80560988, + "learning_rate": 0.0004595437735701433, + "loss": 0.81605339, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.41235352, + "step": 2807, + "time_per_iteration": 2.688770055770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_mlp": 1.00354242, + "epoch": 0.5402077722200846, + "flos": 514665597696.0, + "grad_norm": 0.03651112385557252, + "language_loss": 0.83778703, + "learning_rate": 0.00045923326061463623, + "loss": 0.84823376, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.41137695, + "step": 2808, + "time_per_iteration": 2.761165142059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_mlp": 1.00534451, + "epoch": 0.5404001539053482, + "flos": 677567105280.0, + "grad_norm": 0.031915220360544935, + "language_loss": 0.81941223, + "learning_rate": 0.00045892276348618113, + "loss": 0.82987767, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.41210938, + "step": 2809, + "time_per_iteration": 2.9716503620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105954, + "balance_loss_mlp": 1.01948547, + "epoch": 0.5405925355906118, + "flos": 1558191938304.0, + "grad_norm": 0.009079850654737754, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79320371, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.40039062, + "step": 2810, + "time_per_iteration": 4.989593029022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051922, + "balance_loss_mlp": 1.01069915, + "epoch": 0.5407849172758753, + "flos": 648538166784.0, + "grad_norm": 0.030063831285765737, + "language_loss": 0.81372178, + "learning_rate": 0.000458301817192603, + "loss": 0.82424104, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.41235352, + "step": 2811, + "time_per_iteration": 2.855461359024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063999, + "balance_loss_mlp": 1.02404022, + "epoch": 0.5409772989611389, + "flos": 1410483893760.0, + "grad_norm": 0.010433444863556941, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81905782, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.39941406, + "step": 2812, + "time_per_iteration": 4.82320761680603 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048585, + "balance_loss_mlp": 1.00748193, + "epoch": 0.5411696806464025, + "flos": 555545048064.0, + "grad_norm": 0.0337189850887645, + "language_loss": 0.87703073, + "learning_rate": 0.00045768093565369983, + "loss": 0.88751662, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.41113281, + "step": 2813, + "time_per_iteration": 2.7693569660186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047899, + "balance_loss_mlp": 1.00660491, + "epoch": 0.5413620623316661, + "flos": 529205828352.0, + "grad_norm": 0.032417929995103685, + "language_loss": 0.82523155, + "learning_rate": 0.0004573705194685646, + "loss": 0.83571053, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.41308594, + "step": 2814, + "time_per_iteration": 2.6525402069091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_mlp": 1.00637758, + "epoch": 0.5415544440169295, + "flos": 599852020224.0, + "grad_norm": 0.03532378336462207, + "language_loss": 0.85743833, + "learning_rate": 0.00045706011983366157, + "loss": 0.86791384, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.41186523, + "step": 2815, + "time_per_iteration": 2.67850661277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_mlp": 1.0085113, + "epoch": 0.5417468257021931, + "flos": 471714667776.0, + "grad_norm": 0.039926593194372036, + "language_loss": 0.83561838, + "learning_rate": 0.00045674973686949847, + "loss": 0.84611619, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.4128418, + "step": 2816, + "time_per_iteration": 2.56265926361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_mlp": 1.00839996, + "epoch": 0.5419392073874567, + "flos": 682191243264.0, + "grad_norm": 0.04027281254885066, + "language_loss": 0.85790694, + "learning_rate": 0.0004564393706965766, + "loss": 0.86840272, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.41186523, + "step": 2817, + "time_per_iteration": 2.955655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_mlp": 1.00700641, + "epoch": 0.5421315890727203, + "flos": 463337364480.0, + "grad_norm": 0.033241337033607515, + "language_loss": 0.82050943, + "learning_rate": 0.00045612902143539116, + "loss": 0.83099198, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.41259766, + "step": 2818, + "time_per_iteration": 2.546567440032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_mlp": 1.0021013, + "epoch": 0.5423239707579839, + "flos": 437890504704.0, + "grad_norm": 0.03727551718578137, + "language_loss": 0.82264733, + "learning_rate": 0.00045581868920642986, + "loss": 0.83307964, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.41137695, + "step": 2819, + "time_per_iteration": 2.4746038913726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043245, + "balance_loss_mlp": 1.00197434, + "epoch": 0.5425163524432474, + "flos": 459306187776.0, + "grad_norm": 0.035271404401503774, + "language_loss": 0.80009091, + "learning_rate": 0.00045550837413017457, + "loss": 0.81052339, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.4128418, + "step": 2820, + "time_per_iteration": 2.598879098892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00348616, + "epoch": 0.542708734128511, + "flos": 420410734080.0, + "grad_norm": 0.029285477013781286, + "language_loss": 0.8579312, + "learning_rate": 0.0004551980763271005, + "loss": 0.86837852, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.41259766, + "step": 2821, + "time_per_iteration": 2.650609254837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.00917685, + "epoch": 0.5429011158137745, + "flos": 679709549568.0, + "grad_norm": 0.038877958454501954, + "language_loss": 0.84286433, + "learning_rate": 0.0004548877959176756, + "loss": 0.8533681, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.41210938, + "step": 2822, + "time_per_iteration": 2.831773042678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049844, + "balance_loss_mlp": 1.00857341, + "epoch": 0.5430934974990381, + "flos": 541968142080.0, + "grad_norm": 0.03541809911924704, + "language_loss": 0.8707608, + "learning_rate": 0.00045457753302236166, + "loss": 0.8812592, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.4128418, + "step": 2823, + "time_per_iteration": 2.609090805053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_mlp": 1.00726891, + "epoch": 0.5432858791843016, + "flos": 659644072704.0, + "grad_norm": 0.03671475643697152, + "language_loss": 0.87739956, + "learning_rate": 0.00045426728776161353, + "loss": 0.8878845, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.41235352, + "step": 2824, + "time_per_iteration": 2.802915334701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_mlp": 1.00574553, + "epoch": 0.5434782608695652, + "flos": 532967741952.0, + "grad_norm": 0.03427907044877429, + "language_loss": 0.82057846, + "learning_rate": 0.00045395706025587863, + "loss": 0.83104837, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.41259766, + "step": 2825, + "time_per_iteration": 2.6308939456939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_mlp": 1.00194418, + "epoch": 0.5436706425548288, + "flos": 609633964800.0, + "grad_norm": 0.034616126048734014, + "language_loss": 0.8290934, + "learning_rate": 0.00045364685062559843, + "loss": 0.83952391, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.41113281, + "step": 2826, + "time_per_iteration": 2.8231375217437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_mlp": 1.006657, + "epoch": 0.5438630242400924, + "flos": 706773933312.0, + "grad_norm": 0.03098010756730768, + "language_loss": 0.92170852, + "learning_rate": 0.0004533366589912067, + "loss": 0.93218541, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.41040039, + "step": 2827, + "time_per_iteration": 2.9529805183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105042, + "balance_loss_mlp": 1.00912547, + "epoch": 0.544055405925356, + "flos": 857839513344.0, + "grad_norm": 0.036966152235284246, + "language_loss": 0.78087002, + "learning_rate": 0.0004530264854731306, + "loss": 0.79137421, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.41308594, + "step": 2828, + "time_per_iteration": 3.0584123134613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00913441, + "epoch": 0.5442477876106194, + "flos": 572968438272.0, + "grad_norm": 0.03388858680916364, + "language_loss": 0.84792554, + "learning_rate": 0.00045271633019179034, + "loss": 0.85842907, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.41235352, + "step": 2829, + "time_per_iteration": 2.827160596847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046647, + "balance_loss_mlp": 1.00532901, + "epoch": 0.544440169295883, + "flos": 626803643136.0, + "grad_norm": 0.02947280635893411, + "language_loss": 0.88373405, + "learning_rate": 0.0004524061932675986, + "loss": 0.89420056, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.41333008, + "step": 2830, + "time_per_iteration": 2.8206188678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_mlp": 1.00768852, + "epoch": 0.5446325509811466, + "flos": 837641833728.0, + "grad_norm": 0.03760239902604625, + "language_loss": 0.87454915, + "learning_rate": 0.00045209607482096125, + "loss": 0.88503784, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.41186523, + "step": 2831, + "time_per_iteration": 3.0359649658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_mlp": 1.00600255, + "epoch": 0.5448249326664102, + "flos": 484390465536.0, + "grad_norm": 0.03560900416786153, + "language_loss": 0.8480038, + "learning_rate": 0.0004517859749722772, + "loss": 0.85847604, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.41235352, + "step": 2832, + "time_per_iteration": 2.689295768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050991, + "balance_loss_mlp": 1.00972044, + "epoch": 0.5450173143516738, + "flos": 562346623488.0, + "grad_norm": 0.03426430427633819, + "language_loss": 0.79531574, + "learning_rate": 0.0004514758938419376, + "loss": 0.80582559, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.4128418, + "step": 2833, + "time_per_iteration": 2.8727176189422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049419, + "balance_loss_mlp": 1.00965118, + "epoch": 0.5452096960369373, + "flos": 1473588761856.0, + "grad_norm": 0.014550980978032766, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77970004, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.39746094, + "step": 2834, + "time_per_iteration": 4.9399590492248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_mlp": 1.00556791, + "epoch": 0.5454020777222008, + "flos": 466018334976.0, + "grad_norm": 0.03248736316688099, + "language_loss": 0.84558713, + "learning_rate": 0.00045085578821782175, + "loss": 0.85605574, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.41308594, + "step": 2835, + "time_per_iteration": 2.5900182723999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057518, + "balance_loss_mlp": 1.01784515, + "epoch": 0.5455944594074644, + "flos": 1472617667328.0, + "grad_norm": 0.013168056581512213, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77192259, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.39648438, + "step": 2836, + "time_per_iteration": 4.910645961761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_mlp": 1.01100063, + "epoch": 0.545786841092728, + "flos": 534305309184.0, + "grad_norm": 0.02738620901632673, + "language_loss": 0.81102663, + "learning_rate": 0.00045023575891159866, + "loss": 0.82154894, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.41235352, + "step": 2837, + "time_per_iteration": 2.7457492351531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_mlp": 1.00682068, + "epoch": 0.5459792227779915, + "flos": 1355428740096.0, + "grad_norm": 0.008010480990562174, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75810492, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.3984375, + "step": 2838, + "time_per_iteration": 4.94202995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_mlp": 1.00748277, + "epoch": 0.5461716044632551, + "flos": 639073117440.0, + "grad_norm": 0.02877585305336934, + "language_loss": 0.78956163, + "learning_rate": 0.0004496158068861354, + "loss": 0.80004895, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.41259766, + "step": 2839, + "time_per_iteration": 2.808370590209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_mlp": 1.00642872, + "epoch": 0.5463639861485187, + "flos": 603926938368.0, + "grad_norm": 0.03433602558833516, + "language_loss": 0.81297666, + "learning_rate": 0.00044930586015455207, + "loss": 0.82345319, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.41235352, + "step": 2840, + "time_per_iteration": 2.782735824584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_mlp": 1.00695133, + "epoch": 0.5465563678337823, + "flos": 643753635840.0, + "grad_norm": 0.02662038136573285, + "language_loss": 0.89087546, + "learning_rate": 0.000448995933104179, + "loss": 0.9013567, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.41186523, + "step": 2841, + "time_per_iteration": 2.869476318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050304, + "balance_loss_mlp": 1.0090816, + "epoch": 0.5467487495190458, + "flos": 615365290752.0, + "grad_norm": 0.03719587304070891, + "language_loss": 0.80725658, + "learning_rate": 0.00044868602585534077, + "loss": 0.81775963, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.41235352, + "step": 2842, + "time_per_iteration": 2.843027353286743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_mlp": 1.00552344, + "epoch": 0.5469411312043093, + "flos": 462128109312.0, + "grad_norm": 0.03959126806850753, + "language_loss": 0.89450765, + "learning_rate": 0.0004483761385283541, + "loss": 0.90497464, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.41186523, + "step": 2843, + "time_per_iteration": 2.5162315368652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044431, + "balance_loss_mlp": 1.00332797, + "epoch": 0.5471335128895729, + "flos": 562267888896.0, + "grad_norm": 0.03475490738980998, + "language_loss": 0.82207608, + "learning_rate": 0.0004480662712435281, + "loss": 0.83252037, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.41113281, + "step": 2844, + "time_per_iteration": 2.7367589473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_mlp": 1.0045476, + "epoch": 0.5473258945748365, + "flos": 519686343936.0, + "grad_norm": 0.032685207895773144, + "language_loss": 0.8903448, + "learning_rate": 0.0004477564241211635, + "loss": 0.90080059, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.41040039, + "step": 2845, + "time_per_iteration": 2.6059961318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.00640178, + "epoch": 0.5475182762601001, + "flos": 434744884992.0, + "grad_norm": 0.035185291050346845, + "language_loss": 0.87463105, + "learning_rate": 0.0004474465972815541, + "loss": 0.88510644, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.41137695, + "step": 2846, + "time_per_iteration": 2.5159108638763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_mlp": 1.00808775, + "epoch": 0.5477106579453636, + "flos": 512574676224.0, + "grad_norm": 0.03033857724648134, + "language_loss": 0.88145, + "learning_rate": 0.000447136790844985, + "loss": 0.89194143, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.41064453, + "step": 2847, + "time_per_iteration": 2.7494916915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049923, + "balance_loss_mlp": 1.00889075, + "epoch": 0.5479030396306271, + "flos": 677141339904.0, + "grad_norm": 0.030728657632270156, + "language_loss": 0.81529921, + "learning_rate": 0.00044682700493173385, + "loss": 0.82579845, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.41040039, + "step": 2848, + "time_per_iteration": 2.8558499813079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043616, + "balance_loss_mlp": 1.00260758, + "epoch": 0.5480954213158907, + "flos": 877579346688.0, + "grad_norm": 0.03576262257130289, + "language_loss": 0.80969125, + "learning_rate": 0.00044651723966207004, + "loss": 0.82012743, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.41015625, + "step": 2849, + "time_per_iteration": 3.1599223613739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_mlp": 1.00768459, + "epoch": 0.5482878030011543, + "flos": 623175877632.0, + "grad_norm": 0.0450385792128453, + "language_loss": 0.79220605, + "learning_rate": 0.00044620749515625536, + "loss": 0.80269301, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.41015625, + "step": 2850, + "time_per_iteration": 2.816164255142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044849, + "balance_loss_mlp": 1.00376952, + "epoch": 0.5484801846864179, + "flos": 498258021888.0, + "grad_norm": 0.033687612572946876, + "language_loss": 0.85353971, + "learning_rate": 0.00044589777153454334, + "loss": 0.86398828, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.41088867, + "step": 2851, + "time_per_iteration": 2.767086982727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_mlp": 1.00158429, + "epoch": 0.5486725663716814, + "flos": 443354512896.0, + "grad_norm": 0.032917884516517996, + "language_loss": 0.84102762, + "learning_rate": 0.00044558806891717895, + "loss": 0.85145497, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.41162109, + "step": 2852, + "time_per_iteration": 2.4791274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_mlp": 1.00560999, + "epoch": 0.548864948056945, + "flos": 656348753664.0, + "grad_norm": 0.02926310360240776, + "language_loss": 0.80048501, + "learning_rate": 0.0004452783874243998, + "loss": 0.81095093, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.40991211, + "step": 2853, + "time_per_iteration": 2.8510489463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051891, + "balance_loss_mlp": 1.01100183, + "epoch": 0.5490573297422086, + "flos": 547141499904.0, + "grad_norm": 0.035598285504377866, + "language_loss": 0.85552013, + "learning_rate": 0.00044496872717643475, + "loss": 0.86603898, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.40893555, + "step": 2854, + "time_per_iteration": 2.6640069484710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107375, + "balance_loss_mlp": 1.03398132, + "epoch": 0.5492497114274721, + "flos": 1593763882752.0, + "grad_norm": 0.015003928091872471, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.7816304, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.39746094, + "step": 2855, + "time_per_iteration": 4.924941778182983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.00791013, + "epoch": 0.5494420931127356, + "flos": 752270718720.0, + "grad_norm": 0.03382110809465603, + "language_loss": 0.82668245, + "learning_rate": 0.0004443494708958217, + "loss": 0.83717024, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.40869141, + "step": 2856, + "time_per_iteration": 2.9736838340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049194, + "balance_loss_mlp": 1.00837672, + "epoch": 0.5496344747979992, + "flos": 627305230848.0, + "grad_norm": 0.02827813290363101, + "language_loss": 0.81289691, + "learning_rate": 0.0004440398751035906, + "loss": 0.82338881, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.40820312, + "step": 2857, + "time_per_iteration": 2.943936347961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_mlp": 1.01289868, + "epoch": 0.5498268564832628, + "flos": 524125789440.0, + "grad_norm": 0.04150845511788398, + "language_loss": 0.8407867, + "learning_rate": 0.00044373030103700645, + "loss": 0.85132337, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.40771484, + "step": 2858, + "time_per_iteration": 2.5977840423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_mlp": 1.00719178, + "epoch": 0.5500192381685264, + "flos": 605778732288.0, + "grad_norm": 0.03313045470580536, + "language_loss": 0.80440414, + "learning_rate": 0.000443420748816257, + "loss": 0.81488407, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.40795898, + "step": 2859, + "time_per_iteration": 2.7645347118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049935, + "balance_loss_mlp": 1.00914145, + "epoch": 0.55021161985379, + "flos": 521655756288.0, + "grad_norm": 0.037659665058523445, + "language_loss": 0.79047614, + "learning_rate": 0.0004431112185615208, + "loss": 0.8009755, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.40795898, + "step": 2860, + "time_per_iteration": 2.7862706184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_mlp": 1.00302446, + "epoch": 0.5504040015390534, + "flos": 490655460096.0, + "grad_norm": 0.03348154415794888, + "language_loss": 0.8037793, + "learning_rate": 0.00044280171039296845, + "loss": 0.8142184, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.40893555, + "step": 2861, + "time_per_iteration": 2.6561086177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052487, + "balance_loss_mlp": 1.01166964, + "epoch": 0.550596383224317, + "flos": 576862554624.0, + "grad_norm": 0.03513860333112342, + "language_loss": 0.88868964, + "learning_rate": 0.0004424922244307616, + "loss": 0.89921451, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.40820312, + "step": 2862, + "time_per_iteration": 2.7066099643707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_mlp": 1.01298499, + "epoch": 0.5507887649095806, + "flos": 643634072064.0, + "grad_norm": 0.03653258974946179, + "language_loss": 0.82663441, + "learning_rate": 0.00044218276079505315, + "loss": 0.83717263, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.40844727, + "step": 2863, + "time_per_iteration": 2.87058162689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049384, + "balance_loss_mlp": 1.00856698, + "epoch": 0.5509811465948442, + "flos": 532865674752.0, + "grad_norm": 0.034931125724459874, + "language_loss": 0.75083911, + "learning_rate": 0.0004418733196059876, + "loss": 0.76133299, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.40820312, + "step": 2864, + "time_per_iteration": 2.690927743911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048154, + "balance_loss_mlp": 1.00719357, + "epoch": 0.5511735282801077, + "flos": 655984226304.0, + "grad_norm": 0.03582782743987034, + "language_loss": 0.80482149, + "learning_rate": 0.0004415639009837008, + "loss": 0.81530309, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.40966797, + "step": 2865, + "time_per_iteration": 2.8515002727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050322, + "balance_loss_mlp": 1.00948107, + "epoch": 0.5513659099653713, + "flos": 530610469632.0, + "grad_norm": 0.03216902856467023, + "language_loss": 0.82250589, + "learning_rate": 0.00044125450504831955, + "loss": 0.83300906, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.40844727, + "step": 2866, + "time_per_iteration": 2.743833303451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053087, + "balance_loss_mlp": 1.01229346, + "epoch": 0.5515582916506349, + "flos": 555974704128.0, + "grad_norm": 0.03636447949545943, + "language_loss": 0.827411, + "learning_rate": 0.0004409451319199622, + "loss": 0.83794183, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.40795898, + "step": 2867, + "time_per_iteration": 2.654466390609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045403, + "balance_loss_mlp": 1.00439477, + "epoch": 0.5517506733358984, + "flos": 736772999424.0, + "grad_norm": 0.03752588301556939, + "language_loss": 0.85160595, + "learning_rate": 0.0004406357817187381, + "loss": 0.86206001, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.41015625, + "step": 2868, + "time_per_iteration": 2.9610273838043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051499, + "balance_loss_mlp": 1.01065779, + "epoch": 0.551943055021162, + "flos": 1117190818560.0, + "grad_norm": 0.028811275091252902, + "language_loss": 0.81857193, + "learning_rate": 0.0004403264545647474, + "loss": 0.8290869, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.40844727, + "step": 2869, + "time_per_iteration": 3.511462450027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043012, + "balance_loss_mlp": 1.00195587, + "epoch": 0.5521354367064255, + "flos": 545502588672.0, + "grad_norm": 0.03184831617373855, + "language_loss": 0.85004073, + "learning_rate": 0.00044001715057808154, + "loss": 0.86047089, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.41064453, + "step": 2870, + "time_per_iteration": 2.744248390197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048495, + "balance_loss_mlp": 1.00746286, + "epoch": 0.5523278183916891, + "flos": 937872986880.0, + "grad_norm": 0.03348956391566461, + "language_loss": 0.81933939, + "learning_rate": 0.0004397078698788232, + "loss": 0.82982433, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.41040039, + "step": 2871, + "time_per_iteration": 3.193040132522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_mlp": 1.01277161, + "epoch": 0.5525202000769527, + "flos": 1469101684224.0, + "grad_norm": 0.00853782264427079, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81494617, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.39453125, + "step": 2872, + "time_per_iteration": 4.887877702713013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050207, + "balance_loss_mlp": 1.00917542, + "epoch": 0.5527125817622163, + "flos": 490785717504.0, + "grad_norm": 0.036240955421061, + "language_loss": 0.78392744, + "learning_rate": 0.00043908937882281343, + "loss": 0.79442948, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.41040039, + "step": 2873, + "time_per_iteration": 2.5992209911346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_mlp": 1.00414526, + "epoch": 0.5529049634474797, + "flos": 636149128704.0, + "grad_norm": 0.03461125376652938, + "language_loss": 0.82969832, + "learning_rate": 0.0004387801687061814, + "loss": 0.84015036, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.41064453, + "step": 2874, + "time_per_iteration": 2.8166332244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_mlp": 1.00408852, + "epoch": 0.5530973451327433, + "flos": 582435432960.0, + "grad_norm": 0.031639900781256135, + "language_loss": 0.81371784, + "learning_rate": 0.0004384709823571958, + "loss": 0.82416999, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.41137695, + "step": 2875, + "time_per_iteration": 2.7777786254882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00458348, + "epoch": 0.5532897268180069, + "flos": 1124330676480.0, + "grad_norm": 0.03430168550584483, + "language_loss": 0.83714402, + "learning_rate": 0.0004381618198958932, + "loss": 0.84760094, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.41113281, + "step": 2876, + "time_per_iteration": 3.517432451248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_mlp": 1.00536335, + "epoch": 0.5534821085032705, + "flos": 638513203968.0, + "grad_norm": 0.03082674119581989, + "language_loss": 0.83886576, + "learning_rate": 0.00043785268144230137, + "loss": 0.84933138, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.41210938, + "step": 2877, + "time_per_iteration": 2.9488272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048169, + "balance_loss_mlp": 1.0069226, + "epoch": 0.5536744901885341, + "flos": 572217029376.0, + "grad_norm": 0.037462471463683845, + "language_loss": 0.8303535, + "learning_rate": 0.00043754356711643837, + "loss": 0.84083521, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.41259766, + "step": 2878, + "time_per_iteration": 2.669304370880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_mlp": 1.00479829, + "epoch": 0.5538668718737976, + "flos": 596917337856.0, + "grad_norm": 0.03146432649645385, + "language_loss": 0.84558415, + "learning_rate": 0.0004372344770383132, + "loss": 0.8560434, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.41137695, + "step": 2879, + "time_per_iteration": 2.855231761932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050888, + "balance_loss_mlp": 1.0097847, + "epoch": 0.5540592535590612, + "flos": 533719150848.0, + "grad_norm": 0.0358528854453713, + "language_loss": 0.83432066, + "learning_rate": 0.00043692541132792507, + "loss": 0.84482956, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.41113281, + "step": 2880, + "time_per_iteration": 2.662008047103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051879, + "balance_loss_mlp": 1.01070428, + "epoch": 0.5542516352443247, + "flos": 413505146112.0, + "grad_norm": 0.035032849721931915, + "language_loss": 0.83894408, + "learning_rate": 0.00043661637010526384, + "loss": 0.84946287, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.41186523, + "step": 2881, + "time_per_iteration": 2.507699489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_mlp": 1.00484717, + "epoch": 0.5544440169295883, + "flos": 548678343936.0, + "grad_norm": 0.03314086611141918, + "language_loss": 0.83246458, + "learning_rate": 0.00043630735349031025, + "loss": 0.84292531, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.41235352, + "step": 2882, + "time_per_iteration": 2.70409893989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_mlp": 1.00623393, + "epoch": 0.5546363986148518, + "flos": 623034926592.0, + "grad_norm": 0.03282028788454341, + "language_loss": 0.82495463, + "learning_rate": 0.00043599836160303495, + "loss": 0.83542871, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.41186523, + "step": 2883, + "time_per_iteration": 2.900757312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_mlp": 1.00550687, + "epoch": 0.5548287803001154, + "flos": 706580492544.0, + "grad_norm": 0.029978122278870225, + "language_loss": 0.78110325, + "learning_rate": 0.0004356893945633995, + "loss": 0.79157007, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.41186523, + "step": 2884, + "time_per_iteration": 2.975062608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_mlp": 1.00501966, + "epoch": 0.555021161985379, + "flos": 505184997120.0, + "grad_norm": 0.033025085572570244, + "language_loss": 0.82143605, + "learning_rate": 0.0004353804524913551, + "loss": 0.83189756, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.41137695, + "step": 2885, + "time_per_iteration": 2.6369645595550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_mlp": 1.00512528, + "epoch": 0.5552135436706426, + "flos": 617210281728.0, + "grad_norm": 0.0369840001422722, + "language_loss": 0.82350749, + "learning_rate": 0.0004350715355068441, + "loss": 0.83396947, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.41088867, + "step": 2886, + "time_per_iteration": 2.727186441421509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044964, + "balance_loss_mlp": 1.00393176, + "epoch": 0.5554059253559062, + "flos": 464817828096.0, + "grad_norm": 0.043659618464352824, + "language_loss": 0.80073905, + "learning_rate": 0.00043476264372979847, + "loss": 0.8111887, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.41040039, + "step": 2887, + "time_per_iteration": 2.5368049144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_mlp": 1.00357509, + "epoch": 0.5555983070411696, + "flos": 1564876885248.0, + "grad_norm": 0.03408551435207337, + "language_loss": 0.79322737, + "learning_rate": 0.0004344537772801408, + "loss": 0.80367273, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.40966797, + "step": 2888, + "time_per_iteration": 3.869920015335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057491, + "balance_loss_mlp": 1.01791382, + "epoch": 0.5557906887264332, + "flos": 1471229544192.0, + "grad_norm": 0.014769088101488215, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74479944, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.39550781, + "step": 2889, + "time_per_iteration": 4.936699867248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_mlp": 1.00608003, + "epoch": 0.5559830704116968, + "flos": 530864181504.0, + "grad_norm": 0.0376436874687178, + "language_loss": 0.83696067, + "learning_rate": 0.0004338361208426298, + "loss": 0.84743202, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.41064453, + "step": 2890, + "time_per_iteration": 2.6094541549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051706, + "balance_loss_mlp": 1.01069844, + "epoch": 0.5561754520969604, + "flos": 652519766016.0, + "grad_norm": 0.029226912064567154, + "language_loss": 0.81876659, + "learning_rate": 0.00043352733109457164, + "loss": 0.82928365, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.41015625, + "step": 2891, + "time_per_iteration": 2.8833718299865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_mlp": 1.00985098, + "epoch": 0.556367833782224, + "flos": 735620124672.0, + "grad_norm": 0.029092214279724596, + "language_loss": 0.84975475, + "learning_rate": 0.00043321856715349244, + "loss": 0.86026359, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.41040039, + "step": 2892, + "time_per_iteration": 2.9798240661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046881, + "balance_loss_mlp": 1.00575387, + "epoch": 0.5565602154674875, + "flos": 673641886464.0, + "grad_norm": 0.03553967461394851, + "language_loss": 0.81101406, + "learning_rate": 0.00043290982913926466, + "loss": 0.8214829, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.41137695, + "step": 2893, + "time_per_iteration": 2.8139491081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_mlp": 1.00463176, + "epoch": 0.556752597152751, + "flos": 587504778240.0, + "grad_norm": 0.036653967015968944, + "language_loss": 0.84921324, + "learning_rate": 0.0004326011171717514, + "loss": 0.85967016, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.41064453, + "step": 2894, + "time_per_iteration": 2.9087953567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046707, + "balance_loss_mlp": 1.00555551, + "epoch": 0.5569449788380146, + "flos": 438691491072.0, + "grad_norm": 0.03515530628910635, + "language_loss": 0.81422639, + "learning_rate": 0.0004322924313708051, + "loss": 0.82469344, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.41162109, + "step": 2895, + "time_per_iteration": 2.529937505722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051552, + "balance_loss_mlp": 1.01054382, + "epoch": 0.5571373605232782, + "flos": 503248632576.0, + "grad_norm": 0.03724847922393753, + "language_loss": 0.84896851, + "learning_rate": 0.0004319837718562681, + "loss": 0.85948396, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.41015625, + "step": 2896, + "time_per_iteration": 2.6142115592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_mlp": 1.00599957, + "epoch": 0.5573297422085417, + "flos": 578590894080.0, + "grad_norm": 0.04905398235042313, + "language_loss": 0.83417499, + "learning_rate": 0.0004316751387479726, + "loss": 0.84464645, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.41162109, + "step": 2897, + "time_per_iteration": 2.7738893032073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_mlp": 1.00555933, + "epoch": 0.5575221238938053, + "flos": 1346049251328.0, + "grad_norm": 0.03588075887117774, + "language_loss": 0.82779884, + "learning_rate": 0.0004313665321657409, + "loss": 0.83826572, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.41137695, + "step": 2898, + "time_per_iteration": 3.725510835647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_mlp": 1.00672877, + "epoch": 0.5577145055790689, + "flos": 603099707136.0, + "grad_norm": 0.03720848090960627, + "language_loss": 0.80283779, + "learning_rate": 0.00043105795222938436, + "loss": 0.81331486, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.40991211, + "step": 2899, + "time_per_iteration": 2.7282700538635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_mlp": 1.00829744, + "epoch": 0.5579068872643325, + "flos": 563691972096.0, + "grad_norm": 0.03568825250494595, + "language_loss": 0.79214776, + "learning_rate": 0.00043074939905870467, + "loss": 0.80263913, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.40844727, + "step": 2900, + "time_per_iteration": 2.696354389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_mlp": 1.00399923, + "epoch": 0.558099268949596, + "flos": 545589104640.0, + "grad_norm": 0.04035642488371941, + "language_loss": 0.81151342, + "learning_rate": 0.0004304408727734927, + "loss": 0.82196188, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.40844727, + "step": 2901, + "time_per_iteration": 2.6394877433776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044507, + "balance_loss_mlp": 1.00366592, + "epoch": 0.5582916506348595, + "flos": 553853647104.0, + "grad_norm": 0.036813902208390564, + "language_loss": 0.89428526, + "learning_rate": 0.0004301323734935288, + "loss": 0.90473032, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.40844727, + "step": 2902, + "time_per_iteration": 2.659945249557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_mlp": 1.00635207, + "epoch": 0.5584840323201231, + "flos": 544425536256.0, + "grad_norm": 0.03290970227186249, + "language_loss": 0.87933898, + "learning_rate": 0.000429823901338583, + "loss": 0.88981086, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.40844727, + "step": 2903, + "time_per_iteration": 2.643388032913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_mlp": 1.00432324, + "epoch": 0.5586764140053867, + "flos": 817023246336.0, + "grad_norm": 0.03162840926526219, + "language_loss": 0.87249023, + "learning_rate": 0.00042951545642841513, + "loss": 0.88294262, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.40917969, + "step": 2904, + "time_per_iteration": 3.0901763439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047509, + "balance_loss_mlp": 1.00642967, + "epoch": 0.5588687956906503, + "flos": 487416521472.0, + "grad_norm": 0.02951660315659268, + "language_loss": 0.87151515, + "learning_rate": 0.0004292070388827737, + "loss": 0.88199031, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.41088867, + "step": 2905, + "time_per_iteration": 2.6241614818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050702, + "balance_loss_mlp": 1.00967062, + "epoch": 0.5590611773759138, + "flos": 453069383424.0, + "grad_norm": 0.03428125950398782, + "language_loss": 0.81863332, + "learning_rate": 0.00042889864882139753, + "loss": 0.82914031, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.41040039, + "step": 2906, + "time_per_iteration": 2.6295247077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_mlp": 1.01025224, + "epoch": 0.5592535590611774, + "flos": 521957100288.0, + "grad_norm": 0.03203389874594117, + "language_loss": 0.82458705, + "learning_rate": 0.0004285902863640139, + "loss": 0.83510035, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.41088867, + "step": 2907, + "time_per_iteration": 2.6310994625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044647, + "balance_loss_mlp": 1.00366294, + "epoch": 0.5594459407464409, + "flos": 553601880576.0, + "grad_norm": 0.029509403523767207, + "language_loss": 0.86282808, + "learning_rate": 0.00042828195163033966, + "loss": 0.87327456, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.40991211, + "step": 2908, + "time_per_iteration": 2.720059871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_mlp": 1.00285828, + "epoch": 0.5596383224317045, + "flos": 485788303872.0, + "grad_norm": 0.032784621074408576, + "language_loss": 0.796462, + "learning_rate": 0.0004279736447400812, + "loss": 0.80690086, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.41040039, + "step": 2909, + "time_per_iteration": 2.562958240509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_mlp": 1.00323904, + "epoch": 0.5598307041169681, + "flos": 612380064000.0, + "grad_norm": 0.03125271468065307, + "language_loss": 0.78822809, + "learning_rate": 0.00042766536581293385, + "loss": 0.79866982, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.40942383, + "step": 2910, + "time_per_iteration": 2.742727041244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_mlp": 1.00297463, + "epoch": 0.5600230858022316, + "flos": 489917657088.0, + "grad_norm": 0.033084161668713065, + "language_loss": 0.80192208, + "learning_rate": 0.0004273571149685819, + "loss": 0.81236243, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.41064453, + "step": 2911, + "time_per_iteration": 2.7333109378814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_mlp": 1.00091636, + "epoch": 0.5602154674874952, + "flos": 599982277632.0, + "grad_norm": 0.033670817346998394, + "language_loss": 0.84396589, + "learning_rate": 0.00042704889232669937, + "loss": 0.8543846, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.40966797, + "step": 2912, + "time_per_iteration": 2.7085225582122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00336003, + "epoch": 0.5604078491727588, + "flos": 587063461632.0, + "grad_norm": 0.043754524068974454, + "language_loss": 0.8611334, + "learning_rate": 0.0004267406980069484, + "loss": 0.87157494, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.40795898, + "step": 2913, + "time_per_iteration": 2.747812271118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00275385, + "epoch": 0.5606002308580224, + "flos": 542328778752.0, + "grad_norm": 0.02876490223829942, + "language_loss": 0.7993964, + "learning_rate": 0.0004264325321289808, + "loss": 0.80983406, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.41015625, + "step": 2914, + "time_per_iteration": 2.8028316497802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043869, + "balance_loss_mlp": 1.0028609, + "epoch": 0.5607926125432858, + "flos": 585079464960.0, + "grad_norm": 0.03419971609404561, + "language_loss": 0.86714381, + "learning_rate": 0.00042612439481243736, + "loss": 0.87758255, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.41015625, + "step": 2915, + "time_per_iteration": 2.7691102027893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045259, + "balance_loss_mlp": 1.00417948, + "epoch": 0.5609849942285494, + "flos": 628631137536.0, + "grad_norm": 0.0372312942186238, + "language_loss": 0.90099525, + "learning_rate": 0.00042581628617694735, + "loss": 0.91144788, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.41088867, + "step": 2916, + "time_per_iteration": 2.7420172691345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043547, + "balance_loss_mlp": 1.00261009, + "epoch": 0.561177375913813, + "flos": 589455727104.0, + "grad_norm": 0.03338895186153077, + "language_loss": 0.82208467, + "learning_rate": 0.0004255082063421296, + "loss": 0.83252013, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.40942383, + "step": 2917, + "time_per_iteration": 2.673243999481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_mlp": 1.0016005, + "epoch": 0.5613697575990766, + "flos": 528144327168.0, + "grad_norm": 0.03066260992789867, + "language_loss": 0.85543269, + "learning_rate": 0.00042520015542759065, + "loss": 0.86586022, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.41162109, + "step": 2918, + "time_per_iteration": 2.879850387573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_mlp": 1.00201178, + "epoch": 0.5615621392843402, + "flos": 643875144960.0, + "grad_norm": 0.028477148441929827, + "language_loss": 0.88382292, + "learning_rate": 0.00042489213355292687, + "loss": 0.89425319, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.41015625, + "step": 2919, + "time_per_iteration": 2.9279518127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044234, + "balance_loss_mlp": 1.00315475, + "epoch": 0.5617545209696037, + "flos": 428657779968.0, + "grad_norm": 0.03756668389237789, + "language_loss": 0.81703657, + "learning_rate": 0.00042458414083772276, + "loss": 0.82747889, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.41088867, + "step": 2920, + "time_per_iteration": 2.5474023818969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_mlp": 1.00371051, + "epoch": 0.5619469026548672, + "flos": 569590493952.0, + "grad_norm": 0.029467937694277743, + "language_loss": 0.85509026, + "learning_rate": 0.000424276177401552, + "loss": 0.86553693, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.40966797, + "step": 2921, + "time_per_iteration": 2.797123670578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_mlp": 1.00260556, + "epoch": 0.5621392843401308, + "flos": 506244552960.0, + "grad_norm": 0.03575401527758356, + "language_loss": 0.86372185, + "learning_rate": 0.0004239682433639763, + "loss": 0.87415743, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.40966797, + "step": 2922, + "time_per_iteration": 2.6631922721862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_mlp": 1.00281191, + "epoch": 0.5623316660253944, + "flos": 518010494208.0, + "grad_norm": 0.03518251960287723, + "language_loss": 0.86062789, + "learning_rate": 0.0004236603388445467, + "loss": 0.87106532, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.40942383, + "step": 2923, + "time_per_iteration": 2.60380482673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_mlp": 1.00410116, + "epoch": 0.5625240477106579, + "flos": 607139632128.0, + "grad_norm": 0.03089029411800112, + "language_loss": 0.82301855, + "learning_rate": 0.00042335246396280166, + "loss": 0.8334682, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.40869141, + "step": 2924, + "time_per_iteration": 2.7605555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045584, + "balance_loss_mlp": 1.00462389, + "epoch": 0.5627164293959215, + "flos": 451341043968.0, + "grad_norm": 0.04701230911743114, + "language_loss": 0.91272092, + "learning_rate": 0.0004230446188382693, + "loss": 0.92317677, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.40966797, + "step": 2925, + "time_per_iteration": 2.5571765899658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042518, + "balance_loss_mlp": 1.00158191, + "epoch": 0.5629088110811851, + "flos": 743437514496.0, + "grad_norm": 0.0349005963329915, + "language_loss": 0.81125653, + "learning_rate": 0.0004227368035904654, + "loss": 0.82168174, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.40942383, + "step": 2926, + "time_per_iteration": 3.0334270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_mlp": 1.00211096, + "epoch": 0.5631011927664487, + "flos": 497980010496.0, + "grad_norm": 0.0467260030557379, + "language_loss": 0.83361161, + "learning_rate": 0.00042242901833889474, + "loss": 0.84404236, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.40966797, + "step": 2927, + "time_per_iteration": 2.6271822452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_mlp": 1.00153816, + "epoch": 0.5632935744517122, + "flos": 887595561216.0, + "grad_norm": 0.03653524957968277, + "language_loss": 0.8629514, + "learning_rate": 0.0004221212632030501, + "loss": 0.87337685, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.41015625, + "step": 2928, + "time_per_iteration": 3.1174416542053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_mlp": 1.00542605, + "epoch": 0.5634859561369757, + "flos": 605902186752.0, + "grad_norm": 0.04110669316721802, + "language_loss": 0.80746865, + "learning_rate": 0.0004218135383024124, + "loss": 0.81793177, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.40893555, + "step": 2929, + "time_per_iteration": 2.705615758895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_mlp": 1.00056946, + "epoch": 0.5636783378222393, + "flos": 454903680768.0, + "grad_norm": 0.0339470495466753, + "language_loss": 0.85614669, + "learning_rate": 0.0004215058437564511, + "loss": 0.86656082, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.40844727, + "step": 2930, + "time_per_iteration": 2.5682146549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_mlp": 1.00006831, + "epoch": 0.5638707195075029, + "flos": 519462767616.0, + "grad_norm": 0.03372410984042782, + "language_loss": 0.82691574, + "learning_rate": 0.00042119817968462397, + "loss": 0.83732378, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.4074707, + "step": 2931, + "time_per_iteration": 2.6308341026306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105222, + "balance_loss_mlp": 1.01135468, + "epoch": 0.5640631011927665, + "flos": 565845110016.0, + "grad_norm": 0.03794773284405352, + "language_loss": 0.87544155, + "learning_rate": 0.0004208905462063766, + "loss": 0.88596374, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.40869141, + "step": 2932, + "time_per_iteration": 2.6615707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049556, + "balance_loss_mlp": 1.00866711, + "epoch": 0.56425548287803, + "flos": 518038684416.0, + "grad_norm": 0.03232798556838129, + "language_loss": 0.84722394, + "learning_rate": 0.00042058294344114315, + "loss": 0.85771948, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.40893555, + "step": 2933, + "time_per_iteration": 2.6182868480682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049116, + "balance_loss_mlp": 1.0083226, + "epoch": 0.5644478645632935, + "flos": 855670824192.0, + "grad_norm": 0.03170317888214056, + "language_loss": 0.78432804, + "learning_rate": 0.0004202753715083456, + "loss": 0.79481918, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.40795898, + "step": 2934, + "time_per_iteration": 3.0613481998443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045, + "balance_loss_mlp": 1.00420666, + "epoch": 0.5646402462485571, + "flos": 554496185856.0, + "grad_norm": 0.03929055225526713, + "language_loss": 0.81611717, + "learning_rate": 0.0004199678305273936, + "loss": 0.82656717, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.40795898, + "step": 2935, + "time_per_iteration": 2.634765386581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046418, + "balance_loss_mlp": 1.00552905, + "epoch": 0.5648326279338207, + "flos": 687312111360.0, + "grad_norm": 0.02956036273454178, + "language_loss": 0.8172124, + "learning_rate": 0.0004196603206176854, + "loss": 0.82767659, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.40893555, + "step": 2936, + "time_per_iteration": 2.9358084201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_mlp": 1.00783014, + "epoch": 0.5650250096190843, + "flos": 804683785728.0, + "grad_norm": 0.03257366451462874, + "language_loss": 0.84142041, + "learning_rate": 0.000419352841898607, + "loss": 0.85190785, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.40917969, + "step": 2937, + "time_per_iteration": 2.9652152061462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_mlp": 1.00891984, + "epoch": 0.5652173913043478, + "flos": 583145045760.0, + "grad_norm": 0.037245032295536384, + "language_loss": 0.7792089, + "learning_rate": 0.000419045394489532, + "loss": 0.78970701, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.40893555, + "step": 2938, + "time_per_iteration": 2.6814448833465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048464, + "balance_loss_mlp": 1.00752795, + "epoch": 0.5654097729896114, + "flos": 822168413952.0, + "grad_norm": 0.03166469527574581, + "language_loss": 0.76863134, + "learning_rate": 0.0004187379785098224, + "loss": 0.77911597, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.40942383, + "step": 2939, + "time_per_iteration": 3.1437690258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049709, + "balance_loss_mlp": 1.00881994, + "epoch": 0.565602154674875, + "flos": 785482478592.0, + "grad_norm": 0.035451368889273006, + "language_loss": 0.84531581, + "learning_rate": 0.00041843059407882744, + "loss": 0.85581291, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.40893555, + "step": 2940, + "time_per_iteration": 2.9561386108398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_mlp": 1.00554383, + "epoch": 0.5657945363601385, + "flos": 550744965888.0, + "grad_norm": 0.033205673863039784, + "language_loss": 0.83385015, + "learning_rate": 0.0004181232413158842, + "loss": 0.84431374, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.40820312, + "step": 2941, + "time_per_iteration": 2.6476027965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047072, + "balance_loss_mlp": 1.0061357, + "epoch": 0.5659869180454021, + "flos": 669332698368.0, + "grad_norm": 0.03636978251075169, + "language_loss": 0.83073509, + "learning_rate": 0.0004178159203403179, + "loss": 0.84120584, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.40942383, + "step": 2942, + "time_per_iteration": 2.835840940475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_mlp": 1.00862479, + "epoch": 0.5661792997306656, + "flos": 500949686016.0, + "grad_norm": 0.030415094414242012, + "language_loss": 0.8213833, + "learning_rate": 0.0004175086312714409, + "loss": 0.83187747, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.40795898, + "step": 2943, + "time_per_iteration": 2.6258370876312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00911188, + "epoch": 0.5663716814159292, + "flos": 602363849472.0, + "grad_norm": 0.030374801338140925, + "language_loss": 0.84196591, + "learning_rate": 0.00041720137422855366, + "loss": 0.85246402, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.40698242, + "step": 2944, + "time_per_iteration": 2.753483772277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050206, + "balance_loss_mlp": 1.00948393, + "epoch": 0.5665640631011928, + "flos": 542033270784.0, + "grad_norm": 0.0327328941542846, + "language_loss": 0.79511452, + "learning_rate": 0.00041689414933094383, + "loss": 0.80561656, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.40722656, + "step": 2945, + "time_per_iteration": 2.6251614093780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_mlp": 1.00701642, + "epoch": 0.5667564447864564, + "flos": 603062768640.0, + "grad_norm": 0.03650681858880775, + "language_loss": 0.81631696, + "learning_rate": 0.00041658695669788653, + "loss": 0.82679439, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.40722656, + "step": 2946, + "time_per_iteration": 2.7196879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045027, + "balance_loss_mlp": 1.00432932, + "epoch": 0.5669488264717198, + "flos": 660723070464.0, + "grad_norm": 0.039783949444703086, + "language_loss": 0.82089484, + "learning_rate": 0.00041627979644864453, + "loss": 0.83134508, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.40698242, + "step": 2947, + "time_per_iteration": 2.8414080142974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_mlp": 1.00243521, + "epoch": 0.5671412081569834, + "flos": 486383210496.0, + "grad_norm": 0.029571262892964766, + "language_loss": 0.81883216, + "learning_rate": 0.0004159726687024683, + "loss": 0.82926297, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.40649414, + "step": 2948, + "time_per_iteration": 2.6365981101989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043624, + "balance_loss_mlp": 1.0029496, + "epoch": 0.567333589842247, + "flos": 731061115392.0, + "grad_norm": 0.03568675680792695, + "language_loss": 0.79577011, + "learning_rate": 0.00041566557357859506, + "loss": 0.80620635, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.40673828, + "step": 2949, + "time_per_iteration": 2.8660199642181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046952, + "balance_loss_mlp": 1.00618231, + "epoch": 0.5675259715275106, + "flos": 970559826432.0, + "grad_norm": 0.03148848509964497, + "language_loss": 0.79963183, + "learning_rate": 0.0004153585111962502, + "loss": 0.81010127, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.40771484, + "step": 2950, + "time_per_iteration": 3.284973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049042, + "balance_loss_mlp": 1.00824845, + "epoch": 0.5677183532127742, + "flos": 566214494976.0, + "grad_norm": 0.035222224981726044, + "language_loss": 0.84893769, + "learning_rate": 0.0004150514816746453, + "loss": 0.85942811, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.40795898, + "step": 2951, + "time_per_iteration": 2.688965082168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053398, + "balance_loss_mlp": 1.0126282, + "epoch": 0.5679107348980377, + "flos": 552746459136.0, + "grad_norm": 0.03211470229094595, + "language_loss": 0.86231828, + "learning_rate": 0.0004147444851329802, + "loss": 0.87285221, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.40771484, + "step": 2952, + "time_per_iteration": 2.654975175857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050037, + "balance_loss_mlp": 1.00929093, + "epoch": 0.5681031165833013, + "flos": 820841540352.0, + "grad_norm": 0.031520082579240216, + "language_loss": 0.86395264, + "learning_rate": 0.00041443752169044126, + "loss": 0.87445295, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.4074707, + "step": 2953, + "time_per_iteration": 2.9978690147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_mlp": 1.00384951, + "epoch": 0.5682954982685648, + "flos": 619146646272.0, + "grad_norm": 0.031195671435834585, + "language_loss": 0.85214126, + "learning_rate": 0.0004141305914662025, + "loss": 0.86258864, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.40893555, + "step": 2954, + "time_per_iteration": 2.7177786827087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_mlp": 1.01214588, + "epoch": 0.5684878799538284, + "flos": 649252637184.0, + "grad_norm": 0.03230481359903608, + "language_loss": 0.81020069, + "learning_rate": 0.0004138236945794246, + "loss": 0.82073009, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.40795898, + "step": 2955, + "time_per_iteration": 2.8862104415893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_mlp": 1.01065099, + "epoch": 0.5686802616390919, + "flos": 807354062592.0, + "grad_norm": 0.038353041221636526, + "language_loss": 0.84374332, + "learning_rate": 0.00041351683114925576, + "loss": 0.85425854, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.40869141, + "step": 2956, + "time_per_iteration": 3.0500295162200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_mlp": 1.01126814, + "epoch": 0.5688726433243555, + "flos": 548176756224.0, + "grad_norm": 0.03189027766628176, + "language_loss": 0.87115657, + "learning_rate": 0.0004132100012948308, + "loss": 0.8816781, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.40893555, + "step": 2957, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104687, + "balance_loss_mlp": 1.00593376, + "epoch": 0.5690650250096191, + "flos": 487546778880.0, + "grad_norm": 0.03605588885155363, + "language_loss": 0.84833193, + "learning_rate": 0.00041290320513527145, + "loss": 0.85880065, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.40942383, + "step": 2958, + "time_per_iteration": 2.567070960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010482, + "balance_loss_mlp": 1.00733471, + "epoch": 0.5692574066948827, + "flos": 578555900928.0, + "grad_norm": 0.030752617047449367, + "language_loss": 0.85344827, + "learning_rate": 0.0004125964427896867, + "loss": 0.86393028, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.40869141, + "step": 2959, + "time_per_iteration": 2.672534704208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047545, + "balance_loss_mlp": 1.00663245, + "epoch": 0.5694497883801463, + "flos": 455220576000.0, + "grad_norm": 0.04229544295686443, + "language_loss": 0.79680836, + "learning_rate": 0.0004122897143771723, + "loss": 0.80728376, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.40917969, + "step": 2960, + "time_per_iteration": 2.545262575149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_mlp": 1.00534308, + "epoch": 0.5696421700654097, + "flos": 560583290880.0, + "grad_norm": 0.03127363894209499, + "language_loss": 0.82077289, + "learning_rate": 0.0004119830200168109, + "loss": 0.83123589, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.40966797, + "step": 2961, + "time_per_iteration": 2.663581609725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045878, + "balance_loss_mlp": 1.00510836, + "epoch": 0.5698345517506733, + "flos": 466502426112.0, + "grad_norm": 0.0350478630821908, + "language_loss": 0.89062726, + "learning_rate": 0.0004116763598276714, + "loss": 0.90108603, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.40771484, + "step": 2962, + "time_per_iteration": 2.521552801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_mlp": 1.00641382, + "epoch": 0.5700269334359369, + "flos": 607192121856.0, + "grad_norm": 0.031424704719117534, + "language_loss": 0.81706619, + "learning_rate": 0.00041136973392881017, + "loss": 0.82753831, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.40795898, + "step": 2963, + "time_per_iteration": 2.91904878616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_mlp": 1.00296056, + "epoch": 0.5702193151212005, + "flos": 563857222656.0, + "grad_norm": 0.03326860309508315, + "language_loss": 0.82831907, + "learning_rate": 0.00041106314243926983, + "loss": 0.83875614, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.4074707, + "step": 2964, + "time_per_iteration": 2.7399420738220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_mlp": 1.00340486, + "epoch": 0.570411696806464, + "flos": 524310481920.0, + "grad_norm": 0.03332690132244082, + "language_loss": 0.8800739, + "learning_rate": 0.0004107565854780798, + "loss": 0.89051443, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.40649414, + "step": 2965, + "time_per_iteration": 2.6200034618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_mlp": 1.00565064, + "epoch": 0.5706040784917276, + "flos": 719473063680.0, + "grad_norm": 0.03436086388372073, + "language_loss": 0.81524932, + "learning_rate": 0.000410450063164256, + "loss": 0.82571304, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.40722656, + "step": 2966, + "time_per_iteration": 2.8336212635040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048641, + "balance_loss_mlp": 1.00787103, + "epoch": 0.5707964601769911, + "flos": 477671515392.0, + "grad_norm": 0.03782244517116874, + "language_loss": 0.82540762, + "learning_rate": 0.00041014357561680115, + "loss": 0.83589399, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.40771484, + "step": 2967, + "time_per_iteration": 2.5143654346466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.00714386, + "epoch": 0.5709888418622547, + "flos": 581217429504.0, + "grad_norm": 0.030421169355448613, + "language_loss": 0.86193347, + "learning_rate": 0.0004098371229547039, + "loss": 0.87241161, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.40673828, + "step": 2968, + "time_per_iteration": 2.6610617637634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057983, + "balance_loss_mlp": 1.01869202, + "epoch": 0.5711812235475183, + "flos": 1583195536128.0, + "grad_norm": 0.0076189717983582966, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.8106879, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.39257812, + "step": 2969, + "time_per_iteration": 4.76263952255249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_mlp": 1.00790465, + "epoch": 0.5713736052327818, + "flos": 469498346496.0, + "grad_norm": 0.03484927048715074, + "language_loss": 0.80634308, + "learning_rate": 0.00040922432276247107, + "loss": 0.81682986, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.40771484, + "step": 2970, + "time_per_iteration": 2.5514628887176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_mlp": 1.0054065, + "epoch": 0.5715659869180454, + "flos": 538755448320.0, + "grad_norm": 0.029079861926461517, + "language_loss": 0.84918243, + "learning_rate": 0.0004089179754702457, + "loss": 0.85964465, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.40820312, + "step": 2971, + "time_per_iteration": 2.749539613723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_mlp": 1.00396252, + "epoch": 0.571758368603309, + "flos": 657251807232.0, + "grad_norm": 0.03418066993480882, + "language_loss": 0.80556142, + "learning_rate": 0.00040861166353919843, + "loss": 0.81600946, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.40844727, + "step": 2972, + "time_per_iteration": 2.814680814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052102, + "balance_loss_mlp": 1.011356, + "epoch": 0.5719507502885726, + "flos": 669100373760.0, + "grad_norm": 0.031053974574008693, + "language_loss": 0.82602715, + "learning_rate": 0.00040830538708824983, + "loss": 0.83654815, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.4074707, + "step": 2973, + "time_per_iteration": 2.904085636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050783, + "balance_loss_mlp": 1.01018071, + "epoch": 0.572143131973836, + "flos": 477280743168.0, + "grad_norm": 0.03419925971016847, + "language_loss": 0.82092619, + "learning_rate": 0.000407999146236307, + "loss": 0.83143401, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.40600586, + "step": 2974, + "time_per_iteration": 2.549262046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051203, + "balance_loss_mlp": 1.01062381, + "epoch": 0.5723355136590996, + "flos": 540535310592.0, + "grad_norm": 0.03597856382327793, + "language_loss": 0.83747095, + "learning_rate": 0.0004076929411022634, + "loss": 0.847983, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.40576172, + "step": 2975, + "time_per_iteration": 2.602869987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053058, + "balance_loss_mlp": 1.01235974, + "epoch": 0.5725278953443632, + "flos": 825650370816.0, + "grad_norm": 0.037415312483521146, + "language_loss": 0.8006742, + "learning_rate": 0.0004073867718049982, + "loss": 0.81120479, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.40698242, + "step": 2976, + "time_per_iteration": 3.139498472213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_mlp": 1.00966477, + "epoch": 0.5727202770296268, + "flos": 588570170112.0, + "grad_norm": 0.037681082671355684, + "language_loss": 0.83124882, + "learning_rate": 0.00040708063846337704, + "loss": 0.84175301, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.4074707, + "step": 2977, + "time_per_iteration": 2.7134242057800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_mlp": 1.00937819, + "epoch": 0.5729126587148904, + "flos": 447941712384.0, + "grad_norm": 0.03249864108633733, + "language_loss": 0.81268066, + "learning_rate": 0.00040677454119625143, + "loss": 0.82318383, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.40942383, + "step": 2978, + "time_per_iteration": 2.5775671005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049596, + "balance_loss_mlp": 1.00870752, + "epoch": 0.5731050404001539, + "flos": 520467888384.0, + "grad_norm": 0.034012599703189976, + "language_loss": 0.83670664, + "learning_rate": 0.0004064684801224587, + "loss": 0.84720254, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.40893555, + "step": 2979, + "time_per_iteration": 2.6424074172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047576, + "balance_loss_mlp": 1.00675905, + "epoch": 0.5732974220854175, + "flos": 505771155456.0, + "grad_norm": 0.032486782592384814, + "language_loss": 0.80872238, + "learning_rate": 0.00040616245536082224, + "loss": 0.81919813, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.40820312, + "step": 2980, + "time_per_iteration": 2.57401704788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050822, + "balance_loss_mlp": 1.01000464, + "epoch": 0.573489803770681, + "flos": 593678399232.0, + "grad_norm": 0.028956426653120197, + "language_loss": 0.82143462, + "learning_rate": 0.00040585646703015165, + "loss": 0.8319428, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.40820312, + "step": 2981, + "time_per_iteration": 2.828683614730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_mlp": 1.01010036, + "epoch": 0.5736821854559446, + "flos": 490870288128.0, + "grad_norm": 0.04412597729133787, + "language_loss": 0.78605878, + "learning_rate": 0.0004055505152492419, + "loss": 0.79656816, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.40844727, + "step": 2982, + "time_per_iteration": 2.640928268432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048392, + "balance_loss_mlp": 1.00747919, + "epoch": 0.5738745671412081, + "flos": 459202175232.0, + "grad_norm": 0.034256342510568284, + "language_loss": 0.74769032, + "learning_rate": 0.00040524460013687425, + "loss": 0.7581743, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.40917969, + "step": 2983, + "time_per_iteration": 2.7067794799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105312, + "balance_loss_mlp": 1.0123024, + "epoch": 0.5740669488264717, + "flos": 581621807616.0, + "grad_norm": 0.029467935021435916, + "language_loss": 0.81554836, + "learning_rate": 0.0004049387218118155, + "loss": 0.82607955, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.40820312, + "step": 2984, + "time_per_iteration": 2.9581944942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_mlp": 1.00468242, + "epoch": 0.5742593305117353, + "flos": 525574172160.0, + "grad_norm": 0.03631391131249333, + "language_loss": 0.85729742, + "learning_rate": 0.00040463288039281777, + "loss": 0.86775261, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.40844727, + "step": 2985, + "time_per_iteration": 2.7224113941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056683, + "balance_loss_mlp": 1.01729584, + "epoch": 0.5744517121969989, + "flos": 1557269442816.0, + "grad_norm": 0.010841110534864203, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78933102, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.39355469, + "step": 2986, + "time_per_iteration": 5.064981698989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_mlp": 1.01089525, + "epoch": 0.5746440938822625, + "flos": 753203907840.0, + "grad_norm": 0.045288596232844924, + "language_loss": 0.82885808, + "learning_rate": 0.0004040213087479444, + "loss": 0.83937448, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.4074707, + "step": 2987, + "time_per_iteration": 2.98020601272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043481, + "balance_loss_mlp": 1.00266409, + "epoch": 0.5748364755675259, + "flos": 502857860352.0, + "grad_norm": 0.036149920431262125, + "language_loss": 0.85748988, + "learning_rate": 0.0004037155787595018, + "loss": 0.86792469, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.40820312, + "step": 2988, + "time_per_iteration": 2.5745627880096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_mlp": 1.01026356, + "epoch": 0.5750288572527895, + "flos": 505198603008.0, + "grad_norm": 0.03371383384616788, + "language_loss": 0.81460357, + "learning_rate": 0.000403409886151987, + "loss": 0.82511389, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.40771484, + "step": 2989, + "time_per_iteration": 2.9434561729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045067, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5752212389380531, + "flos": 1544678215680.0, + "grad_norm": 0.006920775411585041, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83044171, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.39453125, + "step": 2990, + "time_per_iteration": 4.784885406494141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_mlp": 1.00367737, + "epoch": 0.5754136206233167, + "flos": 1570674295296.0, + "grad_norm": 0.003743957088283973, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79241765, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.39453125, + "step": 2991, + "time_per_iteration": 4.776461362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049264, + "balance_loss_mlp": 1.00842321, + "epoch": 0.5756060023085803, + "flos": 799562917632.0, + "grad_norm": 0.03045005809397815, + "language_loss": 0.77561808, + "learning_rate": 0.00040249303380173807, + "loss": 0.78611076, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.40844727, + "step": 2992, + "time_per_iteration": 3.0843074321746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_mlp": 1.00451803, + "epoch": 0.5757983839938438, + "flos": 589034819328.0, + "grad_norm": 0.034529184723129894, + "language_loss": 0.79738832, + "learning_rate": 0.00040218749190459126, + "loss": 0.8078438, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.41040039, + "step": 2993, + "time_per_iteration": 2.7403366565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.00428283, + "epoch": 0.5759907656791073, + "flos": 517852046592.0, + "grad_norm": 0.035278528612120996, + "language_loss": 0.82955313, + "learning_rate": 0.00040188198798162775, + "loss": 0.84000504, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.40917969, + "step": 2994, + "time_per_iteration": 2.6673707962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_mlp": 1.00718617, + "epoch": 0.5761831473643709, + "flos": 588290213376.0, + "grad_norm": 0.029287821677584636, + "language_loss": 0.85980493, + "learning_rate": 0.000401576522151455, + "loss": 0.87028569, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.40893555, + "step": 2995, + "time_per_iteration": 2.788686513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_mlp": 1.00815868, + "epoch": 0.5763755290496345, + "flos": 545009749248.0, + "grad_norm": 0.03018415670660867, + "language_loss": 0.8281709, + "learning_rate": 0.0004012710945326651, + "loss": 0.83866143, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.40893555, + "step": 2996, + "time_per_iteration": 2.7784581184387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047721, + "balance_loss_mlp": 1.00685585, + "epoch": 0.576567910734898, + "flos": 627428685312.0, + "grad_norm": 0.030965553916741433, + "language_loss": 0.81781155, + "learning_rate": 0.0004009657052438355, + "loss": 0.82828873, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.40869141, + "step": 2997, + "time_per_iteration": 2.787832498550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_mlp": 1.00593948, + "epoch": 0.5767602924201616, + "flos": 539278423296.0, + "grad_norm": 0.0362963808148575, + "language_loss": 0.86264056, + "learning_rate": 0.00040066035440352904, + "loss": 0.87310815, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.40820312, + "step": 2998, + "time_per_iteration": 2.6896724700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045353, + "balance_loss_mlp": 1.00558472, + "epoch": 0.5769526741054252, + "flos": 1563026046720.0, + "grad_norm": 0.005169215201186531, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.8033849, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.39746094, + "step": 2999, + "time_per_iteration": 4.891216039657593 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_mlp": 1.00318265, + "epoch": 0.5771450557906888, + "flos": 469172702976.0, + "grad_norm": 0.037596514401195116, + "language_loss": 0.7668246, + "learning_rate": 0.00040004976854266145, + "loss": 0.77726436, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.40795898, + "step": 3000, + "time_per_iteration": 2.51895809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00478971, + "epoch": 0.5773374374759523, + "flos": 575633857536.0, + "grad_norm": 0.03248080927364981, + "language_loss": 0.81750363, + "learning_rate": 0.0003997445337591505, + "loss": 0.82796073, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.40917969, + "step": 3001, + "time_per_iteration": 2.692239999771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_mlp": 1.0079695, + "epoch": 0.5775298191612158, + "flos": 529505227008.0, + "grad_norm": 0.031913043384180086, + "language_loss": 0.74606609, + "learning_rate": 0.0003994393378982635, + "loss": 0.75655282, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.40698242, + "step": 3002, + "time_per_iteration": 2.665146589279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053272, + "balance_loss_mlp": 1.01369476, + "epoch": 0.5777222008464794, + "flos": 1306899095808.0, + "grad_norm": 0.010106387724362367, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80591273, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.39550781, + "step": 3003, + "time_per_iteration": 4.803764581680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104489, + "balance_loss_mlp": 1.00409698, + "epoch": 0.577914582531743, + "flos": 604793053440.0, + "grad_norm": 0.0386937293491606, + "language_loss": 0.88557941, + "learning_rate": 0.0003988290634182961, + "loss": 0.89602828, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.40795898, + "step": 3004, + "time_per_iteration": 2.7506465911865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_mlp": 1.00943995, + "epoch": 0.5781069642170066, + "flos": 487833538560.0, + "grad_norm": 0.034765884683499934, + "language_loss": 0.81038988, + "learning_rate": 0.0003985239850361453, + "loss": 0.82089031, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.40600586, + "step": 3005, + "time_per_iteration": 2.5988621711730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047258, + "balance_loss_mlp": 1.00653589, + "epoch": 0.5782993459022701, + "flos": 507414924288.0, + "grad_norm": 0.036479253397917216, + "language_loss": 0.85073388, + "learning_rate": 0.0003982189460504777, + "loss": 0.86120641, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.40722656, + "step": 3006, + "time_per_iteration": 2.694517135620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_mlp": 1.00913548, + "epoch": 0.5784917275875336, + "flos": 603295093248.0, + "grad_norm": 0.03899121610040523, + "language_loss": 0.79739761, + "learning_rate": 0.00039791394657971935, + "loss": 0.80789566, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.40673828, + "step": 3007, + "time_per_iteration": 2.694913387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_mlp": 1.00376368, + "epoch": 0.5786841092727972, + "flos": 522588945408.0, + "grad_norm": 0.03653808704233678, + "language_loss": 0.84952617, + "learning_rate": 0.00039760898674228205, + "loss": 0.85997152, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.40771484, + "step": 3008, + "time_per_iteration": 2.6486122608184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_mlp": 1.00476897, + "epoch": 0.5788764909580608, + "flos": 768836742144.0, + "grad_norm": 0.02798603221606654, + "language_loss": 0.81355041, + "learning_rate": 0.0003973040666565613, + "loss": 0.82400489, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.40673828, + "step": 3009, + "time_per_iteration": 3.029721975326538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_mlp": 1.00590491, + "epoch": 0.5790688726433244, + "flos": 600332220672.0, + "grad_norm": 0.03710521046969438, + "language_loss": 0.82796824, + "learning_rate": 0.000396999186440938, + "loss": 0.8384347, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.4074707, + "step": 3010, + "time_per_iteration": 2.866637945175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_mlp": 1.00711966, + "epoch": 0.5792612543285879, + "flos": 524106347520.0, + "grad_norm": 0.03822457095680595, + "language_loss": 0.85752803, + "learning_rate": 0.000396694346213777, + "loss": 0.86800808, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.40893555, + "step": 3011, + "time_per_iteration": 2.6125171184539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045119, + "balance_loss_mlp": 1.00430202, + "epoch": 0.5794536360138515, + "flos": 878080934400.0, + "grad_norm": 0.030461633114119882, + "language_loss": 0.8396455, + "learning_rate": 0.0003963895460934276, + "loss": 0.8500967, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.40820312, + "step": 3012, + "time_per_iteration": 3.1341123580932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047321, + "balance_loss_mlp": 1.00631309, + "epoch": 0.5796460176991151, + "flos": 402299118336.0, + "grad_norm": 0.04162907217084141, + "language_loss": 0.85323715, + "learning_rate": 0.00039608478619822376, + "loss": 0.86371034, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.41015625, + "step": 3013, + "time_per_iteration": 2.45570969581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_mlp": 1.00448704, + "epoch": 0.5798383993843786, + "flos": 619676424192.0, + "grad_norm": 0.02973237056850944, + "language_loss": 0.8328954, + "learning_rate": 0.00039578006664648394, + "loss": 0.84334981, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.40966797, + "step": 3014, + "time_per_iteration": 2.796370506286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_mlp": 1.00351644, + "epoch": 0.5800307810696421, + "flos": 845793615360.0, + "grad_norm": 0.037256106488294125, + "language_loss": 0.81995672, + "learning_rate": 0.0003954753875565105, + "loss": 0.83040106, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.40917969, + "step": 3015, + "time_per_iteration": 3.0796241760253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045336, + "balance_loss_mlp": 1.00442326, + "epoch": 0.5802231627549057, + "flos": 570365235456.0, + "grad_norm": 0.0302253929683373, + "language_loss": 0.82961631, + "learning_rate": 0.00039517074904659057, + "loss": 0.84006965, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.40917969, + "step": 3016, + "time_per_iteration": 2.6984057426452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105015, + "balance_loss_mlp": 1.00921345, + "epoch": 0.5804155444401693, + "flos": 661663062528.0, + "grad_norm": 0.033398230079863866, + "language_loss": 0.85268873, + "learning_rate": 0.00039486615123499535, + "loss": 0.86319029, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.40942383, + "step": 3017, + "time_per_iteration": 2.8348796367645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051377, + "balance_loss_mlp": 1.01022601, + "epoch": 0.5806079261254329, + "flos": 515058315264.0, + "grad_norm": 0.030637451118741787, + "language_loss": 0.85653043, + "learning_rate": 0.00039456159423997996, + "loss": 0.86704421, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.41162109, + "step": 3018, + "time_per_iteration": 2.6296215057373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_mlp": 1.00740576, + "epoch": 0.5808003078106965, + "flos": 529718109696.0, + "grad_norm": 0.03062870911456177, + "language_loss": 0.90210342, + "learning_rate": 0.00039425707817978406, + "loss": 0.91258705, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.40966797, + "step": 3019, + "time_per_iteration": 2.631979465484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_mlp": 1.00720644, + "epoch": 0.58099268949596, + "flos": 477997158912.0, + "grad_norm": 0.03679030272618613, + "language_loss": 0.84110886, + "learning_rate": 0.00039395260317263124, + "loss": 0.85159171, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.41088867, + "step": 3020, + "time_per_iteration": 2.584413528442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050357, + "balance_loss_mlp": 1.00930095, + "epoch": 0.5811850711812235, + "flos": 518688026112.0, + "grad_norm": 0.03473628129951431, + "language_loss": 0.85378569, + "learning_rate": 0.0003936481693367291, + "loss": 0.86428928, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.41064453, + "step": 3021, + "time_per_iteration": 2.6612508296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049465, + "balance_loss_mlp": 1.00833774, + "epoch": 0.5813774528664871, + "flos": 617627298816.0, + "grad_norm": 0.037803518868136904, + "language_loss": 0.88371962, + "learning_rate": 0.0003933437767902697, + "loss": 0.89421427, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.41137695, + "step": 3022, + "time_per_iteration": 2.7910103797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00499725, + "epoch": 0.5815698345517507, + "flos": 568604815104.0, + "grad_norm": 0.03314052138705104, + "language_loss": 0.78534555, + "learning_rate": 0.00039303942565142825, + "loss": 0.7958051, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.40966797, + "step": 3023, + "time_per_iteration": 2.7066261768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_mlp": 1.00525796, + "epoch": 0.5817622162370142, + "flos": 564304375296.0, + "grad_norm": 0.034500169077956666, + "language_loss": 0.76946682, + "learning_rate": 0.0003927351160383644, + "loss": 0.77992761, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.40820312, + "step": 3024, + "time_per_iteration": 2.785215377807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_mlp": 1.00370252, + "epoch": 0.5819545979222778, + "flos": 460154806272.0, + "grad_norm": 0.03482271460519531, + "language_loss": 0.78468955, + "learning_rate": 0.000392430848069222, + "loss": 0.79513502, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.40844727, + "step": 3025, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_mlp": 1.00244236, + "epoch": 0.5821469796075414, + "flos": 542517361920.0, + "grad_norm": 0.03539348008973476, + "language_loss": 0.83090204, + "learning_rate": 0.00039212662186212795, + "loss": 0.8413347, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.40820312, + "step": 3026, + "time_per_iteration": 2.6203463077545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046114, + "balance_loss_mlp": 1.00534403, + "epoch": 0.582339361292805, + "flos": 553341365760.0, + "grad_norm": 0.030591419392928903, + "language_loss": 0.77452922, + "learning_rate": 0.0003918224375351934, + "loss": 0.78499031, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.40771484, + "step": 3027, + "time_per_iteration": 2.700643301010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_mlp": 1.00646877, + "epoch": 0.5825317429780685, + "flos": 497448287232.0, + "grad_norm": 0.03355698207676345, + "language_loss": 0.79253477, + "learning_rate": 0.0003915182952065135, + "loss": 0.80300689, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.4074707, + "step": 3028, + "time_per_iteration": 2.693223714828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043151, + "balance_loss_mlp": 1.00247645, + "epoch": 0.582724124663332, + "flos": 565255060992.0, + "grad_norm": 0.03374091506860629, + "language_loss": 0.88055015, + "learning_rate": 0.0003912141949941664, + "loss": 0.89098167, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.40673828, + "step": 3029, + "time_per_iteration": 2.674584150314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_mlp": 1.00249338, + "epoch": 0.5829165063485956, + "flos": 493112854272.0, + "grad_norm": 0.039605660090179254, + "language_loss": 0.83319384, + "learning_rate": 0.0003909101370162143, + "loss": 0.84362668, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.40795898, + "step": 3030, + "time_per_iteration": 2.592111587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.00718689, + "epoch": 0.5831088880338592, + "flos": 1531879941888.0, + "grad_norm": 0.006346134957791291, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73480463, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.39355469, + "step": 3031, + "time_per_iteration": 4.929339170455933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047591, + "balance_loss_mlp": 1.00686908, + "epoch": 0.5833012697191228, + "flos": 619209829632.0, + "grad_norm": 0.03163493287885039, + "language_loss": 0.83241516, + "learning_rate": 0.0003903021482356622, + "loss": 0.8428911, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.40722656, + "step": 3032, + "time_per_iteration": 2.7828269004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_mlp": 1.00508761, + "epoch": 0.5834936514043862, + "flos": 769294588416.0, + "grad_norm": 0.028764675594544035, + "language_loss": 0.83318806, + "learning_rate": 0.00038999821766910465, + "loss": 0.84364575, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.40673828, + "step": 3033, + "time_per_iteration": 2.976440906524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046889, + "balance_loss_mlp": 1.00616705, + "epoch": 0.5836860330896498, + "flos": 459316881408.0, + "grad_norm": 0.03570453873198092, + "language_loss": 0.86074644, + "learning_rate": 0.00038969432980902606, + "loss": 0.87121534, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.40722656, + "step": 3034, + "time_per_iteration": 2.5605523586273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049232, + "balance_loss_mlp": 1.00975037, + "epoch": 0.5838784147749134, + "flos": 1364198760960.0, + "grad_norm": 0.006741388763220325, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80833733, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.39453125, + "step": 3035, + "time_per_iteration": 4.870011329650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046432, + "balance_loss_mlp": 1.00566232, + "epoch": 0.584070796460177, + "flos": 568289865216.0, + "grad_norm": 0.0320953374409888, + "language_loss": 0.82746142, + "learning_rate": 0.00038908668268020953, + "loss": 0.83792579, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.40771484, + "step": 3036, + "time_per_iteration": 2.6482043266296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_mlp": 1.00582528, + "epoch": 0.5842631781454406, + "flos": 612666823680.0, + "grad_norm": 0.032158289179941596, + "language_loss": 0.85682309, + "learning_rate": 0.00038878292364738097, + "loss": 0.86729091, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.40966797, + "step": 3037, + "time_per_iteration": 2.7571158409118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104852, + "balance_loss_mlp": 1.00758314, + "epoch": 0.5844555598307041, + "flos": 464333736960.0, + "grad_norm": 0.037716829310632, + "language_loss": 0.87422657, + "learning_rate": 0.0003884792077928508, + "loss": 0.88471174, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.40942383, + "step": 3038, + "time_per_iteration": 2.5060815811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_mlp": 1.00522828, + "epoch": 0.5846479415159677, + "flos": 411058445568.0, + "grad_norm": 0.036592459093467214, + "language_loss": 0.77285695, + "learning_rate": 0.0003881755352345322, + "loss": 0.78331912, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.40991211, + "step": 3039, + "time_per_iteration": 2.558833360671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049516, + "balance_loss_mlp": 1.0084126, + "epoch": 0.5848403232012312, + "flos": 492266181120.0, + "grad_norm": 0.028436591435814704, + "language_loss": 0.87703776, + "learning_rate": 0.0003878719060903207, + "loss": 0.88753295, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.41113281, + "step": 3040, + "time_per_iteration": 2.563680410385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048574, + "balance_loss_mlp": 1.0073278, + "epoch": 0.5850327048864948, + "flos": 585509121024.0, + "grad_norm": 0.03942000109029475, + "language_loss": 0.8397156, + "learning_rate": 0.0003875683204780961, + "loss": 0.85020131, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.41259766, + "step": 3041, + "time_per_iteration": 2.707235336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046188, + "balance_loss_mlp": 1.00506115, + "epoch": 0.5852250865717584, + "flos": 652719042816.0, + "grad_norm": 0.03661913957485838, + "language_loss": 0.85946143, + "learning_rate": 0.00038726477851572043, + "loss": 0.86992323, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.41137695, + "step": 3042, + "time_per_iteration": 2.7779452800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_mlp": 1.00753701, + "epoch": 0.5854174682570219, + "flos": 535620522240.0, + "grad_norm": 0.03519010087747146, + "language_loss": 0.80754662, + "learning_rate": 0.0003869612803210395, + "loss": 0.81803256, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.41064453, + "step": 3043, + "time_per_iteration": 2.64778733253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_mlp": 1.01044726, + "epoch": 0.5856098499422855, + "flos": 510759820800.0, + "grad_norm": 0.03494290194274924, + "language_loss": 0.83645654, + "learning_rate": 0.0003866578260118817, + "loss": 0.84697139, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.41040039, + "step": 3044, + "time_per_iteration": 2.596379041671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_mlp": 1.00828719, + "epoch": 0.5858022316275491, + "flos": 594993612288.0, + "grad_norm": 0.03849486234726574, + "language_loss": 0.83826196, + "learning_rate": 0.0003863544157060581, + "loss": 0.84875488, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.41015625, + "step": 3045, + "time_per_iteration": 2.6666998863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_mlp": 1.0086298, + "epoch": 0.5859946133128127, + "flos": 560318885376.0, + "grad_norm": 0.02876341489298987, + "language_loss": 0.82639688, + "learning_rate": 0.0003860510495213634, + "loss": 0.83689421, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.41113281, + "step": 3046, + "time_per_iteration": 2.865504264831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00827503, + "epoch": 0.5861869949980761, + "flos": 554756700672.0, + "grad_norm": 0.0396946944562825, + "language_loss": 0.78689963, + "learning_rate": 0.0003857477275755746, + "loss": 0.79739368, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.41137695, + "step": 3047, + "time_per_iteration": 2.624819278717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049982, + "balance_loss_mlp": 1.00887823, + "epoch": 0.5863793766833397, + "flos": 720055331328.0, + "grad_norm": 0.02972376125592825, + "language_loss": 0.84339547, + "learning_rate": 0.00038544444998645167, + "loss": 0.85389531, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.41113281, + "step": 3048, + "time_per_iteration": 2.990790367126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_mlp": 1.00750625, + "epoch": 0.5865717583686033, + "flos": 473286504960.0, + "grad_norm": 0.034605288898392046, + "language_loss": 0.82032233, + "learning_rate": 0.00038514121687173767, + "loss": 0.83080769, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.41040039, + "step": 3049, + "time_per_iteration": 2.596529960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049884, + "balance_loss_mlp": 1.0088284, + "epoch": 0.5867641400538669, + "flos": 814847754240.0, + "grad_norm": 0.03903750410866887, + "language_loss": 0.82380903, + "learning_rate": 0.00038483802834915807, + "loss": 0.83430791, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.41064453, + "step": 3050, + "time_per_iteration": 2.9996161460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.00480914, + "epoch": 0.5869565217391305, + "flos": 487518588672.0, + "grad_norm": 0.0350404565928551, + "language_loss": 0.79904723, + "learning_rate": 0.00038453488453642074, + "loss": 0.80950606, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.41088867, + "step": 3051, + "time_per_iteration": 2.7099759578704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_mlp": 1.00626779, + "epoch": 0.587148903424394, + "flos": 570512989440.0, + "grad_norm": 0.03324549798167153, + "language_loss": 0.8786602, + "learning_rate": 0.00038423178555121697, + "loss": 0.88913417, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.41137695, + "step": 3052, + "time_per_iteration": 2.684868097305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_mlp": 1.00359285, + "epoch": 0.5873412851096576, + "flos": 748695442944.0, + "grad_norm": 0.0344494509074348, + "language_loss": 0.86014688, + "learning_rate": 0.00038392873151121994, + "loss": 0.87059504, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.41235352, + "step": 3053, + "time_per_iteration": 3.073838949203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_mlp": 1.00079656, + "epoch": 0.5875336667949211, + "flos": 529188331776.0, + "grad_norm": 0.03507235034672983, + "language_loss": 0.83636832, + "learning_rate": 0.0003836257225340859, + "loss": 0.84678853, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.41235352, + "step": 3054, + "time_per_iteration": 2.6333680152893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_mlp": 1.000633, + "epoch": 0.5877260484801847, + "flos": 825641622528.0, + "grad_norm": 0.032727897026981576, + "language_loss": 0.82534069, + "learning_rate": 0.00038332275873745336, + "loss": 0.83575833, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.41137695, + "step": 3055, + "time_per_iteration": 3.051757335662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_mlp": 1.00292683, + "epoch": 0.5879184301654482, + "flos": 592694665728.0, + "grad_norm": 0.030899230424817493, + "language_loss": 0.83323562, + "learning_rate": 0.0003830198402389431, + "loss": 0.84367692, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.41210938, + "step": 3056, + "time_per_iteration": 2.6873278617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_mlp": 1.00317383, + "epoch": 0.5881108118507118, + "flos": 1549226531328.0, + "grad_norm": 0.008859615514711313, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78391969, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.3984375, + "step": 3057, + "time_per_iteration": 5.044417142868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045675, + "balance_loss_mlp": 1.00461972, + "epoch": 0.5883031935359754, + "flos": 490599079680.0, + "grad_norm": 0.03687508634060279, + "language_loss": 0.83287209, + "learning_rate": 0.0003824141396066855, + "loss": 0.84332883, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.41064453, + "step": 3058, + "time_per_iteration": 2.57017183303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_mlp": 1.00458455, + "epoch": 0.588495575221239, + "flos": 583981025280.0, + "grad_norm": 0.03543871049956236, + "language_loss": 0.83470112, + "learning_rate": 0.000382111357708092, + "loss": 0.84515893, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.41210938, + "step": 3059, + "time_per_iteration": 2.710636615753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046053, + "balance_loss_mlp": 1.00492609, + "epoch": 0.5886879569065026, + "flos": 662240472576.0, + "grad_norm": 0.03467029745908185, + "language_loss": 0.84034348, + "learning_rate": 0.00038180862157792864, + "loss": 0.85080403, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.41137695, + "step": 3060, + "time_per_iteration": 2.765730619430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045299, + "balance_loss_mlp": 1.00429142, + "epoch": 0.588880338591766, + "flos": 563720162304.0, + "grad_norm": 0.034528332603885874, + "language_loss": 0.82661986, + "learning_rate": 0.0003815059313337279, + "loss": 0.83707285, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.41015625, + "step": 3061, + "time_per_iteration": 2.6512649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_mlp": 1.00339055, + "epoch": 0.5890727202770296, + "flos": 555853195008.0, + "grad_norm": 0.028645191608940447, + "language_loss": 0.78527474, + "learning_rate": 0.00038120328709300436, + "loss": 0.79571807, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.40942383, + "step": 3062, + "time_per_iteration": 2.839588165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_mlp": 1.00321651, + "epoch": 0.5892651019622932, + "flos": 656702587392.0, + "grad_norm": 0.03868775593308096, + "language_loss": 0.83858323, + "learning_rate": 0.0003809006889732549, + "loss": 0.84902555, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.41015625, + "step": 3063, + "time_per_iteration": 2.80668306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044985, + "balance_loss_mlp": 1.00395334, + "epoch": 0.5894574836475568, + "flos": 454132829952.0, + "grad_norm": 0.034675820144419535, + "language_loss": 0.8846643, + "learning_rate": 0.0003805981370919589, + "loss": 0.89511412, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.41040039, + "step": 3064, + "time_per_iteration": 2.4926044940948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046275, + "balance_loss_mlp": 1.00509965, + "epoch": 0.5896498653328203, + "flos": 520112109312.0, + "grad_norm": 0.03109338069781882, + "language_loss": 0.843858, + "learning_rate": 0.0003802956315665771, + "loss": 0.85432076, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.41186523, + "step": 3065, + "time_per_iteration": 2.6821701526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_mlp": 1.00530875, + "epoch": 0.5898422470180839, + "flos": 550084930560.0, + "grad_norm": 0.039548358411626815, + "language_loss": 0.82298601, + "learning_rate": 0.0003799931725145529, + "loss": 0.83345109, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.41210938, + "step": 3066, + "time_per_iteration": 2.6161272525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_mlp": 1.00532758, + "epoch": 0.5900346287033474, + "flos": 525380731392.0, + "grad_norm": 0.034195441532662435, + "language_loss": 0.86171907, + "learning_rate": 0.00037969076005331083, + "loss": 0.87218219, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.40991211, + "step": 3067, + "time_per_iteration": 2.769503116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046067, + "balance_loss_mlp": 1.00515461, + "epoch": 0.590227010388611, + "flos": 568215988224.0, + "grad_norm": 0.03443045458348014, + "language_loss": 0.88715112, + "learning_rate": 0.00037938839430025817, + "loss": 0.8976118, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.40917969, + "step": 3068, + "time_per_iteration": 2.626838207244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046372, + "balance_loss_mlp": 1.00557816, + "epoch": 0.5904193920738746, + "flos": 584456368128.0, + "grad_norm": 0.03106221395948033, + "language_loss": 0.86157519, + "learning_rate": 0.0003790860753727835, + "loss": 0.8720389, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.40795898, + "step": 3069, + "time_per_iteration": 2.825906991958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_mlp": 1.0041821, + "epoch": 0.5906117737591381, + "flos": 530797107456.0, + "grad_norm": 0.033655572520404166, + "language_loss": 0.83318973, + "learning_rate": 0.00037878380338825766, + "loss": 0.84363884, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.40722656, + "step": 3070, + "time_per_iteration": 2.6605753898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_mlp": 1.00264668, + "epoch": 0.5908041554444017, + "flos": 685516697856.0, + "grad_norm": 0.032255816781200916, + "language_loss": 0.81519401, + "learning_rate": 0.00037848157846403287, + "loss": 0.82562816, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.40771484, + "step": 3071, + "time_per_iteration": 2.8913676738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_mlp": 1.00712073, + "epoch": 0.5909965371296653, + "flos": 551133792768.0, + "grad_norm": 0.033304308768315895, + "language_loss": 0.83666503, + "learning_rate": 0.0003781794007174435, + "loss": 0.84714377, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.4074707, + "step": 3072, + "time_per_iteration": 2.7170376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044559, + "balance_loss_mlp": 1.00498199, + "epoch": 0.5911889188149289, + "flos": 1495645038336.0, + "grad_norm": 0.0062576164066865435, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7511909, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.39550781, + "step": 3073, + "time_per_iteration": 4.848031282424927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053183, + "balance_loss_mlp": 1.01248538, + "epoch": 0.5913813005001923, + "flos": 488886291456.0, + "grad_norm": 0.03164327687157731, + "language_loss": 0.81542623, + "learning_rate": 0.0003775751872264152, + "loss": 0.82595801, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.40698242, + "step": 3074, + "time_per_iteration": 2.7835612297058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_mlp": 1.00721872, + "epoch": 0.5915736821854559, + "flos": 574522778880.0, + "grad_norm": 0.03137518576611995, + "language_loss": 0.87806273, + "learning_rate": 0.0003772731517165527, + "loss": 0.88854092, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.40600586, + "step": 3075, + "time_per_iteration": 2.7984819412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045123, + "balance_loss_mlp": 1.00451982, + "epoch": 0.5917660638707195, + "flos": 790861916160.0, + "grad_norm": 0.03467745447845496, + "language_loss": 0.83953345, + "learning_rate": 0.0003769711638534784, + "loss": 0.84998471, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.40600586, + "step": 3076, + "time_per_iteration": 2.9498283863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045943, + "balance_loss_mlp": 1.0053643, + "epoch": 0.5919584455559831, + "flos": 529756993536.0, + "grad_norm": 0.038274807826461636, + "language_loss": 0.7910676, + "learning_rate": 0.00037666922375443446, + "loss": 0.80152702, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.40576172, + "step": 3077, + "time_per_iteration": 2.595907211303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043771, + "balance_loss_mlp": 1.00312054, + "epoch": 0.5921508272412467, + "flos": 561753662208.0, + "grad_norm": 0.037448898185008676, + "language_loss": 0.82402956, + "learning_rate": 0.00037636733153664396, + "loss": 0.83446729, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.40649414, + "step": 3078, + "time_per_iteration": 2.8082337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050449, + "balance_loss_mlp": 1.00984669, + "epoch": 0.5923432089265102, + "flos": 564334510848.0, + "grad_norm": 0.04535413457726027, + "language_loss": 0.80388999, + "learning_rate": 0.0003760654873173124, + "loss": 0.81439447, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.40600586, + "step": 3079, + "time_per_iteration": 2.6586430072784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048929, + "balance_loss_mlp": 1.00832665, + "epoch": 0.5925355906117737, + "flos": 496751313408.0, + "grad_norm": 0.032303837876808815, + "language_loss": 0.82224989, + "learning_rate": 0.00037576369121362566, + "loss": 0.83273923, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.40600586, + "step": 3080, + "time_per_iteration": 2.5874335765838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_mlp": 1.00846922, + "epoch": 0.5927279722970373, + "flos": 567493736448.0, + "grad_norm": 0.03169427730059961, + "language_loss": 0.82085633, + "learning_rate": 0.0003754619433427516, + "loss": 0.83134699, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.40600586, + "step": 3081, + "time_per_iteration": 2.9037671089172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_mlp": 1.00400662, + "epoch": 0.5929203539823009, + "flos": 668160381696.0, + "grad_norm": 0.04430970694991959, + "language_loss": 0.78507918, + "learning_rate": 0.0003751602438218392, + "loss": 0.79552627, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.40698242, + "step": 3082, + "time_per_iteration": 2.77486252784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_mlp": 1.00195801, + "epoch": 0.5931127356675644, + "flos": 556786384128.0, + "grad_norm": 0.03446517582568327, + "language_loss": 0.84122735, + "learning_rate": 0.0003748585927680186, + "loss": 0.8516537, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.40673828, + "step": 3083, + "time_per_iteration": 2.6401243209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047685, + "balance_loss_mlp": 1.00698733, + "epoch": 0.593305117352828, + "flos": 536243619072.0, + "grad_norm": 0.03379156982252967, + "language_loss": 0.83284605, + "learning_rate": 0.00037455699029840086, + "loss": 0.84332293, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.40698242, + "step": 3084, + "time_per_iteration": 2.6359477043151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_mlp": 1.00723624, + "epoch": 0.5934974990380916, + "flos": 595058740992.0, + "grad_norm": 0.03375272766067447, + "language_loss": 0.84866869, + "learning_rate": 0.0003742554365300787, + "loss": 0.85914803, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.40698242, + "step": 3085, + "time_per_iteration": 2.7629523277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047831, + "balance_loss_mlp": 1.00727594, + "epoch": 0.5936898807233552, + "flos": 714015858432.0, + "grad_norm": 0.08464198739198994, + "language_loss": 0.79301089, + "learning_rate": 0.0003739539315801255, + "loss": 0.80348921, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.40551758, + "step": 3086, + "time_per_iteration": 2.9152019023895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_mlp": 1.01004303, + "epoch": 0.5938822624086187, + "flos": 392749498368.0, + "grad_norm": 0.03659508144201786, + "language_loss": 0.92428821, + "learning_rate": 0.000373652475565596, + "loss": 0.93479371, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.4050293, + "step": 3087, + "time_per_iteration": 2.4702134132385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050448, + "balance_loss_mlp": 1.00982189, + "epoch": 0.5940746440938822, + "flos": 481336219392.0, + "grad_norm": 0.034289442552625136, + "language_loss": 0.81692433, + "learning_rate": 0.00037335106860352587, + "loss": 0.82742882, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.40625, + "step": 3088, + "time_per_iteration": 2.675694704055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_mlp": 1.00322449, + "epoch": 0.5942670257791458, + "flos": 484307840256.0, + "grad_norm": 0.03351872550432346, + "language_loss": 0.8348605, + "learning_rate": 0.00037304971081093146, + "loss": 0.84530044, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.40771484, + "step": 3089, + "time_per_iteration": 2.5974292755126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_mlp": 1.00181389, + "epoch": 0.5944594074644094, + "flos": 549058422528.0, + "grad_norm": 0.03144984032595776, + "language_loss": 0.81257939, + "learning_rate": 0.00037274840230481024, + "loss": 0.82300425, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.40673828, + "step": 3090, + "time_per_iteration": 2.7465951442718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_mlp": 1.00262976, + "epoch": 0.594651789149673, + "flos": 450129843456.0, + "grad_norm": 0.0354227551067568, + "language_loss": 0.79578584, + "learning_rate": 0.00037244714320214077, + "loss": 0.80621862, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.40649414, + "step": 3091, + "time_per_iteration": 2.532076597213745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_mlp": 1.00489831, + "epoch": 0.5948441708349365, + "flos": 597466557696.0, + "grad_norm": 0.033875543124705955, + "language_loss": 0.83456963, + "learning_rate": 0.000372145933619882, + "loss": 0.84502512, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.40649414, + "step": 3092, + "time_per_iteration": 2.888296127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_mlp": 1.00502765, + "epoch": 0.5950365525202, + "flos": 549581397504.0, + "grad_norm": 0.03918584024885415, + "language_loss": 0.83476591, + "learning_rate": 0.000371844773674974, + "loss": 0.84522295, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.40673828, + "step": 3093, + "time_per_iteration": 2.641191244125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042191, + "balance_loss_mlp": 1.00146902, + "epoch": 0.5952289342054636, + "flos": 655964784384.0, + "grad_norm": 0.03345437818943746, + "language_loss": 0.82307684, + "learning_rate": 0.0003715436634843375, + "loss": 0.83349872, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.40722656, + "step": 3094, + "time_per_iteration": 2.8391387462615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_mlp": 1.00185752, + "epoch": 0.5954213158907272, + "flos": 604604470272.0, + "grad_norm": 0.028714859262846556, + "language_loss": 0.8123939, + "learning_rate": 0.00037124260316487355, + "loss": 0.82281804, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.40551758, + "step": 3095, + "time_per_iteration": 2.8300905227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_mlp": 1.00742722, + "epoch": 0.5956136975759908, + "flos": 487268767488.0, + "grad_norm": 0.03390156256560374, + "language_loss": 0.89901024, + "learning_rate": 0.0003709415928334643, + "loss": 0.90949249, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.40795898, + "step": 3096, + "time_per_iteration": 2.594320297241211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_mlp": 1.00376081, + "epoch": 0.5958060792612543, + "flos": 660041647872.0, + "grad_norm": 0.036547009459556086, + "language_loss": 0.8143428, + "learning_rate": 0.00037064063260697233, + "loss": 0.82478929, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.40893555, + "step": 3097, + "time_per_iteration": 2.853452205657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_mlp": 1.00397766, + "epoch": 0.5959984609465179, + "flos": 724996364544.0, + "grad_norm": 0.03336502037481855, + "language_loss": 0.78911316, + "learning_rate": 0.0003703397226022407, + "loss": 0.79956114, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.40820312, + "step": 3098, + "time_per_iteration": 3.0299534797668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050957, + "balance_loss_mlp": 1.01147461, + "epoch": 0.5961908426317815, + "flos": 1523221703424.0, + "grad_norm": 0.010872658804754508, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76550829, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.39453125, + "step": 3099, + "time_per_iteration": 4.950707674026489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_mlp": 1.00387943, + "epoch": 0.596383224317045, + "flos": 533647219200.0, + "grad_norm": 0.033784299285581076, + "language_loss": 0.84084308, + "learning_rate": 0.0003697380537253339, + "loss": 0.85128987, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.40795898, + "step": 3100, + "time_per_iteration": 2.6651411056518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044743, + "balance_loss_mlp": 1.00394928, + "epoch": 0.5965756060023086, + "flos": 592367076864.0, + "grad_norm": 0.032025449945388196, + "language_loss": 0.82004619, + "learning_rate": 0.0003694372950867471, + "loss": 0.83049357, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.40795898, + "step": 3101, + "time_per_iteration": 2.7825992107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_mlp": 1.00341129, + "epoch": 0.5967679876875721, + "flos": 863470717440.0, + "grad_norm": 0.0338522286072748, + "language_loss": 0.78029126, + "learning_rate": 0.0003691365871370976, + "loss": 0.79073191, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.40649414, + "step": 3102, + "time_per_iteration": 3.0174319744110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044179, + "balance_loss_mlp": 1.00340927, + "epoch": 0.5969603693728357, + "flos": 554878209792.0, + "grad_norm": 0.03201933469342105, + "language_loss": 0.85875535, + "learning_rate": 0.00036883592999313093, + "loss": 0.86919713, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.40771484, + "step": 3103, + "time_per_iteration": 2.683260679244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_mlp": 1.00314891, + "epoch": 0.5971527510580993, + "flos": 719937712896.0, + "grad_norm": 0.039464615758245, + "language_loss": 0.79932439, + "learning_rate": 0.0003685353237715722, + "loss": 0.80976272, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.40673828, + "step": 3104, + "time_per_iteration": 2.8593432903289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_mlp": 1.00312221, + "epoch": 0.5973451327433629, + "flos": 648863810304.0, + "grad_norm": 0.031062495288944163, + "language_loss": 0.82383978, + "learning_rate": 0.0003682347685891274, + "loss": 0.83427846, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.4074707, + "step": 3105, + "time_per_iteration": 2.840812921524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_mlp": 1.00504565, + "epoch": 0.5975375144286263, + "flos": 723090135552.0, + "grad_norm": 0.03430317325592521, + "language_loss": 0.81334996, + "learning_rate": 0.0003679342645624822, + "loss": 0.82380736, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.40698242, + "step": 3106, + "time_per_iteration": 2.961186408996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_mlp": 1.00520086, + "epoch": 0.5977298961138899, + "flos": 752344595712.0, + "grad_norm": 0.03201923744385334, + "language_loss": 0.82261443, + "learning_rate": 0.0003676338118083025, + "loss": 0.83307385, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.4074707, + "step": 3107, + "time_per_iteration": 2.9809908866882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105353, + "balance_loss_mlp": 1.01264107, + "epoch": 0.5979222777991535, + "flos": 531999559680.0, + "grad_norm": 0.03643788911431517, + "language_loss": 0.79681456, + "learning_rate": 0.0003673334104432347, + "loss": 0.8073498, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.40893555, + "step": 3108, + "time_per_iteration": 2.5879976749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_mlp": 1.01157844, + "epoch": 0.5981146594844171, + "flos": 622915362816.0, + "grad_norm": 0.031178647905512342, + "language_loss": 0.84073299, + "learning_rate": 0.0003670330605839048, + "loss": 0.85125697, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.40820312, + "step": 3109, + "time_per_iteration": 2.843069314956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_mlp": 1.00877023, + "epoch": 0.5983070411696807, + "flos": 604710428160.0, + "grad_norm": 0.03611015998230635, + "language_loss": 0.77344596, + "learning_rate": 0.0003667327623469191, + "loss": 0.7839421, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.40844727, + "step": 3110, + "time_per_iteration": 2.7326698303222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00438774, + "epoch": 0.5984994228549442, + "flos": 634670610432.0, + "grad_norm": 0.03877534508876671, + "language_loss": 0.78326917, + "learning_rate": 0.00036643251584886333, + "loss": 0.79372144, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.40844727, + "step": 3111, + "time_per_iteration": 2.784482717514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105463, + "balance_loss_mlp": 1.01369393, + "epoch": 0.5986918045402078, + "flos": 526294478592.0, + "grad_norm": 0.03280596002015671, + "language_loss": 0.82781613, + "learning_rate": 0.00036613232120630393, + "loss": 0.83836246, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.40942383, + "step": 3112, + "time_per_iteration": 2.5862860679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105133, + "balance_loss_mlp": 1.0103699, + "epoch": 0.5988841862254713, + "flos": 484140644352.0, + "grad_norm": 0.03859230842611924, + "language_loss": 0.80514455, + "learning_rate": 0.00036583217853578643, + "loss": 0.81565785, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.40966797, + "step": 3113, + "time_per_iteration": 2.565713405609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048995, + "balance_loss_mlp": 1.00805807, + "epoch": 0.5990765679107349, + "flos": 1142123451648.0, + "grad_norm": 0.034390898471739054, + "language_loss": 0.77730286, + "learning_rate": 0.000365532087953837, + "loss": 0.78779286, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.40942383, + "step": 3114, + "time_per_iteration": 3.646124839782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049853, + "balance_loss_mlp": 1.00889242, + "epoch": 0.5992689495959984, + "flos": 518019242496.0, + "grad_norm": 0.033850887819700186, + "language_loss": 0.89597213, + "learning_rate": 0.00036523204957696065, + "loss": 0.90647066, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.40966797, + "step": 3115, + "time_per_iteration": 2.594458818435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050487, + "balance_loss_mlp": 1.00952673, + "epoch": 0.599461331281262, + "flos": 745942540800.0, + "grad_norm": 0.044244117222237124, + "language_loss": 0.81526911, + "learning_rate": 0.00036493206352164324, + "loss": 0.82577395, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.40966797, + "step": 3116, + "time_per_iteration": 2.9088714122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046863, + "balance_loss_mlp": 1.0058552, + "epoch": 0.5996537129665256, + "flos": 593484958464.0, + "grad_norm": 0.034019953192927346, + "language_loss": 0.85863578, + "learning_rate": 0.000364632129904349, + "loss": 0.8691045, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.41015625, + "step": 3117, + "time_per_iteration": 2.7059812545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055187, + "balance_loss_mlp": 1.01415479, + "epoch": 0.5998460946517892, + "flos": 560116696320.0, + "grad_norm": 0.0363455836603733, + "language_loss": 0.78243721, + "learning_rate": 0.00036433224884152283, + "loss": 0.79298902, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.41040039, + "step": 3118, + "time_per_iteration": 2.7368576526641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049846, + "balance_loss_mlp": 1.00879073, + "epoch": 0.6000384763370528, + "flos": 485536537344.0, + "grad_norm": 0.037553840644260136, + "language_loss": 0.78583586, + "learning_rate": 0.00036403242044958875, + "loss": 0.79633433, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.41064453, + "step": 3119, + "time_per_iteration": 2.5575714111328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105366, + "balance_loss_mlp": 1.01267588, + "epoch": 0.6002308580223162, + "flos": 597878717184.0, + "grad_norm": 0.03820222884564333, + "language_loss": 0.91700655, + "learning_rate": 0.0003637326448449507, + "loss": 0.9275431, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.40991211, + "step": 3120, + "time_per_iteration": 2.742879629135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_mlp": 1.00335419, + "epoch": 0.6004232397075798, + "flos": 546220949760.0, + "grad_norm": 0.03312076086842182, + "language_loss": 0.86720824, + "learning_rate": 0.00036343292214399177, + "loss": 0.87765157, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.40991211, + "step": 3121, + "time_per_iteration": 2.827937364578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048688, + "balance_loss_mlp": 1.00777555, + "epoch": 0.6006156213928434, + "flos": 631151715072.0, + "grad_norm": 0.0990751082853954, + "language_loss": 0.77571696, + "learning_rate": 0.00036313325246307456, + "loss": 0.78620386, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.40917969, + "step": 3122, + "time_per_iteration": 2.844771146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044272, + "balance_loss_mlp": 1.00347829, + "epoch": 0.600808003078107, + "flos": 583405560576.0, + "grad_norm": 0.0330511855915857, + "language_loss": 0.87869143, + "learning_rate": 0.0003628336359185411, + "loss": 0.88913417, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.40795898, + "step": 3123, + "time_per_iteration": 2.728536367416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_mlp": 1.00810611, + "epoch": 0.6010003847633705, + "flos": 636439779072.0, + "grad_norm": 0.035612142743683524, + "language_loss": 0.75946915, + "learning_rate": 0.000362534072626713, + "loss": 0.76995575, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.40551758, + "step": 3124, + "time_per_iteration": 2.7660484313964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049761, + "balance_loss_mlp": 1.00915837, + "epoch": 0.6011927664486341, + "flos": 720031031808.0, + "grad_norm": 0.034873879848328126, + "language_loss": 0.81774855, + "learning_rate": 0.00036223456270389093, + "loss": 0.82824624, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.40600586, + "step": 3125, + "time_per_iteration": 2.943265676498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050049, + "balance_loss_mlp": 1.00939894, + "epoch": 0.6013851481338977, + "flos": 500055380736.0, + "grad_norm": 0.03349756434082021, + "language_loss": 0.81548929, + "learning_rate": 0.00036193510626635517, + "loss": 0.82598984, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.40649414, + "step": 3126, + "time_per_iteration": 2.7160630226135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049829, + "balance_loss_mlp": 1.00929773, + "epoch": 0.6015775298191612, + "flos": 750876771072.0, + "grad_norm": 0.03275922867012815, + "language_loss": 0.81968188, + "learning_rate": 0.0003616357034303649, + "loss": 0.83018017, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.40527344, + "step": 3127, + "time_per_iteration": 2.9286913871765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_mlp": 1.00725627, + "epoch": 0.6017699115044248, + "flos": 594264557568.0, + "grad_norm": 0.02908266373706377, + "language_loss": 0.79201299, + "learning_rate": 0.0003613363543121584, + "loss": 0.80249178, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.40625, + "step": 3128, + "time_per_iteration": 2.917598009109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_mlp": 1.00568318, + "epoch": 0.6019622931896883, + "flos": 516202441728.0, + "grad_norm": 0.031364349484999776, + "language_loss": 0.85277975, + "learning_rate": 0.00036103705902795357, + "loss": 0.86324257, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.40600586, + "step": 3129, + "time_per_iteration": 2.7694129943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_mlp": 1.01047814, + "epoch": 0.6021546748749519, + "flos": 491473943040.0, + "grad_norm": 0.0392414269589035, + "language_loss": 0.80161059, + "learning_rate": 0.0003607378176939471, + "loss": 0.81212205, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.40673828, + "step": 3130, + "time_per_iteration": 2.622267961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055595, + "balance_loss_mlp": 1.01494503, + "epoch": 0.6023470565602155, + "flos": 542115896064.0, + "grad_norm": 0.037876950900112984, + "language_loss": 0.82781708, + "learning_rate": 0.00036043863042631465, + "loss": 0.83837301, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.40649414, + "step": 3131, + "time_per_iteration": 2.7120039463043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_mlp": 1.01163399, + "epoch": 0.6025394382454791, + "flos": 846464344320.0, + "grad_norm": 0.039947813860245845, + "language_loss": 0.76966566, + "learning_rate": 0.00036013949734121133, + "loss": 0.78018856, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.40649414, + "step": 3132, + "time_per_iteration": 3.127255916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050529, + "balance_loss_mlp": 1.00990224, + "epoch": 0.6027318199307425, + "flos": 578258447616.0, + "grad_norm": 0.03419044123662342, + "language_loss": 0.8313787, + "learning_rate": 0.00035984041855477043, + "loss": 0.84188402, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.40625, + "step": 3133, + "time_per_iteration": 2.7259347438812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051548, + "balance_loss_mlp": 1.01216125, + "epoch": 0.6029242016160061, + "flos": 1474255600128.0, + "grad_norm": 0.0070819988580959, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79761446, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.39355469, + "step": 3134, + "time_per_iteration": 4.934648513793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_mlp": 1.01171601, + "epoch": 0.6031165833012697, + "flos": 481783372032.0, + "grad_norm": 0.03833547758664617, + "language_loss": 0.80612588, + "learning_rate": 0.00035924242434230637, + "loss": 0.81664813, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.4050293, + "step": 3135, + "time_per_iteration": 2.691655397415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_mlp": 1.00985157, + "epoch": 0.6033089649865333, + "flos": 500465594880.0, + "grad_norm": 0.04302606138210952, + "language_loss": 0.79556847, + "learning_rate": 0.00035894350914844516, + "loss": 0.80607277, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.40576172, + "step": 3136, + "time_per_iteration": 2.6602935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048879, + "balance_loss_mlp": 1.00827622, + "epoch": 0.6035013466717969, + "flos": 557724430848.0, + "grad_norm": 0.03619946216792389, + "language_loss": 0.83608747, + "learning_rate": 0.0003586446487175703, + "loss": 0.84657621, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.40600586, + "step": 3137, + "time_per_iteration": 2.7028918266296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050046, + "balance_loss_mlp": 1.00944352, + "epoch": 0.6036937283570604, + "flos": 595996787712.0, + "grad_norm": 0.03316873106558702, + "language_loss": 0.8565768, + "learning_rate": 0.0003583458431657099, + "loss": 0.86707723, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.40600586, + "step": 3138, + "time_per_iteration": 2.730760097503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051204, + "balance_loss_mlp": 1.01048255, + "epoch": 0.603886110042324, + "flos": 542059515648.0, + "grad_norm": 0.041412274215224906, + "language_loss": 0.83086127, + "learning_rate": 0.00035804709260887056, + "loss": 0.84137332, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.40722656, + "step": 3139, + "time_per_iteration": 2.6989586353302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049269, + "balance_loss_mlp": 1.00852323, + "epoch": 0.6040784917275875, + "flos": 519656208384.0, + "grad_norm": 0.031983597535220364, + "language_loss": 0.89732921, + "learning_rate": 0.0003577483971630373, + "loss": 0.90782189, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.4074707, + "step": 3140, + "time_per_iteration": 2.697202205657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_mlp": 1.00888968, + "epoch": 0.6042708734128511, + "flos": 662014950912.0, + "grad_norm": 0.02881540865080385, + "language_loss": 0.85653752, + "learning_rate": 0.00035744975694417414, + "loss": 0.86703384, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.4074707, + "step": 3141, + "time_per_iteration": 2.853609085083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049159, + "balance_loss_mlp": 1.00838912, + "epoch": 0.6044632550981146, + "flos": 573517658112.0, + "grad_norm": 0.037282810981105224, + "language_loss": 0.83199489, + "learning_rate": 0.00035715117206822344, + "loss": 0.8424865, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.40771484, + "step": 3142, + "time_per_iteration": 2.778184175491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_mlp": 1.00812936, + "epoch": 0.6046556367833782, + "flos": 547729603584.0, + "grad_norm": 0.035085942615977306, + "language_loss": 0.81379992, + "learning_rate": 0.0003568526426511065, + "loss": 0.82428956, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.40844727, + "step": 3143, + "time_per_iteration": 2.626789093017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047012, + "balance_loss_mlp": 1.00612307, + "epoch": 0.6048480184686418, + "flos": 778175424768.0, + "grad_norm": 0.035762108913210126, + "language_loss": 0.83504343, + "learning_rate": 0.000356554168808722, + "loss": 0.84551358, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.40893555, + "step": 3144, + "time_per_iteration": 2.987703323364258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_mlp": 1.00229406, + "epoch": 0.6050404001539054, + "flos": 658376491776.0, + "grad_norm": 0.03425886740508031, + "language_loss": 0.85222483, + "learning_rate": 0.00035625575065694837, + "loss": 0.86265695, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.40917969, + "step": 3145, + "time_per_iteration": 2.8534908294677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_mlp": 1.00359786, + "epoch": 0.605232781839169, + "flos": 550082985216.0, + "grad_norm": 0.03070859084954421, + "language_loss": 0.78136766, + "learning_rate": 0.0003559573883116415, + "loss": 0.79181373, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.41015625, + "step": 3146, + "time_per_iteration": 2.701352119445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_mlp": 1.00323021, + "epoch": 0.6054251635244324, + "flos": 606642902016.0, + "grad_norm": 0.029138241099590467, + "language_loss": 0.8591851, + "learning_rate": 0.00035565908188863604, + "loss": 0.8696267, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.40942383, + "step": 3147, + "time_per_iteration": 2.919374465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_mlp": 1.00640118, + "epoch": 0.605617545209696, + "flos": 614809267968.0, + "grad_norm": 0.029609984696998014, + "language_loss": 0.8021152, + "learning_rate": 0.00035536083150374464, + "loss": 0.81258953, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.41040039, + "step": 3148, + "time_per_iteration": 2.7596092224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053307, + "balance_loss_mlp": 1.01382446, + "epoch": 0.6058099268949596, + "flos": 1501610634240.0, + "grad_norm": 0.006207951084567088, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75801259, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.39453125, + "step": 3149, + "time_per_iteration": 4.876317739486694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051295, + "balance_loss_mlp": 1.01014411, + "epoch": 0.6060023085802232, + "flos": 671705521920.0, + "grad_norm": 0.034498143829504634, + "language_loss": 0.86414444, + "learning_rate": 0.0003547644993114475, + "loss": 0.87465739, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.41162109, + "step": 3150, + "time_per_iteration": 2.845522403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052323, + "balance_loss_mlp": 1.01110053, + "epoch": 0.6061946902654868, + "flos": 607306828032.0, + "grad_norm": 0.035670233665724194, + "language_loss": 0.80287176, + "learning_rate": 0.00035446641773555806, + "loss": 0.81339502, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.41235352, + "step": 3151, + "time_per_iteration": 2.7565760612487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_mlp": 1.01236176, + "epoch": 0.6063870719507503, + "flos": 558953127936.0, + "grad_norm": 0.031088575801094406, + "language_loss": 0.8789348, + "learning_rate": 0.000354168392660816, + "loss": 0.88947117, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.4128418, + "step": 3152, + "time_per_iteration": 2.747297525405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049898, + "balance_loss_mlp": 1.00865126, + "epoch": 0.6065794536360138, + "flos": 558282398976.0, + "grad_norm": 0.032072657791302916, + "language_loss": 0.83342856, + "learning_rate": 0.0003538704242029252, + "loss": 0.84392756, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.41259766, + "step": 3153, + "time_per_iteration": 2.7606263160705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050833, + "balance_loss_mlp": 1.0096823, + "epoch": 0.6067718353212774, + "flos": 691382171904.0, + "grad_norm": 0.035512545115511426, + "language_loss": 0.78534603, + "learning_rate": 0.0003535725124775672, + "loss": 0.79585433, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.41162109, + "step": 3154, + "time_per_iteration": 2.832683801651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046236, + "balance_loss_mlp": 1.00510859, + "epoch": 0.606964217006541, + "flos": 522903895296.0, + "grad_norm": 0.031701324925560485, + "language_loss": 0.87189692, + "learning_rate": 0.00035327465760040126, + "loss": 0.88235927, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.41137695, + "step": 3155, + "time_per_iteration": 2.6946585178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_mlp": 1.00643396, + "epoch": 0.6071565986918045, + "flos": 642713521920.0, + "grad_norm": 0.0351469249432502, + "language_loss": 0.85231131, + "learning_rate": 0.00035297685968706526, + "loss": 0.86278605, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.41040039, + "step": 3156, + "time_per_iteration": 2.7586491107940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045389, + "balance_loss_mlp": 1.00416672, + "epoch": 0.6073489803770681, + "flos": 561653540352.0, + "grad_norm": 0.03543028352480344, + "language_loss": 0.83488154, + "learning_rate": 0.00035267911885317454, + "loss": 0.84533542, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.41235352, + "step": 3157, + "time_per_iteration": 2.678812026977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051077, + "balance_loss_mlp": 1.00997388, + "epoch": 0.6075413620623317, + "flos": 587202467328.0, + "grad_norm": 0.03110064511501168, + "language_loss": 0.81796658, + "learning_rate": 0.0003523814352143222, + "loss": 0.82847732, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.41113281, + "step": 3158, + "time_per_iteration": 2.8277432918548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052557, + "balance_loss_mlp": 1.01128709, + "epoch": 0.6077337437475953, + "flos": 631972143360.0, + "grad_norm": 0.03468149601951464, + "language_loss": 0.9173736, + "learning_rate": 0.00035208380888607937, + "loss": 0.92789918, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.4128418, + "step": 3159, + "time_per_iteration": 2.787712574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_mlp": 1.01289368, + "epoch": 0.6079261254328588, + "flos": 1471626152448.0, + "grad_norm": 0.014144477200468554, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80514455, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.39550781, + "step": 3160, + "time_per_iteration": 4.879680871963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053925, + "balance_loss_mlp": 1.01444244, + "epoch": 0.6081185071181223, + "flos": 1526205963264.0, + "grad_norm": 0.006801374803666016, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76746154, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.39453125, + "step": 3161, + "time_per_iteration": 5.0031373500823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051684, + "balance_loss_mlp": 1.0106045, + "epoch": 0.6083108888033859, + "flos": 557435725824.0, + "grad_norm": 0.030142563258654227, + "language_loss": 0.82224369, + "learning_rate": 0.00035119127492038446, + "loss": 0.83276057, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.41088867, + "step": 3162, + "time_per_iteration": 2.80432391166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053395, + "balance_loss_mlp": 1.01229131, + "epoch": 0.6085032704886495, + "flos": 842556622080.0, + "grad_norm": 0.03512464115253957, + "language_loss": 0.83202064, + "learning_rate": 0.00035089387898984436, + "loss": 0.84255463, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.41113281, + "step": 3163, + "time_per_iteration": 3.1297876834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_mlp": 1.008147, + "epoch": 0.6086956521739131, + "flos": 685993986048.0, + "grad_norm": 0.03637672327155598, + "language_loss": 0.82543135, + "learning_rate": 0.0003505965409474343, + "loss": 0.83592415, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.41137695, + "step": 3164, + "time_per_iteration": 2.9028842449188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_mlp": 1.00382376, + "epoch": 0.6088880338591766, + "flos": 536866715904.0, + "grad_norm": 0.035078655431856474, + "language_loss": 0.86721897, + "learning_rate": 0.0003502992609085913, + "loss": 0.87766796, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.41088867, + "step": 3165, + "time_per_iteration": 2.752734422683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045246, + "balance_loss_mlp": 1.0041908, + "epoch": 0.6090804155444401, + "flos": 732882773760.0, + "grad_norm": 0.030998406489771316, + "language_loss": 0.82771933, + "learning_rate": 0.00035000203898872954, + "loss": 0.83817178, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.41064453, + "step": 3166, + "time_per_iteration": 2.9903385639190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_mlp": 1.00420833, + "epoch": 0.6092727972297037, + "flos": 700243566336.0, + "grad_norm": 0.03412494871544842, + "language_loss": 0.85219544, + "learning_rate": 0.0003497048753032406, + "loss": 0.86264783, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.41040039, + "step": 3167, + "time_per_iteration": 2.8939006328582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052068, + "balance_loss_mlp": 1.01117909, + "epoch": 0.6094651789149673, + "flos": 1053677681664.0, + "grad_norm": 0.032839303584214885, + "language_loss": 0.81472063, + "learning_rate": 0.000349407769967494, + "loss": 0.82524133, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.40893555, + "step": 3168, + "time_per_iteration": 3.384226083755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_mlp": 1.0035919, + "epoch": 0.6096575606002309, + "flos": 504095305728.0, + "grad_norm": 0.03315731648792901, + "language_loss": 0.85102254, + "learning_rate": 0.0003491107230968361, + "loss": 0.86146903, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.41064453, + "step": 3169, + "time_per_iteration": 2.6621110439300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104943, + "balance_loss_mlp": 1.00837409, + "epoch": 0.6098499422854944, + "flos": 586864184832.0, + "grad_norm": 0.02773637180026576, + "language_loss": 0.82196522, + "learning_rate": 0.00034881373480659085, + "loss": 0.83245957, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.41064453, + "step": 3170, + "time_per_iteration": 2.8139965534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_mlp": 1.00795305, + "epoch": 0.610042323970758, + "flos": 470160327168.0, + "grad_norm": 0.03906179499333773, + "language_loss": 0.78314018, + "learning_rate": 0.0003485168052120594, + "loss": 0.79363, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.41040039, + "step": 3171, + "time_per_iteration": 2.5483758449554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052409, + "balance_loss_mlp": 1.01142442, + "epoch": 0.6102347056560216, + "flos": 515199266304.0, + "grad_norm": 0.03618411847150492, + "language_loss": 0.80390579, + "learning_rate": 0.00034821993442851973, + "loss": 0.81442988, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.40991211, + "step": 3172, + "time_per_iteration": 2.590830087661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_mlp": 1.00884163, + "epoch": 0.6104270873412851, + "flos": 469964941056.0, + "grad_norm": 0.03897584044245514, + "language_loss": 0.82572639, + "learning_rate": 0.00034792312257122735, + "loss": 0.83622348, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.40869141, + "step": 3173, + "time_per_iteration": 2.594754457473755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_mlp": 1.00834739, + "epoch": 0.6106194690265486, + "flos": 550940352000.0, + "grad_norm": 0.03632239406226319, + "language_loss": 0.81349075, + "learning_rate": 0.00034762636975541506, + "loss": 0.82398319, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.40893555, + "step": 3174, + "time_per_iteration": 2.6291897296905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046645, + "balance_loss_mlp": 1.00563669, + "epoch": 0.6108118507118122, + "flos": 473881411584.0, + "grad_norm": 0.03249903592127121, + "language_loss": 0.81528097, + "learning_rate": 0.0003473296760962923, + "loss": 0.82574743, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.41015625, + "step": 3175, + "time_per_iteration": 2.6912500858306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052322, + "balance_loss_mlp": 1.01264954, + "epoch": 0.6110042323970758, + "flos": 1448182731264.0, + "grad_norm": 0.007043978800011362, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79586065, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.39648438, + "step": 3176, + "time_per_iteration": 4.679258108139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00635707, + "epoch": 0.6111966140823394, + "flos": 795542434560.0, + "grad_norm": 0.03450548999539666, + "language_loss": 0.81482762, + "learning_rate": 0.00034673646670883976, + "loss": 0.82530034, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.40917969, + "step": 3177, + "time_per_iteration": 2.9776415824890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104406, + "balance_loss_mlp": 1.0043869, + "epoch": 0.611388995767603, + "flos": 1561066349568.0, + "grad_norm": 0.006895739494838764, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76759082, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.39648438, + "step": 3178, + "time_per_iteration": 4.9859678745269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046014, + "balance_loss_mlp": 1.00512564, + "epoch": 0.6115813774528664, + "flos": 713486080512.0, + "grad_norm": 0.037712756689321836, + "language_loss": 0.81948996, + "learning_rate": 0.0003461434953300865, + "loss": 0.82995009, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.40893555, + "step": 3179, + "time_per_iteration": 2.9206619262695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046107, + "balance_loss_mlp": 1.0051471, + "epoch": 0.61177375913813, + "flos": 685690696704.0, + "grad_norm": 0.02737860550975636, + "language_loss": 0.81828141, + "learning_rate": 0.0003458470991817515, + "loss": 0.8287425, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.40966797, + "step": 3180, + "time_per_iteration": 3.0038623809814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046359, + "balance_loss_mlp": 1.00537503, + "epoch": 0.6119661408233936, + "flos": 512667995136.0, + "grad_norm": 0.03551722244255775, + "language_loss": 0.85187316, + "learning_rate": 0.0003455507628808802, + "loss": 0.86233675, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.40991211, + "step": 3181, + "time_per_iteration": 2.623522996902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048614, + "balance_loss_mlp": 1.0076772, + "epoch": 0.6121585225086572, + "flos": 557856633600.0, + "grad_norm": 0.04043393522454786, + "language_loss": 0.85139406, + "learning_rate": 0.00034525448654252076, + "loss": 0.86188018, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.40942383, + "step": 3182, + "time_per_iteration": 2.701493501663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053271, + "balance_loss_mlp": 1.0125016, + "epoch": 0.6123509041939207, + "flos": 562910427648.0, + "grad_norm": 0.044342295152579134, + "language_loss": 0.83549857, + "learning_rate": 0.0003449582702816976, + "loss": 0.84603125, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.40771484, + "step": 3183, + "time_per_iteration": 2.6956191062927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050719, + "balance_loss_mlp": 1.00980616, + "epoch": 0.6125432858791843, + "flos": 559131017472.0, + "grad_norm": 0.0337797622344846, + "language_loss": 0.833462, + "learning_rate": 0.0003446621142134122, + "loss": 0.84396923, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.40917969, + "step": 3184, + "time_per_iteration": 2.639289379119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049865, + "balance_loss_mlp": 1.0089761, + "epoch": 0.6127356675644479, + "flos": 415897411584.0, + "grad_norm": 0.038637283425345254, + "language_loss": 0.84757721, + "learning_rate": 0.0003443660184526424, + "loss": 0.85807586, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.40893555, + "step": 3185, + "time_per_iteration": 2.4257092475891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_mlp": 1.00855243, + "epoch": 0.6129280492497114, + "flos": 605034126336.0, + "grad_norm": 0.03183522344564459, + "language_loss": 0.86949629, + "learning_rate": 0.0003440699831143429, + "loss": 0.87999046, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.40869141, + "step": 3186, + "time_per_iteration": 2.775930404663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051794, + "balance_loss_mlp": 1.01092947, + "epoch": 0.613120430934975, + "flos": 520865463552.0, + "grad_norm": 0.03426856833524134, + "language_loss": 0.82819283, + "learning_rate": 0.0003437740083134449, + "loss": 0.83871073, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.40869141, + "step": 3187, + "time_per_iteration": 2.696072816848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049174, + "balance_loss_mlp": 1.00835705, + "epoch": 0.6133128126202385, + "flos": 512081836800.0, + "grad_norm": 0.03992475023697304, + "language_loss": 0.84158587, + "learning_rate": 0.00034347809416485574, + "loss": 0.8520776, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.40820312, + "step": 3188, + "time_per_iteration": 2.6222550868988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052376, + "balance_loss_mlp": 1.01158273, + "epoch": 0.6135051943055021, + "flos": 608757156096.0, + "grad_norm": 0.032577275408737616, + "language_loss": 0.82338852, + "learning_rate": 0.0003431822407834597, + "loss": 0.83391231, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.40795898, + "step": 3189, + "time_per_iteration": 2.818133592605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050392, + "balance_loss_mlp": 1.00959849, + "epoch": 0.6136975759907657, + "flos": 1162010072064.0, + "grad_norm": 0.04434341362834108, + "language_loss": 0.84634304, + "learning_rate": 0.00034288644828411706, + "loss": 0.85684693, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.40795898, + "step": 3190, + "time_per_iteration": 3.4801251888275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_mlp": 1.00911534, + "epoch": 0.6138899576760293, + "flos": 708173716992.0, + "grad_norm": 0.03680261410998276, + "language_loss": 0.76343262, + "learning_rate": 0.0003425907167816649, + "loss": 0.77393216, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.40844727, + "step": 3191, + "time_per_iteration": 2.859435558319092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049983, + "balance_loss_mlp": 1.00914156, + "epoch": 0.6140823393612928, + "flos": 587619484416.0, + "grad_norm": 0.036153352426406216, + "language_loss": 0.85233247, + "learning_rate": 0.00034229504639091623, + "loss": 0.86283231, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.40844727, + "step": 3192, + "time_per_iteration": 2.7828218936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_mlp": 1.00656581, + "epoch": 0.6142747210465563, + "flos": 805619887104.0, + "grad_norm": 0.035035162625632645, + "language_loss": 0.80565524, + "learning_rate": 0.0003419994372266606, + "loss": 0.81612837, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.4074707, + "step": 3193, + "time_per_iteration": 3.1529080867767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046771, + "balance_loss_mlp": 1.00593019, + "epoch": 0.6144671027318199, + "flos": 530545340928.0, + "grad_norm": 0.02881776150326524, + "language_loss": 0.82229221, + "learning_rate": 0.00034170388940366335, + "loss": 0.83275998, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.40844727, + "step": 3194, + "time_per_iteration": 2.733793258666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_mlp": 1.00598967, + "epoch": 0.6146594844170835, + "flos": 806913712896.0, + "grad_norm": 0.03443984664399312, + "language_loss": 0.8074832, + "learning_rate": 0.0003414084030366667, + "loss": 0.81795198, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.40893555, + "step": 3195, + "time_per_iteration": 3.1194753646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049907, + "balance_loss_mlp": 1.00906587, + "epoch": 0.6148518661023471, + "flos": 502762596096.0, + "grad_norm": 0.03247725998101352, + "language_loss": 0.83429492, + "learning_rate": 0.0003411129782403883, + "loss": 0.84479403, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.40844727, + "step": 3196, + "time_per_iteration": 2.701995849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_mlp": 1.00785387, + "epoch": 0.6150442477876106, + "flos": 511699812864.0, + "grad_norm": 0.05177418573029483, + "language_loss": 0.85667449, + "learning_rate": 0.0003408176151295225, + "loss": 0.86716187, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.40893555, + "step": 3197, + "time_per_iteration": 2.6645357608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046183, + "balance_loss_mlp": 1.0052464, + "epoch": 0.6152366294728742, + "flos": 527998518528.0, + "grad_norm": 0.03939493376677649, + "language_loss": 0.7823236, + "learning_rate": 0.00034052231381873944, + "loss": 0.79278541, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.40942383, + "step": 3198, + "time_per_iteration": 2.6415092945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049656, + "balance_loss_mlp": 1.00881481, + "epoch": 0.6154290111581378, + "flos": 474282877440.0, + "grad_norm": 0.04031967856737408, + "language_loss": 0.85886127, + "learning_rate": 0.00034022707442268494, + "loss": 0.86935782, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.40844727, + "step": 3199, + "time_per_iteration": 2.5885183811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_mlp": 1.00976777, + "epoch": 0.6156213928434013, + "flos": 551934779136.0, + "grad_norm": 0.028515598642512706, + "language_loss": 0.82251477, + "learning_rate": 0.0003399318970559813, + "loss": 0.83302015, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.40771484, + "step": 3200, + "time_per_iteration": 2.819209337234497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050709, + "balance_loss_mlp": 1.00998724, + "epoch": 0.6158137745286649, + "flos": 752362092288.0, + "grad_norm": 0.030934752464501728, + "language_loss": 0.84934688, + "learning_rate": 0.00033963678183322656, + "loss": 0.85985398, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.40722656, + "step": 3201, + "time_per_iteration": 3.0306894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051067, + "balance_loss_mlp": 1.01027346, + "epoch": 0.6160061562139284, + "flos": 556905947904.0, + "grad_norm": 0.03121820045207164, + "language_loss": 0.83180207, + "learning_rate": 0.0003393417288689945, + "loss": 0.84231275, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.40795898, + "step": 3202, + "time_per_iteration": 2.748361587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050686, + "balance_loss_mlp": 1.00989294, + "epoch": 0.616198537899192, + "flos": 743467650048.0, + "grad_norm": 0.04116101332214976, + "language_loss": 0.76590461, + "learning_rate": 0.00033904673827783504, + "loss": 0.77641141, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.40795898, + "step": 3203, + "time_per_iteration": 2.9209775924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052852, + "balance_loss_mlp": 1.01193893, + "epoch": 0.6163909195844556, + "flos": 479775075840.0, + "grad_norm": 0.031654400686770015, + "language_loss": 0.82428539, + "learning_rate": 0.00033875181017427357, + "loss": 0.83481383, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.40917969, + "step": 3204, + "time_per_iteration": 2.6138155460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104652, + "balance_loss_mlp": 1.00551248, + "epoch": 0.6165833012697192, + "flos": 532666397952.0, + "grad_norm": 0.03324868864618939, + "language_loss": 0.81742775, + "learning_rate": 0.00033845694467281133, + "loss": 0.82789296, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.41015625, + "step": 3205, + "time_per_iteration": 2.8665361404418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045816, + "balance_loss_mlp": 1.0049988, + "epoch": 0.6167756829549826, + "flos": 809295284736.0, + "grad_norm": 0.03418345099687322, + "language_loss": 0.83676243, + "learning_rate": 0.00033816214188792516, + "loss": 0.8472206, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.40820312, + "step": 3206, + "time_per_iteration": 3.176194190979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_mlp": 1.00504088, + "epoch": 0.6169680646402462, + "flos": 489910854144.0, + "grad_norm": 0.03420383958613512, + "language_loss": 0.8597641, + "learning_rate": 0.00033786740193406784, + "loss": 0.87022221, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.40771484, + "step": 3207, + "time_per_iteration": 2.60602068901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_mlp": 1.00528312, + "epoch": 0.6171604463255098, + "flos": 620204256768.0, + "grad_norm": 0.033645733240054064, + "language_loss": 0.81914175, + "learning_rate": 0.00033757272492566736, + "loss": 0.82960248, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.40795898, + "step": 3208, + "time_per_iteration": 2.929311990737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_mlp": 1.00502181, + "epoch": 0.6173528280107734, + "flos": 529895999232.0, + "grad_norm": 0.030436054236508022, + "language_loss": 0.87530887, + "learning_rate": 0.0003372781109771278, + "loss": 0.8857677, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.40869141, + "step": 3209, + "time_per_iteration": 2.725886821746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_mlp": 1.00390351, + "epoch": 0.617545209696037, + "flos": 597737766144.0, + "grad_norm": 0.031193081131094685, + "language_loss": 0.77093422, + "learning_rate": 0.0003369835602028281, + "loss": 0.78138143, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.40820312, + "step": 3210, + "time_per_iteration": 2.7928357124328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_mlp": 1.00196826, + "epoch": 0.6177375913813005, + "flos": 476106481152.0, + "grad_norm": 0.036241731553070825, + "language_loss": 0.80260098, + "learning_rate": 0.0003366890727171232, + "loss": 0.81302822, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.4074707, + "step": 3211, + "time_per_iteration": 2.688157558441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046468, + "balance_loss_mlp": 1.00565052, + "epoch": 0.617929973066564, + "flos": 530881678080.0, + "grad_norm": 0.03703049785450956, + "language_loss": 0.7920953, + "learning_rate": 0.00033639464863434313, + "loss": 0.80255997, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.40820312, + "step": 3212, + "time_per_iteration": 2.6376640796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105003, + "balance_loss_mlp": 1.01045227, + "epoch": 0.6181223547518276, + "flos": 1422835026432.0, + "grad_norm": 0.010124003783497993, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79492497, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.39550781, + "step": 3213, + "time_per_iteration": 4.704723596572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047001, + "balance_loss_mlp": 1.00618351, + "epoch": 0.6183147364370912, + "flos": 741696536064.0, + "grad_norm": 0.03266398965494079, + "language_loss": 0.79975474, + "learning_rate": 0.00033580599113475543, + "loss": 0.81022477, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.40820312, + "step": 3214, + "time_per_iteration": 2.9861807823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_mlp": 1.00875139, + "epoch": 0.6185071181223547, + "flos": 382483462656.0, + "grad_norm": 0.034946308334165094, + "language_loss": 0.86866862, + "learning_rate": 0.00033551175794648507, + "loss": 0.87916261, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.40649414, + "step": 3215, + "time_per_iteration": 2.462238311767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050777, + "balance_loss_mlp": 1.01005554, + "epoch": 0.6186994998076183, + "flos": 464305546752.0, + "grad_norm": 0.05487149837237803, + "language_loss": 0.82309055, + "learning_rate": 0.00033521758861821365, + "loss": 0.83359838, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.40722656, + "step": 3216, + "time_per_iteration": 2.6265599727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_mlp": 1.00778484, + "epoch": 0.6188918814928819, + "flos": 486252953088.0, + "grad_norm": 0.035787768578127474, + "language_loss": 0.89356089, + "learning_rate": 0.0003349234832641479, + "loss": 0.90404689, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.40820312, + "step": 3217, + "time_per_iteration": 2.600252628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105038, + "balance_loss_mlp": 1.00956285, + "epoch": 0.6190842631781455, + "flos": 658598122752.0, + "grad_norm": 0.04394177664040498, + "language_loss": 0.81214905, + "learning_rate": 0.00033462944199846975, + "loss": 0.82265282, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.40820312, + "step": 3218, + "time_per_iteration": 3.059032917022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_mlp": 1.00748599, + "epoch": 0.619276644863409, + "flos": 404467807488.0, + "grad_norm": 0.03662586595942604, + "language_loss": 0.87058449, + "learning_rate": 0.00033433546493533606, + "loss": 0.88106751, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.40820312, + "step": 3219, + "time_per_iteration": 2.464569091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049492, + "balance_loss_mlp": 1.00876999, + "epoch": 0.6194690265486725, + "flos": 584241540096.0, + "grad_norm": 0.03704236392673744, + "language_loss": 0.8459326, + "learning_rate": 0.00033404155218887897, + "loss": 0.85642755, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.40722656, + "step": 3220, + "time_per_iteration": 2.717883825302124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048251, + "balance_loss_mlp": 1.00745773, + "epoch": 0.6196614082339361, + "flos": 505385240832.0, + "grad_norm": 0.03422152158197648, + "language_loss": 0.87844843, + "learning_rate": 0.00033374770387320534, + "loss": 0.88893092, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.40795898, + "step": 3221, + "time_per_iteration": 2.7630932331085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_mlp": 1.00921607, + "epoch": 0.6198537899191997, + "flos": 576526217472.0, + "grad_norm": 0.03373583765668511, + "language_loss": 0.85412097, + "learning_rate": 0.00033345392010239737, + "loss": 0.86462182, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.40869141, + "step": 3222, + "time_per_iteration": 2.7410025596618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050416, + "balance_loss_mlp": 1.00952721, + "epoch": 0.6200461716044633, + "flos": 594303441408.0, + "grad_norm": 0.03547804945622036, + "language_loss": 0.82924426, + "learning_rate": 0.0003331602009905118, + "loss": 0.83974844, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.40893555, + "step": 3223, + "time_per_iteration": 2.8037710189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_mlp": 1.01098979, + "epoch": 0.6202385532897268, + "flos": 667411885056.0, + "grad_norm": 0.03269956620721502, + "language_loss": 0.84572297, + "learning_rate": 0.00033286654665158085, + "loss": 0.85624015, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.40722656, + "step": 3224, + "time_per_iteration": 2.948554754257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050192, + "balance_loss_mlp": 1.00939882, + "epoch": 0.6204309349749904, + "flos": 485927309568.0, + "grad_norm": 0.03423910891288116, + "language_loss": 0.88386071, + "learning_rate": 0.0003325729571996109, + "loss": 0.89436263, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.40795898, + "step": 3225, + "time_per_iteration": 2.6549041271209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049912, + "balance_loss_mlp": 1.00914264, + "epoch": 0.6206233166602539, + "flos": 585218470656.0, + "grad_norm": 0.03260898019544377, + "language_loss": 0.84271944, + "learning_rate": 0.000332279432748584, + "loss": 0.85321862, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.40771484, + "step": 3226, + "time_per_iteration": 2.716174840927124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_mlp": 1.00827563, + "epoch": 0.6208156983455175, + "flos": 477912588288.0, + "grad_norm": 0.031713525688758036, + "language_loss": 0.87778246, + "learning_rate": 0.00033198597341245576, + "loss": 0.88827246, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.40722656, + "step": 3227, + "time_per_iteration": 2.596343994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_mlp": 1.00591445, + "epoch": 0.6210080800307811, + "flos": 790469198592.0, + "grad_norm": 0.02931098854288103, + "language_loss": 0.82211602, + "learning_rate": 0.00033169257930515763, + "loss": 0.8325814, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.40625, + "step": 3228, + "time_per_iteration": 3.0495920181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_mlp": 1.01036096, + "epoch": 0.6212004617160446, + "flos": 608917549056.0, + "grad_norm": 0.05193251609129224, + "language_loss": 0.83099496, + "learning_rate": 0.0003313992505405951, + "loss": 0.8415041, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.40551758, + "step": 3229, + "time_per_iteration": 2.7221577167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049521, + "balance_loss_mlp": 1.00896585, + "epoch": 0.6213928434013082, + "flos": 587612681472.0, + "grad_norm": 0.04085502918766405, + "language_loss": 0.81571418, + "learning_rate": 0.0003311059872326487, + "loss": 0.82620943, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.40551758, + "step": 3230, + "time_per_iteration": 2.6938486099243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051014, + "balance_loss_mlp": 1.0103395, + "epoch": 0.6215852250865718, + "flos": 537109734144.0, + "grad_norm": 0.03319484231219387, + "language_loss": 0.79486078, + "learning_rate": 0.0003308127894951734, + "loss": 0.80537093, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.40673828, + "step": 3231, + "time_per_iteration": 2.6565897464752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_mlp": 1.00634694, + "epoch": 0.6217776067718354, + "flos": 619313842176.0, + "grad_norm": 0.044149605083951216, + "language_loss": 0.8665247, + "learning_rate": 0.00033051965744199834, + "loss": 0.87699568, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.4074707, + "step": 3232, + "time_per_iteration": 2.7405452728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_mlp": 1.00575984, + "epoch": 0.6219699884570988, + "flos": 547100670720.0, + "grad_norm": 0.03240939524045973, + "language_loss": 0.90891719, + "learning_rate": 0.0003302265911869276, + "loss": 0.91938138, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.40649414, + "step": 3233, + "time_per_iteration": 2.9264018535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048831, + "balance_loss_mlp": 1.00827634, + "epoch": 0.6221623701423624, + "flos": 482156647680.0, + "grad_norm": 0.04042837420673253, + "language_loss": 0.8472892, + "learning_rate": 0.0003299335908437397, + "loss": 0.85777748, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.40551758, + "step": 3234, + "time_per_iteration": 2.6122491359710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104817, + "balance_loss_mlp": 1.00751972, + "epoch": 0.622354751827626, + "flos": 380872741632.0, + "grad_norm": 0.045523891323386655, + "language_loss": 0.80743796, + "learning_rate": 0.0003296406565261873, + "loss": 0.81791961, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.40649414, + "step": 3235, + "time_per_iteration": 2.4912121295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052853, + "balance_loss_mlp": 1.01241732, + "epoch": 0.6225471335128896, + "flos": 669072183552.0, + "grad_norm": 0.032252040846456206, + "language_loss": 0.85526693, + "learning_rate": 0.0003293477883479978, + "loss": 0.86579549, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.40429688, + "step": 3236, + "time_per_iteration": 2.8378734588623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049915, + "balance_loss_mlp": 1.00943148, + "epoch": 0.6227395151981532, + "flos": 772628791296.0, + "grad_norm": 0.03861340277154514, + "language_loss": 0.80045772, + "learning_rate": 0.0003290549864228727, + "loss": 0.81095684, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.40478516, + "step": 3237, + "time_per_iteration": 2.9996402263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00410068, + "epoch": 0.6229318968834167, + "flos": 485358647808.0, + "grad_norm": 0.03163121059903129, + "language_loss": 0.87001842, + "learning_rate": 0.0003287622508644875, + "loss": 0.88046503, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.40551758, + "step": 3238, + "time_per_iteration": 2.8210766315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051288, + "balance_loss_mlp": 1.01082802, + "epoch": 0.6231242785686802, + "flos": 463877836032.0, + "grad_norm": 0.03974001893419822, + "language_loss": 0.87119055, + "learning_rate": 0.0003284695817864923, + "loss": 0.88170344, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.40454102, + "step": 3239, + "time_per_iteration": 2.4931445121765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048406, + "balance_loss_mlp": 1.00773168, + "epoch": 0.6233166602539438, + "flos": 610211374848.0, + "grad_norm": 0.03997150810707431, + "language_loss": 0.84201944, + "learning_rate": 0.0003281769793025116, + "loss": 0.85250354, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.40673828, + "step": 3240, + "time_per_iteration": 2.71476674079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.00999331, + "epoch": 0.6235090419392074, + "flos": 440115574272.0, + "grad_norm": 0.053967997241239116, + "language_loss": 0.9023276, + "learning_rate": 0.00032788444352614346, + "loss": 0.91283357, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.40600586, + "step": 3241, + "time_per_iteration": 2.5143325328826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_mlp": 1.00826836, + "epoch": 0.6237014236244709, + "flos": 505901412864.0, + "grad_norm": 0.03953535493242474, + "language_loss": 0.81586522, + "learning_rate": 0.0003275919745709606, + "loss": 0.82635486, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.40698242, + "step": 3242, + "time_per_iteration": 2.6041946411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052265, + "balance_loss_mlp": 1.01171017, + "epoch": 0.6238938053097345, + "flos": 513996814080.0, + "grad_norm": 0.03348358487194809, + "language_loss": 0.82661837, + "learning_rate": 0.00032729957255050936, + "loss": 0.83714104, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.40551758, + "step": 3243, + "time_per_iteration": 2.6362357139587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053144, + "balance_loss_mlp": 1.01263702, + "epoch": 0.6240861869949981, + "flos": 738023083776.0, + "grad_norm": 0.04011709848771047, + "language_loss": 0.82433391, + "learning_rate": 0.0003270072375783102, + "loss": 0.83486533, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.4050293, + "step": 3244, + "time_per_iteration": 2.890136241912842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_mlp": 1.00855565, + "epoch": 0.6242785686802617, + "flos": 495709254144.0, + "grad_norm": 0.03469894111823996, + "language_loss": 0.80177683, + "learning_rate": 0.00032671496976785774, + "loss": 0.81226623, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.40380859, + "step": 3245, + "time_per_iteration": 2.6587681770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_mlp": 1.01091611, + "epoch": 0.6244709503655252, + "flos": 747234421248.0, + "grad_norm": 0.03291682412434118, + "language_loss": 0.76093823, + "learning_rate": 0.0003264227692326205, + "loss": 0.77145123, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.40380859, + "step": 3246, + "time_per_iteration": 3.0954296588897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050368, + "balance_loss_mlp": 1.00995624, + "epoch": 0.6246633320507887, + "flos": 493551258624.0, + "grad_norm": 0.036876384824843206, + "language_loss": 0.86561215, + "learning_rate": 0.00032613063608604055, + "loss": 0.8761158, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.40405273, + "step": 3247, + "time_per_iteration": 2.632049560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_mlp": 1.00296032, + "epoch": 0.6248557137360523, + "flos": 518392518144.0, + "grad_norm": 0.03391504049871655, + "language_loss": 0.84063625, + "learning_rate": 0.0003258385704415343, + "loss": 0.85107023, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.40429688, + "step": 3248, + "time_per_iteration": 2.580336809158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00317442, + "epoch": 0.6250480954213159, + "flos": 520429004544.0, + "grad_norm": 0.028687824097281916, + "language_loss": 0.83734399, + "learning_rate": 0.0003255465724124915, + "loss": 0.84777981, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.40405273, + "step": 3249, + "time_per_iteration": 2.699963331222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_mlp": 1.00580287, + "epoch": 0.6252404771065795, + "flos": 517070502144.0, + "grad_norm": 0.03444404266219843, + "language_loss": 0.83187747, + "learning_rate": 0.00032525464211227587, + "loss": 0.84233886, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.40332031, + "step": 3250, + "time_per_iteration": 2.590261697769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045897, + "balance_loss_mlp": 1.0055331, + "epoch": 0.6254328587918431, + "flos": 577997932800.0, + "grad_norm": 0.03271100856558234, + "language_loss": 0.86164498, + "learning_rate": 0.0003249627796542249, + "loss": 0.87210405, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.40356445, + "step": 3251, + "time_per_iteration": 2.706554412841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046601, + "balance_loss_mlp": 1.006284, + "epoch": 0.6256252404771065, + "flos": 599105468928.0, + "grad_norm": 0.035746905542485746, + "language_loss": 0.84805512, + "learning_rate": 0.00032467098515164943, + "loss": 0.8585211, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.40307617, + "step": 3252, + "time_per_iteration": 2.870948076248169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_mlp": 1.00411773, + "epoch": 0.6258176221623701, + "flos": 509361982464.0, + "grad_norm": 0.036795712439313615, + "language_loss": 0.84738171, + "learning_rate": 0.00032437925871783456, + "loss": 0.85782516, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.40209961, + "step": 3253, + "time_per_iteration": 2.6761369705200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_mlp": 1.00468659, + "epoch": 0.6260100038476337, + "flos": 640805347584.0, + "grad_norm": 0.03851108593477808, + "language_loss": 0.85338682, + "learning_rate": 0.00032408760046603803, + "loss": 0.86383539, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.40161133, + "step": 3254, + "time_per_iteration": 2.8586931228637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_mlp": 1.00344408, + "epoch": 0.6262023855328973, + "flos": 842452609536.0, + "grad_norm": 0.03391057824911436, + "language_loss": 0.78393734, + "learning_rate": 0.00032379601050949193, + "loss": 0.79437345, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.40161133, + "step": 3255, + "time_per_iteration": 3.0973715782165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_mlp": 1.00629032, + "epoch": 0.6263947672181608, + "flos": 523157607168.0, + "grad_norm": 0.03422589562212714, + "language_loss": 0.8863821, + "learning_rate": 0.0003235044889614013, + "loss": 0.89684743, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.40234375, + "step": 3256, + "time_per_iteration": 2.643688917160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046154, + "balance_loss_mlp": 1.00593293, + "epoch": 0.6265871489034244, + "flos": 608290561536.0, + "grad_norm": 0.06509285278700487, + "language_loss": 0.84065372, + "learning_rate": 0.0003232130359349451, + "loss": 0.85111523, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.40209961, + "step": 3257, + "time_per_iteration": 2.859252452850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_mlp": 1.00690067, + "epoch": 0.626779530588688, + "flos": 589594732800.0, + "grad_norm": 0.03191133097735202, + "language_loss": 0.82224607, + "learning_rate": 0.0003229216515432751, + "loss": 0.83271682, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.40161133, + "step": 3258, + "time_per_iteration": 2.7475619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_mlp": 1.00625372, + "epoch": 0.6269719122739515, + "flos": 439538164224.0, + "grad_norm": 0.04023600043450841, + "language_loss": 0.80242079, + "learning_rate": 0.0003226303358995174, + "loss": 0.81288606, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.40258789, + "step": 3259, + "time_per_iteration": 2.5837466716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104682, + "balance_loss_mlp": 1.00647962, + "epoch": 0.6271642939592151, + "flos": 564015670272.0, + "grad_norm": 0.027274694738231114, + "language_loss": 0.88901317, + "learning_rate": 0.00032233908911677, + "loss": 0.89948136, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.40332031, + "step": 3260, + "time_per_iteration": 2.825246810913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044855, + "balance_loss_mlp": 1.00465786, + "epoch": 0.6273566756444786, + "flos": 515653221888.0, + "grad_norm": 0.03753718779185775, + "language_loss": 0.81557947, + "learning_rate": 0.0003220479113081053, + "loss": 0.82602805, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.40185547, + "step": 3261, + "time_per_iteration": 2.7426939010620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_mlp": 1.00566065, + "epoch": 0.6275490573297422, + "flos": 586588118784.0, + "grad_norm": 0.04387524863401932, + "language_loss": 0.79368806, + "learning_rate": 0.00032175680258656836, + "loss": 0.80414808, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.40332031, + "step": 3262, + "time_per_iteration": 2.704888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_mlp": 1.007092, + "epoch": 0.6277414390150058, + "flos": 560544407040.0, + "grad_norm": 0.03394703934758085, + "language_loss": 0.80846763, + "learning_rate": 0.00032146576306517794, + "loss": 0.81894171, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.40307617, + "step": 3263, + "time_per_iteration": 2.744232654571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_mlp": 1.00529134, + "epoch": 0.6279338207002694, + "flos": 613841085696.0, + "grad_norm": 0.03564897241316152, + "language_loss": 0.81241357, + "learning_rate": 0.0003211747928569255, + "loss": 0.82287127, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.40478516, + "step": 3264, + "time_per_iteration": 2.7210609912872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_mlp": 1.00754583, + "epoch": 0.6281262023855329, + "flos": 626933900544.0, + "grad_norm": 0.03587918693245657, + "language_loss": 0.81859601, + "learning_rate": 0.0003208838920747754, + "loss": 0.82907528, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.40380859, + "step": 3265, + "time_per_iteration": 2.828963041305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_mlp": 1.00379026, + "epoch": 0.6283185840707964, + "flos": 1125420367872.0, + "grad_norm": 0.03507856752255015, + "language_loss": 0.77222586, + "learning_rate": 0.0003205930608316656, + "loss": 0.78266764, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.40380859, + "step": 3266, + "time_per_iteration": 3.4536292552948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_mlp": 1.00251615, + "epoch": 0.62851096575606, + "flos": 516332699136.0, + "grad_norm": 0.05679261767260983, + "language_loss": 0.85571408, + "learning_rate": 0.00032030229924050673, + "loss": 0.86614287, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.40356445, + "step": 3267, + "time_per_iteration": 2.669522285461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048611, + "balance_loss_mlp": 1.00815153, + "epoch": 0.6287033474413236, + "flos": 405062714112.0, + "grad_norm": 0.035560546659782886, + "language_loss": 0.80196536, + "learning_rate": 0.00032001160741418247, + "loss": 0.81245148, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.40454102, + "step": 3268, + "time_per_iteration": 2.6049489974975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_mlp": 1.00421953, + "epoch": 0.6288957291265872, + "flos": 526759127808.0, + "grad_norm": 0.05710921395997567, + "language_loss": 0.8274591, + "learning_rate": 0.0003197209854655494, + "loss": 0.83790565, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.40429688, + "step": 3269, + "time_per_iteration": 2.6551384925842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043687, + "balance_loss_mlp": 1.00313175, + "epoch": 0.6290881108118507, + "flos": 604958304000.0, + "grad_norm": 0.03774804220071916, + "language_loss": 0.75090307, + "learning_rate": 0.0003194304335074371, + "loss": 0.7613399, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.40551758, + "step": 3270, + "time_per_iteration": 2.851900577545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049729, + "balance_loss_mlp": 1.0093174, + "epoch": 0.6292804924971143, + "flos": 438598172160.0, + "grad_norm": 0.03683695296075174, + "language_loss": 0.89063656, + "learning_rate": 0.0003191399516526475, + "loss": 0.90113389, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.40405273, + "step": 3271, + "time_per_iteration": 2.5034451484680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_mlp": 1.00488937, + "epoch": 0.6294728741823779, + "flos": 607845354240.0, + "grad_norm": 0.03066213341534494, + "language_loss": 0.79802763, + "learning_rate": 0.0003188495400139559, + "loss": 0.80848092, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.40429688, + "step": 3272, + "time_per_iteration": 2.780644178390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_mlp": 1.00486362, + "epoch": 0.6296652558676414, + "flos": 702774837504.0, + "grad_norm": 0.038362375592622004, + "language_loss": 0.85288656, + "learning_rate": 0.00031855919870411013, + "loss": 0.86333817, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.40283203, + "step": 3273, + "time_per_iteration": 2.8482918739318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_mlp": 1.00769854, + "epoch": 0.6298576375529049, + "flos": 524944272384.0, + "grad_norm": 0.03395775035270535, + "language_loss": 0.85278755, + "learning_rate": 0.0003182689278358305, + "loss": 0.86326772, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.40307617, + "step": 3274, + "time_per_iteration": 2.7457242012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_mlp": 1.00567281, + "epoch": 0.6300500192381685, + "flos": 476926909440.0, + "grad_norm": 0.036436552387549975, + "language_loss": 0.80145383, + "learning_rate": 0.0003179787275218105, + "loss": 0.81191462, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.40405273, + "step": 3275, + "time_per_iteration": 2.567723274230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_mlp": 1.00372946, + "epoch": 0.6302424009234321, + "flos": 521891971584.0, + "grad_norm": 0.03333768301867296, + "language_loss": 0.84862459, + "learning_rate": 0.0003176885978747155, + "loss": 0.85906482, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.40283203, + "step": 3276, + "time_per_iteration": 2.6513776779174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_mlp": 1.00587988, + "epoch": 0.6304347826086957, + "flos": 695858555904.0, + "grad_norm": 0.03467401587057451, + "language_loss": 0.83325267, + "learning_rate": 0.0003173985390071839, + "loss": 0.84371352, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.40185547, + "step": 3277, + "time_per_iteration": 2.876150131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054981, + "balance_loss_mlp": 1.01578522, + "epoch": 0.6306271642939593, + "flos": 1470032928000.0, + "grad_norm": 0.010139969116537896, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78955436, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.39160156, + "step": 3278, + "time_per_iteration": 4.770167589187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_mlp": 1.00548971, + "epoch": 0.6308195459792227, + "flos": 602930565888.0, + "grad_norm": 0.03526553994141675, + "language_loss": 0.81487232, + "learning_rate": 0.00031681863406122704, + "loss": 0.82533133, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.40405273, + "step": 3279, + "time_per_iteration": 2.7587971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043917, + "balance_loss_mlp": 1.0036478, + "epoch": 0.6310119276644863, + "flos": 728237248512.0, + "grad_norm": 0.034493081934242914, + "language_loss": 0.85473228, + "learning_rate": 0.00031652878820794087, + "loss": 0.86517143, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.40258789, + "step": 3280, + "time_per_iteration": 2.9854700565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045188, + "balance_loss_mlp": 1.00484729, + "epoch": 0.6312043093497499, + "flos": 520819776768.0, + "grad_norm": 0.037869406847462164, + "language_loss": 0.8647517, + "learning_rate": 0.00031623901358449627, + "loss": 0.87520361, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.40332031, + "step": 3281, + "time_per_iteration": 2.626267910003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_mlp": 1.00399899, + "epoch": 0.6313966910350135, + "flos": 532223136000.0, + "grad_norm": 0.03407480500665165, + "language_loss": 0.88792193, + "learning_rate": 0.0003159493103033936, + "loss": 0.89836484, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.40283203, + "step": 3282, + "time_per_iteration": 2.574249505996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.00734711, + "epoch": 0.631589072720277, + "flos": 1382996656896.0, + "grad_norm": 0.01146599852639075, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80965501, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.390625, + "step": 3283, + "time_per_iteration": 4.8656487464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.00684595, + "epoch": 0.6317814544055406, + "flos": 625874344704.0, + "grad_norm": 0.030628800549983924, + "language_loss": 0.83010268, + "learning_rate": 0.0003153701182180776, + "loss": 0.84057581, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.40454102, + "step": 3284, + "time_per_iteration": 2.803232431411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047972, + "balance_loss_mlp": 1.00751245, + "epoch": 0.6319738360908042, + "flos": 499097892096.0, + "grad_norm": 0.036572578748274465, + "language_loss": 0.82564306, + "learning_rate": 0.00031508062963872655, + "loss": 0.83612275, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.40454102, + "step": 3285, + "time_per_iteration": 2.5559017658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00602329, + "epoch": 0.6321662177760677, + "flos": 580909282560.0, + "grad_norm": 0.041327466784405305, + "language_loss": 0.80268341, + "learning_rate": 0.0003147912128514423, + "loss": 0.81314898, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.40527344, + "step": 3286, + "time_per_iteration": 2.7093169689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_mlp": 1.00380301, + "epoch": 0.6323585994613313, + "flos": 602606867712.0, + "grad_norm": 0.0363944042801657, + "language_loss": 0.87847489, + "learning_rate": 0.0003145018679685859, + "loss": 0.88891751, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.40454102, + "step": 3287, + "time_per_iteration": 2.741680145263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_mlp": 1.00691795, + "epoch": 0.6325509811465948, + "flos": 529633539072.0, + "grad_norm": 0.02715728015284293, + "language_loss": 0.88303924, + "learning_rate": 0.00031421259510249134, + "loss": 0.89351344, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.4050293, + "step": 3288, + "time_per_iteration": 2.793593406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050359, + "balance_loss_mlp": 1.00975657, + "epoch": 0.6327433628318584, + "flos": 575345152512.0, + "grad_norm": 0.03790719604682011, + "language_loss": 0.8176173, + "learning_rate": 0.00031392339436546414, + "loss": 0.82812083, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.40600586, + "step": 3289, + "time_per_iteration": 2.806328773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_mlp": 1.00960624, + "epoch": 0.632935744517122, + "flos": 518112561408.0, + "grad_norm": 0.04130029787255878, + "language_loss": 0.84016752, + "learning_rate": 0.00031363426586978205, + "loss": 0.85067028, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.40673828, + "step": 3290, + "time_per_iteration": 2.815406322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.00676227, + "epoch": 0.6331281262023856, + "flos": 618597426432.0, + "grad_norm": 0.031083560389852355, + "language_loss": 0.85119176, + "learning_rate": 0.0003133452097276947, + "loss": 0.86166441, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.4050293, + "step": 3291, + "time_per_iteration": 2.7325408458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_mlp": 1.00465119, + "epoch": 0.633320507887649, + "flos": 594116803584.0, + "grad_norm": 0.03244834687463976, + "language_loss": 0.84650022, + "learning_rate": 0.0003130562260514238, + "loss": 0.85695255, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.40576172, + "step": 3292, + "time_per_iteration": 2.7858352661132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046449, + "balance_loss_mlp": 1.00582266, + "epoch": 0.6335128895729126, + "flos": 583496934144.0, + "grad_norm": 0.03053589669397976, + "language_loss": 0.8217054, + "learning_rate": 0.0003127673149531626, + "loss": 0.83216989, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.40625, + "step": 3293, + "time_per_iteration": 2.755866050720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_mlp": 1.00506401, + "epoch": 0.6337052712581762, + "flos": 453974382336.0, + "grad_norm": 0.03437959175785583, + "language_loss": 0.83448106, + "learning_rate": 0.0003124784765450762, + "loss": 0.84493768, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.40600586, + "step": 3294, + "time_per_iteration": 2.555196762084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045908, + "balance_loss_mlp": 1.00535333, + "epoch": 0.6338976529434398, + "flos": 574515975936.0, + "grad_norm": 0.03647562664134654, + "language_loss": 0.810781, + "learning_rate": 0.0003121897109393017, + "loss": 0.82124007, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.40551758, + "step": 3295, + "time_per_iteration": 2.726447582244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_mlp": 1.00441372, + "epoch": 0.6340900346287034, + "flos": 509809135104.0, + "grad_norm": 0.0325303094953836, + "language_loss": 0.89509195, + "learning_rate": 0.0003119010182479481, + "loss": 0.90554118, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.4050293, + "step": 3296, + "time_per_iteration": 2.6128556728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00422609, + "epoch": 0.6342824163139669, + "flos": 480715067904.0, + "grad_norm": 0.036682379732438104, + "language_loss": 0.8339026, + "learning_rate": 0.00031161239858309563, + "loss": 0.84434992, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.4050293, + "step": 3297, + "time_per_iteration": 2.571183443069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_mlp": 1.00323093, + "epoch": 0.6344747979992305, + "flos": 573111334656.0, + "grad_norm": 0.03822576874130642, + "language_loss": 0.83954668, + "learning_rate": 0.0003113238520567964, + "loss": 0.84998387, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.40478516, + "step": 3298, + "time_per_iteration": 2.677607297897339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_mlp": 1.00143397, + "epoch": 0.634667179684494, + "flos": 607046313216.0, + "grad_norm": 0.03748382415323519, + "language_loss": 0.818299, + "learning_rate": 0.00031103537878107403, + "loss": 0.82871747, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.40405273, + "step": 3299, + "time_per_iteration": 2.731858730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_mlp": 1.0007478, + "epoch": 0.6348595613697576, + "flos": 648129897984.0, + "grad_norm": 0.036818455755728355, + "language_loss": 0.80712759, + "learning_rate": 0.0003107469788679238, + "loss": 0.81753987, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.40478516, + "step": 3300, + "time_per_iteration": 2.811863660812378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041324, + "balance_loss_mlp": 1.00088787, + "epoch": 0.6350519430550212, + "flos": 640273624320.0, + "grad_norm": 0.03493243312285999, + "language_loss": 0.872877, + "learning_rate": 0.00031045865242931267, + "loss": 0.88329029, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.40429688, + "step": 3301, + "time_per_iteration": 2.7718210220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042406, + "balance_loss_mlp": 1.00206506, + "epoch": 0.6352443247402847, + "flos": 687831195648.0, + "grad_norm": 0.031178821676135258, + "language_loss": 0.83354819, + "learning_rate": 0.00031017039957717877, + "loss": 0.84397227, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.40332031, + "step": 3302, + "time_per_iteration": 3.0323870182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_mlp": 1.01028883, + "epoch": 0.6354367064255483, + "flos": 560526910464.0, + "grad_norm": 0.03426704048429257, + "language_loss": 0.89209497, + "learning_rate": 0.0003098822204234318, + "loss": 0.9026022, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.40429688, + "step": 3303, + "time_per_iteration": 2.688183069229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048749, + "balance_loss_mlp": 1.00831378, + "epoch": 0.6356290881108119, + "flos": 981062077440.0, + "grad_norm": 0.05617774198225317, + "language_loss": 0.88024724, + "learning_rate": 0.00030959411507995273, + "loss": 0.89073473, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.40429688, + "step": 3304, + "time_per_iteration": 3.2071332931518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050985, + "balance_loss_mlp": 1.01050138, + "epoch": 0.6358214697960755, + "flos": 529373024256.0, + "grad_norm": 0.04089277764533041, + "language_loss": 0.81679875, + "learning_rate": 0.00030930608365859407, + "loss": 0.82730865, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.40478516, + "step": 3305, + "time_per_iteration": 2.6791036128997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052235, + "balance_loss_mlp": 1.01184678, + "epoch": 0.6360138514813389, + "flos": 517869543168.0, + "grad_norm": 0.03251934179180288, + "language_loss": 0.88227487, + "learning_rate": 0.00030901812627117943, + "loss": 0.89279723, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.40380859, + "step": 3306, + "time_per_iteration": 2.643564462661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047687, + "balance_loss_mlp": 1.00720358, + "epoch": 0.6362062331666025, + "flos": 467470608384.0, + "grad_norm": 0.0425448547397637, + "language_loss": 0.85627687, + "learning_rate": 0.000308730243029504, + "loss": 0.8667537, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.40478516, + "step": 3307, + "time_per_iteration": 2.5909810066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_mlp": 1.00854588, + "epoch": 0.6363986148518661, + "flos": 550773156096.0, + "grad_norm": 0.03484330169343757, + "language_loss": 0.80282146, + "learning_rate": 0.0003084424340453339, + "loss": 0.81331193, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.4050293, + "step": 3308, + "time_per_iteration": 2.84796142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048791, + "balance_loss_mlp": 1.00830781, + "epoch": 0.6365909965371297, + "flos": 584158914816.0, + "grad_norm": 0.03632736574425893, + "language_loss": 0.82740968, + "learning_rate": 0.0003081546994304064, + "loss": 0.83789754, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.40478516, + "step": 3309, + "time_per_iteration": 2.7956221103668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105091, + "balance_loss_mlp": 1.01052189, + "epoch": 0.6367833782223933, + "flos": 532288264704.0, + "grad_norm": 0.03383722740926899, + "language_loss": 0.83152783, + "learning_rate": 0.0003078670392964298, + "loss": 0.8420369, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.40380859, + "step": 3310, + "time_per_iteration": 2.6194021701812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049103, + "balance_loss_mlp": 1.00883412, + "epoch": 0.6369757599076568, + "flos": 570588811776.0, + "grad_norm": 0.03520180951361345, + "language_loss": 0.83487624, + "learning_rate": 0.00030757945375508406, + "loss": 0.84536731, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.40258789, + "step": 3311, + "time_per_iteration": 2.636317729949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.00614858, + "epoch": 0.6371681415929203, + "flos": 541054394880.0, + "grad_norm": 0.03810911352031966, + "language_loss": 0.81548536, + "learning_rate": 0.00030729194291801944, + "loss": 0.82595289, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.40600586, + "step": 3312, + "time_per_iteration": 2.6490793228149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_mlp": 1.00452065, + "epoch": 0.6373605232781839, + "flos": 484531416576.0, + "grad_norm": 0.03667535496624994, + "language_loss": 0.77428758, + "learning_rate": 0.00030700450689685787, + "loss": 0.78473806, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.40527344, + "step": 3313, + "time_per_iteration": 2.535402774810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_mlp": 1.00575566, + "epoch": 0.6375529049634475, + "flos": 579817645824.0, + "grad_norm": 0.03891693330572632, + "language_loss": 0.85701913, + "learning_rate": 0.00030671714580319186, + "loss": 0.86748058, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.40380859, + "step": 3314, + "time_per_iteration": 2.8058876991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044916, + "balance_loss_mlp": 1.00433683, + "epoch": 0.637745286648711, + "flos": 683480211456.0, + "grad_norm": 0.11702238081113171, + "language_loss": 0.83888423, + "learning_rate": 0.0003064298597485846, + "loss": 0.84933341, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.40576172, + "step": 3315, + "time_per_iteration": 2.8778491020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045329, + "balance_loss_mlp": 1.00489366, + "epoch": 0.6379376683339746, + "flos": 505649646336.0, + "grad_norm": 0.05211428291246213, + "language_loss": 0.84419525, + "learning_rate": 0.00030614264884457054, + "loss": 0.85464859, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.40429688, + "step": 3316, + "time_per_iteration": 2.624901533126831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_mlp": 1.00977015, + "epoch": 0.6381300500192382, + "flos": 503025056256.0, + "grad_norm": 0.0426813784455398, + "language_loss": 0.77854991, + "learning_rate": 0.000305855513202655, + "loss": 0.7890529, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.40527344, + "step": 3317, + "time_per_iteration": 2.5690500736236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048316, + "balance_loss_mlp": 1.0077374, + "epoch": 0.6383224317045018, + "flos": 401367874560.0, + "grad_norm": 0.04267134147869369, + "language_loss": 0.78333461, + "learning_rate": 0.0003055684529343138, + "loss": 0.79381788, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.40576172, + "step": 3318, + "time_per_iteration": 2.4513895511627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054177, + "balance_loss_mlp": 1.01378846, + "epoch": 0.6385148133897653, + "flos": 500363527680.0, + "grad_norm": 0.0362987336754338, + "language_loss": 0.78882575, + "learning_rate": 0.00030528146815099374, + "loss": 0.79936755, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.40380859, + "step": 3319, + "time_per_iteration": 2.6613929271698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058923, + "balance_loss_mlp": 1.01851058, + "epoch": 0.6387071950750288, + "flos": 528695492352.0, + "grad_norm": 0.033070910188452485, + "language_loss": 0.72438365, + "learning_rate": 0.00030499455896411203, + "loss": 0.73497283, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.40405273, + "step": 3320, + "time_per_iteration": 2.641817092895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062973, + "balance_loss_mlp": 1.02330017, + "epoch": 0.6388995767602924, + "flos": 1459106856960.0, + "grad_norm": 0.013037560040261834, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77363789, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.39648438, + "step": 3321, + "time_per_iteration": 4.960562705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_mlp": 1.00777197, + "epoch": 0.639091958445556, + "flos": 605171186688.0, + "grad_norm": 0.03633146914450565, + "language_loss": 0.77279496, + "learning_rate": 0.0003044209678251865, + "loss": 0.78327799, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.40527344, + "step": 3322, + "time_per_iteration": 2.875474691390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048137, + "balance_loss_mlp": 1.00762939, + "epoch": 0.6392843401308196, + "flos": 585665623296.0, + "grad_norm": 0.031694233880752425, + "language_loss": 0.85324746, + "learning_rate": 0.0003041342860958306, + "loss": 0.86372876, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.4050293, + "step": 3323, + "time_per_iteration": 2.7719669342041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_mlp": 1.00939035, + "epoch": 0.6394767218160831, + "flos": 515729044224.0, + "grad_norm": 0.03911936056883103, + "language_loss": 0.91999781, + "learning_rate": 0.00030384768040828857, + "loss": 0.93049705, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.40527344, + "step": 3324, + "time_per_iteration": 2.6998729705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_mlp": 1.00607193, + "epoch": 0.6396691035013466, + "flos": 542777876736.0, + "grad_norm": 0.04757896669484628, + "language_loss": 0.86295962, + "learning_rate": 0.00030356115087383094, + "loss": 0.87342638, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.40600586, + "step": 3325, + "time_per_iteration": 2.701478958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050797, + "balance_loss_mlp": 1.01033795, + "epoch": 0.6398614851866102, + "flos": 526554993408.0, + "grad_norm": 0.04173120766563636, + "language_loss": 0.85232729, + "learning_rate": 0.00030327469760369803, + "loss": 0.86283523, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.40454102, + "step": 3326, + "time_per_iteration": 2.5700113773345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048429, + "balance_loss_mlp": 1.0079217, + "epoch": 0.6400538668718738, + "flos": 624135311616.0, + "grad_norm": 0.07319214553535336, + "language_loss": 0.85706425, + "learning_rate": 0.0003029883207091009, + "loss": 0.86754858, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.4050293, + "step": 3327, + "time_per_iteration": 2.7076821327209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_mlp": 1.00391674, + "epoch": 0.6402462485571374, + "flos": 504455942400.0, + "grad_norm": 0.03613290239480707, + "language_loss": 0.78819323, + "learning_rate": 0.00030270202030122095, + "loss": 0.79863703, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.40454102, + "step": 3328, + "time_per_iteration": 2.7022666931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043909, + "balance_loss_mlp": 1.00337768, + "epoch": 0.6404386302424009, + "flos": 820663650816.0, + "grad_norm": 0.036325579184177476, + "language_loss": 0.8635475, + "learning_rate": 0.00030241579649121, + "loss": 0.8739866, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.40527344, + "step": 3329, + "time_per_iteration": 2.985426902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_mlp": 1.0080061, + "epoch": 0.6406310119276645, + "flos": 472793665536.0, + "grad_norm": 0.03267380509371782, + "language_loss": 0.80188096, + "learning_rate": 0.00030212964939018994, + "loss": 0.81236637, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.40527344, + "step": 3330, + "time_per_iteration": 2.550344228744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048063, + "balance_loss_mlp": 1.00753188, + "epoch": 0.6408233936129281, + "flos": 426489090816.0, + "grad_norm": 0.03827308355906826, + "language_loss": 0.86015689, + "learning_rate": 0.0003018435791092527, + "loss": 0.87063748, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.40527344, + "step": 3331, + "time_per_iteration": 2.4880104064941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042416, + "balance_loss_mlp": 1.00186062, + "epoch": 0.6410157752981916, + "flos": 550838284800.0, + "grad_norm": 0.0342671152523666, + "language_loss": 0.81525755, + "learning_rate": 0.00030155758575946083, + "loss": 0.82568169, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.40551758, + "step": 3332, + "time_per_iteration": 2.6726834774017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_mlp": 1.00267851, + "epoch": 0.6412081569834551, + "flos": 476861780736.0, + "grad_norm": 0.03538778522895548, + "language_loss": 0.84473503, + "learning_rate": 0.0003012716694518467, + "loss": 0.85516679, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.4050293, + "step": 3333, + "time_per_iteration": 2.5853443145751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042688, + "balance_loss_mlp": 1.00206196, + "epoch": 0.6414005386687187, + "flos": 542031325440.0, + "grad_norm": 0.03182184712742977, + "language_loss": 0.85642707, + "learning_rate": 0.000300985830297413, + "loss": 0.86685395, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.40625, + "step": 3334, + "time_per_iteration": 2.699078321456909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_mlp": 1.00170887, + "epoch": 0.6415929203539823, + "flos": 1042957690368.0, + "grad_norm": 0.0341924045479309, + "language_loss": 0.88431525, + "learning_rate": 0.00030070006840713205, + "loss": 0.89473861, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.40625, + "step": 3335, + "time_per_iteration": 3.373852014541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_mlp": 1.0060693, + "epoch": 0.6417853020392459, + "flos": 649580226048.0, + "grad_norm": 0.035751052988779126, + "language_loss": 0.74186742, + "learning_rate": 0.000300414383891947, + "loss": 0.75233489, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.40673828, + "step": 3336, + "time_per_iteration": 2.86029314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_mlp": 1.0014739, + "epoch": 0.6419776837245095, + "flos": 501944113152.0, + "grad_norm": 0.02988455094961003, + "language_loss": 0.89225817, + "learning_rate": 0.00030012877686276973, + "loss": 0.90268028, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.4074707, + "step": 3337, + "time_per_iteration": 2.72491455078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_mlp": 1.00569534, + "epoch": 0.642170065409773, + "flos": 621779984640.0, + "grad_norm": 0.03237702044621704, + "language_loss": 0.87225235, + "learning_rate": 0.0002998432474304832, + "loss": 0.88271654, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.40722656, + "step": 3338, + "time_per_iteration": 2.7576870918273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_mlp": 1.01165771, + "epoch": 0.6423624470950365, + "flos": 1426641648384.0, + "grad_norm": 0.016568770215616015, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80288672, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.39648438, + "step": 3339, + "time_per_iteration": 4.923727035522461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_mlp": 1.00143564, + "epoch": 0.6425548287803001, + "flos": 563440205568.0, + "grad_norm": 0.03881466361138169, + "language_loss": 0.890571, + "learning_rate": 0.00029927242179996107, + "loss": 0.90099066, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.40527344, + "step": 3340, + "time_per_iteration": 2.7034361362457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_mlp": 1.00212634, + "epoch": 0.6427472104655637, + "flos": 586614363648.0, + "grad_norm": 0.030378234734855056, + "language_loss": 0.83618605, + "learning_rate": 0.0002989871258233398, + "loss": 0.84661257, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.40527344, + "step": 3341, + "time_per_iteration": 2.7497901916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_mlp": 1.00211823, + "epoch": 0.6429395921508272, + "flos": 405147284736.0, + "grad_norm": 0.03870957855804831, + "language_loss": 0.83240426, + "learning_rate": 0.0002987019078868373, + "loss": 0.84283173, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.40625, + "step": 3342, + "time_per_iteration": 2.425215005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_mlp": 1.00362682, + "epoch": 0.6431319738360908, + "flos": 549833164032.0, + "grad_norm": 0.031726413731120486, + "language_loss": 0.82255763, + "learning_rate": 0.00029841676810118484, + "loss": 0.83300042, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.40649414, + "step": 3343, + "time_per_iteration": 2.693652629852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_mlp": 1.00368381, + "epoch": 0.6433243555213544, + "flos": 794706455040.0, + "grad_norm": 0.03684738873998065, + "language_loss": 0.87695611, + "learning_rate": 0.0002981317065770839, + "loss": 0.88739967, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.40673828, + "step": 3344, + "time_per_iteration": 3.0393459796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_mlp": 1.00147617, + "epoch": 0.643516737206618, + "flos": 584113228032.0, + "grad_norm": 0.0395181937617663, + "language_loss": 0.81428736, + "learning_rate": 0.00029784672342520493, + "loss": 0.82471007, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.40795898, + "step": 3345, + "time_per_iteration": 2.6979730129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045514, + "balance_loss_mlp": 1.00479162, + "epoch": 0.6437091188918815, + "flos": 519751472640.0, + "grad_norm": 0.07302138379312399, + "language_loss": 0.8401407, + "learning_rate": 0.00029756181875618834, + "loss": 0.85059583, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.40722656, + "step": 3346, + "time_per_iteration": 2.609215497970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046536, + "balance_loss_mlp": 1.00588584, + "epoch": 0.643901500577145, + "flos": 385787529984.0, + "grad_norm": 0.039174224295971255, + "language_loss": 0.83988988, + "learning_rate": 0.0002972769926806439, + "loss": 0.85035521, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.40649414, + "step": 3347, + "time_per_iteration": 2.4672152996063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044082, + "balance_loss_mlp": 1.00345576, + "epoch": 0.6440938822624086, + "flos": 484698612480.0, + "grad_norm": 0.03574243057214525, + "language_loss": 0.88977337, + "learning_rate": 0.0002969922453091508, + "loss": 0.9002142, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.40625, + "step": 3348, + "time_per_iteration": 2.615544557571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044846, + "balance_loss_mlp": 1.00414753, + "epoch": 0.6442862639476722, + "flos": 541638607872.0, + "grad_norm": 0.030177655617681567, + "language_loss": 0.85437477, + "learning_rate": 0.00029670757675225777, + "loss": 0.86482322, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.40698242, + "step": 3349, + "time_per_iteration": 2.7615771293640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047798, + "balance_loss_mlp": 1.0071243, + "epoch": 0.6444786456329358, + "flos": 527959634688.0, + "grad_norm": 0.036762953036999044, + "language_loss": 0.79762578, + "learning_rate": 0.0002964229871204831, + "loss": 0.8081038, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.40673828, + "step": 3350, + "time_per_iteration": 2.6479439735412598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_mlp": 1.00781858, + "epoch": 0.6446710273181993, + "flos": 699162623232.0, + "grad_norm": 0.0356496056156774, + "language_loss": 0.84474576, + "learning_rate": 0.00029613847652431403, + "loss": 0.85523063, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.40673828, + "step": 3351, + "time_per_iteration": 2.852724313735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_mlp": 1.00514281, + "epoch": 0.6448634090034628, + "flos": 626300110080.0, + "grad_norm": 0.031569039076812924, + "language_loss": 0.79828554, + "learning_rate": 0.0002958540450742078, + "loss": 0.80874443, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.4074707, + "step": 3352, + "time_per_iteration": 2.943434238433838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104848, + "balance_loss_mlp": 1.0077343, + "epoch": 0.6450557906887264, + "flos": 602166518016.0, + "grad_norm": 0.03244355782647549, + "language_loss": 0.7780689, + "learning_rate": 0.0002955696928805901, + "loss": 0.78855366, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.4074707, + "step": 3353, + "time_per_iteration": 2.9107890129089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046337, + "balance_loss_mlp": 1.0057348, + "epoch": 0.64524817237399, + "flos": 647385292032.0, + "grad_norm": 0.03305835241833302, + "language_loss": 0.86728162, + "learning_rate": 0.0002952854200538563, + "loss": 0.87774503, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.40600586, + "step": 3354, + "time_per_iteration": 2.8001787662506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_mlp": 1.00430393, + "epoch": 0.6454405540592536, + "flos": 474367448064.0, + "grad_norm": 0.03406107124883384, + "language_loss": 0.8233161, + "learning_rate": 0.000295001226704371, + "loss": 0.83376658, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.4074707, + "step": 3355, + "time_per_iteration": 2.6213538646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044402, + "balance_loss_mlp": 1.00372756, + "epoch": 0.6456329357445171, + "flos": 613020657408.0, + "grad_norm": 0.03542934708236725, + "language_loss": 0.82853353, + "learning_rate": 0.00029471711294246783, + "loss": 0.83897758, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.40673828, + "step": 3356, + "time_per_iteration": 2.790909767150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044152, + "balance_loss_mlp": 1.00362051, + "epoch": 0.6458253174297807, + "flos": 732932351232.0, + "grad_norm": 0.03702752169183614, + "language_loss": 0.82778573, + "learning_rate": 0.0002944330788784494, + "loss": 0.83822721, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.40527344, + "step": 3357, + "time_per_iteration": 2.8837075233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044846, + "balance_loss_mlp": 1.00424361, + "epoch": 0.6460176991150443, + "flos": 571555048704.0, + "grad_norm": 0.04139380130769849, + "language_loss": 0.84656543, + "learning_rate": 0.00029414912462258786, + "loss": 0.85701388, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.40600586, + "step": 3358, + "time_per_iteration": 2.8205137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_mlp": 1.00543988, + "epoch": 0.6462100808003078, + "flos": 584243485440.0, + "grad_norm": 0.03729295118772339, + "language_loss": 0.81916165, + "learning_rate": 0.00029386525028512366, + "loss": 0.82962239, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.40625, + "step": 3359, + "time_per_iteration": 2.7342734336853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044545, + "balance_loss_mlp": 1.00391877, + "epoch": 0.6464024624855714, + "flos": 485011617024.0, + "grad_norm": 0.03542298422939795, + "language_loss": 0.87396795, + "learning_rate": 0.0002935814559762666, + "loss": 0.88441336, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.40625, + "step": 3360, + "time_per_iteration": 2.7663137912750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_mlp": 1.00362873, + "epoch": 0.6465948441708349, + "flos": 528843246336.0, + "grad_norm": 0.034215531166731795, + "language_loss": 0.80432177, + "learning_rate": 0.0002932977418061957, + "loss": 0.81476361, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.40551758, + "step": 3361, + "time_per_iteration": 2.680459976196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_mlp": 1.00299382, + "epoch": 0.6467872258560985, + "flos": 670626524160.0, + "grad_norm": 0.03987324070915456, + "language_loss": 0.81433517, + "learning_rate": 0.00029301410788505833, + "loss": 0.82477069, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.40551758, + "step": 3362, + "time_per_iteration": 2.772834539413452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042359, + "balance_loss_mlp": 1.00178003, + "epoch": 0.6469796075413621, + "flos": 433040845056.0, + "grad_norm": 0.046274531894689615, + "language_loss": 0.81467456, + "learning_rate": 0.00029273055432297126, + "loss": 0.82509816, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.40576172, + "step": 3363, + "time_per_iteration": 2.49839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_mlp": 1.00188959, + "epoch": 0.6471719892266257, + "flos": 805102748160.0, + "grad_norm": 0.03834251982821679, + "language_loss": 0.81200004, + "learning_rate": 0.00029244708123001917, + "loss": 0.82242495, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.40600586, + "step": 3364, + "time_per_iteration": 2.968705177307129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_mlp": 1.00215495, + "epoch": 0.6473643709118891, + "flos": 578349821184.0, + "grad_norm": 0.036932041933641975, + "language_loss": 0.84809864, + "learning_rate": 0.0002921636887162565, + "loss": 0.85852528, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.4050293, + "step": 3365, + "time_per_iteration": 2.7454428672790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044497, + "balance_loss_mlp": 1.00398982, + "epoch": 0.6475567525971527, + "flos": 762788520960.0, + "grad_norm": 0.046091211557592264, + "language_loss": 0.8445828, + "learning_rate": 0.00029188037689170595, + "loss": 0.85502779, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.4050293, + "step": 3366, + "time_per_iteration": 2.9878523349761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_mlp": 1.00274241, + "epoch": 0.6477491342824163, + "flos": 844501734912.0, + "grad_norm": 0.04252046587739173, + "language_loss": 0.84425056, + "learning_rate": 0.0002915971458663586, + "loss": 0.85468358, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.40551758, + "step": 3367, + "time_per_iteration": 3.052515745162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_mlp": 1.01003003, + "epoch": 0.6479415159676799, + "flos": 886382415360.0, + "grad_norm": 0.03864645902049365, + "language_loss": 0.82315862, + "learning_rate": 0.00029131399575017494, + "loss": 0.83366442, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.40551758, + "step": 3368, + "time_per_iteration": 3.1613588333129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050945, + "balance_loss_mlp": 1.01034212, + "epoch": 0.6481338976529435, + "flos": 616724245248.0, + "grad_norm": 0.06720988527624061, + "language_loss": 0.86632174, + "learning_rate": 0.0002910309266530836, + "loss": 0.87683117, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.40600586, + "step": 3369, + "time_per_iteration": 2.800647497177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051276, + "balance_loss_mlp": 1.01067364, + "epoch": 0.648326279338207, + "flos": 511020335616.0, + "grad_norm": 0.03423893349875194, + "language_loss": 0.85872662, + "learning_rate": 0.0002907479386849814, + "loss": 0.86923945, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.40600586, + "step": 3370, + "time_per_iteration": 2.6336069107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105095, + "balance_loss_mlp": 1.0103476, + "epoch": 0.6485186610234706, + "flos": 703869386496.0, + "grad_norm": 0.03204560465373447, + "language_loss": 0.80689716, + "learning_rate": 0.0002904650319557339, + "loss": 0.81740665, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.40600586, + "step": 3371, + "time_per_iteration": 2.9737660884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054287, + "balance_loss_mlp": 1.01349366, + "epoch": 0.6487110427087341, + "flos": 561746859264.0, + "grad_norm": 0.039912158099113866, + "language_loss": 0.81825972, + "learning_rate": 0.0002901822065751758, + "loss": 0.82880259, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.40795898, + "step": 3372, + "time_per_iteration": 2.678905487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054182, + "balance_loss_mlp": 1.01341212, + "epoch": 0.6489034243939977, + "flos": 681302774016.0, + "grad_norm": 0.03214296467255679, + "language_loss": 0.86033392, + "learning_rate": 0.0002898994626531093, + "loss": 0.87087572, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.40771484, + "step": 3373, + "time_per_iteration": 2.9144790172576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047579, + "balance_loss_mlp": 1.00688112, + "epoch": 0.6490958060792612, + "flos": 475372568832.0, + "grad_norm": 0.03458153211721296, + "language_loss": 0.88523054, + "learning_rate": 0.00028961680029930526, + "loss": 0.8957063, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.40698242, + "step": 3374, + "time_per_iteration": 2.5657663345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048477, + "balance_loss_mlp": 1.00794625, + "epoch": 0.6492881877645248, + "flos": 590003001600.0, + "grad_norm": 0.03430965952422358, + "language_loss": 0.77826953, + "learning_rate": 0.00028933421962350317, + "loss": 0.78875428, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.40527344, + "step": 3375, + "time_per_iteration": 2.782069683074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053456, + "balance_loss_mlp": 1.0128299, + "epoch": 0.6494805694497884, + "flos": 643588385280.0, + "grad_norm": 0.03575939394791191, + "language_loss": 0.84478199, + "learning_rate": 0.0002890517207354104, + "loss": 0.85531658, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.40625, + "step": 3376, + "time_per_iteration": 2.837724447250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047454, + "balance_loss_mlp": 1.00689936, + "epoch": 0.649672951135052, + "flos": 532837484544.0, + "grad_norm": 0.034227306744160566, + "language_loss": 0.82481575, + "learning_rate": 0.0002887693037447029, + "loss": 0.83529025, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.40551758, + "step": 3377, + "time_per_iteration": 2.579442262649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_mlp": 1.00662696, + "epoch": 0.6498653328203156, + "flos": 548446019328.0, + "grad_norm": 0.03719565127882316, + "language_loss": 0.82554042, + "learning_rate": 0.00028848696876102443, + "loss": 0.83601272, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.40600586, + "step": 3378, + "time_per_iteration": 2.6242425441741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047252, + "balance_loss_mlp": 1.00650632, + "epoch": 0.650057714505579, + "flos": 463161420288.0, + "grad_norm": 0.037917560954429594, + "language_loss": 0.8430717, + "learning_rate": 0.00028820471589398723, + "loss": 0.85354424, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.4074707, + "step": 3379, + "time_per_iteration": 2.5716495513916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_mlp": 1.00565684, + "epoch": 0.6502500961908426, + "flos": 511241966592.0, + "grad_norm": 0.04232947369873583, + "language_loss": 0.78428495, + "learning_rate": 0.00028792254525317196, + "loss": 0.79474926, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.40771484, + "step": 3380, + "time_per_iteration": 2.6657466888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104445, + "balance_loss_mlp": 1.00377584, + "epoch": 0.6504424778761062, + "flos": 580911227904.0, + "grad_norm": 0.0355389042104645, + "language_loss": 0.8194313, + "learning_rate": 0.00028764045694812645, + "loss": 0.82987577, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.40673828, + "step": 3381, + "time_per_iteration": 2.75962233543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047874, + "balance_loss_mlp": 1.00727105, + "epoch": 0.6506348595613698, + "flos": 520467888384.0, + "grad_norm": 0.04062665752895993, + "language_loss": 0.76926279, + "learning_rate": 0.0002873584510883671, + "loss": 0.77974153, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.40600586, + "step": 3382, + "time_per_iteration": 2.5889906883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049006, + "balance_loss_mlp": 1.00837946, + "epoch": 0.6508272412466333, + "flos": 511363475712.0, + "grad_norm": 0.029998580027972052, + "language_loss": 0.86699784, + "learning_rate": 0.0002870765277833788, + "loss": 0.8774879, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.40625, + "step": 3383, + "time_per_iteration": 2.6930124759674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_mlp": 1.00863218, + "epoch": 0.6510196229318969, + "flos": 626805588480.0, + "grad_norm": 0.03382855215234118, + "language_loss": 0.80910194, + "learning_rate": 0.00028679468714261347, + "loss": 0.81959337, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.4050293, + "step": 3384, + "time_per_iteration": 2.793992280960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.00750864, + "epoch": 0.6512120046171604, + "flos": 475670022144.0, + "grad_norm": 0.034347459077756264, + "language_loss": 0.77632761, + "learning_rate": 0.0002865129292754918, + "loss": 0.78680825, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.40551758, + "step": 3385, + "time_per_iteration": 2.5745677947998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051635, + "balance_loss_mlp": 1.01115131, + "epoch": 0.651404386302424, + "flos": 553031273472.0, + "grad_norm": 0.0319561697529533, + "language_loss": 0.82687205, + "learning_rate": 0.00028623125429140105, + "loss": 0.8373884, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.40478516, + "step": 3386, + "time_per_iteration": 2.8197057247161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049254, + "balance_loss_mlp": 1.00874698, + "epoch": 0.6515967679876876, + "flos": 524375610624.0, + "grad_norm": 0.03843989341560043, + "language_loss": 0.87771493, + "learning_rate": 0.00028594966229969785, + "loss": 0.8882075, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.4050293, + "step": 3387, + "time_per_iteration": 2.6713032722473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_mlp": 1.00899053, + "epoch": 0.6517891496729511, + "flos": 575017563648.0, + "grad_norm": 0.03692798161206562, + "language_loss": 0.8182978, + "learning_rate": 0.00028566815340970577, + "loss": 0.82879114, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.40332031, + "step": 3388, + "time_per_iteration": 2.7321841716766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048844, + "balance_loss_mlp": 1.0084554, + "epoch": 0.6519815313582147, + "flos": 556990518528.0, + "grad_norm": 0.03423866481728588, + "language_loss": 0.81470537, + "learning_rate": 0.0002853867277307162, + "loss": 0.82519382, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.40380859, + "step": 3389, + "time_per_iteration": 2.7031924724578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049788, + "balance_loss_mlp": 1.00937581, + "epoch": 0.6521739130434783, + "flos": 481522857216.0, + "grad_norm": 0.03513339122298917, + "language_loss": 0.82942468, + "learning_rate": 0.00028510538537198824, + "loss": 0.83992255, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.40405273, + "step": 3390, + "time_per_iteration": 2.703963279724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050398, + "balance_loss_mlp": 1.00993848, + "epoch": 0.6523662947287419, + "flos": 667021112832.0, + "grad_norm": 0.03209400617836455, + "language_loss": 0.86939168, + "learning_rate": 0.00028482412644274867, + "loss": 0.87989569, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.40454102, + "step": 3391, + "time_per_iteration": 2.9381484985351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049716, + "balance_loss_mlp": 1.00920916, + "epoch": 0.6525586764140053, + "flos": 549702906624.0, + "grad_norm": 0.03739783573884853, + "language_loss": 0.75139832, + "learning_rate": 0.00028454295105219207, + "loss": 0.76189548, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.4050293, + "step": 3392, + "time_per_iteration": 2.658132314682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_mlp": 1.00706887, + "epoch": 0.6527510580992689, + "flos": 804391190016.0, + "grad_norm": 0.02478431190679109, + "language_loss": 0.79875654, + "learning_rate": 0.0002842618593094802, + "loss": 0.80923212, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.40478516, + "step": 3393, + "time_per_iteration": 3.1278936862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046198, + "balance_loss_mlp": 1.00571501, + "epoch": 0.6529434397845325, + "flos": 672376250880.0, + "grad_norm": 0.04113995840272075, + "language_loss": 0.80790162, + "learning_rate": 0.00028398085132374243, + "loss": 0.81836367, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.40478516, + "step": 3394, + "time_per_iteration": 2.8653299808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_mlp": 1.00571322, + "epoch": 0.6531358214697961, + "flos": 829876933632.0, + "grad_norm": 0.032703635981260123, + "language_loss": 0.85031712, + "learning_rate": 0.0002836999272040761, + "loss": 0.86077929, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.4050293, + "step": 3395, + "time_per_iteration": 3.131331205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050486, + "balance_loss_mlp": 1.01005006, + "epoch": 0.6533282031550597, + "flos": 488393452032.0, + "grad_norm": 0.04317230929037854, + "language_loss": 0.84511197, + "learning_rate": 0.00028341908705954575, + "loss": 0.85561681, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.40429688, + "step": 3396, + "time_per_iteration": 2.5415916442871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048904, + "balance_loss_mlp": 1.0094223, + "epoch": 0.6535205848403232, + "flos": 1561105233408.0, + "grad_norm": 0.006364223174853702, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82810712, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.39453125, + "step": 3397, + "time_per_iteration": 4.924402236938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.0069257, + "epoch": 0.6537129665255867, + "flos": 494704133376.0, + "grad_norm": 0.03394309019693363, + "language_loss": 0.78847253, + "learning_rate": 0.00028285765913198604, + "loss": 0.79894781, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.40600586, + "step": 3398, + "time_per_iteration": 2.595367193222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046629, + "balance_loss_mlp": 1.00595522, + "epoch": 0.6539053482108503, + "flos": 606143259648.0, + "grad_norm": 0.03316024353093433, + "language_loss": 0.82683516, + "learning_rate": 0.0002825770715669227, + "loss": 0.83730143, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.40673828, + "step": 3399, + "time_per_iteration": 2.7097129821777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048676, + "balance_loss_mlp": 1.00807345, + "epoch": 0.6540977298961139, + "flos": 578881544448.0, + "grad_norm": 0.0428136910892252, + "language_loss": 0.81872654, + "learning_rate": 0.00028229656841292634, + "loss": 0.82921332, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.40600586, + "step": 3400, + "time_per_iteration": 2.6833486557006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_mlp": 1.00442982, + "epoch": 0.6542901115813774, + "flos": 512770062336.0, + "grad_norm": 0.04250142071298369, + "language_loss": 0.76713872, + "learning_rate": 0.0002820161497788979, + "loss": 0.77758902, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.40600586, + "step": 3401, + "time_per_iteration": 2.626732349395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048894, + "balance_loss_mlp": 1.00838673, + "epoch": 0.654482493266641, + "flos": 626675331072.0, + "grad_norm": 0.03960445373110503, + "language_loss": 0.87829405, + "learning_rate": 0.00028173581577370545, + "loss": 0.88878298, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.4050293, + "step": 3402, + "time_per_iteration": 2.7741096019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048023, + "balance_loss_mlp": 1.00753999, + "epoch": 0.6546748749519046, + "flos": 525063836160.0, + "grad_norm": 0.03167040591829995, + "language_loss": 0.79177642, + "learning_rate": 0.0002814555665061844, + "loss": 0.80225664, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.40478516, + "step": 3403, + "time_per_iteration": 2.664350986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047128, + "balance_loss_mlp": 1.00664401, + "epoch": 0.6548672566371682, + "flos": 480274718208.0, + "grad_norm": 0.036729511728986385, + "language_loss": 0.78224975, + "learning_rate": 0.00028117540208513715, + "loss": 0.79272103, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.40478516, + "step": 3404, + "time_per_iteration": 2.6802027225494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043621, + "balance_loss_mlp": 1.00306582, + "epoch": 0.6550596383224317, + "flos": 617136404736.0, + "grad_norm": 0.034100585633273374, + "language_loss": 0.85354125, + "learning_rate": 0.00028089532261933313, + "loss": 0.86397743, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.40551758, + "step": 3405, + "time_per_iteration": 2.7186086177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_mlp": 1.00567865, + "epoch": 0.6552520200076952, + "flos": 489808786944.0, + "grad_norm": 0.041360786835332355, + "language_loss": 0.86205178, + "learning_rate": 0.0002806153282175087, + "loss": 0.87251461, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.40600586, + "step": 3406, + "time_per_iteration": 2.5789847373962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046471, + "balance_loss_mlp": 1.00584447, + "epoch": 0.6554444016929588, + "flos": 688859649024.0, + "grad_norm": 0.034986799312927766, + "language_loss": 0.8358103, + "learning_rate": 0.0002803354189883679, + "loss": 0.84627509, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.40625, + "step": 3407, + "time_per_iteration": 2.837360382080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_mlp": 1.01023173, + "epoch": 0.6556367833782224, + "flos": 544171824384.0, + "grad_norm": 0.032399307772020214, + "language_loss": 0.86254793, + "learning_rate": 0.00028005559504058053, + "loss": 0.87305439, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.40405273, + "step": 3408, + "time_per_iteration": 2.7328412532806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_mlp": 1.00673985, + "epoch": 0.655829165063486, + "flos": 674731577856.0, + "grad_norm": 0.033393765710147245, + "language_loss": 0.77549541, + "learning_rate": 0.0002797758564827838, + "loss": 0.78596783, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.4050293, + "step": 3409, + "time_per_iteration": 2.8037917613983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048239, + "balance_loss_mlp": 1.00761223, + "epoch": 0.6560215467487496, + "flos": 532837484544.0, + "grad_norm": 0.037569861592142095, + "language_loss": 0.83625042, + "learning_rate": 0.0002794962034235824, + "loss": 0.84673285, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.40625, + "step": 3410, + "time_per_iteration": 2.660435676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048327, + "balance_loss_mlp": 1.00789142, + "epoch": 0.656213928434013, + "flos": 592460395776.0, + "grad_norm": 0.035927702009128905, + "language_loss": 0.75148469, + "learning_rate": 0.00027921663597154695, + "loss": 0.76196802, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.40429688, + "step": 3411, + "time_per_iteration": 2.7516040802001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_mlp": 1.01015997, + "epoch": 0.6564063101192766, + "flos": 416678956032.0, + "grad_norm": 0.07901014031845595, + "language_loss": 0.81708795, + "learning_rate": 0.00027893715423521525, + "loss": 0.82759392, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.40429688, + "step": 3412, + "time_per_iteration": 2.4704418182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045819, + "balance_loss_mlp": 1.00547826, + "epoch": 0.6565986918045402, + "flos": 454271835648.0, + "grad_norm": 0.03411050033810387, + "language_loss": 0.84291053, + "learning_rate": 0.00027865775832309163, + "loss": 0.85336864, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.40332031, + "step": 3413, + "time_per_iteration": 2.6385068893432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_mlp": 1.00854325, + "epoch": 0.6567910734898038, + "flos": 548799853056.0, + "grad_norm": 0.036374593364126635, + "language_loss": 0.86917508, + "learning_rate": 0.00027837844834364733, + "loss": 0.87966299, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.40234375, + "step": 3414, + "time_per_iteration": 2.6444642543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048805, + "balance_loss_mlp": 1.00860763, + "epoch": 0.6569834551750673, + "flos": 656765770752.0, + "grad_norm": 0.03225713211671443, + "language_loss": 0.87055808, + "learning_rate": 0.00027809922440532, + "loss": 0.88104612, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.40185547, + "step": 3415, + "time_per_iteration": 2.847615957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_mlp": 1.01114511, + "epoch": 0.6571758368603309, + "flos": 540811376640.0, + "grad_norm": 0.035988230545184526, + "language_loss": 0.81540048, + "learning_rate": 0.00027782008661651406, + "loss": 0.82591534, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.40332031, + "step": 3416, + "time_per_iteration": 2.767226457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049379, + "balance_loss_mlp": 1.00906193, + "epoch": 0.6573682185455945, + "flos": 498379531008.0, + "grad_norm": 0.03451446989535273, + "language_loss": 0.87885237, + "learning_rate": 0.00027754103508560013, + "loss": 0.88934618, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.40307617, + "step": 3417, + "time_per_iteration": 2.6277449131011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045088, + "balance_loss_mlp": 1.00481939, + "epoch": 0.657560600230858, + "flos": 448353871872.0, + "grad_norm": 0.03502749433462501, + "language_loss": 0.8376503, + "learning_rate": 0.0002772620699209163, + "loss": 0.8481012, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.40258789, + "step": 3418, + "time_per_iteration": 2.603851318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041901, + "balance_loss_mlp": 1.00168002, + "epoch": 0.6577529819161216, + "flos": 482920695552.0, + "grad_norm": 0.033924516533442195, + "language_loss": 0.80503142, + "learning_rate": 0.0002769831912307658, + "loss": 0.81545043, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.40209961, + "step": 3419, + "time_per_iteration": 2.567737340927124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_mlp": 1.00313556, + "epoch": 0.6579453636013851, + "flos": 531860553984.0, + "grad_norm": 0.04823961507786352, + "language_loss": 0.80877286, + "learning_rate": 0.00027670439912341917, + "loss": 0.81920785, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.40356445, + "step": 3420, + "time_per_iteration": 2.639587163925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043978, + "balance_loss_mlp": 1.00354195, + "epoch": 0.6581377452866487, + "flos": 629243540736.0, + "grad_norm": 0.032258458979824364, + "language_loss": 0.84138131, + "learning_rate": 0.0002764256937071129, + "loss": 0.85182106, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.40429688, + "step": 3421, + "time_per_iteration": 2.793288469314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_mlp": 1.00347602, + "epoch": 0.6583301269719123, + "flos": 549674716416.0, + "grad_norm": 0.033092634832732, + "language_loss": 0.87840796, + "learning_rate": 0.00027614707509005036, + "loss": 0.88884783, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.4050293, + "step": 3422, + "time_per_iteration": 2.672691822052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_mlp": 1.0038892, + "epoch": 0.6585225086571759, + "flos": 428397265152.0, + "grad_norm": 0.041046610709459384, + "language_loss": 0.7990576, + "learning_rate": 0.0002758685433804008, + "loss": 0.80950087, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.40429688, + "step": 3423, + "time_per_iteration": 2.5028507709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_mlp": 1.00448632, + "epoch": 0.6587148903424394, + "flos": 861050261760.0, + "grad_norm": 0.040364444047634805, + "language_loss": 0.7997486, + "learning_rate": 0.00027559009868630005, + "loss": 0.81019825, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.40478516, + "step": 3424, + "time_per_iteration": 3.1220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_mlp": 1.00671124, + "epoch": 0.6589072720277029, + "flos": 807037167360.0, + "grad_norm": 0.05893519085395252, + "language_loss": 0.80930316, + "learning_rate": 0.0002753117411158491, + "loss": 0.81977397, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.40356445, + "step": 3425, + "time_per_iteration": 3.0889339447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047077, + "balance_loss_mlp": 1.00676, + "epoch": 0.6590996537129665, + "flos": 549674716416.0, + "grad_norm": 0.03274381739097603, + "language_loss": 0.90609264, + "learning_rate": 0.0002750334707771168, + "loss": 0.91656339, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.40307617, + "step": 3426, + "time_per_iteration": 2.6541290283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_mlp": 1.00647414, + "epoch": 0.6592920353982301, + "flos": 455109760512.0, + "grad_norm": 0.03777224687776173, + "language_loss": 0.81529361, + "learning_rate": 0.0002747552877781369, + "loss": 0.82576048, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.40209961, + "step": 3427, + "time_per_iteration": 2.5356411933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_mlp": 1.00858271, + "epoch": 0.6594844170834937, + "flos": 568261675008.0, + "grad_norm": 0.03735814383850805, + "language_loss": 0.82849789, + "learning_rate": 0.0002744771922269097, + "loss": 0.83898544, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.40161133, + "step": 3428, + "time_per_iteration": 2.7781617641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_mlp": 1.00761461, + "epoch": 0.6596767987687572, + "flos": 1189755878400.0, + "grad_norm": 0.035375644925624505, + "language_loss": 0.82642734, + "learning_rate": 0.0002741991842314015, + "loss": 0.83690447, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.40087891, + "step": 3429, + "time_per_iteration": 3.484401226043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050876, + "balance_loss_mlp": 1.01070201, + "epoch": 0.6598691804540208, + "flos": 504468581376.0, + "grad_norm": 0.033809257581419436, + "language_loss": 0.86197507, + "learning_rate": 0.0002739212638995445, + "loss": 0.87248385, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.40161133, + "step": 3430, + "time_per_iteration": 2.557008743286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_mlp": 1.00654662, + "epoch": 0.6600615621392844, + "flos": 532399080192.0, + "grad_norm": 0.03652926945024374, + "language_loss": 0.83438206, + "learning_rate": 0.00027364343133923696, + "loss": 0.84484929, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.40161133, + "step": 3431, + "time_per_iteration": 2.662047863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010455, + "balance_loss_mlp": 1.00534999, + "epoch": 0.6602539438245479, + "flos": 566557635072.0, + "grad_norm": 0.03543857868011933, + "language_loss": 0.83350068, + "learning_rate": 0.0002733656866583431, + "loss": 0.84395564, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.40136719, + "step": 3432, + "time_per_iteration": 2.676973581314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_mlp": 1.00558269, + "epoch": 0.6604463255098114, + "flos": 858592867584.0, + "grad_norm": 0.037899677341019365, + "language_loss": 0.83285594, + "learning_rate": 0.0002730880299646927, + "loss": 0.8433131, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.40112305, + "step": 3433, + "time_per_iteration": 3.0207436084747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_mlp": 1.00585747, + "epoch": 0.660638707195075, + "flos": 675680318208.0, + "grad_norm": 0.03767896728200409, + "language_loss": 0.85914338, + "learning_rate": 0.0002728104613660821, + "loss": 0.8696028, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.40063477, + "step": 3434, + "time_per_iteration": 2.847806215286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_mlp": 1.0032711, + "epoch": 0.6608310888803386, + "flos": 890524407552.0, + "grad_norm": 0.03485230588781084, + "language_loss": 0.8359797, + "learning_rate": 0.0002725329809702729, + "loss": 0.84641242, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.39990234, + "step": 3435, + "time_per_iteration": 3.1851022243499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_mlp": 1.0028832, + "epoch": 0.6610234705656022, + "flos": 1138108804608.0, + "grad_norm": 0.04206643775716819, + "language_loss": 0.76903141, + "learning_rate": 0.0002722555888849921, + "loss": 0.7794615, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.40112305, + "step": 3436, + "time_per_iteration": 3.453571081161499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_mlp": 1.00474417, + "epoch": 0.6612158522508658, + "flos": 468959820288.0, + "grad_norm": 0.03417683071505001, + "language_loss": 0.80971491, + "learning_rate": 0.00027197828521793334, + "loss": 0.82016289, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.40039062, + "step": 3437, + "time_per_iteration": 2.5737972259521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_mlp": 1.00892961, + "epoch": 0.6614082339361292, + "flos": 572774997504.0, + "grad_norm": 0.03444646564186984, + "language_loss": 0.85238397, + "learning_rate": 0.0002717010700767552, + "loss": 0.86287451, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.40112305, + "step": 3438, + "time_per_iteration": 2.6816329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_mlp": 1.00700414, + "epoch": 0.6616006156213928, + "flos": 499460474112.0, + "grad_norm": 0.039408018339583364, + "language_loss": 0.7639091, + "learning_rate": 0.00027142394356908226, + "loss": 0.77437991, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.40063477, + "step": 3439, + "time_per_iteration": 2.6397507190704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_mlp": 1.01604831, + "epoch": 0.6617929973066564, + "flos": 603610043136.0, + "grad_norm": 0.03512262783038589, + "language_loss": 0.85516727, + "learning_rate": 0.00027114690580250456, + "loss": 0.8657307, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.40283203, + "step": 3440, + "time_per_iteration": 2.8226699829101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_mlp": 1.00607085, + "epoch": 0.66198537899192, + "flos": 523995532032.0, + "grad_norm": 0.03484935524221126, + "language_loss": 0.87502497, + "learning_rate": 0.0002708699568845776, + "loss": 0.88549048, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.40478516, + "step": 3441, + "time_per_iteration": 2.666151762008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054733, + "balance_loss_mlp": 1.01563263, + "epoch": 0.6621777606771835, + "flos": 1569612794112.0, + "grad_norm": 0.008720086595697616, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80342519, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.390625, + "step": 3442, + "time_per_iteration": 4.902445316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043667, + "balance_loss_mlp": 1.00320721, + "epoch": 0.6623701423624471, + "flos": 527690371584.0, + "grad_norm": 0.04147844177514617, + "language_loss": 0.83753407, + "learning_rate": 0.0002703163260247261, + "loss": 0.84797072, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.40454102, + "step": 3443, + "time_per_iteration": 2.6544172763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_mlp": 1.00157988, + "epoch": 0.6625625240477107, + "flos": 529216521984.0, + "grad_norm": 0.040243971726719965, + "language_loss": 0.82285839, + "learning_rate": 0.0002700396442977399, + "loss": 0.83327973, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.40551758, + "step": 3444, + "time_per_iteration": 2.659823179244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046923, + "balance_loss_mlp": 1.00648713, + "epoch": 0.6627549057329742, + "flos": 474196361472.0, + "grad_norm": 0.03873462944333031, + "language_loss": 0.84804982, + "learning_rate": 0.0002697630518492817, + "loss": 0.85851908, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.40429688, + "step": 3445, + "time_per_iteration": 2.6407060623168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042694, + "balance_loss_mlp": 1.00218678, + "epoch": 0.6629472874182378, + "flos": 529012387584.0, + "grad_norm": 0.03365832032426446, + "language_loss": 0.86288029, + "learning_rate": 0.0002694865487867343, + "loss": 0.87330723, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.4050293, + "step": 3446, + "time_per_iteration": 2.6234817504882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_mlp": 0.99986076, + "epoch": 0.6631396691035013, + "flos": 614379611904.0, + "grad_norm": 0.029868994053189296, + "language_loss": 0.85050064, + "learning_rate": 0.0002692101352174453, + "loss": 0.86090481, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.40551758, + "step": 3447, + "time_per_iteration": 2.7610418796539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054226, + "balance_loss_mlp": 1.01357543, + "epoch": 0.6633320507887649, + "flos": 610434951168.0, + "grad_norm": 0.03566276224507284, + "language_loss": 0.85075617, + "learning_rate": 0.00026893381124872787, + "loss": 0.86129844, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.40649414, + "step": 3448, + "time_per_iteration": 2.7092947959899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.0067625, + "epoch": 0.6635244324740285, + "flos": 751142143488.0, + "grad_norm": 0.03834758690665688, + "language_loss": 0.81510758, + "learning_rate": 0.00026865757698786097, + "loss": 0.82558024, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.4050293, + "step": 3449, + "time_per_iteration": 3.0252504348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050032, + "balance_loss_mlp": 1.00952482, + "epoch": 0.6637168141592921, + "flos": 665748674304.0, + "grad_norm": 0.03495621172774381, + "language_loss": 0.82439375, + "learning_rate": 0.000268381432542088, + "loss": 0.83489406, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.4050293, + "step": 3450, + "time_per_iteration": 2.847905397415161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_mlp": 1.00598156, + "epoch": 0.6639091958445555, + "flos": 607921176576.0, + "grad_norm": 0.03480028422588226, + "language_loss": 0.80330265, + "learning_rate": 0.00026810537801861807, + "loss": 0.8137669, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.40429688, + "step": 3451, + "time_per_iteration": 2.8109076023101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044982, + "balance_loss_mlp": 1.00442719, + "epoch": 0.6641015775298191, + "flos": 477680263680.0, + "grad_norm": 0.03370448580538907, + "language_loss": 0.81616271, + "learning_rate": 0.0002678294135246243, + "loss": 0.82661253, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.40551758, + "step": 3452, + "time_per_iteration": 2.77632999420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_mlp": 1.00337756, + "epoch": 0.6642939592150827, + "flos": 905596361472.0, + "grad_norm": 0.035596990972813804, + "language_loss": 0.87064171, + "learning_rate": 0.0002675535391672463, + "loss": 0.88108027, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.40478516, + "step": 3453, + "time_per_iteration": 3.1011788845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046586, + "balance_loss_mlp": 1.00610256, + "epoch": 0.6644863409003463, + "flos": 582938966016.0, + "grad_norm": 0.03233314445792202, + "language_loss": 0.86734712, + "learning_rate": 0.0002672777550535877, + "loss": 0.87781298, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.40478516, + "step": 3454, + "time_per_iteration": 2.799320936203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047695, + "balance_loss_mlp": 1.00714028, + "epoch": 0.6646787225856099, + "flos": 479970461952.0, + "grad_norm": 0.04849178662998588, + "language_loss": 0.85994661, + "learning_rate": 0.00026700206129071747, + "loss": 0.87042361, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.40551758, + "step": 3455, + "time_per_iteration": 2.5544278621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044362, + "balance_loss_mlp": 1.00371206, + "epoch": 0.6648711042708734, + "flos": 450828762624.0, + "grad_norm": 0.04059200209413719, + "language_loss": 0.89189559, + "learning_rate": 0.00026672645798566925, + "loss": 0.90233922, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.40649414, + "step": 3456, + "time_per_iteration": 2.501304864883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_mlp": 1.00669408, + "epoch": 0.665063485956137, + "flos": 860597273088.0, + "grad_norm": 0.0398485152985426, + "language_loss": 0.7998091, + "learning_rate": 0.00026645094524544225, + "loss": 0.81028181, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.40576172, + "step": 3457, + "time_per_iteration": 3.276411294937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_mlp": 1.00266171, + "epoch": 0.6652558676414005, + "flos": 605472530688.0, + "grad_norm": 0.027841742129180558, + "language_loss": 0.75740635, + "learning_rate": 0.00026617552317699945, + "loss": 0.76784027, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.40722656, + "step": 3458, + "time_per_iteration": 2.801248550415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046474, + "balance_loss_mlp": 1.00591886, + "epoch": 0.6654482493266641, + "flos": 511411107840.0, + "grad_norm": 0.036000642082667296, + "language_loss": 0.87457603, + "learning_rate": 0.0002659001918872693, + "loss": 0.88504076, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.40551758, + "step": 3459, + "time_per_iteration": 2.6388814449310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050084, + "balance_loss_mlp": 1.00948107, + "epoch": 0.6656406310119277, + "flos": 566661647616.0, + "grad_norm": 0.03405161677383315, + "language_loss": 0.81573474, + "learning_rate": 0.0002656249514831449, + "loss": 0.82623559, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.40600586, + "step": 3460, + "time_per_iteration": 2.6583993434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052206, + "balance_loss_mlp": 1.01155555, + "epoch": 0.6658330126971912, + "flos": 1026060187392.0, + "grad_norm": 0.03356522396560915, + "language_loss": 0.87476516, + "learning_rate": 0.00026534980207148416, + "loss": 0.88528717, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.40649414, + "step": 3461, + "time_per_iteration": 3.4255144596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_mlp": 1.00962222, + "epoch": 0.6660253943824548, + "flos": 818234446848.0, + "grad_norm": 0.03543783293435262, + "language_loss": 0.74157602, + "learning_rate": 0.0002650747437591097, + "loss": 0.75208062, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.40844727, + "step": 3462, + "time_per_iteration": 2.99372935295105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_mlp": 1.00879669, + "epoch": 0.6662177760677184, + "flos": 1499533318656.0, + "grad_norm": 0.007196146037648728, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82927668, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.39257812, + "step": 3463, + "time_per_iteration": 5.021228075027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047386, + "balance_loss_mlp": 1.00668836, + "epoch": 0.666410157752982, + "flos": 501108133632.0, + "grad_norm": 0.0343393236578971, + "language_loss": 0.8738476, + "learning_rate": 0.00026452490085933155, + "loss": 0.88432145, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.40698242, + "step": 3464, + "time_per_iteration": 2.5860917568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048288, + "balance_loss_mlp": 1.00747108, + "epoch": 0.6666025394382454, + "flos": 482139151104.0, + "grad_norm": 0.04334646456147875, + "language_loss": 0.90236807, + "learning_rate": 0.00026425011648539614, + "loss": 0.91285098, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.40820312, + "step": 3465, + "time_per_iteration": 2.5441110134124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049485, + "balance_loss_mlp": 1.00864422, + "epoch": 0.666794921123509, + "flos": 547692665088.0, + "grad_norm": 0.03397954120439615, + "language_loss": 0.83244991, + "learning_rate": 0.00026397542363768267, + "loss": 0.84294474, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.40844727, + "step": 3466, + "time_per_iteration": 2.74092698097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051821, + "balance_loss_mlp": 1.01097989, + "epoch": 0.6669873028087726, + "flos": 472943364864.0, + "grad_norm": 0.036434069598551014, + "language_loss": 0.82217574, + "learning_rate": 0.0002637008224228362, + "loss": 0.83269393, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.40844727, + "step": 3467, + "time_per_iteration": 2.5710275173187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_mlp": 1.00785649, + "epoch": 0.6671796844940362, + "flos": 548500454400.0, + "grad_norm": 0.030766968440674072, + "language_loss": 0.8512944, + "learning_rate": 0.00026342631294746653, + "loss": 0.86178112, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.40820312, + "step": 3468, + "time_per_iteration": 2.7195847034454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_mlp": 1.00493824, + "epoch": 0.6673720661792998, + "flos": 1072123689216.0, + "grad_norm": 0.03165025767658557, + "language_loss": 0.81300414, + "learning_rate": 0.0002631518953181476, + "loss": 0.8234629, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.40942383, + "step": 3469, + "time_per_iteration": 3.4572696685791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045563, + "balance_loss_mlp": 1.00588989, + "epoch": 0.6675644478645633, + "flos": 1527113874432.0, + "grad_norm": 0.008139756237930116, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77370852, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.39648438, + "step": 3470, + "time_per_iteration": 4.91265869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_mlp": 1.00341797, + "epoch": 0.6677568295498268, + "flos": 580844153856.0, + "grad_norm": 0.03268114077515944, + "language_loss": 0.80885828, + "learning_rate": 0.00026260333602377985, + "loss": 0.81930137, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.40893555, + "step": 3471, + "time_per_iteration": 2.7573940753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043519, + "balance_loss_mlp": 1.00277328, + "epoch": 0.6679492112350904, + "flos": 384791157504.0, + "grad_norm": 0.03558012533984873, + "language_loss": 0.87711406, + "learning_rate": 0.0002623291945717007, + "loss": 0.88754922, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.4074707, + "step": 3472, + "time_per_iteration": 2.442338466644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_mlp": 1.00364411, + "epoch": 0.668141592920354, + "flos": 1152616954368.0, + "grad_norm": 0.0328139503917561, + "language_loss": 0.84606934, + "learning_rate": 0.00026205514539161175, + "loss": 0.85651278, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.40698242, + "step": 3473, + "time_per_iteration": 3.503469705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044285, + "balance_loss_mlp": 1.00353932, + "epoch": 0.6683339746056175, + "flos": 562292188416.0, + "grad_norm": 0.030626159125144124, + "language_loss": 0.84382141, + "learning_rate": 0.00026178118858990773, + "loss": 0.85426426, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.4074707, + "step": 3474, + "time_per_iteration": 2.8285627365112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_mlp": 1.00330997, + "epoch": 0.6685263562908811, + "flos": 515329523712.0, + "grad_norm": 0.030456650520625777, + "language_loss": 0.8459208, + "learning_rate": 0.0002615073242729483, + "loss": 0.85636061, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.40673828, + "step": 3475, + "time_per_iteration": 2.637474775314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047916, + "balance_loss_mlp": 1.00726593, + "epoch": 0.6687187379761447, + "flos": 631002015744.0, + "grad_norm": 0.030827527571606016, + "language_loss": 0.85137111, + "learning_rate": 0.0002612335525470573, + "loss": 0.86185026, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.40649414, + "step": 3476, + "time_per_iteration": 2.823110342025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_mlp": 1.00759649, + "epoch": 0.6689111196614083, + "flos": 536688826368.0, + "grad_norm": 0.0342797401257031, + "language_loss": 0.78870076, + "learning_rate": 0.0002609598735185221, + "loss": 0.79918253, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.40576172, + "step": 3477, + "time_per_iteration": 2.6825544834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048062, + "balance_loss_mlp": 1.00736415, + "epoch": 0.6691035013466718, + "flos": 604161208320.0, + "grad_norm": 0.031585406138604756, + "language_loss": 0.83722425, + "learning_rate": 0.00026068628729359445, + "loss": 0.84770489, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.40698242, + "step": 3478, + "time_per_iteration": 2.77055287361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_mlp": 1.00893426, + "epoch": 0.6692958830319353, + "flos": 634128193536.0, + "grad_norm": 0.03192222919752024, + "language_loss": 0.76639205, + "learning_rate": 0.00026041279397848996, + "loss": 0.77688813, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.40673828, + "step": 3479, + "time_per_iteration": 2.8836774826049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.00585258, + "epoch": 0.6694882647171989, + "flos": 646749556224.0, + "grad_norm": 0.03482378260676791, + "language_loss": 0.83261842, + "learning_rate": 0.00026013939367938797, + "loss": 0.84308422, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.40722656, + "step": 3480, + "time_per_iteration": 2.8915905952453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044065, + "balance_loss_mlp": 1.00339055, + "epoch": 0.6696806464024625, + "flos": 570762810624.0, + "grad_norm": 0.033098295415039676, + "language_loss": 0.81370211, + "learning_rate": 0.00025986608650243204, + "loss": 0.82414275, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.40673828, + "step": 3481, + "time_per_iteration": 2.785128116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_mlp": 1.00114262, + "epoch": 0.6698730280877261, + "flos": 623964225024.0, + "grad_norm": 0.029494842151893377, + "language_loss": 0.79968995, + "learning_rate": 0.0002595928725537293, + "loss": 0.81010795, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.40649414, + "step": 3482, + "time_per_iteration": 2.862269639968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044098, + "balance_loss_mlp": 1.00361502, + "epoch": 0.6700654097729896, + "flos": 503509147392.0, + "grad_norm": 0.04687738924835003, + "language_loss": 0.88447571, + "learning_rate": 0.0002593197519393509, + "loss": 0.89491665, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.40478516, + "step": 3483, + "time_per_iteration": 2.5955467224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044048, + "balance_loss_mlp": 1.00356483, + "epoch": 0.6702577914582531, + "flos": 625119045120.0, + "grad_norm": 0.03040614525342857, + "language_loss": 0.79865301, + "learning_rate": 0.00025904672476533165, + "loss": 0.80909348, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.40478516, + "step": 3484, + "time_per_iteration": 2.83461594581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.00695693, + "epoch": 0.6704501731435167, + "flos": 457213320960.0, + "grad_norm": 0.03431199864252877, + "language_loss": 0.83164477, + "learning_rate": 0.0002587737911376704, + "loss": 0.84211963, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.40527344, + "step": 3485, + "time_per_iteration": 2.6094586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_mlp": 1.00313866, + "epoch": 0.6706425548287803, + "flos": 544258340352.0, + "grad_norm": 0.0329892912769069, + "language_loss": 0.84059811, + "learning_rate": 0.00025850095116232885, + "loss": 0.851035, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.40551758, + "step": 3486, + "time_per_iteration": 2.686342477798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_mlp": 1.00332236, + "epoch": 0.6708349365140439, + "flos": 635180946432.0, + "grad_norm": 0.03091711657706004, + "language_loss": 0.78076321, + "learning_rate": 0.000258228204945233, + "loss": 0.79120129, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.40478516, + "step": 3487, + "time_per_iteration": 2.9295520782470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_mlp": 1.0044347, + "epoch": 0.6710273181993074, + "flos": 641903787264.0, + "grad_norm": 0.032938145156071165, + "language_loss": 0.85185027, + "learning_rate": 0.00025795555259227254, + "loss": 0.86229968, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.4050293, + "step": 3488, + "time_per_iteration": 2.79502534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_mlp": 1.00561631, + "epoch": 0.671219699884571, + "flos": 555025963776.0, + "grad_norm": 0.02894865619678765, + "language_loss": 0.84055519, + "learning_rate": 0.00025768299420930046, + "loss": 0.85101712, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.40576172, + "step": 3489, + "time_per_iteration": 2.779972553253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_mlp": 1.00622523, + "epoch": 0.6714120815698346, + "flos": 732782651904.0, + "grad_norm": 0.0327604861643189, + "language_loss": 0.8377071, + "learning_rate": 0.0002574105299021332, + "loss": 0.84817296, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.40356445, + "step": 3490, + "time_per_iteration": 2.893480062484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_mlp": 1.00433028, + "epoch": 0.6716044632550981, + "flos": 689947395072.0, + "grad_norm": 0.03209886664090861, + "language_loss": 0.8471486, + "learning_rate": 0.00025713815977655084, + "loss": 0.85759532, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.40332031, + "step": 3491, + "time_per_iteration": 2.957084894180298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.0041728, + "epoch": 0.6717968449403616, + "flos": 461587637760.0, + "grad_norm": 0.0366727841184643, + "language_loss": 0.85291302, + "learning_rate": 0.0002568658839382969, + "loss": 0.8633579, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.40307617, + "step": 3492, + "time_per_iteration": 2.5661098957061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054708, + "balance_loss_mlp": 1.01429558, + "epoch": 0.6719892266256252, + "flos": 502597345536.0, + "grad_norm": 0.0394893912508571, + "language_loss": 0.8491143, + "learning_rate": 0.00025659370249307814, + "loss": 0.85966134, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.40405273, + "step": 3493, + "time_per_iteration": 2.6122422218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051838, + "balance_loss_mlp": 1.01144993, + "epoch": 0.6721816083108888, + "flos": 684737098752.0, + "grad_norm": 0.033378667785843884, + "language_loss": 0.85795897, + "learning_rate": 0.00025632161554656473, + "loss": 0.86847734, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.40380859, + "step": 3494, + "time_per_iteration": 2.8829426765441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049339, + "balance_loss_mlp": 1.00897467, + "epoch": 0.6723739899961524, + "flos": 586896265728.0, + "grad_norm": 0.03541855963970859, + "language_loss": 0.8296122, + "learning_rate": 0.00025604962320439017, + "loss": 0.84010559, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.40356445, + "step": 3495, + "time_per_iteration": 2.7043375968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_mlp": 1.00421739, + "epoch": 0.672566371681416, + "flos": 507740567808.0, + "grad_norm": 0.03528245901985063, + "language_loss": 0.82875669, + "learning_rate": 0.0002557777255721516, + "loss": 0.83920175, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.40283203, + "step": 3496, + "time_per_iteration": 2.719181776046753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045131, + "balance_loss_mlp": 1.00479066, + "epoch": 0.6727587533666795, + "flos": 536736458496.0, + "grad_norm": 0.036828443855142154, + "language_loss": 0.81081581, + "learning_rate": 0.0002555059227554087, + "loss": 0.82126713, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.40332031, + "step": 3497, + "time_per_iteration": 2.7057156562805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_mlp": 1.0047394, + "epoch": 0.672951135051943, + "flos": 604037753856.0, + "grad_norm": 0.0344810885559189, + "language_loss": 0.78363037, + "learning_rate": 0.00025523421485968453, + "loss": 0.79408205, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.40429688, + "step": 3498, + "time_per_iteration": 2.8460867404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_mlp": 1.00299513, + "epoch": 0.6731435167372066, + "flos": 812679065088.0, + "grad_norm": 0.0462085280228462, + "language_loss": 0.8591696, + "learning_rate": 0.00025496260199046585, + "loss": 0.86960292, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.40332031, + "step": 3499, + "time_per_iteration": 2.971506357192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_mlp": 1.0028913, + "epoch": 0.6733358984224702, + "flos": 612751394304.0, + "grad_norm": 0.03556230846218865, + "language_loss": 0.84967017, + "learning_rate": 0.000254691084253202, + "loss": 0.86010194, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.40283203, + "step": 3500, + "time_per_iteration": 2.8486316204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_mlp": 1.00318992, + "epoch": 0.6735282801077337, + "flos": 559968942336.0, + "grad_norm": 0.24449978816738047, + "language_loss": 0.77738857, + "learning_rate": 0.00025441966175330567, + "loss": 0.7878232, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.40258789, + "step": 3501, + "time_per_iteration": 2.631596803665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_mlp": 1.0023365, + "epoch": 0.6737206617929973, + "flos": 673633138176.0, + "grad_norm": 0.03266233971307438, + "language_loss": 0.80253637, + "learning_rate": 0.00025414833459615183, + "loss": 0.81296146, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.40161133, + "step": 3502, + "time_per_iteration": 2.822633981704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044113, + "balance_loss_mlp": 1.00398731, + "epoch": 0.6739130434782609, + "flos": 634642420224.0, + "grad_norm": 0.03194426719542878, + "language_loss": 0.80720419, + "learning_rate": 0.0002538771028870796, + "loss": 0.81764531, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.40112305, + "step": 3503, + "time_per_iteration": 2.8278305530548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044086, + "balance_loss_mlp": 1.00376928, + "epoch": 0.6741054251635245, + "flos": 532546834176.0, + "grad_norm": 0.03505319293998398, + "language_loss": 0.82144141, + "learning_rate": 0.0002536059667313903, + "loss": 0.8318823, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.40307617, + "step": 3504, + "time_per_iteration": 2.772728443145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045901, + "balance_loss_mlp": 1.00548911, + "epoch": 0.674297806848788, + "flos": 543652740096.0, + "grad_norm": 0.033634031590092824, + "language_loss": 0.89796269, + "learning_rate": 0.0002533349262343483, + "loss": 0.90842175, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.40405273, + "step": 3505, + "time_per_iteration": 2.6931023597717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048869, + "balance_loss_mlp": 1.00845683, + "epoch": 0.6744901885340515, + "flos": 464455246080.0, + "grad_norm": 0.03724604036951252, + "language_loss": 0.82972419, + "learning_rate": 0.0002530639815011807, + "loss": 0.84021288, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.40405273, + "step": 3506, + "time_per_iteration": 2.5213606357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104646, + "balance_loss_mlp": 1.00604796, + "epoch": 0.6746825702193151, + "flos": 633022950912.0, + "grad_norm": 0.0353973537861221, + "language_loss": 0.85602045, + "learning_rate": 0.0002527931326370781, + "loss": 0.866485, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.40405273, + "step": 3507, + "time_per_iteration": 2.7929484844207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046008, + "balance_loss_mlp": 1.00554848, + "epoch": 0.6748749519045787, + "flos": 672393747456.0, + "grad_norm": 0.038454630804936565, + "language_loss": 0.83645785, + "learning_rate": 0.00025252237974719276, + "loss": 0.84691793, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.40454102, + "step": 3508, + "time_per_iteration": 2.8264431953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00374067, + "epoch": 0.6750673335898423, + "flos": 768493602048.0, + "grad_norm": 0.034781380834319586, + "language_loss": 0.81037247, + "learning_rate": 0.00025225172293664056, + "loss": 0.82081401, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.40405273, + "step": 3509, + "time_per_iteration": 2.988295078277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_mlp": 1.00597382, + "epoch": 0.6752597152751059, + "flos": 1515907846656.0, + "grad_norm": 0.0075717383905430985, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77978498, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.39355469, + "step": 3510, + "time_per_iteration": 4.925229787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00339806, + "epoch": 0.6754520969603693, + "flos": 688534005504.0, + "grad_norm": 0.03671107253105254, + "language_loss": 0.85454929, + "learning_rate": 0.00025171069797381106, + "loss": 0.8649869, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.40356445, + "step": 3511, + "time_per_iteration": 2.8605566024780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042968, + "balance_loss_mlp": 1.00265193, + "epoch": 0.6756444786456329, + "flos": 501618469632.0, + "grad_norm": 0.03363257909810701, + "language_loss": 0.82468766, + "learning_rate": 0.00025144033003157864, + "loss": 0.83511734, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.40307617, + "step": 3512, + "time_per_iteration": 2.6560440063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043999, + "balance_loss_mlp": 1.00382507, + "epoch": 0.6758368603308965, + "flos": 493660128768.0, + "grad_norm": 0.04010660433283205, + "language_loss": 0.79292786, + "learning_rate": 0.00025117005858876806, + "loss": 0.80336791, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.40161133, + "step": 3513, + "time_per_iteration": 2.6984188556671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_mlp": 1.00392663, + "epoch": 0.6760292420161601, + "flos": 557044953600.0, + "grad_norm": 0.035892201444293004, + "language_loss": 0.86103761, + "learning_rate": 0.000250899883750308, + "loss": 0.8714788, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.40185547, + "step": 3514, + "time_per_iteration": 2.7181315422058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046252, + "balance_loss_mlp": 1.00600672, + "epoch": 0.6762216237014236, + "flos": 608722162944.0, + "grad_norm": 0.033450458947787066, + "language_loss": 0.81925356, + "learning_rate": 0.00025062980562109006, + "loss": 0.82971609, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.40234375, + "step": 3515, + "time_per_iteration": 2.78231143951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043998, + "balance_loss_mlp": 1.00377643, + "epoch": 0.6764140053866872, + "flos": 534928406016.0, + "grad_norm": 0.037161832732059044, + "language_loss": 0.83539182, + "learning_rate": 0.0002503598243059677, + "loss": 0.84583181, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.40209961, + "step": 3516, + "time_per_iteration": 2.7860419750213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046044, + "balance_loss_mlp": 1.00584662, + "epoch": 0.6766063870719508, + "flos": 505862529024.0, + "grad_norm": 0.041409918101289474, + "language_loss": 0.80496907, + "learning_rate": 0.0002500899399097568, + "loss": 0.81542951, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.40185547, + "step": 3517, + "time_per_iteration": 2.6418778896331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041487, + "balance_loss_mlp": 1.00131381, + "epoch": 0.6767987687572143, + "flos": 514194145536.0, + "grad_norm": 0.03808875517476391, + "language_loss": 0.86208284, + "learning_rate": 0.0002498201525372359, + "loss": 0.87249774, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.40161133, + "step": 3518, + "time_per_iteration": 2.569801092147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041708, + "balance_loss_mlp": 1.00148714, + "epoch": 0.6769911504424779, + "flos": 526079650560.0, + "grad_norm": 0.03452143000851854, + "language_loss": 0.83818328, + "learning_rate": 0.00024955046229314584, + "loss": 0.84860039, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.40209961, + "step": 3519, + "time_per_iteration": 2.602756977081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_mlp": 1.00353205, + "epoch": 0.6771835321277414, + "flos": 450837510912.0, + "grad_norm": 0.03417107794198843, + "language_loss": 0.87895727, + "learning_rate": 0.00024928086928218947, + "loss": 0.8893944, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.40161133, + "step": 3520, + "time_per_iteration": 2.4941091537475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00443459, + "epoch": 0.677375913813005, + "flos": 710674852608.0, + "grad_norm": 0.03642632041664857, + "language_loss": 0.76859355, + "learning_rate": 0.00024901137360903216, + "loss": 0.7790401, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.40209961, + "step": 3521, + "time_per_iteration": 2.985905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_mlp": 1.00404227, + "epoch": 0.6775682954982686, + "flos": 429346005504.0, + "grad_norm": 0.039972484461639736, + "language_loss": 0.81834614, + "learning_rate": 0.00024874197537830115, + "loss": 0.82878971, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.40307617, + "step": 3522, + "time_per_iteration": 2.525432586669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045522, + "balance_loss_mlp": 1.0052768, + "epoch": 0.6777606771835322, + "flos": 438821748480.0, + "grad_norm": 0.0378942066794791, + "language_loss": 0.83926749, + "learning_rate": 0.00024847267469458684, + "loss": 0.84972268, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.40234375, + "step": 3523, + "time_per_iteration": 2.519306182861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045594, + "balance_loss_mlp": 1.0053252, + "epoch": 0.6779530588687956, + "flos": 776788280064.0, + "grad_norm": 0.03620909543605363, + "language_loss": 0.78424889, + "learning_rate": 0.00024820347166244034, + "loss": 0.79470479, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.40258789, + "step": 3524, + "time_per_iteration": 3.016852378845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045119, + "balance_loss_mlp": 1.00494587, + "epoch": 0.6781454405540592, + "flos": 572905254912.0, + "grad_norm": 0.03295614224458047, + "language_loss": 0.85541701, + "learning_rate": 0.0002479343663863755, + "loss": 0.86586821, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.40161133, + "step": 3525, + "time_per_iteration": 2.7807812690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046011, + "balance_loss_mlp": 1.00586104, + "epoch": 0.6783378222393228, + "flos": 485983689984.0, + "grad_norm": 0.034679626335120894, + "language_loss": 0.77479804, + "learning_rate": 0.00024766535897086876, + "loss": 0.78525817, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.40136719, + "step": 3526, + "time_per_iteration": 2.599513530731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_mlp": 1.0070163, + "epoch": 0.6785302039245864, + "flos": 483832497408.0, + "grad_norm": 0.03442955801383442, + "language_loss": 0.797737, + "learning_rate": 0.0002473964495203578, + "loss": 0.80820936, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.40209961, + "step": 3527, + "time_per_iteration": 2.6847755908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_mlp": 1.00640571, + "epoch": 0.67872258560985, + "flos": 525862877184.0, + "grad_norm": 0.03305823044562006, + "language_loss": 0.861408, + "learning_rate": 0.0002471276381392425, + "loss": 0.87187475, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.40258789, + "step": 3528, + "time_per_iteration": 4.207594156265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_mlp": 1.00437164, + "epoch": 0.6789149672951135, + "flos": 1555894937088.0, + "grad_norm": 0.004731891717640295, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79231918, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.38867188, + "step": 3529, + "time_per_iteration": 4.977165222167969 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046125, + "balance_loss_mlp": 1.00583255, + "epoch": 0.6791073489803771, + "flos": 742686105600.0, + "grad_norm": 0.033652850666290056, + "language_loss": 0.84582424, + "learning_rate": 0.00024659031000260826, + "loss": 0.85628551, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.40283203, + "step": 3530, + "time_per_iteration": 2.9048852920532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_mlp": 1.00703049, + "epoch": 0.6792997306656406, + "flos": 577448712960.0, + "grad_norm": 0.040150019342018534, + "language_loss": 0.81559235, + "learning_rate": 0.0002463217934556985, + "loss": 0.82606721, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.40454102, + "step": 3531, + "time_per_iteration": 2.6925132274627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046124, + "balance_loss_mlp": 1.00692749, + "epoch": 0.6794921123509042, + "flos": 1506546809856.0, + "grad_norm": 0.009705737357192788, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77578211, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.39160156, + "step": 3532, + "time_per_iteration": 4.747551202774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_mlp": 1.00138783, + "epoch": 0.6796844940361677, + "flos": 700141499136.0, + "grad_norm": 0.03694517286226913, + "language_loss": 0.84159917, + "learning_rate": 0.0002457850559259306, + "loss": 0.8520174, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.40429688, + "step": 3533, + "time_per_iteration": 2.8468008041381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_mlp": 1.00326967, + "epoch": 0.6798768757214313, + "flos": 553816708608.0, + "grad_norm": 0.03486714477103508, + "language_loss": 0.82139623, + "learning_rate": 0.00024551683515145275, + "loss": 0.83183265, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.40356445, + "step": 3534, + "time_per_iteration": 2.6637539863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_mlp": 1.00261092, + "epoch": 0.6800692574066949, + "flos": 523976090112.0, + "grad_norm": 0.03293406934357783, + "language_loss": 0.87167442, + "learning_rate": 0.0002452487131761014, + "loss": 0.88210464, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.40405273, + "step": 3535, + "time_per_iteration": 2.719104051589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041768, + "balance_loss_mlp": 1.00128436, + "epoch": 0.6802616390919585, + "flos": 575130324480.0, + "grad_norm": 0.03513185710250464, + "language_loss": 0.80471444, + "learning_rate": 0.00024498069010397093, + "loss": 0.81513214, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.40478516, + "step": 3536, + "time_per_iteration": 2.656780242919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042968, + "balance_loss_mlp": 1.00250804, + "epoch": 0.6804540207772221, + "flos": 489129309696.0, + "grad_norm": 0.03285150643596687, + "language_loss": 0.85294282, + "learning_rate": 0.00024471276603911697, + "loss": 0.86337245, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.40454102, + "step": 3537, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046239, + "balance_loss_mlp": 1.00566006, + "epoch": 0.6806464024624855, + "flos": 579745714176.0, + "grad_norm": 0.0319685563784025, + "language_loss": 0.79588819, + "learning_rate": 0.0002444449410855572, + "loss": 0.80635059, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.40576172, + "step": 3538, + "time_per_iteration": 2.7366206645965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048213, + "balance_loss_mlp": 1.00777721, + "epoch": 0.6808387841477491, + "flos": 554793639168.0, + "grad_norm": 0.028008178154431115, + "language_loss": 0.8488512, + "learning_rate": 0.00024417721534727033, + "loss": 0.85933334, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.40429688, + "step": 3539, + "time_per_iteration": 2.6501903533935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_mlp": 1.00716043, + "epoch": 0.6810311658330127, + "flos": 427754726400.0, + "grad_norm": 0.0434584868230971, + "language_loss": 0.83537716, + "learning_rate": 0.00024390958892819687, + "loss": 0.84585309, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.40429688, + "step": 3540, + "time_per_iteration": 2.5052664279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046344, + "balance_loss_mlp": 1.00571704, + "epoch": 0.6812235475182763, + "flos": 573461277696.0, + "grad_norm": 0.03693481574756638, + "language_loss": 0.81626362, + "learning_rate": 0.0002436420619322381, + "loss": 0.82672703, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.40625, + "step": 3541, + "time_per_iteration": 2.832705497741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049574, + "balance_loss_mlp": 1.00901949, + "epoch": 0.6814159292035398, + "flos": 502994920704.0, + "grad_norm": 0.03366403266770877, + "language_loss": 0.83297849, + "learning_rate": 0.0002433746344632577, + "loss": 0.84347427, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.40551758, + "step": 3542, + "time_per_iteration": 2.672311782836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050212, + "balance_loss_mlp": 1.00972831, + "epoch": 0.6816083108888034, + "flos": 766956758016.0, + "grad_norm": 0.03487918397791305, + "language_loss": 0.80590951, + "learning_rate": 0.00024310730662508006, + "loss": 0.81641161, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.40478516, + "step": 3543, + "time_per_iteration": 3.086225986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051043, + "balance_loss_mlp": 1.0106312, + "epoch": 0.681800692574067, + "flos": 480480797952.0, + "grad_norm": 0.03000398684674813, + "language_loss": 0.88137174, + "learning_rate": 0.0002428400785214911, + "loss": 0.89188218, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.40405273, + "step": 3544, + "time_per_iteration": 2.5797877311706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050788, + "balance_loss_mlp": 1.01030433, + "epoch": 0.6819930742593305, + "flos": 692834445312.0, + "grad_norm": 0.03498907792035314, + "language_loss": 0.83317804, + "learning_rate": 0.00024257295025623794, + "loss": 0.84368593, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.40478516, + "step": 3545, + "time_per_iteration": 2.817002534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_mlp": 1.00852597, + "epoch": 0.6821854559445941, + "flos": 679355715840.0, + "grad_norm": 0.03355065924517062, + "language_loss": 0.81087142, + "learning_rate": 0.00024230592193302892, + "loss": 0.82136154, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.40478516, + "step": 3546, + "time_per_iteration": 2.9010307788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00469804, + "epoch": 0.6823778376298576, + "flos": 463133230080.0, + "grad_norm": 0.04387981272485442, + "language_loss": 0.85039532, + "learning_rate": 0.00024203899365553372, + "loss": 0.86084759, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.40527344, + "step": 3547, + "time_per_iteration": 2.5003862380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045105, + "balance_loss_mlp": 1.00543213, + "epoch": 0.6825702193151212, + "flos": 1478176939776.0, + "grad_norm": 0.005965966657319216, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7777946, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.39648438, + "step": 3548, + "time_per_iteration": 4.51382303237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043331, + "balance_loss_mlp": 1.00299025, + "epoch": 0.6827626010003848, + "flos": 724414096896.0, + "grad_norm": 0.03369751046554337, + "language_loss": 0.83353454, + "learning_rate": 0.00024150543765216848, + "loss": 0.84396785, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.40332031, + "step": 3549, + "time_per_iteration": 2.868882179260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043901, + "balance_loss_mlp": 1.00348902, + "epoch": 0.6829549826856484, + "flos": 559940752128.0, + "grad_norm": 0.03314347093854088, + "language_loss": 0.83934271, + "learning_rate": 0.00024123881013344352, + "loss": 0.84978169, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.40405273, + "step": 3550, + "time_per_iteration": 2.673149347305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043918, + "balance_loss_mlp": 1.00352979, + "epoch": 0.6831473643709118, + "flos": 626134859520.0, + "grad_norm": 0.03193969534774964, + "language_loss": 0.80188608, + "learning_rate": 0.00024097228307472202, + "loss": 0.81232524, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.40380859, + "step": 3551, + "time_per_iteration": 2.783318519592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_mlp": 1.00367296, + "epoch": 0.6833397460561754, + "flos": 715098746880.0, + "grad_norm": 0.03508880753124507, + "language_loss": 0.82590389, + "learning_rate": 0.00024070585657947846, + "loss": 0.83634502, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.40429688, + "step": 3552, + "time_per_iteration": 2.87227725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_mlp": 1.00414932, + "epoch": 0.683532127741439, + "flos": 465727684608.0, + "grad_norm": 0.028861941577793874, + "language_loss": 0.86039191, + "learning_rate": 0.00024043953075114934, + "loss": 0.87083775, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.40429688, + "step": 3553, + "time_per_iteration": 2.685239315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044014, + "balance_loss_mlp": 1.00353038, + "epoch": 0.6837245094267026, + "flos": 583340431872.0, + "grad_norm": 0.03309577009255294, + "language_loss": 0.89582229, + "learning_rate": 0.00024017330569313128, + "loss": 0.9062624, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.40478516, + "step": 3554, + "time_per_iteration": 2.738507032394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044038, + "balance_loss_mlp": 1.00345898, + "epoch": 0.6839168911119662, + "flos": 795524937984.0, + "grad_norm": 0.03513613894761906, + "language_loss": 0.75376379, + "learning_rate": 0.0002399071815087821, + "loss": 0.7642042, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.40576172, + "step": 3555, + "time_per_iteration": 3.038098096847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049451, + "balance_loss_mlp": 1.00908649, + "epoch": 0.6841092727972297, + "flos": 581115362304.0, + "grad_norm": 0.037584614918211315, + "language_loss": 0.84306592, + "learning_rate": 0.00023964115830142025, + "loss": 0.85356045, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.40356445, + "step": 3556, + "time_per_iteration": 2.6664743423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047993, + "balance_loss_mlp": 1.0076046, + "epoch": 0.6843016544824932, + "flos": 384595771392.0, + "grad_norm": 0.04136622286730017, + "language_loss": 0.88220561, + "learning_rate": 0.00023937523617432522, + "loss": 0.89268553, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.40380859, + "step": 3557, + "time_per_iteration": 2.429532289505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104626, + "balance_loss_mlp": 1.00582457, + "epoch": 0.6844940361677568, + "flos": 1441289793792.0, + "grad_norm": 0.032795620592968935, + "language_loss": 0.87315959, + "learning_rate": 0.00023910941523073705, + "loss": 0.88362217, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.40429688, + "step": 3558, + "time_per_iteration": 3.8917641639709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104589, + "balance_loss_mlp": 1.00550175, + "epoch": 0.6846864178530204, + "flos": 521900719872.0, + "grad_norm": 0.03199772830475091, + "language_loss": 0.86959422, + "learning_rate": 0.0002388436955738566, + "loss": 0.8800531, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.40380859, + "step": 3559, + "time_per_iteration": 2.6707799434661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045488, + "balance_loss_mlp": 1.00514805, + "epoch": 0.6848787995382839, + "flos": 719230045440.0, + "grad_norm": 0.030101152384031323, + "language_loss": 0.81828642, + "learning_rate": 0.00023857807730684523, + "loss": 0.82874131, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.40332031, + "step": 3560, + "time_per_iteration": 2.8835229873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_mlp": 1.00440454, + "epoch": 0.6850711812235475, + "flos": 512162516736.0, + "grad_norm": 0.03806744815323664, + "language_loss": 0.83236831, + "learning_rate": 0.00023831256053282547, + "loss": 0.84281576, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.40332031, + "step": 3561, + "time_per_iteration": 2.723851203918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104642, + "balance_loss_mlp": 1.0061748, + "epoch": 0.6852635629088111, + "flos": 669432820224.0, + "grad_norm": 0.034115256160246236, + "language_loss": 0.78766859, + "learning_rate": 0.00023804714535488003, + "loss": 0.79813278, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.40234375, + "step": 3562, + "time_per_iteration": 2.862870454788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_mlp": 1.00953674, + "epoch": 0.6854559445940747, + "flos": 1526367323136.0, + "grad_norm": 0.0075236953863810525, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80858457, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.39257812, + "step": 3563, + "time_per_iteration": 4.951240539550781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045746, + "balance_loss_mlp": 1.00550103, + "epoch": 0.6856483262793382, + "flos": 455137950720.0, + "grad_norm": 0.03558245087854763, + "language_loss": 0.81134826, + "learning_rate": 0.00023751662019934488, + "loss": 0.82180572, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.40234375, + "step": 3564, + "time_per_iteration": 2.5381388664245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043955, + "balance_loss_mlp": 1.00378108, + "epoch": 0.6858407079646017, + "flos": 616689252096.0, + "grad_norm": 0.034154017668987145, + "language_loss": 0.79535556, + "learning_rate": 0.00023725151042772364, + "loss": 0.80579513, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.40161133, + "step": 3565, + "time_per_iteration": 2.8012731075286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044161, + "balance_loss_mlp": 1.00394011, + "epoch": 0.6860330896498653, + "flos": 467095387392.0, + "grad_norm": 0.03227163562068172, + "language_loss": 0.83989513, + "learning_rate": 0.00023698650266411276, + "loss": 0.85033673, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.40209961, + "step": 3566, + "time_per_iteration": 2.6114397048950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045444, + "balance_loss_mlp": 1.00527024, + "epoch": 0.6862254713351289, + "flos": 865839650304.0, + "grad_norm": 0.03269984364116833, + "language_loss": 0.83511543, + "learning_rate": 0.00023672159701139755, + "loss": 0.84556985, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.40161133, + "step": 3567, + "time_per_iteration": 3.2268896102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045242, + "balance_loss_mlp": 1.00504482, + "epoch": 0.6864178530203925, + "flos": 448091411712.0, + "grad_norm": 0.03951724412418829, + "language_loss": 0.86782575, + "learning_rate": 0.00023645679357242296, + "loss": 0.87827814, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.40185547, + "step": 3568, + "time_per_iteration": 2.5142667293548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_mlp": 1.00512683, + "epoch": 0.6866102347056561, + "flos": 425212761600.0, + "grad_norm": 0.04100777191651884, + "language_loss": 0.84717417, + "learning_rate": 0.00023619209244999534, + "loss": 0.85762644, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.40087891, + "step": 3569, + "time_per_iteration": 2.506850004196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_mlp": 1.00339389, + "epoch": 0.6868026163909196, + "flos": 473334137088.0, + "grad_norm": 0.0410478225777228, + "language_loss": 0.85724694, + "learning_rate": 0.0002359274937468806, + "loss": 0.86768192, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.40087891, + "step": 3570, + "time_per_iteration": 2.5271074771881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_mlp": 1.00446975, + "epoch": 0.6869949980761831, + "flos": 465206654976.0, + "grad_norm": 0.037625801670490476, + "language_loss": 0.78364801, + "learning_rate": 0.00023566299756580512, + "loss": 0.79409337, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.40039062, + "step": 3571, + "time_per_iteration": 2.641204595565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_mlp": 1.00682652, + "epoch": 0.6871873797614467, + "flos": 427131629568.0, + "grad_norm": 0.03563606510751839, + "language_loss": 0.78681505, + "learning_rate": 0.0002353986040094551, + "loss": 0.79728556, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.40209961, + "step": 3572, + "time_per_iteration": 2.508169412612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051965, + "balance_loss_mlp": 1.01188707, + "epoch": 0.6873797614467103, + "flos": 444555019776.0, + "grad_norm": 0.03726905033743987, + "language_loss": 0.79780114, + "learning_rate": 0.00023513431318047796, + "loss": 0.80832076, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.40063477, + "step": 3573, + "time_per_iteration": 2.524447441101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049651, + "balance_loss_mlp": 1.00952482, + "epoch": 0.6875721431319738, + "flos": 993915764736.0, + "grad_norm": 0.03636326410660492, + "language_loss": 0.77452493, + "learning_rate": 0.00023487012518147977, + "loss": 0.78502142, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.40112305, + "step": 3574, + "time_per_iteration": 3.220405340194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050375, + "balance_loss_mlp": 1.0102489, + "epoch": 0.6877645248172374, + "flos": 1287448957440.0, + "grad_norm": 0.03573540340682003, + "language_loss": 0.8513974, + "learning_rate": 0.00023460604011502772, + "loss": 0.86190116, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.40112305, + "step": 3575, + "time_per_iteration": 3.642275094985962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050219, + "balance_loss_mlp": 1.01016474, + "epoch": 0.687956906502501, + "flos": 878230633728.0, + "grad_norm": 0.03712322767152043, + "language_loss": 0.86061072, + "learning_rate": 0.00023434205808364845, + "loss": 0.87111294, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.40039062, + "step": 3576, + "time_per_iteration": 3.093545436859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047023, + "balance_loss_mlp": 1.00680172, + "epoch": 0.6881492881877646, + "flos": 564471571200.0, + "grad_norm": 0.039318035109250464, + "language_loss": 0.86179203, + "learning_rate": 0.00023407817918982932, + "loss": 0.87226224, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.40209961, + "step": 3577, + "time_per_iteration": 2.755629777908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_mlp": 1.00656283, + "epoch": 0.6883416698730281, + "flos": 796510616832.0, + "grad_norm": 0.03470611198905491, + "language_loss": 0.79102242, + "learning_rate": 0.00023381440353601718, + "loss": 0.80149001, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.40185547, + "step": 3578, + "time_per_iteration": 2.990251302719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_mlp": 1.00540566, + "epoch": 0.6885340515582916, + "flos": 724880691456.0, + "grad_norm": 0.04272273427793483, + "language_loss": 0.86559987, + "learning_rate": 0.00023355073122461822, + "loss": 0.87605572, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.40161133, + "step": 3579, + "time_per_iteration": 2.9245500564575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_mlp": 1.00522161, + "epoch": 0.6887264332435552, + "flos": 1012522165248.0, + "grad_norm": 0.033292192645982856, + "language_loss": 0.83352244, + "learning_rate": 0.00023328716235799973, + "loss": 0.84397686, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.40209961, + "step": 3580, + "time_per_iteration": 3.2759361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049535, + "balance_loss_mlp": 1.00936127, + "epoch": 0.6889188149288188, + "flos": 586347045888.0, + "grad_norm": 0.03483646378728446, + "language_loss": 0.84317255, + "learning_rate": 0.00023302369703848803, + "loss": 0.85366791, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.40161133, + "step": 3581, + "time_per_iteration": 2.692676544189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_mlp": 1.00718749, + "epoch": 0.6891111966140824, + "flos": 637277703936.0, + "grad_norm": 0.03603221184194459, + "language_loss": 0.80829328, + "learning_rate": 0.00023276033536836937, + "loss": 0.81876856, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.40332031, + "step": 3582, + "time_per_iteration": 2.7863240242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_mlp": 1.00297284, + "epoch": 0.6893035782993459, + "flos": 496312909056.0, + "grad_norm": 0.032647536159740746, + "language_loss": 0.85196984, + "learning_rate": 0.00023249707744988984, + "loss": 0.86240131, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.40161133, + "step": 3583, + "time_per_iteration": 2.6404688358306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_mlp": 1.00486732, + "epoch": 0.6894959599846094, + "flos": 459149685504.0, + "grad_norm": 0.038319027803205424, + "language_loss": 0.82998735, + "learning_rate": 0.00023223392338525529, + "loss": 0.84043896, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.40283203, + "step": 3584, + "time_per_iteration": 2.526021957397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050689, + "balance_loss_mlp": 1.01030123, + "epoch": 0.689688341669873, + "flos": 506057915136.0, + "grad_norm": 0.03433951849080314, + "language_loss": 0.79221714, + "learning_rate": 0.00023197087327663107, + "loss": 0.802724, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.40380859, + "step": 3585, + "time_per_iteration": 2.6632885932922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_mlp": 1.00449693, + "epoch": 0.6898807233551366, + "flos": 765219670272.0, + "grad_norm": 0.036720139480463, + "language_loss": 0.81855822, + "learning_rate": 0.00023170792722614243, + "loss": 0.82900751, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.40429688, + "step": 3586, + "time_per_iteration": 2.8943870067596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046135, + "balance_loss_mlp": 1.00577044, + "epoch": 0.6900731050404002, + "flos": 584573986560.0, + "grad_norm": 0.03037103532376505, + "language_loss": 0.84293818, + "learning_rate": 0.00023144508533587377, + "loss": 0.85339952, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.40356445, + "step": 3587, + "time_per_iteration": 2.826327085494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_mlp": 1.00622261, + "epoch": 0.6902654867256637, + "flos": 713206123776.0, + "grad_norm": 0.03728824809581911, + "language_loss": 0.79222, + "learning_rate": 0.0002311823477078698, + "loss": 0.8026861, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.40380859, + "step": 3588, + "time_per_iteration": 2.9109723567962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046452, + "balance_loss_mlp": 1.00611138, + "epoch": 0.6904578684109273, + "flos": 598304482560.0, + "grad_norm": 0.034163579129476235, + "language_loss": 0.85722661, + "learning_rate": 0.00023091971444413428, + "loss": 0.8676911, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.40332031, + "step": 3589, + "time_per_iteration": 4.1711201667785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044858, + "balance_loss_mlp": 1.00454128, + "epoch": 0.6906502500961909, + "flos": 586177904640.0, + "grad_norm": 0.030860818872724436, + "language_loss": 0.82910645, + "learning_rate": 0.00023065718564663012, + "loss": 0.83955508, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.40307617, + "step": 3590, + "time_per_iteration": 2.7104885578155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_mlp": 1.00551605, + "epoch": 0.6908426317814544, + "flos": 1591143183360.0, + "grad_norm": 0.007096149350185522, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74956298, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.39160156, + "step": 3591, + "time_per_iteration": 5.0011866092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045736, + "balance_loss_mlp": 1.0053004, + "epoch": 0.6910350134667179, + "flos": 501805107456.0, + "grad_norm": 0.029643353067264133, + "language_loss": 0.81368697, + "learning_rate": 0.0002301324418579666, + "loss": 0.82414436, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.40429688, + "step": 3592, + "time_per_iteration": 2.710340738296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050949, + "balance_loss_mlp": 1.01184845, + "epoch": 0.6912273951519815, + "flos": 1412135443968.0, + "grad_norm": 0.014289812000501409, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79739422, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.390625, + "step": 3593, + "time_per_iteration": 4.750281810760498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_mlp": 1.00402915, + "epoch": 0.6914197768372451, + "flos": 636557397504.0, + "grad_norm": 0.0367015241777211, + "language_loss": 0.8129431, + "learning_rate": 0.00022960811715677415, + "loss": 0.82338709, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.40356445, + "step": 3594, + "time_per_iteration": 2.846938133239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_mlp": 1.00485945, + "epoch": 0.6916121585225087, + "flos": 559202949120.0, + "grad_norm": 0.030135543775537642, + "language_loss": 0.82059658, + "learning_rate": 0.00022934611221845608, + "loss": 0.83104908, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.40380859, + "step": 3595, + "time_per_iteration": 2.8187928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049786, + "balance_loss_mlp": 1.00925434, + "epoch": 0.6918045402077723, + "flos": 530293574400.0, + "grad_norm": 0.0337393790819551, + "language_loss": 0.78598142, + "learning_rate": 0.00022908421235729609, + "loss": 0.79647928, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.40527344, + "step": 3596, + "time_per_iteration": 2.7116031646728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_mlp": 1.00894892, + "epoch": 0.6919969218930357, + "flos": 571426736640.0, + "grad_norm": 0.033365686577519565, + "language_loss": 0.8572033, + "learning_rate": 0.0002288224176749728, + "loss": 0.86769807, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.40527344, + "step": 3597, + "time_per_iteration": 2.6345982551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053776, + "balance_loss_mlp": 1.01334012, + "epoch": 0.6921893035782993, + "flos": 684504774144.0, + "grad_norm": 0.03882210113784689, + "language_loss": 0.79009509, + "learning_rate": 0.00022856072827312385, + "loss": 0.80063289, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.40429688, + "step": 3598, + "time_per_iteration": 2.7988228797912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055733, + "balance_loss_mlp": 1.01532125, + "epoch": 0.6923816852635629, + "flos": 547794732288.0, + "grad_norm": 0.03734800797345761, + "language_loss": 0.77726078, + "learning_rate": 0.00022829914425334598, + "loss": 0.78781813, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.40405273, + "step": 3599, + "time_per_iteration": 2.628700017929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053695, + "balance_loss_mlp": 1.01318777, + "epoch": 0.6925740669488265, + "flos": 511057274112.0, + "grad_norm": 0.04268943868915618, + "language_loss": 0.81083095, + "learning_rate": 0.0002280376657171956, + "loss": 0.82136786, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.4050293, + "step": 3600, + "time_per_iteration": 2.6388540267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_mlp": 1.00723314, + "epoch": 0.69276644863409, + "flos": 870914831616.0, + "grad_norm": 0.03151516530710953, + "language_loss": 0.76992857, + "learning_rate": 0.00022777629276618706, + "loss": 0.78040528, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.40429688, + "step": 3601, + "time_per_iteration": 3.086951732635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_mlp": 1.00776148, + "epoch": 0.6929588303193536, + "flos": 626918349312.0, + "grad_norm": 0.03515773513290382, + "language_loss": 0.77888995, + "learning_rate": 0.0002275150255017947, + "loss": 0.78937119, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.40356445, + "step": 3602, + "time_per_iteration": 2.8207640647888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_mlp": 1.01031494, + "epoch": 0.6931512120046172, + "flos": 1548807568896.0, + "grad_norm": 0.008023369975758985, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76782179, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.39160156, + "step": 3603, + "time_per_iteration": 5.031715631484985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_mlp": 1.00765991, + "epoch": 0.6933435936898807, + "flos": 1451326405632.0, + "grad_norm": 0.006067349506475221, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.7617377, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.38964844, + "step": 3604, + "time_per_iteration": 4.695592641830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_mlp": 1.00505233, + "epoch": 0.6935359753751443, + "flos": 541931203584.0, + "grad_norm": 0.03296159343177749, + "language_loss": 0.85254478, + "learning_rate": 0.0002267318588424379, + "loss": 0.86299777, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.40234375, + "step": 3605, + "time_per_iteration": 2.62107253074646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_mlp": 1.00330186, + "epoch": 0.6937283570604078, + "flos": 720691067136.0, + "grad_norm": 0.03433808415235627, + "language_loss": 0.87899154, + "learning_rate": 0.00022647101533842845, + "loss": 0.88942766, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.40307617, + "step": 3606, + "time_per_iteration": 2.9330387115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043345, + "balance_loss_mlp": 1.00302887, + "epoch": 0.6939207387456714, + "flos": 523194545664.0, + "grad_norm": 0.042523396404585766, + "language_loss": 0.76967436, + "learning_rate": 0.00022621027802778872, + "loss": 0.7801078, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.40307617, + "step": 3607, + "time_per_iteration": 2.6252737045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00430596, + "epoch": 0.694113120430935, + "flos": 536402066688.0, + "grad_norm": 0.03600646931475283, + "language_loss": 0.7913326, + "learning_rate": 0.00022594964701174586, + "loss": 0.80177784, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.40209961, + "step": 3608, + "time_per_iteration": 2.674360513687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044111, + "balance_loss_mlp": 1.00391352, + "epoch": 0.6943055021161986, + "flos": 524395052544.0, + "grad_norm": 0.03489608183841533, + "language_loss": 0.85372239, + "learning_rate": 0.00022568912239148586, + "loss": 0.86416358, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.40185547, + "step": 3609, + "time_per_iteration": 2.610682964324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043745, + "balance_loss_mlp": 1.0034523, + "epoch": 0.694497883801462, + "flos": 485971051008.0, + "grad_norm": 0.03140889244124769, + "language_loss": 0.81940842, + "learning_rate": 0.00022542870426815344, + "loss": 0.82984591, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.40283203, + "step": 3610, + "time_per_iteration": 2.7095394134521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_mlp": 1.00303328, + "epoch": 0.6946902654867256, + "flos": 462425562624.0, + "grad_norm": 0.03725802111731568, + "language_loss": 0.86767513, + "learning_rate": 0.00022516839274285173, + "loss": 0.87810791, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.40234375, + "step": 3611, + "time_per_iteration": 2.5144243240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_mlp": 1.00223184, + "epoch": 0.6948826471719892, + "flos": 513868502016.0, + "grad_norm": 0.03700002274884872, + "language_loss": 0.75493568, + "learning_rate": 0.00022490818791664265, + "loss": 0.76536095, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.40283203, + "step": 3612, + "time_per_iteration": 2.5950610637664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_mlp": 1.00429881, + "epoch": 0.6950750288572528, + "flos": 558256154112.0, + "grad_norm": 0.03078051424242557, + "language_loss": 0.86039829, + "learning_rate": 0.00022464808989054676, + "loss": 0.87084323, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.40185547, + "step": 3613, + "time_per_iteration": 2.6489851474761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_mlp": 1.00567627, + "epoch": 0.6952674105425164, + "flos": 543522482688.0, + "grad_norm": 0.037582150054456365, + "language_loss": 0.76400638, + "learning_rate": 0.00022438809876554284, + "loss": 0.77446485, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.40161133, + "step": 3614, + "time_per_iteration": 2.666472911834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046875, + "balance_loss_mlp": 1.00672579, + "epoch": 0.6954597922277799, + "flos": 547857915648.0, + "grad_norm": 0.03577219625118018, + "language_loss": 0.81085944, + "learning_rate": 0.00022412821464256873, + "loss": 0.82132822, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.40136719, + "step": 3615, + "time_per_iteration": 2.6799051761627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.00479829, + "epoch": 0.6956521739130435, + "flos": 520541765376.0, + "grad_norm": 0.03709092288517812, + "language_loss": 0.82944018, + "learning_rate": 0.00022386843762252023, + "loss": 0.83988917, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.40087891, + "step": 3616, + "time_per_iteration": 2.600679397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_mlp": 1.00496709, + "epoch": 0.695844555598307, + "flos": 467264528640.0, + "grad_norm": 0.03687910314272662, + "language_loss": 0.8069849, + "learning_rate": 0.00022360876780625193, + "loss": 0.81743586, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.40112305, + "step": 3617, + "time_per_iteration": 2.5893161296844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_mlp": 1.00462067, + "epoch": 0.6960369372835706, + "flos": 601932248064.0, + "grad_norm": 0.02883770808166936, + "language_loss": 0.80609798, + "learning_rate": 0.00022334920529457604, + "loss": 0.81654525, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.40087891, + "step": 3618, + "time_per_iteration": 2.8958587646484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045417, + "balance_loss_mlp": 1.00538695, + "epoch": 0.6962293189688342, + "flos": 645466424064.0, + "grad_norm": 0.029378827731847603, + "language_loss": 0.88201439, + "learning_rate": 0.00022308975018826423, + "loss": 0.89246857, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.40014648, + "step": 3619, + "time_per_iteration": 2.849514961242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_mlp": 1.00476694, + "epoch": 0.6964217006540977, + "flos": 639958674432.0, + "grad_norm": 0.03836514772463411, + "language_loss": 0.84951282, + "learning_rate": 0.00022283040258804564, + "loss": 0.85996217, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.40161133, + "step": 3620, + "time_per_iteration": 2.755397319793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042358, + "balance_loss_mlp": 1.00220859, + "epoch": 0.6966140823393613, + "flos": 653387826432.0, + "grad_norm": 0.036503412775040926, + "language_loss": 0.84546065, + "learning_rate": 0.00022257116259460802, + "loss": 0.85588425, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.40136719, + "step": 3621, + "time_per_iteration": 2.8644983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_mlp": 1.00342941, + "epoch": 0.6968064640246249, + "flos": 705825192960.0, + "grad_norm": 0.030665085995137797, + "language_loss": 0.81856084, + "learning_rate": 0.00022231203030859725, + "loss": 0.82899684, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.40161133, + "step": 3622, + "time_per_iteration": 3.017775297164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_mlp": 1.00803852, + "epoch": 0.6969988457098885, + "flos": 493531816704.0, + "grad_norm": 0.04078314735210094, + "language_loss": 0.8408944, + "learning_rate": 0.00022205300583061737, + "loss": 0.85137624, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.40136719, + "step": 3623, + "time_per_iteration": 2.5776522159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_mlp": 1.00615692, + "epoch": 0.6971912273951519, + "flos": 1355615377920.0, + "grad_norm": 0.00769674903149883, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83883369, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.38964844, + "step": 3624, + "time_per_iteration": 4.895683288574219 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046564, + "balance_loss_mlp": 1.00653315, + "epoch": 0.6973836090804155, + "flos": 603575049984.0, + "grad_norm": 0.03550964133883238, + "language_loss": 0.77939522, + "learning_rate": 0.00022153528070095735, + "loss": 0.7898609, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.40014648, + "step": 3625, + "time_per_iteration": 2.73093581199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_mlp": 1.00628626, + "epoch": 0.6975759907656791, + "flos": 525111468288.0, + "grad_norm": 0.03728439171184861, + "language_loss": 0.88488603, + "learning_rate": 0.00022127658025027568, + "loss": 0.89534825, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.39916992, + "step": 3626, + "time_per_iteration": 2.645886182785034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_mlp": 1.00771928, + "epoch": 0.6977683724509427, + "flos": 481878636288.0, + "grad_norm": 0.032998889272974, + "language_loss": 0.85482383, + "learning_rate": 0.00022101798800962258, + "loss": 0.86530197, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.40087891, + "step": 3627, + "time_per_iteration": 2.6026127338409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048765, + "balance_loss_mlp": 1.00875843, + "epoch": 0.6979607541362063, + "flos": 523641698304.0, + "grad_norm": 0.041862603089362516, + "language_loss": 0.79471421, + "learning_rate": 0.00022075950407939227, + "loss": 0.80520177, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.39990234, + "step": 3628, + "time_per_iteration": 2.61621356010437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045103, + "balance_loss_mlp": 1.00514364, + "epoch": 0.6981531358214698, + "flos": 549116748288.0, + "grad_norm": 0.03728815941445965, + "language_loss": 0.83285969, + "learning_rate": 0.0002205011285599367, + "loss": 0.84331071, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.39941406, + "step": 3629, + "time_per_iteration": 2.6081953048706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041921, + "balance_loss_mlp": 1.00200999, + "epoch": 0.6983455175067333, + "flos": 701276877312.0, + "grad_norm": 0.05573052179945255, + "language_loss": 0.80735791, + "learning_rate": 0.00022024286155156658, + "loss": 0.81777716, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.39892578, + "step": 3630, + "time_per_iteration": 2.828234910964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041029, + "balance_loss_mlp": 1.00106966, + "epoch": 0.6985378991919969, + "flos": 486120750336.0, + "grad_norm": 0.034934255505656486, + "language_loss": 0.86530191, + "learning_rate": 0.00021998470315454994, + "loss": 0.87571216, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.39941406, + "step": 3631, + "time_per_iteration": 2.689331293106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_mlp": 1.00034702, + "epoch": 0.6987302808772605, + "flos": 559893120000.0, + "grad_norm": 0.03380510665243889, + "language_loss": 0.86876583, + "learning_rate": 0.00021972665346911275, + "loss": 0.87916821, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.39868164, + "step": 3632, + "time_per_iteration": 2.689023017883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040303, + "balance_loss_mlp": 1.00032043, + "epoch": 0.698922662562524, + "flos": 484568355072.0, + "grad_norm": 0.03644538242957212, + "language_loss": 0.80445158, + "learning_rate": 0.00021946871259543877, + "loss": 0.81485462, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.3996582, + "step": 3633, + "time_per_iteration": 2.584099292755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040446, + "balance_loss_mlp": 1.00048685, + "epoch": 0.6991150442477876, + "flos": 720206976000.0, + "grad_norm": 0.03286124329654603, + "language_loss": 0.83436686, + "learning_rate": 0.00021921088063366957, + "loss": 0.84477133, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.39941406, + "step": 3634, + "time_per_iteration": 2.9156620502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_mlp": 1.00328004, + "epoch": 0.6993074259330512, + "flos": 490160675328.0, + "grad_norm": 0.03268452893811677, + "language_loss": 0.82517856, + "learning_rate": 0.00021895315768390435, + "loss": 0.83561075, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.39916992, + "step": 3635, + "time_per_iteration": 2.5866551399230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_mlp": 1.01126432, + "epoch": 0.6994998076183148, + "flos": 719469172992.0, + "grad_norm": 0.02932000302360117, + "language_loss": 0.88269186, + "learning_rate": 0.00021869554384619999, + "loss": 0.89320338, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.39868164, + "step": 3636, + "time_per_iteration": 2.971536159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050583, + "balance_loss_mlp": 1.01057589, + "epoch": 0.6996921893035783, + "flos": 580164676608.0, + "grad_norm": 0.03639524799705141, + "language_loss": 0.81240088, + "learning_rate": 0.00021843803922057115, + "loss": 0.82290673, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.39990234, + "step": 3637, + "time_per_iteration": 2.725170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_mlp": 1.01060295, + "epoch": 0.6998845709888418, + "flos": 519675650304.0, + "grad_norm": 0.03468807829141317, + "language_loss": 0.82837808, + "learning_rate": 0.00021818064390698977, + "loss": 0.83888352, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.39916992, + "step": 3638, + "time_per_iteration": 2.633237838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_mlp": 1.00311363, + "epoch": 0.7000769526741054, + "flos": 622096879872.0, + "grad_norm": 0.03453806338856074, + "language_loss": 0.87273943, + "learning_rate": 0.0002179233580053861, + "loss": 0.8831709, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.40014648, + "step": 3639, + "time_per_iteration": 2.7544472217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_mlp": 1.0029943, + "epoch": 0.700269334359369, + "flos": 561056688384.0, + "grad_norm": 0.033530662596956085, + "language_loss": 0.85948008, + "learning_rate": 0.00021766618161564688, + "loss": 0.86991107, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.40087891, + "step": 3640, + "time_per_iteration": 2.7110095024108887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_mlp": 1.00132048, + "epoch": 0.7004617160446326, + "flos": 484362275328.0, + "grad_norm": 0.03557696097422109, + "language_loss": 0.87556666, + "learning_rate": 0.00021740911483761677, + "loss": 0.88598037, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.40039062, + "step": 3641, + "time_per_iteration": 2.5866122245788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_mlp": 1.00230658, + "epoch": 0.7006540977298961, + "flos": 698322753024.0, + "grad_norm": 0.029252813269696705, + "language_loss": 0.92278117, + "learning_rate": 0.00021715215777109837, + "loss": 0.93320382, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.39941406, + "step": 3642, + "time_per_iteration": 2.9658164978027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_mlp": 1.01600468, + "epoch": 0.7008464794151597, + "flos": 505771155456.0, + "grad_norm": 0.0370639666427534, + "language_loss": 0.84983593, + "learning_rate": 0.00021689531051585103, + "loss": 0.86039579, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.3996582, + "step": 3643, + "time_per_iteration": 2.605422019958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055864, + "balance_loss_mlp": 1.01583362, + "epoch": 0.7010388611004232, + "flos": 538273302528.0, + "grad_norm": 0.03585337078400258, + "language_loss": 0.8111937, + "learning_rate": 0.00021663857317159196, + "loss": 0.82175231, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.40014648, + "step": 3644, + "time_per_iteration": 2.601376533508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053049, + "balance_loss_mlp": 1.01304281, + "epoch": 0.7012312427856868, + "flos": 548315761920.0, + "grad_norm": 0.03435070912909032, + "language_loss": 0.82316148, + "learning_rate": 0.00021638194583799487, + "loss": 0.83369195, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.39990234, + "step": 3645, + "time_per_iteration": 2.686854839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_mlp": 1.00636709, + "epoch": 0.7014236244709504, + "flos": 942974413056.0, + "grad_norm": 0.03710405842133189, + "language_loss": 0.83184248, + "learning_rate": 0.00021612542861469176, + "loss": 0.84230787, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.40161133, + "step": 3646, + "time_per_iteration": 3.2522597312927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_mlp": 1.01623774, + "epoch": 0.7016160061562139, + "flos": 526209907968.0, + "grad_norm": 0.03458129081843451, + "language_loss": 0.82967472, + "learning_rate": 0.00021586902160127135, + "loss": 0.84023744, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.40014648, + "step": 3647, + "time_per_iteration": 2.592898368835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_mlp": 1.01410604, + "epoch": 0.7018083878414775, + "flos": 374245165056.0, + "grad_norm": 0.045887676858618894, + "language_loss": 0.74931926, + "learning_rate": 0.00021561272489727974, + "loss": 0.75986135, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.40087891, + "step": 3648, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047959, + "balance_loss_mlp": 1.00785732, + "epoch": 0.7020007695267411, + "flos": 528834498048.0, + "grad_norm": 0.03554324535987718, + "language_loss": 0.81039417, + "learning_rate": 0.0002153565386022199, + "loss": 0.82087374, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.40087891, + "step": 3649, + "time_per_iteration": 2.695328712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_mlp": 1.00897217, + "epoch": 0.7021931512120047, + "flos": 691373423616.0, + "grad_norm": 0.035617603587249046, + "language_loss": 0.82844687, + "learning_rate": 0.00021510046281555262, + "loss": 0.83893853, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.40185547, + "step": 3650, + "time_per_iteration": 2.8195676803588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047858, + "balance_loss_mlp": 1.00761259, + "epoch": 0.7023855328972681, + "flos": 640926856704.0, + "grad_norm": 0.042051655567710275, + "language_loss": 0.82163751, + "learning_rate": 0.0002148444976366949, + "loss": 0.83211613, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.40234375, + "step": 3651, + "time_per_iteration": 2.7910640239715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049797, + "balance_loss_mlp": 1.00969481, + "epoch": 0.7025779145825317, + "flos": 562007374080.0, + "grad_norm": 0.03669409965522196, + "language_loss": 0.8294403, + "learning_rate": 0.00021458864316502136, + "loss": 0.83993822, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.40087891, + "step": 3652, + "time_per_iteration": 2.7377076148986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050304, + "balance_loss_mlp": 1.01020181, + "epoch": 0.7027702962677953, + "flos": 448371368448.0, + "grad_norm": 0.037398832167444995, + "language_loss": 0.87441307, + "learning_rate": 0.0002143328994998634, + "loss": 0.88491613, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.40087891, + "step": 3653, + "time_per_iteration": 2.510070323944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048411, + "balance_loss_mlp": 1.00833249, + "epoch": 0.7029626779530589, + "flos": 623714403840.0, + "grad_norm": 0.0361167635185571, + "language_loss": 0.78985465, + "learning_rate": 0.00021407726674050982, + "loss": 0.80033875, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.40063477, + "step": 3654, + "time_per_iteration": 2.8577005863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_mlp": 1.00903809, + "epoch": 0.7031550596383225, + "flos": 630734697984.0, + "grad_norm": 0.031984411751134825, + "language_loss": 0.87403131, + "learning_rate": 0.0002138217449862061, + "loss": 0.88452226, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.40039062, + "step": 3655, + "time_per_iteration": 2.731257915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051587, + "balance_loss_mlp": 1.01160455, + "epoch": 0.703347441323586, + "flos": 531860553984.0, + "grad_norm": 0.032014026327257146, + "language_loss": 0.7905367, + "learning_rate": 0.00021356633433615403, + "loss": 0.80105257, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.3996582, + "step": 3656, + "time_per_iteration": 2.6462786197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_mlp": 1.01192546, + "epoch": 0.7035398230088495, + "flos": 694916618496.0, + "grad_norm": 0.025544718758457735, + "language_loss": 0.83906752, + "learning_rate": 0.0002133110348895133, + "loss": 0.84958708, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.40014648, + "step": 3657, + "time_per_iteration": 2.968036413192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_mlp": 1.01158977, + "epoch": 0.7037322046941131, + "flos": 969667466496.0, + "grad_norm": 0.030163391429171182, + "language_loss": 0.85463339, + "learning_rate": 0.0002130558467453999, + "loss": 0.8651489, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.39941406, + "step": 3658, + "time_per_iteration": 3.3951528072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047875, + "balance_loss_mlp": 1.00789237, + "epoch": 0.7039245863793767, + "flos": 503926164480.0, + "grad_norm": 0.029582354045105844, + "language_loss": 0.84755009, + "learning_rate": 0.0002128007700028865, + "loss": 0.85802877, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.3996582, + "step": 3659, + "time_per_iteration": 2.754249334335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_mlp": 1.00460947, + "epoch": 0.7041169680646402, + "flos": 466938885120.0, + "grad_norm": 0.03694565934757681, + "language_loss": 0.8474158, + "learning_rate": 0.00021254580476100276, + "loss": 0.85786295, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.40087891, + "step": 3660, + "time_per_iteration": 2.576219081878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_mlp": 1.00359917, + "epoch": 0.7043093497499038, + "flos": 633322349568.0, + "grad_norm": 0.037641747763634714, + "language_loss": 0.79470807, + "learning_rate": 0.00021229095111873497, + "loss": 0.80514407, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.39990234, + "step": 3661, + "time_per_iteration": 2.7775161266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043368, + "balance_loss_mlp": 1.00333726, + "epoch": 0.7045017314351674, + "flos": 544096002048.0, + "grad_norm": 0.03023690962448049, + "language_loss": 0.86693418, + "learning_rate": 0.0002120362091750261, + "loss": 0.87736779, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.40014648, + "step": 3662, + "time_per_iteration": 2.815168857574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.00267351, + "epoch": 0.704694113120431, + "flos": 429141871104.0, + "grad_norm": 0.036907150984541, + "language_loss": 0.87510955, + "learning_rate": 0.00021178157902877566, + "loss": 0.88553607, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.3996582, + "step": 3663, + "time_per_iteration": 2.458578109741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_mlp": 1.00373864, + "epoch": 0.7048864948056945, + "flos": 651713922048.0, + "grad_norm": 0.04106624653226338, + "language_loss": 0.87760627, + "learning_rate": 0.0002115270607788397, + "loss": 0.88804281, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.39892578, + "step": 3664, + "time_per_iteration": 2.756804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_mlp": 1.00445461, + "epoch": 0.705078876490958, + "flos": 413494452480.0, + "grad_norm": 0.03442797785772838, + "language_loss": 0.86509478, + "learning_rate": 0.00021127265452403133, + "loss": 0.87553817, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.39868164, + "step": 3665, + "time_per_iteration": 2.534076690673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_mlp": 1.0045929, + "epoch": 0.7052712581762216, + "flos": 1423150943232.0, + "grad_norm": 0.008458198264264957, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85135132, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.38867188, + "step": 3666, + "time_per_iteration": 4.859800815582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_mlp": 1.00266969, + "epoch": 0.7054636398614852, + "flos": 494070342912.0, + "grad_norm": 0.037128971718994215, + "language_loss": 0.833794, + "learning_rate": 0.00021076417839483065, + "loss": 0.84422016, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.39916992, + "step": 3667, + "time_per_iteration": 2.7798430919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00283003, + "epoch": 0.7056560215467488, + "flos": 451377982464.0, + "grad_norm": 0.031014936324499143, + "language_loss": 0.85416818, + "learning_rate": 0.00021051010871784589, + "loss": 0.86459577, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.39916992, + "step": 3668, + "time_per_iteration": 2.560455560684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_mlp": 1.00304842, + "epoch": 0.7058484032320124, + "flos": 566818149888.0, + "grad_norm": 0.030353159640158514, + "language_loss": 0.79448986, + "learning_rate": 0.0002102561514308045, + "loss": 0.8049202, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.3996582, + "step": 3669, + "time_per_iteration": 2.7246358394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_mlp": 1.00234294, + "epoch": 0.7060407849172758, + "flos": 568103227392.0, + "grad_norm": 0.03405380367536788, + "language_loss": 0.82700998, + "learning_rate": 0.00021000230663230135, + "loss": 0.83743417, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.40063477, + "step": 3670, + "time_per_iteration": 2.6809375286102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_mlp": 1.00293946, + "epoch": 0.7062331666025394, + "flos": 469713174528.0, + "grad_norm": 0.035705889445470915, + "language_loss": 0.83772206, + "learning_rate": 0.00020974857442088762, + "loss": 0.8481518, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.40014648, + "step": 3671, + "time_per_iteration": 2.6487808227539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_mlp": 1.00330818, + "epoch": 0.706425548287803, + "flos": 596417695488.0, + "grad_norm": 0.03583731061026118, + "language_loss": 0.89143217, + "learning_rate": 0.00020949495489507104, + "loss": 0.90186673, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.40136719, + "step": 3672, + "time_per_iteration": 2.704887628555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_mlp": 1.00331914, + "epoch": 0.7066179299730666, + "flos": 476814148608.0, + "grad_norm": 0.034102097435369114, + "language_loss": 0.84997833, + "learning_rate": 0.00020924144815331525, + "loss": 0.86041224, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.40063477, + "step": 3673, + "time_per_iteration": 2.5945112705230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_mlp": 1.0026809, + "epoch": 0.7068103116583301, + "flos": 507436311552.0, + "grad_norm": 0.033684521411270194, + "language_loss": 0.83985698, + "learning_rate": 0.00020898805429404044, + "loss": 0.85028362, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.3996582, + "step": 3674, + "time_per_iteration": 2.5818920135498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_mlp": 1.00266123, + "epoch": 0.7070026933435937, + "flos": 680575664640.0, + "grad_norm": 0.03512873001655734, + "language_loss": 0.78734016, + "learning_rate": 0.0002087347734156228, + "loss": 0.7977668, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.39990234, + "step": 3675, + "time_per_iteration": 2.8316643238067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044552, + "balance_loss_mlp": 1.00447345, + "epoch": 0.7071950750288573, + "flos": 473166941184.0, + "grad_norm": 0.03289895415072129, + "language_loss": 0.79907787, + "learning_rate": 0.00020848160561639452, + "loss": 0.8095234, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.40063477, + "step": 3676, + "time_per_iteration": 2.662691354751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_mlp": 1.00358856, + "epoch": 0.7073874567141208, + "flos": 474684343296.0, + "grad_norm": 0.031178438211795275, + "language_loss": 0.86372793, + "learning_rate": 0.0002082285509946445, + "loss": 0.87416512, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.40112305, + "step": 3677, + "time_per_iteration": 2.54286789894104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043907, + "balance_loss_mlp": 1.0038054, + "epoch": 0.7075798383993844, + "flos": 547037487360.0, + "grad_norm": 0.033007214142821914, + "language_loss": 0.83766264, + "learning_rate": 0.00020797560964861683, + "loss": 0.84810174, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.40087891, + "step": 3678, + "time_per_iteration": 2.7636282444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043429, + "balance_loss_mlp": 1.00335097, + "epoch": 0.7077722200846479, + "flos": 663391401984.0, + "grad_norm": 0.033779282823635445, + "language_loss": 0.81209165, + "learning_rate": 0.0002077227816765122, + "loss": 0.82252598, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.40063477, + "step": 3679, + "time_per_iteration": 3.0056393146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_mlp": 1.00824738, + "epoch": 0.7079646017699115, + "flos": 1533303046656.0, + "grad_norm": 0.005266739458106997, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.7749517, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.39160156, + "step": 3680, + "time_per_iteration": 4.76727819442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_mlp": 1.00492287, + "epoch": 0.7081569834551751, + "flos": 622646099712.0, + "grad_norm": 0.03129589389619307, + "language_loss": 0.78969026, + "learning_rate": 0.00020721746624665383, + "loss": 0.80013955, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.39990234, + "step": 3681, + "time_per_iteration": 2.72866153717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_mlp": 1.00419092, + "epoch": 0.7083493651404387, + "flos": 796035273984.0, + "grad_norm": 0.031303476473040825, + "language_loss": 0.80593359, + "learning_rate": 0.00020696497898508114, + "loss": 0.81637675, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.40112305, + "step": 3682, + "time_per_iteration": 3.041132926940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044378, + "balance_loss_mlp": 1.00425231, + "epoch": 0.7085417468257021, + "flos": 815162704128.0, + "grad_norm": 0.03799512363441117, + "language_loss": 0.78282857, + "learning_rate": 0.00020671260548979316, + "loss": 0.79327232, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.40112305, + "step": 3683, + "time_per_iteration": 2.980470895767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046933, + "balance_loss_mlp": 1.00675917, + "epoch": 0.7087341285109657, + "flos": 701797906944.0, + "grad_norm": 0.03765603647775186, + "language_loss": 0.85959506, + "learning_rate": 0.00020646034585876982, + "loss": 0.87006438, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.40161133, + "step": 3684, + "time_per_iteration": 2.83225417137146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00350797, + "epoch": 0.7089265101962293, + "flos": 597735820800.0, + "grad_norm": 0.030001144776417084, + "language_loss": 0.8503226, + "learning_rate": 0.00020620820018994718, + "loss": 0.86075842, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.40063477, + "step": 3685, + "time_per_iteration": 2.808814287185669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_mlp": 1.00334978, + "epoch": 0.7091188918814929, + "flos": 488167930368.0, + "grad_norm": 0.039691244265052834, + "language_loss": 0.82984829, + "learning_rate": 0.00020595616858121675, + "loss": 0.84028256, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.40063477, + "step": 3686, + "time_per_iteration": 2.696423292160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_mlp": 1.00316179, + "epoch": 0.7093112735667565, + "flos": 601256661504.0, + "grad_norm": 0.03416651463344776, + "language_loss": 0.81164849, + "learning_rate": 0.00020570425113042586, + "loss": 0.82208097, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.40063477, + "step": 3687, + "time_per_iteration": 2.735722303390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_mlp": 1.00281906, + "epoch": 0.70950365525202, + "flos": 506850153216.0, + "grad_norm": 0.03675476987666338, + "language_loss": 0.86545879, + "learning_rate": 0.0002054524479353776, + "loss": 0.87588727, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.40014648, + "step": 3688, + "time_per_iteration": 2.6537790298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042758, + "balance_loss_mlp": 1.0026803, + "epoch": 0.7096960369372836, + "flos": 733425190656.0, + "grad_norm": 0.03699911632186226, + "language_loss": 0.81610233, + "learning_rate": 0.00020520075909383063, + "loss": 0.82652992, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.40063477, + "step": 3689, + "time_per_iteration": 2.8920962810516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_mlp": 1.00576556, + "epoch": 0.7098884186225471, + "flos": 973652956416.0, + "grad_norm": 0.0320857463001868, + "language_loss": 0.811288, + "learning_rate": 0.00020494918470349916, + "loss": 0.82174444, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.39868164, + "step": 3690, + "time_per_iteration": 3.3136045932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045022, + "balance_loss_mlp": 1.00513482, + "epoch": 0.7100808003078107, + "flos": 505258874112.0, + "grad_norm": 0.03898509483209187, + "language_loss": 0.86111224, + "learning_rate": 0.00020469772486205297, + "loss": 0.87156248, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.39868164, + "step": 3691, + "time_per_iteration": 2.6186795234680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_mlp": 1.00715148, + "epoch": 0.7102731819930742, + "flos": 541390732032.0, + "grad_norm": 0.07359850513533242, + "language_loss": 0.81684911, + "learning_rate": 0.0002044463796671177, + "loss": 0.82731974, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.39892578, + "step": 3692, + "time_per_iteration": 2.7347307205200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047542, + "balance_loss_mlp": 1.00767887, + "epoch": 0.7104655636783378, + "flos": 621628339968.0, + "grad_norm": 0.03494472731168418, + "language_loss": 0.80876124, + "learning_rate": 0.00020419514921627408, + "loss": 0.8192367, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.3984375, + "step": 3693, + "time_per_iteration": 2.9353420734405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_mlp": 1.00568545, + "epoch": 0.7106579453636014, + "flos": 558377663232.0, + "grad_norm": 0.034076048259573104, + "language_loss": 0.77580255, + "learning_rate": 0.00020394403360705855, + "loss": 0.78625828, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.39868164, + "step": 3694, + "time_per_iteration": 2.7425014972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041714, + "balance_loss_mlp": 1.00187469, + "epoch": 0.710850327048865, + "flos": 514063888128.0, + "grad_norm": 0.03425732262265505, + "language_loss": 0.88495499, + "learning_rate": 0.00020369303293696228, + "loss": 0.89537215, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.39819336, + "step": 3695, + "time_per_iteration": 2.6524975299835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_mlp": 1.00138831, + "epoch": 0.7110427087341286, + "flos": 424507039488.0, + "grad_norm": 0.03544655381873144, + "language_loss": 0.78715348, + "learning_rate": 0.00020344214730343304, + "loss": 0.79756576, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.39819336, + "step": 3696, + "time_per_iteration": 2.5949435234069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044579, + "balance_loss_mlp": 1.0046916, + "epoch": 0.711235090419392, + "flos": 578654077440.0, + "grad_norm": 0.028723552959570162, + "language_loss": 0.79433203, + "learning_rate": 0.00020319137680387296, + "loss": 0.80477786, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.39868164, + "step": 3697, + "time_per_iteration": 2.9308555126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_mlp": 1.00476861, + "epoch": 0.7114274721046556, + "flos": 448985716992.0, + "grad_norm": 0.03974363326367457, + "language_loss": 0.81048799, + "learning_rate": 0.0002029407215356398, + "loss": 0.82093453, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.39868164, + "step": 3698, + "time_per_iteration": 2.51846981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047508, + "balance_loss_mlp": 1.00747764, + "epoch": 0.7116198537899192, + "flos": 623093252352.0, + "grad_norm": 0.03573092214562991, + "language_loss": 0.83794999, + "learning_rate": 0.00020269018159604663, + "loss": 0.84842503, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.40014648, + "step": 3699, + "time_per_iteration": 2.7074286937713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.00760162, + "epoch": 0.7118122354751828, + "flos": 499720988928.0, + "grad_norm": 0.03677211843520988, + "language_loss": 0.82181633, + "learning_rate": 0.00020243975708236162, + "loss": 0.83229172, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.39916992, + "step": 3700, + "time_per_iteration": 2.564375877380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_mlp": 1.00660574, + "epoch": 0.7120046171604463, + "flos": 573845246976.0, + "grad_norm": 0.03454353277878698, + "language_loss": 0.86407083, + "learning_rate": 0.00020218944809180818, + "loss": 0.87453598, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.39892578, + "step": 3701, + "time_per_iteration": 2.7084884643554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_mlp": 1.00657344, + "epoch": 0.7121969988457099, + "flos": 573771369984.0, + "grad_norm": 0.03303682180607054, + "language_loss": 0.8533892, + "learning_rate": 0.00020193925472156493, + "loss": 0.86385572, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.40063477, + "step": 3702, + "time_per_iteration": 2.7079381942749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_mlp": 1.00603485, + "epoch": 0.7123893805309734, + "flos": 1526823224064.0, + "grad_norm": 0.008337798105396301, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75334108, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.38867188, + "step": 3703, + "time_per_iteration": 4.8932740688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057166, + "balance_loss_mlp": 1.01696837, + "epoch": 0.712581762216237, + "flos": 616414152960.0, + "grad_norm": 0.03156423949245577, + "language_loss": 0.84361899, + "learning_rate": 0.00020143921523049863, + "loss": 0.85419071, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.40185547, + "step": 3704, + "time_per_iteration": 2.9233312606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052935, + "balance_loss_mlp": 1.01285696, + "epoch": 0.7127741439015006, + "flos": 598875089664.0, + "grad_norm": 0.03941549169831495, + "language_loss": 0.84401309, + "learning_rate": 0.00020118936930380837, + "loss": 0.85454243, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.40063477, + "step": 3705, + "time_per_iteration": 2.7015953063964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_mlp": 1.00774693, + "epoch": 0.7129665255867641, + "flos": 538440498432.0, + "grad_norm": 0.03692779593562928, + "language_loss": 0.81897098, + "learning_rate": 0.0002009396393856932, + "loss": 0.82945073, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.40209961, + "step": 3706, + "time_per_iteration": 2.649216890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047729, + "balance_loss_mlp": 1.00746036, + "epoch": 0.7131589072720277, + "flos": 527521230336.0, + "grad_norm": 0.035672100544370096, + "language_loss": 0.82740968, + "learning_rate": 0.00020069002557310673, + "loss": 0.83788699, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.40258789, + "step": 3707, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043206, + "balance_loss_mlp": 1.00281823, + "epoch": 0.7133512889572913, + "flos": 532097736192.0, + "grad_norm": 0.0323096227749812, + "language_loss": 0.77545685, + "learning_rate": 0.00020044052796295807, + "loss": 0.78588891, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.40380859, + "step": 3708, + "time_per_iteration": 2.791064500808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048487, + "balance_loss_mlp": 1.00821805, + "epoch": 0.7135436706425549, + "flos": 504551206656.0, + "grad_norm": 0.04325770643622515, + "language_loss": 0.82374418, + "learning_rate": 0.00020019114665211063, + "loss": 0.83422899, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.40258789, + "step": 3709, + "time_per_iteration": 2.6297860145568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046781, + "balance_loss_mlp": 1.00648808, + "epoch": 0.7137360523278183, + "flos": 516968434944.0, + "grad_norm": 0.035345949050593885, + "language_loss": 0.81970435, + "learning_rate": 0.00019994188173738276, + "loss": 0.83017212, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.40283203, + "step": 3710, + "time_per_iteration": 2.6330204010009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047924, + "balance_loss_mlp": 1.00755966, + "epoch": 0.7139284340130819, + "flos": 511537474560.0, + "grad_norm": 0.03739330083001905, + "language_loss": 0.81062478, + "learning_rate": 0.0001996927333155477, + "loss": 0.82110405, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.40356445, + "step": 3711, + "time_per_iteration": 2.74644136428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049541, + "balance_loss_mlp": 1.0092001, + "epoch": 0.7141208156983455, + "flos": 891800736768.0, + "grad_norm": 0.03143322017513776, + "language_loss": 0.85805249, + "learning_rate": 0.00019944370148333346, + "loss": 0.86854792, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.40332031, + "step": 3712, + "time_per_iteration": 3.1481471061706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049102, + "balance_loss_mlp": 1.00871384, + "epoch": 0.7143131973836091, + "flos": 536884212480.0, + "grad_norm": 0.034489718193939395, + "language_loss": 0.80643392, + "learning_rate": 0.00019919478633742278, + "loss": 0.81692493, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.40380859, + "step": 3713, + "time_per_iteration": 2.6485395431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048925, + "balance_loss_mlp": 1.00848949, + "epoch": 0.7145055790688727, + "flos": 474627962880.0, + "grad_norm": 0.04039016318386717, + "language_loss": 0.85767764, + "learning_rate": 0.00019894598797445302, + "loss": 0.86816686, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.40429688, + "step": 3714, + "time_per_iteration": 2.5401811599731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050713, + "balance_loss_mlp": 1.01037288, + "epoch": 0.7146979607541362, + "flos": 571702802688.0, + "grad_norm": 0.03221862991626059, + "language_loss": 0.82471192, + "learning_rate": 0.00019869730649101615, + "loss": 0.83521909, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.40332031, + "step": 3715, + "time_per_iteration": 2.75704288482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105074, + "balance_loss_mlp": 1.0103991, + "epoch": 0.7148903424393998, + "flos": 841139341824.0, + "grad_norm": 0.03811132383920714, + "language_loss": 0.72900105, + "learning_rate": 0.00019844874198365943, + "loss": 0.73950851, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.40332031, + "step": 3716, + "time_per_iteration": 3.115915536880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049607, + "balance_loss_mlp": 1.00921834, + "epoch": 0.7150827241246633, + "flos": 542879943936.0, + "grad_norm": 0.037838986549668586, + "language_loss": 0.84377575, + "learning_rate": 0.00019820029454888362, + "loss": 0.85427183, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.40380859, + "step": 3717, + "time_per_iteration": 2.7640199661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_mlp": 1.01282501, + "epoch": 0.7152751058099269, + "flos": 1587190741248.0, + "grad_norm": 0.009155096775058921, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.7557348, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.39355469, + "step": 3718, + "time_per_iteration": 5.020099639892578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_mlp": 1.00296581, + "epoch": 0.7154674874951905, + "flos": 518429456640.0, + "grad_norm": 0.0370915875215028, + "language_loss": 0.80511153, + "learning_rate": 0.0001977037512828529, + "loss": 0.81554461, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.40332031, + "step": 3719, + "time_per_iteration": 2.593027114868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043585, + "balance_loss_mlp": 1.00326824, + "epoch": 0.715659869180454, + "flos": 603640178688.0, + "grad_norm": 0.03300286270545162, + "language_loss": 0.86582744, + "learning_rate": 0.0001974556556443734, + "loss": 0.87626332, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.40307617, + "step": 3720, + "time_per_iteration": 2.725634813308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047164, + "balance_loss_mlp": 1.0068953, + "epoch": 0.7158522508657176, + "flos": 532770410496.0, + "grad_norm": 0.029643200911988788, + "language_loss": 0.89179665, + "learning_rate": 0.00019720767746402547, + "loss": 0.90226829, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.40258789, + "step": 3721, + "time_per_iteration": 2.727351188659668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061105, + "balance_loss_mlp": 1.02069271, + "epoch": 0.7160446325509812, + "flos": 558646926336.0, + "grad_norm": 0.03644218382348141, + "language_loss": 0.80571723, + "learning_rate": 0.00019695981683808222, + "loss": 0.81632823, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.40405273, + "step": 3722, + "time_per_iteration": 2.7068886756896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_mlp": 1.01662219, + "epoch": 0.7162370142362448, + "flos": 692283280128.0, + "grad_norm": 0.03246359808294338, + "language_loss": 0.85348076, + "learning_rate": 0.00019671207386277225, + "loss": 0.86404943, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.40234375, + "step": 3723, + "time_per_iteration": 2.9236690998077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046407, + "balance_loss_mlp": 1.00611401, + "epoch": 0.7164293959215082, + "flos": 795459809280.0, + "grad_norm": 0.035040971125857495, + "language_loss": 0.78785622, + "learning_rate": 0.0001964644486342777, + "loss": 0.79832029, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.40283203, + "step": 3724, + "time_per_iteration": 2.9631621837615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_mlp": 1.00506568, + "epoch": 0.7166217776067718, + "flos": 495205721088.0, + "grad_norm": 0.03180638125163834, + "language_loss": 0.86850977, + "learning_rate": 0.00019621694124873524, + "loss": 0.87896389, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.40332031, + "step": 3725, + "time_per_iteration": 2.6598877906799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_mlp": 1.00958252, + "epoch": 0.7168141592920354, + "flos": 1403964220416.0, + "grad_norm": 0.007874165171020433, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77589142, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.39453125, + "step": 3726, + "time_per_iteration": 4.864764451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049401, + "balance_loss_mlp": 1.00922716, + "epoch": 0.717006540977299, + "flos": 794600497152.0, + "grad_norm": 0.03337333426789978, + "language_loss": 0.77893984, + "learning_rate": 0.00019572228039082428, + "loss": 0.78943384, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.40161133, + "step": 3727, + "time_per_iteration": 3.107271432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_mlp": 1.01066017, + "epoch": 0.7171989226625626, + "flos": 555964010496.0, + "grad_norm": 0.028215345270395674, + "language_loss": 0.84187287, + "learning_rate": 0.0001954751271105002, + "loss": 0.85238069, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.40112305, + "step": 3728, + "time_per_iteration": 2.8074874877929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049671, + "balance_loss_mlp": 1.00940251, + "epoch": 0.717391304347826, + "flos": 557062450176.0, + "grad_norm": 0.03474148956732634, + "language_loss": 0.81498766, + "learning_rate": 0.00019522809205721687, + "loss": 0.8254844, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.40258789, + "step": 3729, + "time_per_iteration": 2.736825704574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048977, + "balance_loss_mlp": 1.00885069, + "epoch": 0.7175836860330896, + "flos": 539955955200.0, + "grad_norm": 0.033940302209900526, + "language_loss": 0.83540523, + "learning_rate": 0.0001949811753268816, + "loss": 0.84589505, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.40112305, + "step": 3730, + "time_per_iteration": 2.732431173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047309, + "balance_loss_mlp": 1.00720656, + "epoch": 0.7177760677183532, + "flos": 516651539712.0, + "grad_norm": 0.04023515024908783, + "language_loss": 0.83238113, + "learning_rate": 0.00019473437701535634, + "loss": 0.8428542, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.40087891, + "step": 3731, + "time_per_iteration": 2.608720064163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_mlp": 1.00444937, + "epoch": 0.7179684494036168, + "flos": 675940833024.0, + "grad_norm": 0.03223034722468918, + "language_loss": 0.90125024, + "learning_rate": 0.00019448769721845677, + "loss": 0.9116962, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.40136719, + "step": 3732, + "time_per_iteration": 2.8010287284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_mlp": 1.00342619, + "epoch": 0.7181608310888803, + "flos": 470876742912.0, + "grad_norm": 0.03459418465075036, + "language_loss": 0.86262, + "learning_rate": 0.00019424113603195203, + "loss": 0.87305647, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.40209961, + "step": 3733, + "time_per_iteration": 2.5431971549987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044053, + "balance_loss_mlp": 1.0037843, + "epoch": 0.7183532127741439, + "flos": 595185107712.0, + "grad_norm": 0.037144823365086815, + "language_loss": 0.8025893, + "learning_rate": 0.0001939946935515657, + "loss": 0.81302989, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.40258789, + "step": 3734, + "time_per_iteration": 2.8843894004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_mlp": 1.00533688, + "epoch": 0.7185455944594075, + "flos": 499916375040.0, + "grad_norm": 0.03883855208122221, + "language_loss": 0.8098954, + "learning_rate": 0.0001937483698729755, + "loss": 0.82035124, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.40234375, + "step": 3735, + "time_per_iteration": 2.6381587982177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_mlp": 1.00243688, + "epoch": 0.718737976144671, + "flos": 816308775936.0, + "grad_norm": 0.032230667359085925, + "language_loss": 0.82948256, + "learning_rate": 0.0001935021650918128, + "loss": 0.83990961, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.40258789, + "step": 3736, + "time_per_iteration": 3.0015594959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.00332057, + "epoch": 0.7189303578299346, + "flos": 439240710912.0, + "grad_norm": 0.03694442625738843, + "language_loss": 0.87466842, + "learning_rate": 0.0001932560793036625, + "loss": 0.88510168, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.39990234, + "step": 3737, + "time_per_iteration": 2.522517204284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043371, + "balance_loss_mlp": 1.00341213, + "epoch": 0.7191227395151981, + "flos": 550447512576.0, + "grad_norm": 0.0396546540063306, + "language_loss": 0.86941743, + "learning_rate": 0.00019301011260406382, + "loss": 0.87985116, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.39941406, + "step": 3738, + "time_per_iteration": 2.6374080181121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046219, + "balance_loss_mlp": 1.00616467, + "epoch": 0.7193151212004617, + "flos": 628081917696.0, + "grad_norm": 0.032473190286521646, + "language_loss": 0.80187446, + "learning_rate": 0.00019276426508850936, + "loss": 0.81233668, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.40039062, + "step": 3739, + "time_per_iteration": 2.7331862449645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_mlp": 1.00620306, + "epoch": 0.7195075028857253, + "flos": 742441142016.0, + "grad_norm": 0.03365291152671841, + "language_loss": 0.80674922, + "learning_rate": 0.00019251853685244564, + "loss": 0.8172121, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.40063477, + "step": 3740, + "time_per_iteration": 3.040309429168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044377, + "balance_loss_mlp": 1.00410771, + "epoch": 0.7196998845709889, + "flos": 804291068160.0, + "grad_norm": 0.03612611127551407, + "language_loss": 0.81356812, + "learning_rate": 0.00019227292799127283, + "loss": 0.82401186, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.40258789, + "step": 3741, + "time_per_iteration": 3.0432052612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_mlp": 1.00416589, + "epoch": 0.7198922662562524, + "flos": 926777774592.0, + "grad_norm": 0.036362359760093145, + "language_loss": 0.79752231, + "learning_rate": 0.00019202743860034454, + "loss": 0.80796617, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.40209961, + "step": 3742, + "time_per_iteration": 3.223635196685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049155, + "balance_loss_mlp": 1.0088625, + "epoch": 0.7200846479415159, + "flos": 581208681216.0, + "grad_norm": 0.0348094997574978, + "language_loss": 0.84359837, + "learning_rate": 0.00019178206877496873, + "loss": 0.85408992, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.40283203, + "step": 3743, + "time_per_iteration": 2.6937367916107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.0053103, + "epoch": 0.7202770296267795, + "flos": 558840367104.0, + "grad_norm": 0.028995122197605715, + "language_loss": 0.85587943, + "learning_rate": 0.0001915368186104059, + "loss": 0.86633497, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.40234375, + "step": 3744, + "time_per_iteration": 2.737929582595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_mlp": 1.00722742, + "epoch": 0.7204694113120431, + "flos": 673772143872.0, + "grad_norm": 0.03601847406415609, + "language_loss": 0.81636101, + "learning_rate": 0.0001912916882018706, + "loss": 0.82683504, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.40161133, + "step": 3745, + "time_per_iteration": 2.8627820014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010475, + "balance_loss_mlp": 1.00727844, + "epoch": 0.7206617929973067, + "flos": 800596228608.0, + "grad_norm": 0.04088395220221656, + "language_loss": 0.80132556, + "learning_rate": 0.00019104667764453125, + "loss": 0.8118006, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.40209961, + "step": 3746, + "time_per_iteration": 3.0283303260803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.01020253, + "epoch": 0.7208541746825702, + "flos": 532939551744.0, + "grad_norm": 0.030159350032508997, + "language_loss": 0.80461586, + "learning_rate": 0.00019080178703350926, + "loss": 0.81511962, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.40161133, + "step": 3747, + "time_per_iteration": 2.6268179416656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_mlp": 1.00945175, + "epoch": 0.7210465563678338, + "flos": 536169742080.0, + "grad_norm": 0.034039887094515435, + "language_loss": 0.83305407, + "learning_rate": 0.00019055701646387952, + "loss": 0.84355056, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.40185547, + "step": 3748, + "time_per_iteration": 2.642871618270874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050407, + "balance_loss_mlp": 1.0114975, + "epoch": 0.7212389380530974, + "flos": 1537249652736.0, + "grad_norm": 0.008513050614024542, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81523097, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.38867188, + "step": 3749, + "time_per_iteration": 4.767102003097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046475, + "balance_loss_mlp": 1.00627732, + "epoch": 0.7214313197383609, + "flos": 462453752832.0, + "grad_norm": 0.03442724668025846, + "language_loss": 0.86840045, + "learning_rate": 0.00019006783582886368, + "loss": 0.87886518, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.40185547, + "step": 3750, + "time_per_iteration": 2.5307884216308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.00384998, + "epoch": 0.7216237014236244, + "flos": 1038913874688.0, + "grad_norm": 0.03633272884659257, + "language_loss": 0.83278096, + "learning_rate": 0.00018982342595339437, + "loss": 0.84322238, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.40283203, + "step": 3751, + "time_per_iteration": 3.5147032737731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044645, + "balance_loss_mlp": 1.00437641, + "epoch": 0.721816083108888, + "flos": 897451382784.0, + "grad_norm": 0.033868816355573705, + "language_loss": 0.82631296, + "learning_rate": 0.00018957913649915076, + "loss": 0.83675945, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.40258789, + "step": 3752, + "time_per_iteration": 3.1239399909973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_mlp": 1.00446343, + "epoch": 0.7220084647941516, + "flos": 524312427264.0, + "grad_norm": 0.03748349952969219, + "language_loss": 0.80553722, + "learning_rate": 0.00018933496756097428, + "loss": 0.81598485, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.40283203, + "step": 3753, + "time_per_iteration": 2.6250908374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_mlp": 1.00487828, + "epoch": 0.7222008464794152, + "flos": 817472344320.0, + "grad_norm": 0.035953196977106826, + "language_loss": 0.82196552, + "learning_rate": 0.0001890909192336603, + "loss": 0.83241749, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.40307617, + "step": 3754, + "time_per_iteration": 3.015929698944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_mlp": 1.00417137, + "epoch": 0.7223932281646788, + "flos": 750373238016.0, + "grad_norm": 0.03340807501662783, + "language_loss": 0.70701879, + "learning_rate": 0.00018884699161195623, + "loss": 0.7174632, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.40258789, + "step": 3755, + "time_per_iteration": 2.934309959411621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043188, + "balance_loss_mlp": 1.00279963, + "epoch": 0.7225856098499422, + "flos": 746989457664.0, + "grad_norm": 0.03539660333033103, + "language_loss": 0.77625644, + "learning_rate": 0.00018860318479056327, + "loss": 0.78668833, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.40380859, + "step": 3756, + "time_per_iteration": 3.092843770980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_mlp": 1.00541389, + "epoch": 0.7227779915352058, + "flos": 548435325696.0, + "grad_norm": 0.03162886339795087, + "language_loss": 0.84069121, + "learning_rate": 0.00018835949886413555, + "loss": 0.85114777, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.40234375, + "step": 3757, + "time_per_iteration": 2.697178602218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047884, + "balance_loss_mlp": 1.00756705, + "epoch": 0.7229703732204694, + "flos": 531506720256.0, + "grad_norm": 0.03673346832571833, + "language_loss": 0.78688115, + "learning_rate": 0.0001881159339272806, + "loss": 0.79735994, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.40307617, + "step": 3758, + "time_per_iteration": 2.672168731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_mlp": 1.00597107, + "epoch": 0.723162754905733, + "flos": 529366221312.0, + "grad_norm": 0.03397833212706175, + "language_loss": 0.79266065, + "learning_rate": 0.00018787249007455858, + "loss": 0.80312276, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.40234375, + "step": 3759, + "time_per_iteration": 2.587975025177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046442, + "balance_loss_mlp": 1.00629199, + "epoch": 0.7233551365909965, + "flos": 656060048640.0, + "grad_norm": 0.03524788149604232, + "language_loss": 0.71597099, + "learning_rate": 0.00018762916740048302, + "loss": 0.72643542, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.40136719, + "step": 3760, + "time_per_iteration": 2.7926323413848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_mlp": 1.00701594, + "epoch": 0.7235475182762601, + "flos": 523444366848.0, + "grad_norm": 0.0316872797389574, + "language_loss": 0.86490506, + "learning_rate": 0.0001873859659995195, + "loss": 0.87537622, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.40087891, + "step": 3761, + "time_per_iteration": 2.7313694953918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047248, + "balance_loss_mlp": 1.00721729, + "epoch": 0.7237398999615237, + "flos": 610322190336.0, + "grad_norm": 0.03701947835091587, + "language_loss": 0.84027237, + "learning_rate": 0.0001871428859660878, + "loss": 0.85074484, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.40014648, + "step": 3762, + "time_per_iteration": 2.724437952041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_mlp": 1.00707483, + "epoch": 0.7239322816467872, + "flos": 660282720768.0, + "grad_norm": 0.032017946801170455, + "language_loss": 0.82444721, + "learning_rate": 0.00018689992739455975, + "loss": 0.83491802, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.39990234, + "step": 3763, + "time_per_iteration": 2.8985331058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045695, + "balance_loss_mlp": 1.00566471, + "epoch": 0.7241246633320508, + "flos": 970941850368.0, + "grad_norm": 0.0325077929756691, + "language_loss": 0.8663789, + "learning_rate": 0.00018665709037926027, + "loss": 0.87683582, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.40014648, + "step": 3764, + "time_per_iteration": 3.3307945728302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_mlp": 1.00323248, + "epoch": 0.7243170450173143, + "flos": 516000252672.0, + "grad_norm": 0.037062443743513, + "language_loss": 0.85301733, + "learning_rate": 0.00018641437501446694, + "loss": 0.86344957, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.3996582, + "step": 3765, + "time_per_iteration": 2.57521915435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_mlp": 1.00170028, + "epoch": 0.7245094267025779, + "flos": 560806867200.0, + "grad_norm": 0.03616258332607596, + "language_loss": 0.82752323, + "learning_rate": 0.0001861717813944104, + "loss": 0.83794075, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.40039062, + "step": 3766, + "time_per_iteration": 2.6512858867645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_mlp": 1.00287974, + "epoch": 0.7247018083878415, + "flos": 613775956992.0, + "grad_norm": 0.03625673893536532, + "language_loss": 0.79743433, + "learning_rate": 0.00018592930961327365, + "loss": 0.80786318, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.39990234, + "step": 3767, + "time_per_iteration": 2.704402208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_mlp": 1.00588417, + "epoch": 0.7248941900731051, + "flos": 635871117312.0, + "grad_norm": 0.03196657989519071, + "language_loss": 0.88960397, + "learning_rate": 0.00018568695976519273, + "loss": 0.90006363, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.40063477, + "step": 3768, + "time_per_iteration": 2.764528751373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046055, + "balance_loss_mlp": 1.0059768, + "epoch": 0.7250865717583687, + "flos": 425837803776.0, + "grad_norm": 0.0390622861553884, + "language_loss": 0.80584097, + "learning_rate": 0.00018544473194425593, + "loss": 0.81630147, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.40063477, + "step": 3769, + "time_per_iteration": 2.4841666221618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043424, + "balance_loss_mlp": 1.00329816, + "epoch": 0.7252789534436321, + "flos": 636398949888.0, + "grad_norm": 0.04244308423853245, + "language_loss": 0.79393184, + "learning_rate": 0.00018520262624450485, + "loss": 0.80436611, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.40112305, + "step": 3770, + "time_per_iteration": 2.8432021141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046388, + "balance_loss_mlp": 1.00638103, + "epoch": 0.7254713351288957, + "flos": 618354408192.0, + "grad_norm": 0.03205335937009439, + "language_loss": 0.87801862, + "learning_rate": 0.00018496064275993324, + "loss": 0.88848257, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.39990234, + "step": 3771, + "time_per_iteration": 2.753740072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046271, + "balance_loss_mlp": 1.00612164, + "epoch": 0.7256637168141593, + "flos": 768291412992.0, + "grad_norm": 0.038084131410306525, + "language_loss": 0.82372004, + "learning_rate": 0.00018471878158448686, + "loss": 0.83418274, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.40136719, + "step": 3772, + "time_per_iteration": 2.917302370071411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_mlp": 1.0082382, + "epoch": 0.7258560984994229, + "flos": 496727980800.0, + "grad_norm": 0.02992069132066452, + "language_loss": 0.84553695, + "learning_rate": 0.00018447704281206512, + "loss": 0.85602057, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.40112305, + "step": 3773, + "time_per_iteration": 2.843857765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048932, + "balance_loss_mlp": 1.00878251, + "epoch": 0.7260484801846864, + "flos": 531142192896.0, + "grad_norm": 0.03465658020099934, + "language_loss": 0.83523774, + "learning_rate": 0.0001842354265365191, + "loss": 0.84572709, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.40136719, + "step": 3774, + "time_per_iteration": 2.6899774074554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046049, + "balance_loss_mlp": 1.00592351, + "epoch": 0.72624086186995, + "flos": 626108614656.0, + "grad_norm": 0.036794080035960464, + "language_loss": 0.81133199, + "learning_rate": 0.0001839939328516526, + "loss": 0.82179248, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.40112305, + "step": 3775, + "time_per_iteration": 2.75508451461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104583, + "balance_loss_mlp": 1.0056808, + "epoch": 0.7264332435552135, + "flos": 717805962240.0, + "grad_norm": 0.03611168561837021, + "language_loss": 0.82141531, + "learning_rate": 0.0001837525618512218, + "loss": 0.83187354, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.40136719, + "step": 3776, + "time_per_iteration": 2.8697876930236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.0069201, + "epoch": 0.7266256252404771, + "flos": 682242766080.0, + "grad_norm": 0.036803325831150785, + "language_loss": 0.83319986, + "learning_rate": 0.00018351131362893519, + "loss": 0.84367126, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.40209961, + "step": 3777, + "time_per_iteration": 2.7980828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_mlp": 1.00580347, + "epoch": 0.7268180069257407, + "flos": 519918668544.0, + "grad_norm": 0.038913474879357805, + "language_loss": 0.81077832, + "learning_rate": 0.00018327018827845364, + "loss": 0.82123971, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.40332031, + "step": 3778, + "time_per_iteration": 2.610944986343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00537205, + "epoch": 0.7270103886110042, + "flos": 513673115904.0, + "grad_norm": 0.03821848161600015, + "language_loss": 0.88036418, + "learning_rate": 0.00018302918589339036, + "loss": 0.89082056, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.40258789, + "step": 3779, + "time_per_iteration": 2.6776628494262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044448, + "balance_loss_mlp": 1.00413156, + "epoch": 0.7272027702962678, + "flos": 547692665088.0, + "grad_norm": 0.03543573147287282, + "language_loss": 0.90566671, + "learning_rate": 0.00018278830656731054, + "loss": 0.91611117, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.40307617, + "step": 3780, + "time_per_iteration": 2.6612467765808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_mlp": 1.003613, + "epoch": 0.7273951519815314, + "flos": 594155687424.0, + "grad_norm": 0.02879348395383923, + "language_loss": 0.86881804, + "learning_rate": 0.00018254755039373222, + "loss": 0.87925708, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.40283203, + "step": 3781, + "time_per_iteration": 2.724158763885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045168, + "balance_loss_mlp": 1.00482738, + "epoch": 0.727587533666795, + "flos": 607139632128.0, + "grad_norm": 0.03859798712496429, + "language_loss": 0.84525704, + "learning_rate": 0.0001823069174661252, + "loss": 0.85570872, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.40332031, + "step": 3782, + "time_per_iteration": 2.7668051719665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_mlp": 1.00395322, + "epoch": 0.7277799153520584, + "flos": 514026949632.0, + "grad_norm": 0.03650439895450689, + "language_loss": 0.78873003, + "learning_rate": 0.00018206640787791112, + "loss": 0.79917252, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.40283203, + "step": 3783, + "time_per_iteration": 2.649040699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_mlp": 1.00268304, + "epoch": 0.727972297037322, + "flos": 538794332160.0, + "grad_norm": 0.03501392489574684, + "language_loss": 0.86669183, + "learning_rate": 0.00018182602172246416, + "loss": 0.87712133, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.40258789, + "step": 3784, + "time_per_iteration": 2.603267192840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045035, + "balance_loss_mlp": 1.00474274, + "epoch": 0.7281646787225856, + "flos": 536076423168.0, + "grad_norm": 0.037923852732183974, + "language_loss": 0.77186882, + "learning_rate": 0.00018158575909311075, + "loss": 0.78231919, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.40283203, + "step": 3785, + "time_per_iteration": 2.6864423751831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_mlp": 1.00489032, + "epoch": 0.7283570604078492, + "flos": 626210681856.0, + "grad_norm": 0.0363846490797151, + "language_loss": 0.80090117, + "learning_rate": 0.000181345620083129, + "loss": 0.81135345, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.40332031, + "step": 3786, + "time_per_iteration": 2.7992641925811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_mlp": 1.00548017, + "epoch": 0.7285494420931128, + "flos": 535255994880.0, + "grad_norm": 0.04682580138791378, + "language_loss": 0.86931181, + "learning_rate": 0.00018110560478574927, + "loss": 0.87977034, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.40356445, + "step": 3787, + "time_per_iteration": 2.680211305618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_mlp": 1.00277114, + "epoch": 0.7287418237783763, + "flos": 667741419264.0, + "grad_norm": 0.04795946543380901, + "language_loss": 0.80688787, + "learning_rate": 0.0001808657132941533, + "loss": 0.81731963, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.40405273, + "step": 3788, + "time_per_iteration": 2.793989658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_mlp": 1.00286126, + "epoch": 0.7289342054636399, + "flos": 551639271168.0, + "grad_norm": 0.04788875018667363, + "language_loss": 0.83400464, + "learning_rate": 0.00018062594570147572, + "loss": 0.84443599, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.40258789, + "step": 3789, + "time_per_iteration": 2.5800626277923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043681, + "balance_loss_mlp": 1.00331712, + "epoch": 0.7291265871489034, + "flos": 689139605760.0, + "grad_norm": 0.0306016583616733, + "language_loss": 0.86152685, + "learning_rate": 0.00018038630210080243, + "loss": 0.87196368, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.40356445, + "step": 3790, + "time_per_iteration": 2.791778326034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_mlp": 1.00133801, + "epoch": 0.729318968834167, + "flos": 573771369984.0, + "grad_norm": 0.03320164846736232, + "language_loss": 0.8504535, + "learning_rate": 0.0001801467825851712, + "loss": 0.86087084, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.40380859, + "step": 3791, + "time_per_iteration": 2.7736573219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043265, + "balance_loss_mlp": 1.00278151, + "epoch": 0.7295113505194305, + "flos": 587165528832.0, + "grad_norm": 0.039500127545913186, + "language_loss": 0.79190361, + "learning_rate": 0.00017990738724757172, + "loss": 0.80233628, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.40478516, + "step": 3792, + "time_per_iteration": 2.8482463359832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_mlp": 1.00345612, + "epoch": 0.7297037322046941, + "flos": 708442980096.0, + "grad_norm": 0.03263259511522569, + "language_loss": 0.82787073, + "learning_rate": 0.00017966811618094598, + "loss": 0.83830941, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.40405273, + "step": 3793, + "time_per_iteration": 2.889319658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044424, + "balance_loss_mlp": 1.0039407, + "epoch": 0.7298961138899577, + "flos": 488308881408.0, + "grad_norm": 0.03689917900491825, + "language_loss": 0.85408473, + "learning_rate": 0.00017942896947818664, + "loss": 0.86452901, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.40478516, + "step": 3794, + "time_per_iteration": 2.550274133682251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_mlp": 1.00383759, + "epoch": 0.7300884955752213, + "flos": 1368624600576.0, + "grad_norm": 0.005828351386569188, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75868088, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.39160156, + "step": 3795, + "time_per_iteration": 4.89626932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042163, + "balance_loss_mlp": 1.00179839, + "epoch": 0.7302808772604849, + "flos": 532837484544.0, + "grad_norm": 0.04171921070399138, + "language_loss": 0.85686743, + "learning_rate": 0.00017895104953559947, + "loss": 0.86728907, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.40356445, + "step": 3796, + "time_per_iteration": 2.57736873626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_mlp": 1.00203419, + "epoch": 0.7304732589457483, + "flos": 437063273472.0, + "grad_norm": 0.04046264333697194, + "language_loss": 0.90178061, + "learning_rate": 0.00017871227648131672, + "loss": 0.91220486, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.40380859, + "step": 3797, + "time_per_iteration": 2.474209785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_mlp": 1.00489223, + "epoch": 0.7306656406310119, + "flos": 452604734208.0, + "grad_norm": 0.029697022991301388, + "language_loss": 0.82934296, + "learning_rate": 0.0001784736281619907, + "loss": 0.83979571, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.40380859, + "step": 3798, + "time_per_iteration": 2.5923726558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044562, + "balance_loss_mlp": 1.00407827, + "epoch": 0.7308580223162755, + "flos": 513030577152.0, + "grad_norm": 0.032710497654363443, + "language_loss": 0.75410861, + "learning_rate": 0.00017823510467027232, + "loss": 0.7645542, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.40478516, + "step": 3799, + "time_per_iteration": 2.7622478008270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045673, + "balance_loss_mlp": 1.00521374, + "epoch": 0.7310504040015391, + "flos": 376283596800.0, + "grad_norm": 0.039904062723008, + "language_loss": 0.79136682, + "learning_rate": 0.00017799670609876516, + "loss": 0.80182356, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.40454102, + "step": 3800, + "time_per_iteration": 2.493797540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_mlp": 1.00222194, + "epoch": 0.7312427856868026, + "flos": 550382383872.0, + "grad_norm": 0.0325229913216085, + "language_loss": 0.89329851, + "learning_rate": 0.00017775843254002366, + "loss": 0.90372574, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.4050293, + "step": 3801, + "time_per_iteration": 2.7277941703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047641, + "balance_loss_mlp": 1.00727654, + "epoch": 0.7314351673720662, + "flos": 768678294528.0, + "grad_norm": 0.03330924575668911, + "language_loss": 0.84167385, + "learning_rate": 0.00017752028408655367, + "loss": 0.8521502, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.40356445, + "step": 3802, + "time_per_iteration": 3.040632486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_mlp": 1.00824392, + "epoch": 0.7316275490573297, + "flos": 487705226496.0, + "grad_norm": 0.03826862590336393, + "language_loss": 0.8564449, + "learning_rate": 0.00017728226083081272, + "loss": 0.86693048, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.40307617, + "step": 3803, + "time_per_iteration": 2.5550501346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048554, + "balance_loss_mlp": 1.00833249, + "epoch": 0.7318199307425933, + "flos": 474413134848.0, + "grad_norm": 0.03815942500131441, + "language_loss": 0.82039976, + "learning_rate": 0.00017704436286520965, + "loss": 0.83088529, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.40209961, + "step": 3804, + "time_per_iteration": 2.58294677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_mlp": 1.00793362, + "epoch": 0.7320123124278569, + "flos": 550512641280.0, + "grad_norm": 0.03634721787215332, + "language_loss": 0.8514055, + "learning_rate": 0.0001768065902821046, + "loss": 0.86188722, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.40234375, + "step": 3805, + "time_per_iteration": 2.6493990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_mlp": 1.00665021, + "epoch": 0.7322046941131204, + "flos": 571900134144.0, + "grad_norm": 0.03447588355898286, + "language_loss": 0.82488358, + "learning_rate": 0.00017656894317380907, + "loss": 0.83535278, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.40258789, + "step": 3806, + "time_per_iteration": 2.7446413040161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_mlp": 1.00468445, + "epoch": 0.732397075798384, + "flos": 1472503928064.0, + "grad_norm": 0.008037479366224719, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77074862, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.38867188, + "step": 3807, + "time_per_iteration": 5.015838623046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_mlp": 1.00413609, + "epoch": 0.7325894574836476, + "flos": 465831697152.0, + "grad_norm": 0.038585998350043275, + "language_loss": 0.84359336, + "learning_rate": 0.00017609402575064875, + "loss": 0.85403788, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.40307617, + "step": 3808, + "time_per_iteration": 2.5619466304779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044203, + "balance_loss_mlp": 1.00398183, + "epoch": 0.7327818391689112, + "flos": 496482050304.0, + "grad_norm": 0.03775450514575077, + "language_loss": 0.81649804, + "learning_rate": 0.00017585675562016367, + "loss": 0.82694006, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.40209961, + "step": 3809, + "time_per_iteration": 2.5793349742889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044159, + "balance_loss_mlp": 1.00524902, + "epoch": 0.7329742208541746, + "flos": 1436682162432.0, + "grad_norm": 0.007309956802170158, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78257012, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.38867188, + "step": 3810, + "time_per_iteration": 4.810467720031738 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_mlp": 1.00303602, + "epoch": 0.7331666025394382, + "flos": 497869195008.0, + "grad_norm": 0.0392578744691446, + "language_loss": 0.85801327, + "learning_rate": 0.00017538259298196474, + "loss": 0.86844486, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.40112305, + "step": 3811, + "time_per_iteration": 2.5858519077301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_mlp": 1.00657213, + "epoch": 0.7333589842247018, + "flos": 539639059968.0, + "grad_norm": 0.03309973691359967, + "language_loss": 0.82286286, + "learning_rate": 0.00017514570065833745, + "loss": 0.83333039, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.40161133, + "step": 3812, + "time_per_iteration": 2.693704843521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_mlp": 1.00525999, + "epoch": 0.7335513659099654, + "flos": 492042604800.0, + "grad_norm": 0.03925978819405336, + "language_loss": 0.81363267, + "learning_rate": 0.00017490893445433426, + "loss": 0.82408601, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.40063477, + "step": 3813, + "time_per_iteration": 2.608065128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_mlp": 1.00384104, + "epoch": 0.733743747595229, + "flos": 563253567744.0, + "grad_norm": 0.033972583106890976, + "language_loss": 0.82267326, + "learning_rate": 0.00017467229446187587, + "loss": 0.83311474, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.40307617, + "step": 3814, + "time_per_iteration": 2.6955394744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043196, + "balance_loss_mlp": 1.00290346, + "epoch": 0.7339361292804925, + "flos": 539649753600.0, + "grad_norm": 0.03487524168244714, + "language_loss": 0.81803584, + "learning_rate": 0.00017443578077283424, + "loss": 0.82846785, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.40283203, + "step": 3815, + "time_per_iteration": 2.6844675540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_mlp": 1.00742269, + "epoch": 0.734128510965756, + "flos": 549561955584.0, + "grad_norm": 0.03210943726156845, + "language_loss": 0.85443103, + "learning_rate": 0.0001741993934790319, + "loss": 0.86490697, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.40161133, + "step": 3816, + "time_per_iteration": 2.754804849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104727, + "balance_loss_mlp": 1.0069536, + "epoch": 0.7343208926510196, + "flos": 541202148864.0, + "grad_norm": 0.03979674876858525, + "language_loss": 0.84579813, + "learning_rate": 0.00017396313267224273, + "loss": 0.85627079, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.40307617, + "step": 3817, + "time_per_iteration": 2.7152209281921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046673, + "balance_loss_mlp": 1.00638032, + "epoch": 0.7345132743362832, + "flos": 572171342592.0, + "grad_norm": 0.03405657916649516, + "language_loss": 0.88968074, + "learning_rate": 0.0001737269984441912, + "loss": 0.9001475, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.40283203, + "step": 3818, + "time_per_iteration": 2.63198184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_mlp": 1.00906372, + "epoch": 0.7347056560215467, + "flos": 546481464576.0, + "grad_norm": 0.04751068267806247, + "language_loss": 0.85475308, + "learning_rate": 0.00017349099088655263, + "loss": 0.86524642, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.40258789, + "step": 3819, + "time_per_iteration": 2.796168804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_mlp": 1.0060178, + "epoch": 0.7348980377068103, + "flos": 597077730816.0, + "grad_norm": 0.03129969376285051, + "language_loss": 0.81227374, + "learning_rate": 0.00017325511009095375, + "loss": 0.82273662, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.40258789, + "step": 3820, + "time_per_iteration": 2.7165210247039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_mlp": 1.00621831, + "epoch": 0.7350904193920739, + "flos": 539612815104.0, + "grad_norm": 0.03503609859827407, + "language_loss": 0.84185189, + "learning_rate": 0.00017301935614897113, + "loss": 0.8523168, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.40258789, + "step": 3821, + "time_per_iteration": 2.7012970447540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046737, + "balance_loss_mlp": 1.00656378, + "epoch": 0.7352828010773375, + "flos": 514061942784.0, + "grad_norm": 0.02996543941139594, + "language_loss": 0.8232463, + "learning_rate": 0.00017278372915213274, + "loss": 0.83371365, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.40161133, + "step": 3822, + "time_per_iteration": 2.646228313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105101, + "balance_loss_mlp": 1.01171875, + "epoch": 0.735475182762601, + "flos": 1557258749184.0, + "grad_norm": 0.004879497460224864, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80944854, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.39257812, + "step": 3823, + "time_per_iteration": 5.001528024673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_mlp": 1.00636435, + "epoch": 0.7356675644478645, + "flos": 682612151040.0, + "grad_norm": 0.0358517187113506, + "language_loss": 0.8115629, + "learning_rate": 0.00017231285635975314, + "loss": 0.82202852, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.40185547, + "step": 3824, + "time_per_iteration": 2.916127920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_mlp": 1.00615466, + "epoch": 0.7358599461331281, + "flos": 516232577280.0, + "grad_norm": 0.05204398731861849, + "language_loss": 0.83695984, + "learning_rate": 0.00017207761074702115, + "loss": 0.8474232, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.40161133, + "step": 3825, + "time_per_iteration": 2.62750506401062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_mlp": 1.00662804, + "epoch": 0.7360523278183917, + "flos": 444917601792.0, + "grad_norm": 0.03194798623104488, + "language_loss": 0.84528393, + "learning_rate": 0.0001718424924450514, + "loss": 0.85575122, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.40087891, + "step": 3826, + "time_per_iteration": 2.61261248588562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046993, + "balance_loss_mlp": 1.00684357, + "epoch": 0.7362447095036553, + "flos": 604551980544.0, + "grad_norm": 0.028984397633237662, + "language_loss": 0.86482602, + "learning_rate": 0.00017160750154512482, + "loss": 0.875296, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.40136719, + "step": 3827, + "time_per_iteration": 2.6998865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_mlp": 1.00305784, + "epoch": 0.7364370911889189, + "flos": 554251222272.0, + "grad_norm": 0.040234447169501614, + "language_loss": 0.8371399, + "learning_rate": 0.0001713726381384731, + "loss": 0.84757173, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.40112305, + "step": 3828, + "time_per_iteration": 2.746196746826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_mlp": 1.00163531, + "epoch": 0.7366294728741823, + "flos": 449990837760.0, + "grad_norm": 0.03659096604544618, + "language_loss": 0.81686258, + "learning_rate": 0.00017113790231627812, + "loss": 0.82728064, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.40161133, + "step": 3829, + "time_per_iteration": 2.5232386589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_mlp": 1.00445557, + "epoch": 0.7368218545594459, + "flos": 1538705816832.0, + "grad_norm": 0.007725694552394297, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80301964, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.39257812, + "step": 3830, + "time_per_iteration": 4.843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_mlp": 1.00405526, + "epoch": 0.7370142362447095, + "flos": 516473650176.0, + "grad_norm": 0.03681023024701871, + "language_loss": 0.82271254, + "learning_rate": 0.00017066881378973936, + "loss": 0.83315504, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.40185547, + "step": 3831, + "time_per_iteration": 2.684302806854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_mlp": 1.00578201, + "epoch": 0.7372066179299731, + "flos": 501905229312.0, + "grad_norm": 0.03287634093560934, + "language_loss": 0.83259964, + "learning_rate": 0.00017043446126751189, + "loss": 0.84305775, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.40014648, + "step": 3832, + "time_per_iteration": 2.710259199142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_mlp": 1.00409663, + "epoch": 0.7373989996152366, + "flos": 559167955968.0, + "grad_norm": 0.03638251388363948, + "language_loss": 0.76960367, + "learning_rate": 0.00017020023669397376, + "loss": 0.78004539, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.40063477, + "step": 3833, + "time_per_iteration": 2.735877752304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050598, + "balance_loss_mlp": 1.01054347, + "epoch": 0.7375913813005002, + "flos": 507781396992.0, + "grad_norm": 0.059668100448601574, + "language_loss": 0.82237148, + "learning_rate": 0.0001699661401600589, + "loss": 0.8328774, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.40039062, + "step": 3834, + "time_per_iteration": 2.579663038253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_mlp": 1.007357, + "epoch": 0.7377837629857638, + "flos": 487156006656.0, + "grad_norm": 0.03637906521459096, + "language_loss": 0.78828633, + "learning_rate": 0.00016973217175665205, + "loss": 0.79876041, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.40039062, + "step": 3835, + "time_per_iteration": 2.6623384952545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_mlp": 1.00731659, + "epoch": 0.7379761446710273, + "flos": 1417880375808.0, + "grad_norm": 0.007661340220520439, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82212675, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.39160156, + "step": 3836, + "time_per_iteration": 4.928514003753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_mlp": 1.00634801, + "epoch": 0.7381685263562909, + "flos": 630910642176.0, + "grad_norm": 0.035800200298820535, + "language_loss": 0.84820634, + "learning_rate": 0.00016926461970465047, + "loss": 0.85866964, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.3996582, + "step": 3837, + "time_per_iteration": 2.762173891067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043931, + "balance_loss_mlp": 1.00382948, + "epoch": 0.7383609080415544, + "flos": 740652531456.0, + "grad_norm": 0.029602535209274302, + "language_loss": 0.84896356, + "learning_rate": 0.00016903103623757516, + "loss": 0.85940289, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.40087891, + "step": 3838, + "time_per_iteration": 3.0506296157836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_mlp": 1.00541735, + "epoch": 0.738553289726818, + "flos": 551257247232.0, + "grad_norm": 0.038121042805401205, + "language_loss": 0.807634, + "learning_rate": 0.00016879758126404738, + "loss": 0.8180899, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.40161133, + "step": 3839, + "time_per_iteration": 2.715830087661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104385, + "balance_loss_mlp": 1.00372398, + "epoch": 0.7387456714120816, + "flos": 911776785408.0, + "grad_norm": 0.03920302310428291, + "language_loss": 0.80385631, + "learning_rate": 0.00016856425487470216, + "loss": 0.81429482, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.40112305, + "step": 3840, + "time_per_iteration": 3.1212151050567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044355, + "balance_loss_mlp": 1.00422895, + "epoch": 0.7389380530973452, + "flos": 854197163520.0, + "grad_norm": 0.035349098992081385, + "language_loss": 0.79466581, + "learning_rate": 0.00016833105716012486, + "loss": 0.80510932, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.40112305, + "step": 3841, + "time_per_iteration": 3.1690988540649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_mlp": 1.0040617, + "epoch": 0.7391304347826086, + "flos": 818421084672.0, + "grad_norm": 0.0368177293104177, + "language_loss": 0.85204184, + "learning_rate": 0.00016809798821085088, + "loss": 0.86248374, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.40112305, + "step": 3842, + "time_per_iteration": 3.033186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104542, + "balance_loss_mlp": 1.00536537, + "epoch": 0.7393228164678722, + "flos": 573938565888.0, + "grad_norm": 0.03389595177699646, + "language_loss": 0.89421487, + "learning_rate": 0.00016786504811736565, + "loss": 0.90466905, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.40039062, + "step": 3843, + "time_per_iteration": 2.723698616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104513, + "balance_loss_mlp": 1.00500441, + "epoch": 0.7395151981531358, + "flos": 686576253696.0, + "grad_norm": 0.0300135100261375, + "language_loss": 0.83072603, + "learning_rate": 0.00016763223697010442, + "loss": 0.84117734, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.40112305, + "step": 3844, + "time_per_iteration": 2.975797414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_mlp": 1.00389111, + "epoch": 0.7397075798383994, + "flos": 557455167744.0, + "grad_norm": 0.04240767697887406, + "language_loss": 0.84802914, + "learning_rate": 0.00016739955485945256, + "loss": 0.85846901, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.40087891, + "step": 3845, + "time_per_iteration": 2.720717191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_mlp": 1.00448394, + "epoch": 0.739899961523663, + "flos": 547822922496.0, + "grad_norm": 0.04053063595065812, + "language_loss": 0.86230588, + "learning_rate": 0.00016716700187574513, + "loss": 0.87275296, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.40209961, + "step": 3846, + "time_per_iteration": 2.703578472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_mlp": 1.00492537, + "epoch": 0.7400923432089265, + "flos": 610304693760.0, + "grad_norm": 0.03543720475620032, + "language_loss": 0.84347486, + "learning_rate": 0.0001669345781092675, + "loss": 0.85392559, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.40136719, + "step": 3847, + "time_per_iteration": 2.7703194618225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_mlp": 1.00455177, + "epoch": 0.7402847248941901, + "flos": 592180439040.0, + "grad_norm": 0.0397830502127856, + "language_loss": 0.87809312, + "learning_rate": 0.0001667022836502546, + "loss": 0.8885411, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.40234375, + "step": 3848, + "time_per_iteration": 2.760023355484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_mlp": 1.00629044, + "epoch": 0.7404771065794536, + "flos": 478305305856.0, + "grad_norm": 0.03878201132992699, + "language_loss": 0.83579338, + "learning_rate": 0.00016647011858889077, + "loss": 0.84625876, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.40234375, + "step": 3849, + "time_per_iteration": 2.566498041152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_mlp": 1.00385714, + "epoch": 0.7406694882647172, + "flos": 497467729152.0, + "grad_norm": 0.04044358723064945, + "language_loss": 0.86492926, + "learning_rate": 0.00016623808301531056, + "loss": 0.87536979, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.40185547, + "step": 3850, + "time_per_iteration": 2.659444808959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043918, + "balance_loss_mlp": 1.00367308, + "epoch": 0.7408618699499807, + "flos": 563327444736.0, + "grad_norm": 0.04103255616090965, + "language_loss": 0.79822052, + "learning_rate": 0.00016600617701959842, + "loss": 0.80865979, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.40234375, + "step": 3851, + "time_per_iteration": 2.7590160369873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044136, + "balance_loss_mlp": 1.0050354, + "epoch": 0.7410542516352443, + "flos": 1391472136704.0, + "grad_norm": 0.004180276378427017, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.7988795, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.390625, + "step": 3852, + "time_per_iteration": 4.960458040237427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047168, + "balance_loss_mlp": 1.00699425, + "epoch": 0.7412466333205079, + "flos": 671212682496.0, + "grad_norm": 0.032734679500117485, + "language_loss": 0.81693292, + "learning_rate": 0.00016554275412186315, + "loss": 0.82740462, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.40161133, + "step": 3853, + "time_per_iteration": 2.8345468044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045922, + "balance_loss_mlp": 1.00579631, + "epoch": 0.7414390150057715, + "flos": 490319122944.0, + "grad_norm": 0.03898197484032271, + "language_loss": 0.81142187, + "learning_rate": 0.0001653112373997568, + "loss": 0.82188106, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.40112305, + "step": 3854, + "time_per_iteration": 2.6750757694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048137, + "balance_loss_mlp": 1.00786769, + "epoch": 0.7416313966910351, + "flos": 600494558976.0, + "grad_norm": 0.046812555930759385, + "language_loss": 0.75529599, + "learning_rate": 0.0001650798506153517, + "loss": 0.76577735, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.40258789, + "step": 3855, + "time_per_iteration": 2.7398931980133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_mlp": 1.00440431, + "epoch": 0.7418237783762985, + "flos": 543587611392.0, + "grad_norm": 0.04165043457756402, + "language_loss": 0.84612322, + "learning_rate": 0.00016484859385848023, + "loss": 0.85657072, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.40332031, + "step": 3856, + "time_per_iteration": 2.6185436248779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047216, + "balance_loss_mlp": 1.00692356, + "epoch": 0.7420161600615621, + "flos": 545224577280.0, + "grad_norm": 0.03738954086230496, + "language_loss": 0.77780879, + "learning_rate": 0.0001646174672189243, + "loss": 0.78828102, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.40283203, + "step": 3857, + "time_per_iteration": 2.689188241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_mlp": 1.00661707, + "epoch": 0.7422085417468257, + "flos": 528211401216.0, + "grad_norm": 0.03526154422012509, + "language_loss": 0.80570501, + "learning_rate": 0.00016438647078641488, + "loss": 0.81617367, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.40234375, + "step": 3858, + "time_per_iteration": 2.5922017097473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_mlp": 1.00247657, + "epoch": 0.7424009234320893, + "flos": 509761502976.0, + "grad_norm": 0.033547873778652565, + "language_loss": 0.83657616, + "learning_rate": 0.00016415560465063344, + "loss": 0.84700406, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.40307617, + "step": 3859, + "time_per_iteration": 2.7559196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_mlp": 1.00216925, + "epoch": 0.7425933051173528, + "flos": 513607987200.0, + "grad_norm": 0.0418042544684692, + "language_loss": 0.79894865, + "learning_rate": 0.0001639248689012095, + "loss": 0.8093735, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.40307617, + "step": 3860, + "time_per_iteration": 2.5863146781921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042581, + "balance_loss_mlp": 1.00235939, + "epoch": 0.7427856868026164, + "flos": 459378119424.0, + "grad_norm": 0.03937431006783476, + "language_loss": 0.88026142, + "learning_rate": 0.00016369426362772271, + "loss": 0.89068723, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.40209961, + "step": 3861, + "time_per_iteration": 2.761857271194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046937, + "balance_loss_mlp": 1.00681162, + "epoch": 0.74297806848788, + "flos": 606188946432.0, + "grad_norm": 0.03201159100602054, + "language_loss": 0.80801797, + "learning_rate": 0.00016346378891970233, + "loss": 0.81848741, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.40112305, + "step": 3862, + "time_per_iteration": 2.8071773052215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_mlp": 1.00797915, + "epoch": 0.7431704501731435, + "flos": 893071229952.0, + "grad_norm": 0.0336740145247338, + "language_loss": 0.81989479, + "learning_rate": 0.00016323344486662633, + "loss": 0.8303746, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.39990234, + "step": 3863, + "time_per_iteration": 3.324979066848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_mlp": 1.0081737, + "epoch": 0.7433628318584071, + "flos": 593352755712.0, + "grad_norm": 0.03174757765296807, + "language_loss": 0.78870291, + "learning_rate": 0.00016300323155792247, + "loss": 0.7991842, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.39941406, + "step": 3864, + "time_per_iteration": 2.9272854328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052134, + "balance_loss_mlp": 1.01215136, + "epoch": 0.7435552135436706, + "flos": 478190599680.0, + "grad_norm": 0.033980491156459056, + "language_loss": 0.89128578, + "learning_rate": 0.00016277314908296687, + "loss": 0.90180707, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.3996582, + "step": 3865, + "time_per_iteration": 2.6214301586151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050851, + "balance_loss_mlp": 1.01086855, + "epoch": 0.7437475952289342, + "flos": 674432179200.0, + "grad_norm": 0.04325039484001494, + "language_loss": 0.76593798, + "learning_rate": 0.00016254319753108604, + "loss": 0.77644652, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.3996582, + "step": 3866, + "time_per_iteration": 2.899153232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_mlp": 1.00706291, + "epoch": 0.7439399769141978, + "flos": 771771424512.0, + "grad_norm": 0.03836259627327615, + "language_loss": 0.77282906, + "learning_rate": 0.00016231337699155492, + "loss": 0.78330016, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.40039062, + "step": 3867, + "time_per_iteration": 3.037646532058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046954, + "balance_loss_mlp": 1.00680459, + "epoch": 0.7441323585994614, + "flos": 649039754496.0, + "grad_norm": 0.035166098424979836, + "language_loss": 0.78786439, + "learning_rate": 0.0001620836875535977, + "loss": 0.79833388, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.40136719, + "step": 3868, + "time_per_iteration": 2.850342273712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044105, + "balance_loss_mlp": 1.00385988, + "epoch": 0.7443247402847248, + "flos": 566501254656.0, + "grad_norm": 0.03170658148117992, + "language_loss": 0.81203747, + "learning_rate": 0.00016185412930638766, + "loss": 0.82247853, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.40234375, + "step": 3869, + "time_per_iteration": 2.845094680786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_mlp": 1.00283337, + "epoch": 0.7445171219699884, + "flos": 579680585472.0, + "grad_norm": 0.03566273998402668, + "language_loss": 0.8328712, + "learning_rate": 0.00016162470233904765, + "loss": 0.843301, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.40136719, + "step": 3870, + "time_per_iteration": 2.720104217529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043223, + "balance_loss_mlp": 1.00304985, + "epoch": 0.744709503655252, + "flos": 620030257920.0, + "grad_norm": 0.03479057330030947, + "language_loss": 0.82728422, + "learning_rate": 0.00016139540674064856, + "loss": 0.83771646, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.40161133, + "step": 3871, + "time_per_iteration": 2.7673120498657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_mlp": 1.00208843, + "epoch": 0.7449018853405156, + "flos": 529681171200.0, + "grad_norm": 0.03196452770059439, + "language_loss": 0.78282529, + "learning_rate": 0.00016116624260021113, + "loss": 0.79324627, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.39990234, + "step": 3872, + "time_per_iteration": 2.7602975368499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_mlp": 1.00197768, + "epoch": 0.7450942670257792, + "flos": 434223855360.0, + "grad_norm": 0.03942691463996184, + "language_loss": 0.84282726, + "learning_rate": 0.0001609372100067046, + "loss": 0.85324788, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.40063477, + "step": 3873, + "time_per_iteration": 2.557443618774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_mlp": 1.00324678, + "epoch": 0.7452866487110427, + "flos": 698166250752.0, + "grad_norm": 0.03979606180562333, + "language_loss": 0.85209823, + "learning_rate": 0.0001607083090490475, + "loss": 0.86253166, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.40087891, + "step": 3874, + "time_per_iteration": 2.9215829372406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042214, + "balance_loss_mlp": 1.00213587, + "epoch": 0.7454790303963063, + "flos": 513280398336.0, + "grad_norm": 0.038948732221191794, + "language_loss": 0.80756831, + "learning_rate": 0.00016047953981610714, + "loss": 0.81799042, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.40063477, + "step": 3875, + "time_per_iteration": 2.7751615047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042316, + "balance_loss_mlp": 1.00331116, + "epoch": 0.7456714120815698, + "flos": 1328876637696.0, + "grad_norm": 0.007608844356592571, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80771959, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.38964844, + "step": 3876, + "time_per_iteration": 4.963236331939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104273, + "balance_loss_mlp": 1.00258029, + "epoch": 0.7458637937668334, + "flos": 722972517120.0, + "grad_norm": 0.03405336651276997, + "language_loss": 0.81492639, + "learning_rate": 0.0001600223968795889, + "loss": 0.82535368, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.40136719, + "step": 3877, + "time_per_iteration": 2.910365581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040028, + "balance_loss_mlp": 1.00102234, + "epoch": 0.746056175452097, + "flos": 1504869014784.0, + "grad_norm": 0.004565558570820898, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76736104, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.38964844, + "step": 3878, + "time_per_iteration": 4.932594060897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_mlp": 1.00196517, + "epoch": 0.7462485571373605, + "flos": 521295119616.0, + "grad_norm": 0.03746689938213739, + "language_loss": 0.82366681, + "learning_rate": 0.00015956578190706483, + "loss": 0.83408701, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.40039062, + "step": 3879, + "time_per_iteration": 2.6971168518066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043092, + "balance_loss_mlp": 1.00282276, + "epoch": 0.7464409388226241, + "flos": 482167341312.0, + "grad_norm": 0.03527801182694915, + "language_loss": 0.76289219, + "learning_rate": 0.00015933767262892468, + "loss": 0.77332312, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.40258789, + "step": 3880, + "time_per_iteration": 2.739508628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043029, + "balance_loss_mlp": 1.00290275, + "epoch": 0.7466333205078877, + "flos": 487742164992.0, + "grad_norm": 0.04213099092543845, + "language_loss": 0.82585847, + "learning_rate": 0.00015910969560762927, + "loss": 0.83628881, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.40112305, + "step": 3881, + "time_per_iteration": 2.562812089920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041177, + "balance_loss_mlp": 1.00102758, + "epoch": 0.7468257021931513, + "flos": 612408254208.0, + "grad_norm": 0.03436500005268551, + "language_loss": 0.83349586, + "learning_rate": 0.00015888185093168727, + "loss": 0.84390759, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.40136719, + "step": 3882, + "time_per_iteration": 2.775710105895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_mlp": 1.00434875, + "epoch": 0.7470180838784147, + "flos": 534485144064.0, + "grad_norm": 0.033392076126709996, + "language_loss": 0.81580567, + "learning_rate": 0.00015865413868955581, + "loss": 0.82625163, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.40234375, + "step": 3883, + "time_per_iteration": 2.641209125518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_mlp": 1.00276339, + "epoch": 0.7472104655636783, + "flos": 740673918720.0, + "grad_norm": 0.03165690169757385, + "language_loss": 0.83215499, + "learning_rate": 0.00015842655896964054, + "loss": 0.84258389, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.40112305, + "step": 3884, + "time_per_iteration": 3.0401206016540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_mlp": 1.00191045, + "epoch": 0.7474028472489419, + "flos": 641502321408.0, + "grad_norm": 0.03740320780985832, + "language_loss": 0.74281669, + "learning_rate": 0.00015819911186029567, + "loss": 0.75323802, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.40209961, + "step": 3885, + "time_per_iteration": 2.7730581760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_mlp": 1.00332129, + "epoch": 0.7475952289342055, + "flos": 591326962944.0, + "grad_norm": 0.03361665798046632, + "language_loss": 0.8701033, + "learning_rate": 0.00015797179744982443, + "loss": 0.88053918, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.40258789, + "step": 3886, + "time_per_iteration": 2.708472967147827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_mlp": 1.00303507, + "epoch": 0.7477876106194691, + "flos": 489220683264.0, + "grad_norm": 0.029904604338816032, + "language_loss": 0.79095513, + "learning_rate": 0.00015774461582647765, + "loss": 0.80138862, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.40307617, + "step": 3887, + "time_per_iteration": 2.619105100631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044379, + "balance_loss_mlp": 1.00406253, + "epoch": 0.7479799923047326, + "flos": 555790011648.0, + "grad_norm": 0.036783241933874694, + "language_loss": 0.81563759, + "learning_rate": 0.00015751756707845505, + "loss": 0.82608134, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.40307617, + "step": 3888, + "time_per_iteration": 2.639768123626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041973, + "balance_loss_mlp": 1.00170422, + "epoch": 0.7481723739899961, + "flos": 768791055360.0, + "grad_norm": 0.03246382733666718, + "language_loss": 0.88938636, + "learning_rate": 0.00015729065129390502, + "loss": 0.89980614, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.40258789, + "step": 3889, + "time_per_iteration": 3.0039196014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041937, + "balance_loss_mlp": 1.00162077, + "epoch": 0.7483647556752597, + "flos": 497161527552.0, + "grad_norm": 0.037416161983298064, + "language_loss": 0.82518947, + "learning_rate": 0.0001570638685609241, + "loss": 0.83560884, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.40307617, + "step": 3890, + "time_per_iteration": 2.6009106636047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_mlp": 1.00238311, + "epoch": 0.7485571373605233, + "flos": 473826976512.0, + "grad_norm": 0.0374886975546847, + "language_loss": 0.80841064, + "learning_rate": 0.00015683721896755693, + "loss": 0.81883812, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.40356445, + "step": 3891, + "time_per_iteration": 2.5633225440979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050194, + "balance_loss_mlp": 1.0109024, + "epoch": 0.7487495190457868, + "flos": 1557901287936.0, + "grad_norm": 0.009107033640044568, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83260679, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.39257812, + "step": 3892, + "time_per_iteration": 4.94974160194397 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046825, + "balance_loss_mlp": 1.00665128, + "epoch": 0.7489419007310504, + "flos": 582967156224.0, + "grad_norm": 0.04143959916189291, + "language_loss": 0.85828441, + "learning_rate": 0.00015638431955158528, + "loss": 0.8687526, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.40161133, + "step": 3893, + "time_per_iteration": 2.6816978454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_mlp": 1.0072118, + "epoch": 0.749134282416314, + "flos": 568698134016.0, + "grad_norm": 0.030135437984765083, + "language_loss": 0.81634343, + "learning_rate": 0.00015615806990481186, + "loss": 0.82681662, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.40087891, + "step": 3894, + "time_per_iteration": 2.7294962406158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046959, + "balance_loss_mlp": 1.0068568, + "epoch": 0.7493266641015776, + "flos": 534166303488.0, + "grad_norm": 0.0348465154646137, + "language_loss": 0.84720361, + "learning_rate": 0.00015593195374931452, + "loss": 0.85767317, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.40087891, + "step": 3895, + "time_per_iteration": 2.7430076599121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047, + "balance_loss_mlp": 1.00685048, + "epoch": 0.7495190457868411, + "flos": 524718750720.0, + "grad_norm": 0.040656951694221274, + "language_loss": 0.80276871, + "learning_rate": 0.00015570597117287922, + "loss": 0.81323874, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.40136719, + "step": 3896, + "time_per_iteration": 2.6507298946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_mlp": 1.00107622, + "epoch": 0.7497114274721046, + "flos": 515190518016.0, + "grad_norm": 0.03462966662761621, + "language_loss": 0.78418148, + "learning_rate": 0.0001554801222632406, + "loss": 0.79459298, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.40063477, + "step": 3897, + "time_per_iteration": 2.595093250274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_mlp": 1.00186181, + "epoch": 0.7499038091573682, + "flos": 495997959168.0, + "grad_norm": 0.03336183376647943, + "language_loss": 0.85394609, + "learning_rate": 0.00015525440710808052, + "loss": 0.86436647, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.40161133, + "step": 3898, + "time_per_iteration": 2.643571376800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_mlp": 1.00302041, + "epoch": 0.7500961908426318, + "flos": 738989320704.0, + "grad_norm": 0.03519199778666105, + "language_loss": 0.78480381, + "learning_rate": 0.00015502882579502953, + "loss": 0.79523695, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.40283203, + "step": 3899, + "time_per_iteration": 2.97965669631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043279, + "balance_loss_mlp": 1.00303352, + "epoch": 0.7502885725278954, + "flos": 534537633792.0, + "grad_norm": 0.03091865582012727, + "language_loss": 0.85061979, + "learning_rate": 0.00015480337841166592, + "loss": 0.86105257, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.40234375, + "step": 3900, + "time_per_iteration": 2.7444653511047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043426, + "balance_loss_mlp": 1.00322855, + "epoch": 0.7504809542131589, + "flos": 590559024384.0, + "grad_norm": 0.034641340110691664, + "language_loss": 0.83055896, + "learning_rate": 0.00015457806504551647, + "loss": 0.84099317, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.40185547, + "step": 3901, + "time_per_iteration": 2.847846269607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_mlp": 1.0071348, + "epoch": 0.7506733358984224, + "flos": 512583424512.0, + "grad_norm": 0.03350221131006084, + "language_loss": 0.78925437, + "learning_rate": 0.0001543528857840554, + "loss": 0.79972672, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.40087891, + "step": 3902, + "time_per_iteration": 2.6609957218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047342, + "balance_loss_mlp": 1.00728762, + "epoch": 0.750865717583686, + "flos": 540383665920.0, + "grad_norm": 0.03644816467758723, + "language_loss": 0.80910051, + "learning_rate": 0.000154127840714705, + "loss": 0.81957394, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.40039062, + "step": 3903, + "time_per_iteration": 2.778198003768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048232, + "balance_loss_mlp": 1.00810659, + "epoch": 0.7510580992689496, + "flos": 477541257984.0, + "grad_norm": 0.040090358516612946, + "language_loss": 0.8254571, + "learning_rate": 0.00015390292992483557, + "loss": 0.83593941, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.40112305, + "step": 3904, + "time_per_iteration": 2.5382485389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047868, + "balance_loss_mlp": 1.0078609, + "epoch": 0.7512504809542132, + "flos": 580201615104.0, + "grad_norm": 0.03358602757025677, + "language_loss": 0.84426451, + "learning_rate": 0.00015367815350176523, + "loss": 0.85474312, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.39990234, + "step": 3905, + "time_per_iteration": 2.741651773452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_mlp": 1.00804722, + "epoch": 0.7514428626394767, + "flos": 419564060928.0, + "grad_norm": 0.03247714739847641, + "language_loss": 0.83377486, + "learning_rate": 0.00015345351153275987, + "loss": 0.84425521, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.3996582, + "step": 3906, + "time_per_iteration": 2.5285587310791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041564, + "balance_loss_mlp": 1.0013901, + "epoch": 0.7516352443247403, + "flos": 642255675648.0, + "grad_norm": 0.03199624670716249, + "language_loss": 0.81475991, + "learning_rate": 0.00015322900410503332, + "loss": 0.82517552, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.40161133, + "step": 3907, + "time_per_iteration": 2.814133405685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_mlp": 1.00366187, + "epoch": 0.7518276260100039, + "flos": 582192414720.0, + "grad_norm": 0.03412627966929826, + "language_loss": 0.77873939, + "learning_rate": 0.00015300463130574703, + "loss": 0.78917778, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.40161133, + "step": 3908, + "time_per_iteration": 2.909247875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_mlp": 1.00210238, + "epoch": 0.7520200076952674, + "flos": 688616630784.0, + "grad_norm": 0.028908861637072923, + "language_loss": 0.82461572, + "learning_rate": 0.00015278039322201033, + "loss": 0.83503771, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.40087891, + "step": 3909, + "time_per_iteration": 2.9831857681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044247, + "balance_loss_mlp": 1.00419319, + "epoch": 0.7522123893805309, + "flos": 487416521472.0, + "grad_norm": 0.03727501857461446, + "language_loss": 0.8023864, + "learning_rate": 0.00015255628994088004, + "loss": 0.8128289, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.40039062, + "step": 3910, + "time_per_iteration": 2.5681653022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104353, + "balance_loss_mlp": 1.00352335, + "epoch": 0.7524047710657945, + "flos": 820592686080.0, + "grad_norm": 0.03692479601662457, + "language_loss": 0.75641394, + "learning_rate": 0.00015233232154936082, + "loss": 0.76684928, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.39990234, + "step": 3911, + "time_per_iteration": 3.284299612045288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046246, + "balance_loss_mlp": 1.00616753, + "epoch": 0.7525971527510581, + "flos": 700782092544.0, + "grad_norm": 0.03573003611692562, + "language_loss": 0.76908588, + "learning_rate": 0.0001521084881344048, + "loss": 0.77954835, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.40063477, + "step": 3912, + "time_per_iteration": 2.8574602603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_mlp": 1.00155079, + "epoch": 0.7527895344363217, + "flos": 634950567168.0, + "grad_norm": 0.03264325310237669, + "language_loss": 0.8679074, + "learning_rate": 0.00015188478978291208, + "loss": 0.87832272, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.3996582, + "step": 3913, + "time_per_iteration": 2.7522592544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_mlp": 1.00173748, + "epoch": 0.7529819161215853, + "flos": 563933044992.0, + "grad_norm": 0.03193556827495635, + "language_loss": 0.86971831, + "learning_rate": 0.00015166122658173014, + "loss": 0.88013625, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.40039062, + "step": 3914, + "time_per_iteration": 2.8044931888580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_mlp": 1.00230658, + "epoch": 0.7531742978068487, + "flos": 691957636608.0, + "grad_norm": 0.032939092122736, + "language_loss": 0.89373708, + "learning_rate": 0.00015143779861765332, + "loss": 0.90416014, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.39990234, + "step": 3915, + "time_per_iteration": 2.895873546600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_mlp": 1.00266302, + "epoch": 0.7533666794921123, + "flos": 682307894784.0, + "grad_norm": 0.030283450917942635, + "language_loss": 0.81763279, + "learning_rate": 0.00015121450597742458, + "loss": 0.82805902, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.39941406, + "step": 3916, + "time_per_iteration": 2.8187012672424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00275862, + "epoch": 0.7535590611773759, + "flos": 624814788864.0, + "grad_norm": 0.03530069245734392, + "language_loss": 0.79033458, + "learning_rate": 0.00015099134874773369, + "loss": 0.80076224, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.39990234, + "step": 3917, + "time_per_iteration": 2.729224443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_mlp": 1.0022496, + "epoch": 0.7537514428626395, + "flos": 520494133248.0, + "grad_norm": 0.030735782054698856, + "language_loss": 0.80733752, + "learning_rate": 0.00015076832701521793, + "loss": 0.81775939, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.39916992, + "step": 3918, + "time_per_iteration": 2.7341344356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_mlp": 1.00632989, + "epoch": 0.753943824547903, + "flos": 725035248384.0, + "grad_norm": 0.03833991263993651, + "language_loss": 0.82337809, + "learning_rate": 0.000150545440866462, + "loss": 0.83384174, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.40014648, + "step": 3919, + "time_per_iteration": 2.988217353820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045499, + "balance_loss_mlp": 1.00544465, + "epoch": 0.7541362062331666, + "flos": 438467914752.0, + "grad_norm": 0.03907672700659196, + "language_loss": 0.78807712, + "learning_rate": 0.000150322690387998, + "loss": 0.79853213, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.40039062, + "step": 3920, + "time_per_iteration": 2.503204107284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_mlp": 1.00841296, + "epoch": 0.7543285879184302, + "flos": 566344752384.0, + "grad_norm": 0.03511209658305934, + "language_loss": 0.7581147, + "learning_rate": 0.00015010007566630535, + "loss": 0.76859963, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.40063477, + "step": 3921, + "time_per_iteration": 2.785719633102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_mlp": 1.00624704, + "epoch": 0.7545209696036937, + "flos": 522059167488.0, + "grad_norm": 0.043005780548435554, + "language_loss": 0.81968284, + "learning_rate": 0.00014987759678781077, + "loss": 0.83014631, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.40087891, + "step": 3922, + "time_per_iteration": 2.611830711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045351, + "balance_loss_mlp": 1.00524938, + "epoch": 0.7547133512889573, + "flos": 617210281728.0, + "grad_norm": 0.034097045182419745, + "language_loss": 0.82924581, + "learning_rate": 0.00014965525383888795, + "loss": 0.83969939, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.40087891, + "step": 3923, + "time_per_iteration": 2.7791478633880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104536, + "balance_loss_mlp": 1.00532925, + "epoch": 0.7549057329742208, + "flos": 752142406656.0, + "grad_norm": 0.03232128162967594, + "language_loss": 0.72664821, + "learning_rate": 0.00014943304690585851, + "loss": 0.73710179, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.40014648, + "step": 3924, + "time_per_iteration": 2.8950600624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047725, + "balance_loss_mlp": 1.00767088, + "epoch": 0.7550981146594844, + "flos": 515451032832.0, + "grad_norm": 0.03846404623424841, + "language_loss": 0.79993105, + "learning_rate": 0.0001492109760749908, + "loss": 0.81040823, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.40039062, + "step": 3925, + "time_per_iteration": 2.582379102706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_mlp": 1.00763071, + "epoch": 0.755290496344748, + "flos": 523027349760.0, + "grad_norm": 0.03160852953683284, + "language_loss": 0.80470473, + "learning_rate": 0.00014898904143250002, + "loss": 0.81518203, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.40087891, + "step": 3926, + "time_per_iteration": 2.642066240310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047142, + "balance_loss_mlp": 1.00804138, + "epoch": 0.7554828780300116, + "flos": 1417706376960.0, + "grad_norm": 0.005903328707274883, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76802349, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.390625, + "step": 3927, + "time_per_iteration": 4.909573793411255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045265, + "balance_loss_mlp": 1.00513911, + "epoch": 0.7556752597152752, + "flos": 557986891008.0, + "grad_norm": 0.0318859682760306, + "language_loss": 0.80794632, + "learning_rate": 0.0001485455810572474, + "loss": 0.81839895, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.40112305, + "step": 3928, + "time_per_iteration": 2.635267734527588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_mlp": 1.00466609, + "epoch": 0.7558676414005386, + "flos": 564742779648.0, + "grad_norm": 0.029085057110465686, + "language_loss": 0.84313619, + "learning_rate": 0.00014832405549665236, + "loss": 0.853585, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.40209961, + "step": 3929, + "time_per_iteration": 2.7366552352905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_mlp": 1.00514007, + "epoch": 0.7560600230858022, + "flos": 562535206656.0, + "grad_norm": 0.03398651483995001, + "language_loss": 0.79036754, + "learning_rate": 0.00014810266646876746, + "loss": 0.80082047, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.40136719, + "step": 3930, + "time_per_iteration": 2.748523712158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_mlp": 1.00550461, + "epoch": 0.7562524047710658, + "flos": 720958384896.0, + "grad_norm": 0.03398387115243252, + "language_loss": 0.78128892, + "learning_rate": 0.00014788141405954364, + "loss": 0.79174572, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.40161133, + "step": 3931, + "time_per_iteration": 3.0010688304901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046674, + "balance_loss_mlp": 1.0064286, + "epoch": 0.7564447864563294, + "flos": 544397346048.0, + "grad_norm": 0.04087931734394053, + "language_loss": 0.85259515, + "learning_rate": 0.00014766029835487865, + "loss": 0.8630619, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.40234375, + "step": 3932, + "time_per_iteration": 2.7051644325256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045407, + "balance_loss_mlp": 1.00528109, + "epoch": 0.7566371681415929, + "flos": 727095067392.0, + "grad_norm": 0.040524003150174424, + "language_loss": 0.80254388, + "learning_rate": 0.0001474393194406173, + "loss": 0.812998, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.40112305, + "step": 3933, + "time_per_iteration": 2.88698410987854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045593, + "balance_loss_mlp": 1.00546694, + "epoch": 0.7568295498268565, + "flos": 577807404288.0, + "grad_norm": 0.03205492443871288, + "language_loss": 0.80140668, + "learning_rate": 0.00014721847740255112, + "loss": 0.81186259, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.40112305, + "step": 3934, + "time_per_iteration": 2.8201425075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_mlp": 1.00357056, + "epoch": 0.75702193151212, + "flos": 1523218791168.0, + "grad_norm": 0.006266777740012466, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74954593, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.390625, + "step": 3935, + "time_per_iteration": 4.622663736343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_mlp": 1.00100183, + "epoch": 0.7572143131973836, + "flos": 526489864704.0, + "grad_norm": 0.04266541401518767, + "language_loss": 0.78904128, + "learning_rate": 0.00014677720429790526, + "loss": 0.79945183, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.40039062, + "step": 3936, + "time_per_iteration": 2.5691311359405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104174, + "balance_loss_mlp": 1.00159049, + "epoch": 0.7574066948826472, + "flos": 551823963648.0, + "grad_norm": 0.029232134856981343, + "language_loss": 0.85000217, + "learning_rate": 0.0001465567734026429, + "loss": 0.86041951, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.40136719, + "step": 3937, + "time_per_iteration": 2.6958813667297363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045039, + "balance_loss_mlp": 1.00488961, + "epoch": 0.7575990765679107, + "flos": 396769981440.0, + "grad_norm": 0.04157992306337891, + "language_loss": 0.82874024, + "learning_rate": 0.00014633647972621034, + "loss": 0.83919066, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.40136719, + "step": 3938, + "time_per_iteration": 2.443800449371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_mlp": 1.00556958, + "epoch": 0.7577914582531743, + "flos": 586186652928.0, + "grad_norm": 0.031504909373110845, + "language_loss": 0.86987495, + "learning_rate": 0.00014611632335413354, + "loss": 0.8803314, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.40063477, + "step": 3939, + "time_per_iteration": 2.7657620906829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043556, + "balance_loss_mlp": 1.00357366, + "epoch": 0.7579838399384379, + "flos": 822485309184.0, + "grad_norm": 0.033895333971604005, + "language_loss": 0.83048445, + "learning_rate": 0.00014589630437188456, + "loss": 0.84091997, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.3996582, + "step": 3940, + "time_per_iteration": 3.1827540397644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_mlp": 1.00527036, + "epoch": 0.7581762216237015, + "flos": 444806786304.0, + "grad_norm": 0.03886523682666057, + "language_loss": 0.78962266, + "learning_rate": 0.00014567642286488253, + "loss": 0.8000766, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.40112305, + "step": 3941, + "time_per_iteration": 2.5701324939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_mlp": 1.00506926, + "epoch": 0.7583686033089649, + "flos": 541939951872.0, + "grad_norm": 0.03861315862447661, + "language_loss": 0.79739159, + "learning_rate": 0.00014545667891849258, + "loss": 0.8078438, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.40136719, + "step": 3942, + "time_per_iteration": 2.6185083389282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_mlp": 1.00675285, + "epoch": 0.7585609849942285, + "flos": 523613508096.0, + "grad_norm": 0.03344324045472487, + "language_loss": 0.82940769, + "learning_rate": 0.00014523707261802733, + "loss": 0.83987653, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.40112305, + "step": 3943, + "time_per_iteration": 2.615499973297119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045818, + "balance_loss_mlp": 1.00564396, + "epoch": 0.7587533666794921, + "flos": 542908134144.0, + "grad_norm": 0.03989389594451329, + "language_loss": 0.81696534, + "learning_rate": 0.00014501760404874527, + "loss": 0.82742351, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.40161133, + "step": 3944, + "time_per_iteration": 2.7015254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047263, + "balance_loss_mlp": 1.00713706, + "epoch": 0.7589457483647557, + "flos": 607521656064.0, + "grad_norm": 0.037013243760391015, + "language_loss": 0.86645532, + "learning_rate": 0.00014479827329585176, + "loss": 0.87692797, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.40112305, + "step": 3945, + "time_per_iteration": 2.707260847091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.008057, + "epoch": 0.7591381300500193, + "flos": 556252715520.0, + "grad_norm": 0.030362278965781222, + "language_loss": 0.85217047, + "learning_rate": 0.00014457908044449846, + "loss": 0.86265111, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.39990234, + "step": 3946, + "time_per_iteration": 2.7425830364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048648, + "balance_loss_mlp": 1.00868881, + "epoch": 0.7593305117352828, + "flos": 530814604032.0, + "grad_norm": 0.0320699776647955, + "language_loss": 0.83156931, + "learning_rate": 0.00014436002557978371, + "loss": 0.8420558, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.39941406, + "step": 3947, + "time_per_iteration": 2.852153778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052135, + "balance_loss_mlp": 1.01313019, + "epoch": 0.7595228934205464, + "flos": 1505925658368.0, + "grad_norm": 0.007143494000939788, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77695286, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.38964844, + "step": 3948, + "time_per_iteration": 4.8901238441467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_mlp": 1.00338674, + "epoch": 0.7597152751058099, + "flos": 456468715008.0, + "grad_norm": 0.03356126441084979, + "language_loss": 0.80132592, + "learning_rate": 0.0001439223301503945, + "loss": 0.81176007, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.40014648, + "step": 3949, + "time_per_iteration": 2.5245442390441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_mlp": 1.0028255, + "epoch": 0.7599076567910735, + "flos": 686799830016.0, + "grad_norm": 0.04215278284699455, + "language_loss": 0.76435691, + "learning_rate": 0.00014370368975564834, + "loss": 0.77478409, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.39868164, + "step": 3950, + "time_per_iteration": 3.002926826477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_mlp": 1.0027709, + "epoch": 0.760100038476337, + "flos": 533495574528.0, + "grad_norm": 0.03911832457042585, + "language_loss": 0.84080267, + "learning_rate": 0.00014348518768739766, + "loss": 0.85123098, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.40039062, + "step": 3951, + "time_per_iteration": 2.7287793159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_mlp": 1.00780487, + "epoch": 0.7602924201616006, + "flos": 1474919526144.0, + "grad_norm": 0.009800306556812065, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77774942, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.390625, + "step": 3952, + "time_per_iteration": 4.851192951202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_mlp": 1.00179482, + "epoch": 0.7604848018468642, + "flos": 776041728768.0, + "grad_norm": 0.043428396505350506, + "language_loss": 0.86555135, + "learning_rate": 0.00014304859886964867, + "loss": 0.87596822, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.39868164, + "step": 3953, + "time_per_iteration": 3.0201337337493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044206, + "balance_loss_mlp": 1.00415182, + "epoch": 0.7606771835321278, + "flos": 559261274880.0, + "grad_norm": 0.03249370950181494, + "language_loss": 0.8406316, + "learning_rate": 0.00014283051228964878, + "loss": 0.85107362, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.40039062, + "step": 3954, + "time_per_iteration": 2.6745314598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046758, + "balance_loss_mlp": 1.00687051, + "epoch": 0.7608695652173914, + "flos": 526433484288.0, + "grad_norm": 0.03436460979792566, + "language_loss": 0.83105361, + "learning_rate": 0.00014261256437514197, + "loss": 0.84152114, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.39868164, + "step": 3955, + "time_per_iteration": 2.6607260704040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046091, + "balance_loss_mlp": 1.0060848, + "epoch": 0.7610619469026548, + "flos": 616168222464.0, + "grad_norm": 0.03814764574124358, + "language_loss": 0.82773203, + "learning_rate": 0.0001423947552107428, + "loss": 0.83819294, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.39990234, + "step": 3956, + "time_per_iteration": 2.731502056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_mlp": 1.00053155, + "epoch": 0.7612543285879184, + "flos": 864818978304.0, + "grad_norm": 0.03440554152429829, + "language_loss": 0.77563798, + "learning_rate": 0.00014217708488101243, + "loss": 0.78604192, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.3984375, + "step": 3957, + "time_per_iteration": 3.0592825412750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_mlp": 1.00076258, + "epoch": 0.761446710273182, + "flos": 554728510464.0, + "grad_norm": 0.045631291273616384, + "language_loss": 0.77730322, + "learning_rate": 0.0001419595534704579, + "loss": 0.78771019, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.39916992, + "step": 3958, + "time_per_iteration": 2.693791389465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_mlp": 1.00143242, + "epoch": 0.7616390919584456, + "flos": 468326029824.0, + "grad_norm": 0.03770259597334161, + "language_loss": 0.81622386, + "learning_rate": 0.00014174216106353237, + "loss": 0.82663804, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.3996582, + "step": 3959, + "time_per_iteration": 2.6240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043866, + "balance_loss_mlp": 1.00385952, + "epoch": 0.7618314736437091, + "flos": 499432283904.0, + "grad_norm": 0.036732960604225574, + "language_loss": 0.76590341, + "learning_rate": 0.00014152490774463512, + "loss": 0.77634203, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.39990234, + "step": 3960, + "time_per_iteration": 2.6385769844055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_mlp": 1.00236273, + "epoch": 0.7620238553289727, + "flos": 435452552448.0, + "grad_norm": 0.04258907673967457, + "language_loss": 0.87829125, + "learning_rate": 0.00014130779359811135, + "loss": 0.88871497, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.39990234, + "step": 3961, + "time_per_iteration": 2.530336380004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046467, + "balance_loss_mlp": 1.00657988, + "epoch": 0.7622162370142362, + "flos": 665542594560.0, + "grad_norm": 0.03171084912805384, + "language_loss": 0.86222768, + "learning_rate": 0.0001410908187082521, + "loss": 0.87269235, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.39868164, + "step": 3962, + "time_per_iteration": 2.8736419677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047801, + "balance_loss_mlp": 1.0077945, + "epoch": 0.7624086186994998, + "flos": 559028950272.0, + "grad_norm": 0.03864138857233312, + "language_loss": 0.84107929, + "learning_rate": 0.0001408739831592949, + "loss": 0.85155731, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.39990234, + "step": 3963, + "time_per_iteration": 2.639000415802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_mlp": 1.00829566, + "epoch": 0.7626010003847634, + "flos": 630287545344.0, + "grad_norm": 0.04234358402280358, + "language_loss": 0.77802932, + "learning_rate": 0.0001406572870354224, + "loss": 0.78851116, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.39868164, + "step": 3964, + "time_per_iteration": 2.855811834335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_mlp": 1.00591099, + "epoch": 0.7627933820700269, + "flos": 438849938688.0, + "grad_norm": 0.03234706292902695, + "language_loss": 0.87125206, + "learning_rate": 0.00014044073042076337, + "loss": 0.88171101, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.3996582, + "step": 3965, + "time_per_iteration": 2.5181050300598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_mlp": 1.00586748, + "epoch": 0.7629857637552905, + "flos": 533794973184.0, + "grad_norm": 0.028534394430764273, + "language_loss": 0.89329129, + "learning_rate": 0.00014022431339939302, + "loss": 0.90375006, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.39990234, + "step": 3966, + "time_per_iteration": 2.671855926513672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_mlp": 1.00679314, + "epoch": 0.7631781454405541, + "flos": 681237645312.0, + "grad_norm": 0.04110089752084587, + "language_loss": 0.78748721, + "learning_rate": 0.00014000803605533163, + "loss": 0.79795587, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.40063477, + "step": 3967, + "time_per_iteration": 2.8315372467041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104216, + "balance_loss_mlp": 1.00203407, + "epoch": 0.7633705271258177, + "flos": 508489064448.0, + "grad_norm": 0.04146307364785201, + "language_loss": 0.8433795, + "learning_rate": 0.00013979189847254553, + "loss": 0.85380107, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.40112305, + "step": 3968, + "time_per_iteration": 2.601447582244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_mlp": 1.00454402, + "epoch": 0.7635629088110811, + "flos": 620039006208.0, + "grad_norm": 0.03458604771119312, + "language_loss": 0.81047332, + "learning_rate": 0.00013957590073494674, + "loss": 0.82091957, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.40063477, + "step": 3969, + "time_per_iteration": 2.8777170181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044405, + "balance_loss_mlp": 1.00430274, + "epoch": 0.7637552904963447, + "flos": 639567902208.0, + "grad_norm": 0.03961564196889536, + "language_loss": 0.79463089, + "learning_rate": 0.0001393600429263931, + "loss": 0.80507493, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.40087891, + "step": 3970, + "time_per_iteration": 2.7422754764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.0035553, + "epoch": 0.7639476721816083, + "flos": 1566686860032.0, + "grad_norm": 0.00740169880788124, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75787538, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.390625, + "step": 3971, + "time_per_iteration": 4.935492038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_mlp": 1.00043809, + "epoch": 0.7641400538668719, + "flos": 497020576512.0, + "grad_norm": 0.032719762183458435, + "language_loss": 0.81907034, + "learning_rate": 0.0001389287474315804, + "loss": 0.82947505, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.40014648, + "step": 3972, + "time_per_iteration": 2.630120038986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046444, + "balance_loss_mlp": 1.00638986, + "epoch": 0.7643324355521355, + "flos": 579515334912.0, + "grad_norm": 0.03140885431358122, + "language_loss": 0.80818957, + "learning_rate": 0.00013871330991276505, + "loss": 0.81865394, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.40039062, + "step": 3973, + "time_per_iteration": 2.685450553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_mlp": 1.00257015, + "epoch": 0.764524817237399, + "flos": 786233887488.0, + "grad_norm": 0.035934794543156075, + "language_loss": 0.81384689, + "learning_rate": 0.00013849801265788247, + "loss": 0.82427323, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.40039062, + "step": 3974, + "time_per_iteration": 3.039971113204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_mlp": 1.00235903, + "epoch": 0.7647171989226625, + "flos": 527299599360.0, + "grad_norm": 0.03568861441891304, + "language_loss": 0.83377182, + "learning_rate": 0.00013828285575051818, + "loss": 0.84419549, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.39990234, + "step": 3975, + "time_per_iteration": 2.6113204956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_mlp": 1.00243056, + "epoch": 0.7649095806079261, + "flos": 556029139200.0, + "grad_norm": 0.03438397238975277, + "language_loss": 0.84555364, + "learning_rate": 0.0001380678392742035, + "loss": 0.85597825, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.40014648, + "step": 3976, + "time_per_iteration": 2.702728509902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042138, + "balance_loss_mlp": 1.0021317, + "epoch": 0.7651019622931897, + "flos": 650389960704.0, + "grad_norm": 0.02964586673443437, + "language_loss": 0.84697402, + "learning_rate": 0.00013785296331241526, + "loss": 0.85739541, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.39990234, + "step": 3977, + "time_per_iteration": 2.8500404357910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_mlp": 1.00282192, + "epoch": 0.7652943439784533, + "flos": 1048113551616.0, + "grad_norm": 0.03693742198159439, + "language_loss": 0.8784855, + "learning_rate": 0.00013763822794857583, + "loss": 0.88891351, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.3996582, + "step": 3978, + "time_per_iteration": 3.2964861392974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_mlp": 1.00266194, + "epoch": 0.7654867256637168, + "flos": 505415376384.0, + "grad_norm": 0.03301663266188199, + "language_loss": 0.9032107, + "learning_rate": 0.00013742363326605278, + "loss": 0.91363835, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.40087891, + "step": 3979, + "time_per_iteration": 2.7543904781341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_mlp": 1.00289607, + "epoch": 0.7656791073489804, + "flos": 575864236800.0, + "grad_norm": 0.031055895405363115, + "language_loss": 0.78887016, + "learning_rate": 0.00013720917934815935, + "loss": 0.79929984, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.40063477, + "step": 3980, + "time_per_iteration": 2.757488489151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043263, + "balance_loss_mlp": 1.0031848, + "epoch": 0.765871489034244, + "flos": 493792331520.0, + "grad_norm": 0.04115022529331337, + "language_loss": 0.83214378, + "learning_rate": 0.00013699486627815344, + "loss": 0.84257638, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.40063477, + "step": 3981, + "time_per_iteration": 2.6013007164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_mlp": 1.00347948, + "epoch": 0.7660638707195075, + "flos": 487051994112.0, + "grad_norm": 0.036811021847235705, + "language_loss": 0.83011079, + "learning_rate": 0.00013678069413923928, + "loss": 0.84054542, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.3996582, + "step": 3982, + "time_per_iteration": 2.647836208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042258, + "balance_loss_mlp": 1.00225163, + "epoch": 0.766256252404771, + "flos": 445243245312.0, + "grad_norm": 0.03517202501681349, + "language_loss": 0.8304435, + "learning_rate": 0.00013656666301456555, + "loss": 0.84086609, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.39990234, + "step": 3983, + "time_per_iteration": 2.5181782245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045473, + "balance_loss_mlp": 1.00541902, + "epoch": 0.7664486340900346, + "flos": 486214069248.0, + "grad_norm": 0.03304538519441237, + "language_loss": 0.84839791, + "learning_rate": 0.0001363527729872267, + "loss": 0.85885262, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.40039062, + "step": 3984, + "time_per_iteration": 2.7154600620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_mlp": 1.00527942, + "epoch": 0.7666410157752982, + "flos": 647385292032.0, + "grad_norm": 0.036051539426371945, + "language_loss": 0.77239299, + "learning_rate": 0.00013613902414026207, + "loss": 0.78284609, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.40014648, + "step": 3985, + "time_per_iteration": 2.793349027633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_mlp": 1.00238621, + "epoch": 0.7668333974605618, + "flos": 775661650176.0, + "grad_norm": 0.03427802042896287, + "language_loss": 0.82765865, + "learning_rate": 0.00013592541655665642, + "loss": 0.83808166, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.39892578, + "step": 3986, + "time_per_iteration": 2.9631149768829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_mlp": 1.00257468, + "epoch": 0.7670257791458254, + "flos": 614513760000.0, + "grad_norm": 0.03630429655058752, + "language_loss": 0.85794669, + "learning_rate": 0.00013571195031933947, + "loss": 0.86837274, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.40014648, + "step": 3987, + "time_per_iteration": 2.684053659439087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040779, + "balance_loss_mlp": 1.0018692, + "epoch": 0.7672181608310888, + "flos": 1488365207808.0, + "grad_norm": 0.004720848952888087, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81522119, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.38867188, + "step": 3988, + "time_per_iteration": 4.726950168609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.0030216, + "epoch": 0.7674105425163524, + "flos": 611867782656.0, + "grad_norm": 0.03766281507369906, + "language_loss": 0.85887635, + "learning_rate": 0.00013528544221501655, + "loss": 0.86930621, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.39941406, + "step": 3989, + "time_per_iteration": 2.710402011871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043109, + "balance_loss_mlp": 1.00315046, + "epoch": 0.767602924201616, + "flos": 846605295360.0, + "grad_norm": 0.0329376529812033, + "language_loss": 0.82137692, + "learning_rate": 0.00013507240051359586, + "loss": 0.83180797, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.39941406, + "step": 3990, + "time_per_iteration": 3.0520286560058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_mlp": 1.00342703, + "epoch": 0.7677953058868796, + "flos": 528146272512.0, + "grad_norm": 0.038347091036525886, + "language_loss": 0.8687346, + "learning_rate": 0.00013485950048963425, + "loss": 0.87916845, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.39941406, + "step": 3991, + "time_per_iteration": 2.597275495529175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_mlp": 1.01365852, + "epoch": 0.7679876875721431, + "flos": 925112618496.0, + "grad_norm": 0.036512387474733066, + "language_loss": 0.83205199, + "learning_rate": 0.00013464674222578643, + "loss": 0.84258771, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.39892578, + "step": 3992, + "time_per_iteration": 3.1764492988586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_mlp": 1.01389194, + "epoch": 0.7681800692574067, + "flos": 459019428096.0, + "grad_norm": 0.03635515300980307, + "language_loss": 0.83761203, + "learning_rate": 0.00013443412580465292, + "loss": 0.84815073, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.3996582, + "step": 3993, + "time_per_iteration": 2.583146810531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053421, + "balance_loss_mlp": 1.01348555, + "epoch": 0.7683724509426703, + "flos": 659733500928.0, + "grad_norm": 0.040381204925205964, + "language_loss": 0.84726322, + "learning_rate": 0.00013422165130877857, + "loss": 0.85779738, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.39916992, + "step": 3994, + "time_per_iteration": 2.8877995014190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_mlp": 1.00473106, + "epoch": 0.7685648326279338, + "flos": 556339231488.0, + "grad_norm": 0.052990639724004036, + "language_loss": 0.80869007, + "learning_rate": 0.00013400931882065327, + "loss": 0.81913817, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.40063477, + "step": 3995, + "time_per_iteration": 2.693859815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043655, + "balance_loss_mlp": 1.00367188, + "epoch": 0.7687572143131974, + "flos": 688744942848.0, + "grad_norm": 0.032666888186809864, + "language_loss": 0.81219018, + "learning_rate": 0.0001337971284227118, + "loss": 0.82262671, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.3996582, + "step": 3996, + "time_per_iteration": 3.0207791328430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_mlp": 1.00307465, + "epoch": 0.7689495959984609, + "flos": 1492668559872.0, + "grad_norm": 0.00690868544016345, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77160406, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.38867188, + "step": 3997, + "time_per_iteration": 4.991567134857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_mlp": 1.00463462, + "epoch": 0.7691419776837245, + "flos": 571500613632.0, + "grad_norm": 0.032008326579370035, + "language_loss": 0.80634248, + "learning_rate": 0.0001333731742268438, + "loss": 0.81678867, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.3996582, + "step": 3998, + "time_per_iteration": 2.698580026626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040606, + "balance_loss_mlp": 1.00064719, + "epoch": 0.7693343593689881, + "flos": 521191107072.0, + "grad_norm": 0.03337650423069263, + "language_loss": 0.85920131, + "learning_rate": 0.0001331614105935109, + "loss": 0.86960733, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.39941406, + "step": 3999, + "time_per_iteration": 2.693692684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_mlp": 1.00495398, + "epoch": 0.7695267410542517, + "flos": 661552247040.0, + "grad_norm": 0.031590911772699855, + "language_loss": 0.84561241, + "learning_rate": 0.00013294978937954883, + "loss": 0.85606205, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.39990234, + "step": 4000, + "time_per_iteration": 2.7991349697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_mlp": 1.00548565, + "epoch": 0.7697191227395151, + "flos": 547859860992.0, + "grad_norm": 0.04547292617376322, + "language_loss": 0.8583228, + "learning_rate": 0.00013273831066711655, + "loss": 0.86877775, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.39990234, + "step": 4001, + "time_per_iteration": 2.640451192855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_mlp": 1.00354123, + "epoch": 0.7699115044247787, + "flos": 541696933632.0, + "grad_norm": 0.030960933943813315, + "language_loss": 0.80473912, + "learning_rate": 0.00013252697453831747, + "loss": 0.8151741, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.39941406, + "step": 4002, + "time_per_iteration": 2.709754467010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_mlp": 1.00417161, + "epoch": 0.7701038861100423, + "flos": 564143982336.0, + "grad_norm": 0.03227531523104023, + "language_loss": 0.82851601, + "learning_rate": 0.00013231578107519916, + "loss": 0.83895779, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.39990234, + "step": 4003, + "time_per_iteration": 2.914151191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043647, + "balance_loss_mlp": 1.0037117, + "epoch": 0.7702962677953059, + "flos": 482734057728.0, + "grad_norm": 0.0383418204368582, + "language_loss": 0.83275282, + "learning_rate": 0.00013210473035975422, + "loss": 0.84318936, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.39916992, + "step": 4004, + "time_per_iteration": 2.605908155441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.0033915, + "epoch": 0.7704886494805695, + "flos": 771806417664.0, + "grad_norm": 0.03621639997578191, + "language_loss": 0.85901195, + "learning_rate": 0.0001318938224739201, + "loss": 0.8694452, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.39916992, + "step": 4005, + "time_per_iteration": 3.059812545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044187, + "balance_loss_mlp": 1.00441921, + "epoch": 0.770681031165833, + "flos": 602318162688.0, + "grad_norm": 0.030887976595528478, + "language_loss": 0.84163052, + "learning_rate": 0.00013168305749957843, + "loss": 0.85207236, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.39746094, + "step": 4006, + "time_per_iteration": 2.730853796005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_mlp": 1.00063193, + "epoch": 0.7708734128510966, + "flos": 497096398848.0, + "grad_norm": 0.03317085046195358, + "language_loss": 0.83013129, + "learning_rate": 0.00013147243551855532, + "loss": 0.84053576, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.39794922, + "step": 4007, + "time_per_iteration": 2.6102261543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_mlp": 1.00025964, + "epoch": 0.7710657945363601, + "flos": 568455115776.0, + "grad_norm": 0.02959339881613439, + "language_loss": 0.81033671, + "learning_rate": 0.00013126195661262148, + "loss": 0.82073796, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.3984375, + "step": 4008, + "time_per_iteration": 2.8038330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_mlp": 1.00230277, + "epoch": 0.7712581762216237, + "flos": 605750542080.0, + "grad_norm": 0.030762375032726955, + "language_loss": 0.8689748, + "learning_rate": 0.00013105162086349216, + "loss": 0.87939554, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.39746094, + "step": 4009, + "time_per_iteration": 2.8229057788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045827, + "balance_loss_mlp": 1.00593925, + "epoch": 0.7714505579068872, + "flos": 531997614336.0, + "grad_norm": 0.03203683238249966, + "language_loss": 0.86152643, + "learning_rate": 0.00013084142835282687, + "loss": 0.87198472, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.39868164, + "step": 4010, + "time_per_iteration": 2.6913058757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104528, + "balance_loss_mlp": 1.00637054, + "epoch": 0.7716429395921508, + "flos": 1425382815744.0, + "grad_norm": 0.007782218935032237, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80929649, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.38867188, + "step": 4011, + "time_per_iteration": 4.785134315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_mlp": 1.00353301, + "epoch": 0.7718353212774144, + "flos": 579587266560.0, + "grad_norm": 0.03553512598849003, + "language_loss": 0.89913195, + "learning_rate": 0.0001304214733732485, + "loss": 0.90956569, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.39819336, + "step": 4012, + "time_per_iteration": 4.228041648864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_mlp": 1.00733876, + "epoch": 0.772027702962678, + "flos": 511773689856.0, + "grad_norm": 0.036769707264373286, + "language_loss": 0.83085632, + "learning_rate": 0.00013021171106737672, + "loss": 0.8413288, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.39892578, + "step": 4013, + "time_per_iteration": 2.6609246730804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_mlp": 1.00402582, + "epoch": 0.7722200846479416, + "flos": 526748434176.0, + "grad_norm": 0.0322565513109964, + "language_loss": 0.80160201, + "learning_rate": 0.00013000209232605071, + "loss": 0.81204116, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.39868164, + "step": 4014, + "time_per_iteration": 2.6655430793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042333, + "balance_loss_mlp": 1.00244582, + "epoch": 0.772412466333205, + "flos": 480602307072.0, + "grad_norm": 0.033386370052076744, + "language_loss": 0.80578887, + "learning_rate": 0.0001297926172306519, + "loss": 0.81621224, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.39868164, + "step": 4015, + "time_per_iteration": 2.6234195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_mlp": 1.00240195, + "epoch": 0.7726048480184686, + "flos": 907314007296.0, + "grad_norm": 0.032763935043036714, + "language_loss": 0.79579479, + "learning_rate": 0.0001295832858625055, + "loss": 0.8062169, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.39794922, + "step": 4016, + "time_per_iteration": 3.251309394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_mlp": 1.0024209, + "epoch": 0.7727972297037322, + "flos": 632567049984.0, + "grad_norm": 0.031482852227098596, + "language_loss": 0.70049077, + "learning_rate": 0.00012937409830288154, + "loss": 0.71091413, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.39892578, + "step": 4017, + "time_per_iteration": 2.821953296661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043688, + "balance_loss_mlp": 1.00377691, + "epoch": 0.7729896113889958, + "flos": 415673835264.0, + "grad_norm": 0.04152117908534408, + "language_loss": 0.85515797, + "learning_rate": 0.00012916505463299362, + "loss": 0.86559486, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.39892578, + "step": 4018, + "time_per_iteration": 2.5182814598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_mlp": 1.00375712, + "epoch": 0.7731819930742593, + "flos": 670105494528.0, + "grad_norm": 0.03808310048663825, + "language_loss": 0.78672588, + "learning_rate": 0.00012895615493399972, + "loss": 0.79716301, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.39941406, + "step": 4019, + "time_per_iteration": 2.8195626735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042435, + "balance_loss_mlp": 1.00249946, + "epoch": 0.7733743747595229, + "flos": 490859594496.0, + "grad_norm": 0.04130359033653859, + "language_loss": 0.83203042, + "learning_rate": 0.00012874739928700192, + "loss": 0.84245479, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.39916992, + "step": 4020, + "time_per_iteration": 2.561997652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_mlp": 1.00260687, + "epoch": 0.7735667564447865, + "flos": 660888321024.0, + "grad_norm": 0.03933419760406932, + "language_loss": 0.8045736, + "learning_rate": 0.00012853878777304624, + "loss": 0.81499898, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.39916992, + "step": 4021, + "time_per_iteration": 2.866426706314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_mlp": 1.00597656, + "epoch": 0.77375913813005, + "flos": 534491947008.0, + "grad_norm": 0.03165766491354683, + "language_loss": 0.84674478, + "learning_rate": 0.000128330320473123, + "loss": 0.85720319, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.3984375, + "step": 4022, + "time_per_iteration": 2.689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053924, + "balance_loss_mlp": 1.01501465, + "epoch": 0.7739515198153136, + "flos": 1523382096384.0, + "grad_norm": 0.014118198615631215, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79385823, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.38867188, + "step": 4023, + "time_per_iteration": 4.873432874679565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_mlp": 1.00660741, + "epoch": 0.7741439015005771, + "flos": 641252500224.0, + "grad_norm": 0.03719222986938954, + "language_loss": 0.8204658, + "learning_rate": 0.0001279138188390543, + "loss": 0.83093119, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.39916992, + "step": 4024, + "time_per_iteration": 2.7891459465026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042489, + "balance_loss_mlp": 1.00267303, + "epoch": 0.7743362831858407, + "flos": 667025003520.0, + "grad_norm": 0.033177934187398395, + "language_loss": 0.86806941, + "learning_rate": 0.00012770578466660915, + "loss": 0.87849432, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.39794922, + "step": 4025, + "time_per_iteration": 2.8528504371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_mlp": 1.00135612, + "epoch": 0.7745286648711043, + "flos": 563994283008.0, + "grad_norm": 0.03246135787328845, + "language_loss": 0.82025433, + "learning_rate": 0.0001274978950315968, + "loss": 0.83066702, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.39892578, + "step": 4026, + "time_per_iteration": 2.8042469024658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_mlp": 1.00322056, + "epoch": 0.7747210465563679, + "flos": 517962862080.0, + "grad_norm": 0.03718030635862113, + "language_loss": 0.83308971, + "learning_rate": 0.00012729015001472716, + "loss": 0.84352028, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.39819336, + "step": 4027, + "time_per_iteration": 2.6950860023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042837, + "balance_loss_mlp": 1.00294983, + "epoch": 0.7749134282416313, + "flos": 635369529600.0, + "grad_norm": 0.032368305886577194, + "language_loss": 0.81846035, + "learning_rate": 0.00012708254969665418, + "loss": 0.82888865, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.39868164, + "step": 4028, + "time_per_iteration": 2.7516164779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043156, + "balance_loss_mlp": 1.00326848, + "epoch": 0.7751058099268949, + "flos": 496351792896.0, + "grad_norm": 0.03964938582220019, + "language_loss": 0.83793879, + "learning_rate": 0.00012687509415797526, + "loss": 0.84837031, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.39868164, + "step": 4029, + "time_per_iteration": 2.5878894329071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104315, + "balance_loss_mlp": 1.003286, + "epoch": 0.7752981916121585, + "flos": 511363475712.0, + "grad_norm": 0.03526859549006244, + "language_loss": 0.8169387, + "learning_rate": 0.00012666778347923208, + "loss": 0.82737017, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.3984375, + "step": 4030, + "time_per_iteration": 2.717336893081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104185, + "balance_loss_mlp": 1.00208211, + "epoch": 0.7754905732974221, + "flos": 498566168832.0, + "grad_norm": 0.03835604282300423, + "language_loss": 0.84299457, + "learning_rate": 0.0001264606177409092, + "loss": 0.85341311, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.39746094, + "step": 4031, + "time_per_iteration": 2.6836674213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_mlp": 1.00206888, + "epoch": 0.7756829549826857, + "flos": 481783372032.0, + "grad_norm": 0.032423363351723834, + "language_loss": 0.86526835, + "learning_rate": 0.00012625359702343609, + "loss": 0.87568641, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.3972168, + "step": 4032, + "time_per_iteration": 2.74953031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042791, + "balance_loss_mlp": 1.00316608, + "epoch": 0.7758753366679492, + "flos": 553686451200.0, + "grad_norm": 0.036449679892663574, + "language_loss": 0.85421842, + "learning_rate": 0.00012604672140718504, + "loss": 0.86464632, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.39599609, + "step": 4033, + "time_per_iteration": 2.6570351123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.0033319, + "epoch": 0.7760677183532128, + "flos": 705065035776.0, + "grad_norm": 0.035522641284568106, + "language_loss": 0.78343493, + "learning_rate": 0.00012583999097247233, + "loss": 0.79386473, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.39624023, + "step": 4034, + "time_per_iteration": 2.828260660171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_mlp": 1.00273633, + "epoch": 0.7762601000384763, + "flos": 524479623168.0, + "grad_norm": 0.037193057814734455, + "language_loss": 0.80287337, + "learning_rate": 0.0001256334057995578, + "loss": 0.81329775, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.39672852, + "step": 4035, + "time_per_iteration": 2.694047689437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042576, + "balance_loss_mlp": 1.00292659, + "epoch": 0.7764524817237399, + "flos": 558618736128.0, + "grad_norm": 0.03306447256493109, + "language_loss": 0.85536653, + "learning_rate": 0.000125426965968645, + "loss": 0.86579227, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.39624023, + "step": 4036, + "time_per_iteration": 2.668351173400879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_mlp": 1.00257981, + "epoch": 0.7766448634090035, + "flos": 580817908992.0, + "grad_norm": 0.03814563398191554, + "language_loss": 0.83102942, + "learning_rate": 0.00012522067155988092, + "loss": 0.84145141, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.39599609, + "step": 4037, + "time_per_iteration": 2.738696336746216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_mlp": 1.00182426, + "epoch": 0.776837245094267, + "flos": 636819857664.0, + "grad_norm": 0.03633176837238025, + "language_loss": 0.75620854, + "learning_rate": 0.00012501452265335617, + "loss": 0.76662356, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.39648438, + "step": 4038, + "time_per_iteration": 2.8050642013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.00211596, + "epoch": 0.7770296267795306, + "flos": 615814388736.0, + "grad_norm": 0.05953534229047703, + "language_loss": 0.82882428, + "learning_rate": 0.0001248085193291047, + "loss": 0.83924192, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.39624023, + "step": 4039, + "time_per_iteration": 2.7656314373016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104134, + "balance_loss_mlp": 1.00166762, + "epoch": 0.7772220084647942, + "flos": 880297255680.0, + "grad_norm": 0.03559940349726857, + "language_loss": 0.82936066, + "learning_rate": 0.00012460266166710443, + "loss": 0.83977401, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.39648438, + "step": 4040, + "time_per_iteration": 3.203681468963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_mlp": 1.00201309, + "epoch": 0.7774143901500578, + "flos": 841039219968.0, + "grad_norm": 0.03667780998396207, + "language_loss": 0.78218615, + "learning_rate": 0.00012439694974727633, + "loss": 0.79260302, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.39648438, + "step": 4041, + "time_per_iteration": 3.1048929691314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042796, + "balance_loss_mlp": 1.00305152, + "epoch": 0.7776067718353212, + "flos": 569229857280.0, + "grad_norm": 0.03323606563363869, + "language_loss": 0.80526865, + "learning_rate": 0.00012419138364948458, + "loss": 0.8156966, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.3972168, + "step": 4042, + "time_per_iteration": 2.759185791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104225, + "balance_loss_mlp": 1.00255311, + "epoch": 0.7777991535205848, + "flos": 747210121728.0, + "grad_norm": 0.04016086024334982, + "language_loss": 0.83042264, + "learning_rate": 0.00012398596345353702, + "loss": 0.84084511, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.39672852, + "step": 4043, + "time_per_iteration": 2.8885416984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055285, + "balance_loss_mlp": 1.01556456, + "epoch": 0.7779915352058484, + "flos": 539183159040.0, + "grad_norm": 0.05452361141280675, + "language_loss": 0.8397001, + "learning_rate": 0.0001237806892391851, + "loss": 0.85025299, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.39697266, + "step": 4044, + "time_per_iteration": 2.6936380863189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051551, + "balance_loss_mlp": 1.01173472, + "epoch": 0.778183916891112, + "flos": 635955687936.0, + "grad_norm": 0.03830611533598255, + "language_loss": 0.81336337, + "learning_rate": 0.0001235755610861233, + "loss": 0.82387888, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.39794922, + "step": 4045, + "time_per_iteration": 2.8001327514648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051224, + "balance_loss_mlp": 1.01140773, + "epoch": 0.7783762985763756, + "flos": 589790118912.0, + "grad_norm": 0.03835840399941748, + "language_loss": 0.85962141, + "learning_rate": 0.0001233705790739893, + "loss": 0.87013364, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.39794922, + "step": 4046, + "time_per_iteration": 2.7678940296173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_mlp": 1.00712717, + "epoch": 0.7785686802616391, + "flos": 932241782784.0, + "grad_norm": 0.03816222734497192, + "language_loss": 0.75308621, + "learning_rate": 0.0001231657432823643, + "loss": 0.76355535, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.39770508, + "step": 4047, + "time_per_iteration": 3.2008919715881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104796, + "balance_loss_mlp": 1.00816751, + "epoch": 0.7787610619469026, + "flos": 498956941056.0, + "grad_norm": 0.03863312039595469, + "language_loss": 0.79339081, + "learning_rate": 0.0001229610537907725, + "loss": 0.80387038, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.39770508, + "step": 4048, + "time_per_iteration": 2.6606078147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047514, + "balance_loss_mlp": 1.00776947, + "epoch": 0.7789534436321662, + "flos": 516651539712.0, + "grad_norm": 0.03926321418096956, + "language_loss": 0.91044021, + "learning_rate": 0.00012275651067868143, + "loss": 0.92091531, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.3972168, + "step": 4049, + "time_per_iteration": 2.5831098556518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048801, + "balance_loss_mlp": 1.00903308, + "epoch": 0.7791458253174298, + "flos": 990062477568.0, + "grad_norm": 0.03241253923352413, + "language_loss": 0.80757916, + "learning_rate": 0.00012255211402550182, + "loss": 0.81806719, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.39746094, + "step": 4050, + "time_per_iteration": 3.227099657058716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_mlp": 1.00596547, + "epoch": 0.7793382070026933, + "flos": 630185478144.0, + "grad_norm": 0.040685190405043196, + "language_loss": 0.77082014, + "learning_rate": 0.00012234786391058727, + "loss": 0.78127873, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.39868164, + "step": 4051, + "time_per_iteration": 2.803809881210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_mlp": 1.00439727, + "epoch": 0.7795305886879569, + "flos": 532763607552.0, + "grad_norm": 0.14552341545352887, + "language_loss": 0.85931647, + "learning_rate": 0.0001221437604132352, + "loss": 0.86975908, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.3984375, + "step": 4052, + "time_per_iteration": 2.6521799564361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044071, + "balance_loss_mlp": 1.00425482, + "epoch": 0.7797229703732205, + "flos": 613142166528.0, + "grad_norm": 0.03707443730916314, + "language_loss": 0.81622672, + "learning_rate": 0.0001219398036126852, + "loss": 0.82666743, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.39794922, + "step": 4053, + "time_per_iteration": 2.7498905658721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043695, + "balance_loss_mlp": 1.00385571, + "epoch": 0.7799153520584841, + "flos": 873796045824.0, + "grad_norm": 0.03756906141902607, + "language_loss": 0.78616834, + "learning_rate": 0.00012173599358812027, + "loss": 0.79660529, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.39819336, + "step": 4054, + "time_per_iteration": 3.2531538009643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010442, + "balance_loss_mlp": 1.00424063, + "epoch": 0.7801077337437476, + "flos": 584745073152.0, + "grad_norm": 0.034551857273689666, + "language_loss": 0.83048439, + "learning_rate": 0.0001215323304186668, + "loss": 0.84092641, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.39941406, + "step": 4055, + "time_per_iteration": 2.802626371383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104311, + "balance_loss_mlp": 1.00329399, + "epoch": 0.7803001154290111, + "flos": 602281224192.0, + "grad_norm": 0.03735081367855325, + "language_loss": 0.87971795, + "learning_rate": 0.00012132881418339364, + "loss": 0.890149, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.39794922, + "step": 4056, + "time_per_iteration": 2.779559850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_mlp": 1.00430298, + "epoch": 0.7804924971142747, + "flos": 1482928411392.0, + "grad_norm": 0.004870984594471592, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.7856068, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.38769531, + "step": 4057, + "time_per_iteration": 4.857725620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_mlp": 1.00350153, + "epoch": 0.7806848787995383, + "flos": 631516242432.0, + "grad_norm": 0.03794339503679321, + "language_loss": 0.77468872, + "learning_rate": 0.00012092222283137944, + "loss": 0.78512168, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.39770508, + "step": 4058, + "time_per_iteration": 2.742105722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045322, + "balance_loss_mlp": 1.00650787, + "epoch": 0.7808772604848019, + "flos": 1420747984128.0, + "grad_norm": 0.008365987604131462, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79951632, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.38769531, + "step": 4059, + "time_per_iteration": 4.828707695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043702, + "balance_loss_mlp": 1.00390983, + "epoch": 0.7810696421700654, + "flos": 733104404736.0, + "grad_norm": 0.03231348100854236, + "language_loss": 0.83930951, + "learning_rate": 0.00012051622016348856, + "loss": 0.84974658, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.39770508, + "step": 4060, + "time_per_iteration": 2.9990715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043011, + "balance_loss_mlp": 1.00324297, + "epoch": 0.781262023855329, + "flos": 425837803776.0, + "grad_norm": 0.036166261595935334, + "language_loss": 0.84719038, + "learning_rate": 0.00012031343978315539, + "loss": 0.85762048, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.39746094, + "step": 4061, + "time_per_iteration": 2.4627366065979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042268, + "balance_loss_mlp": 1.00247562, + "epoch": 0.7814544055405925, + "flos": 502074370560.0, + "grad_norm": 0.0342232602285917, + "language_loss": 0.83237028, + "learning_rate": 0.00012011080681021774, + "loss": 0.84279293, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.39770508, + "step": 4062, + "time_per_iteration": 2.689143657684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104349, + "balance_loss_mlp": 1.00372207, + "epoch": 0.7816467872258561, + "flos": 463393744896.0, + "grad_norm": 0.03454181235361348, + "language_loss": 0.86497313, + "learning_rate": 0.00011990832132334512, + "loss": 0.87540805, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.39746094, + "step": 4063, + "time_per_iteration": 2.554494619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_mlp": 1.00317287, + "epoch": 0.7818391689111197, + "flos": 742108695552.0, + "grad_norm": 0.04030756572766353, + "language_loss": 0.82932305, + "learning_rate": 0.00011970598340114897, + "loss": 0.8397525, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.39746094, + "step": 4064, + "time_per_iteration": 2.970621109008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_mlp": 1.00241697, + "epoch": 0.7820315505963832, + "flos": 548806656000.0, + "grad_norm": 0.039516882872222964, + "language_loss": 0.84180045, + "learning_rate": 0.00011950379312218396, + "loss": 0.85222203, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.3972168, + "step": 4065, + "time_per_iteration": 2.7288360595703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104416, + "balance_loss_mlp": 1.00446284, + "epoch": 0.7822239322816468, + "flos": 730260129024.0, + "grad_norm": 0.03113922880228995, + "language_loss": 0.86965168, + "learning_rate": 0.00011930175056494719, + "loss": 0.88009328, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.39672852, + "step": 4066, + "time_per_iteration": 2.9733567237854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_mlp": 1.00383461, + "epoch": 0.7824163139669104, + "flos": 452986758144.0, + "grad_norm": 0.03027995654836667, + "language_loss": 0.76300895, + "learning_rate": 0.00011909985580787885, + "loss": 0.77344429, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.39672852, + "step": 4067, + "time_per_iteration": 2.63247013092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_mlp": 1.0023967, + "epoch": 0.782608695652174, + "flos": 541621111296.0, + "grad_norm": 0.030067199560216216, + "language_loss": 0.81511915, + "learning_rate": 0.00011889810892936137, + "loss": 0.82554203, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.39868164, + "step": 4068, + "time_per_iteration": 2.725503444671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042501, + "balance_loss_mlp": 1.00256538, + "epoch": 0.7828010773374374, + "flos": 501429886464.0, + "grad_norm": 0.036639479010935665, + "language_loss": 0.77685481, + "learning_rate": 0.00011869651000771959, + "loss": 0.78727984, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.39916992, + "step": 4069, + "time_per_iteration": 2.831753730773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_mlp": 1.00274503, + "epoch": 0.782993459022701, + "flos": 601918642176.0, + "grad_norm": 0.036456028329252196, + "language_loss": 0.83725941, + "learning_rate": 0.00011849505912122117, + "loss": 0.84768599, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.39892578, + "step": 4070, + "time_per_iteration": 2.7105395793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042326, + "balance_loss_mlp": 1.00246227, + "epoch": 0.7831858407079646, + "flos": 811476612864.0, + "grad_norm": 0.03866218742365993, + "language_loss": 0.78222632, + "learning_rate": 0.00011829375634807654, + "loss": 0.79264963, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.3984375, + "step": 4071, + "time_per_iteration": 3.082258939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_mlp": 1.00321043, + "epoch": 0.7833782223932282, + "flos": 808014097920.0, + "grad_norm": 0.03240130540030076, + "language_loss": 0.81343973, + "learning_rate": 0.00011809260176643821, + "loss": 0.82386994, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.39794922, + "step": 4072, + "time_per_iteration": 3.0537989139556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_mlp": 1.00296557, + "epoch": 0.7835706040784918, + "flos": 521900719872.0, + "grad_norm": 0.03900176982337939, + "language_loss": 0.84087825, + "learning_rate": 0.00011789159545440131, + "loss": 0.85130656, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.3984375, + "step": 4073, + "time_per_iteration": 2.628188133239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_mlp": 1.00314867, + "epoch": 0.7837629857637552, + "flos": 506744195328.0, + "grad_norm": 0.031003851704209363, + "language_loss": 0.82853079, + "learning_rate": 0.00011769073749000348, + "loss": 0.83895999, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.39746094, + "step": 4074, + "time_per_iteration": 2.7814579010009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_mlp": 1.00359035, + "epoch": 0.7839553674490188, + "flos": 517135630848.0, + "grad_norm": 0.03896088374638199, + "language_loss": 0.76594853, + "learning_rate": 0.0001174900279512246, + "loss": 0.77638209, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.39746094, + "step": 4075, + "time_per_iteration": 2.5712804794311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043376, + "balance_loss_mlp": 1.00363171, + "epoch": 0.7841477491342824, + "flos": 507651139584.0, + "grad_norm": 0.03246431097284687, + "language_loss": 0.82211149, + "learning_rate": 0.00011728946691598707, + "loss": 0.83254528, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.3972168, + "step": 4076, + "time_per_iteration": 2.604954242706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00401795, + "epoch": 0.784340130819546, + "flos": 720905895168.0, + "grad_norm": 0.038070904406741414, + "language_loss": 0.76823437, + "learning_rate": 0.00011708905446215561, + "loss": 0.77867198, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.3972168, + "step": 4077, + "time_per_iteration": 2.8703718185424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_mlp": 1.00389552, + "epoch": 0.7845325125048095, + "flos": 515514216192.0, + "grad_norm": 0.030616823376727165, + "language_loss": 0.80449855, + "learning_rate": 0.00011688879066753711, + "loss": 0.81493515, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.39746094, + "step": 4078, + "time_per_iteration": 2.693617582321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042875, + "balance_loss_mlp": 1.00313067, + "epoch": 0.7847248941900731, + "flos": 467051645952.0, + "grad_norm": 0.040474516708916684, + "language_loss": 0.87913537, + "learning_rate": 0.00011668867560988122, + "loss": 0.88956416, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.3972168, + "step": 4079, + "time_per_iteration": 2.5590639114379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_mlp": 1.00582922, + "epoch": 0.7849172758753367, + "flos": 504084612096.0, + "grad_norm": 0.03640725809465974, + "language_loss": 0.84891224, + "learning_rate": 0.00011648870936687916, + "loss": 0.85936773, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.39697266, + "step": 4080, + "time_per_iteration": 2.7692296504974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046484, + "balance_loss_mlp": 1.00678754, + "epoch": 0.7851096575606002, + "flos": 533032870656.0, + "grad_norm": 0.04308382250768319, + "language_loss": 0.79184526, + "learning_rate": 0.00011628889201616461, + "loss": 0.80231011, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.39672852, + "step": 4081, + "time_per_iteration": 2.6643264293670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_mlp": 1.00411689, + "epoch": 0.7853020392458638, + "flos": 571044712704.0, + "grad_norm": 0.03315243630239655, + "language_loss": 0.82372963, + "learning_rate": 0.00011608922363531393, + "loss": 0.8341682, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.3972168, + "step": 4082, + "time_per_iteration": 2.6805782318115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_mlp": 1.00403428, + "epoch": 0.7854944209311273, + "flos": 833992680960.0, + "grad_norm": 0.03684416800395475, + "language_loss": 0.83803403, + "learning_rate": 0.00011588970430184504, + "loss": 0.84847128, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.39672852, + "step": 4083, + "time_per_iteration": 3.0843493938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_mlp": 1.00404453, + "epoch": 0.7856868026163909, + "flos": 561011001600.0, + "grad_norm": 0.030260484959858683, + "language_loss": 0.82344627, + "learning_rate": 0.00011569033409321822, + "loss": 0.83388317, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.39624023, + "step": 4084, + "time_per_iteration": 2.692643165588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044727, + "balance_loss_mlp": 1.0050776, + "epoch": 0.7858791843016545, + "flos": 546268581888.0, + "grad_norm": 0.039325334071154384, + "language_loss": 0.73417258, + "learning_rate": 0.00011549111308683591, + "loss": 0.74461985, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.39624023, + "step": 4085, + "time_per_iteration": 2.6917884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044945, + "balance_loss_mlp": 1.00529623, + "epoch": 0.7860715659869181, + "flos": 381840923904.0, + "grad_norm": 0.042614016338838545, + "language_loss": 0.8128258, + "learning_rate": 0.00011529204136004251, + "loss": 0.82327527, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.39624023, + "step": 4086, + "time_per_iteration": 2.4572253227233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_mlp": 1.0049361, + "epoch": 0.7862639476721817, + "flos": 568513441536.0, + "grad_norm": 0.03346159984299651, + "language_loss": 0.84931922, + "learning_rate": 0.00011509311899012459, + "loss": 0.85976499, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.39624023, + "step": 4087, + "time_per_iteration": 2.763591766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_mlp": 1.00353885, + "epoch": 0.7864563293574451, + "flos": 546323016960.0, + "grad_norm": 0.03949651761127577, + "language_loss": 0.78551108, + "learning_rate": 0.00011489434605431053, + "loss": 0.79594392, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.3972168, + "step": 4088, + "time_per_iteration": 2.6439645290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042959, + "balance_loss_mlp": 1.00321484, + "epoch": 0.7866487110427087, + "flos": 564649460736.0, + "grad_norm": 0.036592949661453156, + "language_loss": 0.81577885, + "learning_rate": 0.0001146957226297708, + "loss": 0.82620847, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.3972168, + "step": 4089, + "time_per_iteration": 2.679487705230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042934, + "balance_loss_mlp": 1.00321376, + "epoch": 0.7868410927279723, + "flos": 729559264512.0, + "grad_norm": 0.030545920555930417, + "language_loss": 0.76902366, + "learning_rate": 0.00011449724879361827, + "loss": 0.77945304, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.39697266, + "step": 4090, + "time_per_iteration": 2.9623334407806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043691, + "balance_loss_mlp": 1.00404155, + "epoch": 0.7870334744132359, + "flos": 522447994368.0, + "grad_norm": 0.042680254244296036, + "language_loss": 0.74582481, + "learning_rate": 0.00011429892462290687, + "loss": 0.75626171, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.39624023, + "step": 4091, + "time_per_iteration": 2.718287229537964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_mlp": 1.00435197, + "epoch": 0.7872258560984994, + "flos": 452363661312.0, + "grad_norm": 0.033106880677710115, + "language_loss": 0.83571684, + "learning_rate": 0.00011410075019463295, + "loss": 0.84615809, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.39746094, + "step": 4092, + "time_per_iteration": 2.627462148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043831, + "balance_loss_mlp": 1.00413382, + "epoch": 0.787418237783763, + "flos": 516250073856.0, + "grad_norm": 0.03274569080250533, + "language_loss": 0.80842328, + "learning_rate": 0.00011390272558573461, + "loss": 0.8188616, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.39672852, + "step": 4093, + "time_per_iteration": 2.678356409072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_mlp": 1.00474429, + "epoch": 0.7876106194690266, + "flos": 486057566976.0, + "grad_norm": 0.03217400572636969, + "language_loss": 0.80303454, + "learning_rate": 0.00011370485087309202, + "loss": 0.81347895, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.39672852, + "step": 4094, + "time_per_iteration": 2.6190710067749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_mlp": 1.00449383, + "epoch": 0.7878030011542901, + "flos": 543930751488.0, + "grad_norm": 0.036296400111331464, + "language_loss": 0.79175836, + "learning_rate": 0.00011350712613352688, + "loss": 0.80220002, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.39648438, + "step": 4095, + "time_per_iteration": 2.705301284790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_mlp": 1.00463307, + "epoch": 0.7879953828395537, + "flos": 517749979392.0, + "grad_norm": 0.042475497231540135, + "language_loss": 0.79742056, + "learning_rate": 0.00011330955144380283, + "loss": 0.80786359, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.39648438, + "step": 4096, + "time_per_iteration": 2.592628240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_mlp": 1.00336826, + "epoch": 0.7881877645248172, + "flos": 583377370368.0, + "grad_norm": 0.033751498450810845, + "language_loss": 0.86674351, + "learning_rate": 0.00011311212688062483, + "loss": 0.87717438, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.39697266, + "step": 4097, + "time_per_iteration": 2.8006155490875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_mlp": 1.0035969, + "epoch": 0.7883801462100808, + "flos": 590328645120.0, + "grad_norm": 0.0369008039403456, + "language_loss": 0.78409964, + "learning_rate": 0.0001129148525206402, + "loss": 0.79453301, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.3972168, + "step": 4098, + "time_per_iteration": 2.824293375015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_mlp": 1.00373876, + "epoch": 0.7885725278953444, + "flos": 482742806016.0, + "grad_norm": 0.04185353422422626, + "language_loss": 0.86687458, + "learning_rate": 0.00011271772844043759, + "loss": 0.87730914, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.39697266, + "step": 4099, + "time_per_iteration": 2.6993777751922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_mlp": 1.00372386, + "epoch": 0.788764909580608, + "flos": 758099254272.0, + "grad_norm": 0.0413483333130522, + "language_loss": 0.76537859, + "learning_rate": 0.00011252075471654727, + "loss": 0.77581275, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.39672852, + "step": 4100, + "time_per_iteration": 2.9176177978515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_mlp": 1.00374079, + "epoch": 0.7889572912658714, + "flos": 703880080128.0, + "grad_norm": 0.0322415537049841, + "language_loss": 0.7816056, + "learning_rate": 0.00011232393142544133, + "loss": 0.79204047, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.3972168, + "step": 4101, + "time_per_iteration": 2.9494380950927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044086, + "balance_loss_mlp": 1.00436497, + "epoch": 0.789149672951135, + "flos": 737841303552.0, + "grad_norm": 0.03312890995407851, + "language_loss": 0.83342379, + "learning_rate": 0.00011212725864353323, + "loss": 0.84386468, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.39697266, + "step": 4102, + "time_per_iteration": 3.066310405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104599, + "balance_loss_mlp": 1.00727081, + "epoch": 0.7893420546363986, + "flos": 1484490533376.0, + "grad_norm": 0.0037033448465983686, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77381915, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.38671875, + "step": 4103, + "time_per_iteration": 4.842837810516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043191, + "balance_loss_mlp": 1.00349379, + "epoch": 0.7895344363216622, + "flos": 510080343552.0, + "grad_norm": 0.04492862133379161, + "language_loss": 0.75946063, + "learning_rate": 0.00011173436491267291, + "loss": 0.76989251, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.39672852, + "step": 4104, + "time_per_iteration": 2.6089494228363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_mlp": 1.00348377, + "epoch": 0.7897268180069258, + "flos": 543038391552.0, + "grad_norm": 0.035594569075133434, + "language_loss": 0.82524866, + "learning_rate": 0.0001115381441162554, + "loss": 0.83568072, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.39697266, + "step": 4105, + "time_per_iteration": 2.610574245452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_mlp": 1.00495148, + "epoch": 0.7899191996921893, + "flos": 1415752515840.0, + "grad_norm": 0.004244579927016686, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74627399, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.38671875, + "step": 4106, + "time_per_iteration": 4.910478830337524 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042244, + "balance_loss_mlp": 1.00254726, + "epoch": 0.7901115813774529, + "flos": 624022550784.0, + "grad_norm": 0.03217840063053149, + "language_loss": 0.855353, + "learning_rate": 0.00011114615504234465, + "loss": 0.86577547, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.39672852, + "step": 4107, + "time_per_iteration": 2.746295690536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_mlp": 1.0045898, + "epoch": 0.7903039630627164, + "flos": 646805936640.0, + "grad_norm": 0.033942053342870586, + "language_loss": 0.81416857, + "learning_rate": 0.00011095038691703468, + "loss": 0.82461071, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.39599609, + "step": 4108, + "time_per_iteration": 2.8708901405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104442, + "balance_loss_mlp": 1.00479472, + "epoch": 0.79049634474798, + "flos": 595612818432.0, + "grad_norm": 0.037550083801842486, + "language_loss": 0.83416122, + "learning_rate": 0.00011075476983417998, + "loss": 0.84460539, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.39599609, + "step": 4109, + "time_per_iteration": 2.8592021465301514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043584, + "balance_loss_mlp": 1.00393546, + "epoch": 0.7906887264332435, + "flos": 717332564736.0, + "grad_norm": 0.03806568849711228, + "language_loss": 0.7824564, + "learning_rate": 0.00011055930386972579, + "loss": 0.79289222, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.39624023, + "step": 4110, + "time_per_iteration": 2.860257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_mlp": 1.00162232, + "epoch": 0.7908811081185071, + "flos": 791261436672.0, + "grad_norm": 0.034643176312320036, + "language_loss": 0.78703582, + "learning_rate": 0.00011036398909955863, + "loss": 0.79744881, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.39648438, + "step": 4111, + "time_per_iteration": 2.9770195484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043101, + "balance_loss_mlp": 1.0034523, + "epoch": 0.7910734898037707, + "flos": 643076103936.0, + "grad_norm": 0.033380496511460814, + "language_loss": 0.8228001, + "learning_rate": 0.00011016882559950648, + "loss": 0.83323109, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.39624023, + "step": 4112, + "time_per_iteration": 2.8614118099212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_mlp": 1.00446427, + "epoch": 0.7912658714890343, + "flos": 670561395456.0, + "grad_norm": 0.037601887407010925, + "language_loss": 0.80818218, + "learning_rate": 0.00010997381344533853, + "loss": 0.81862211, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.39501953, + "step": 4113, + "time_per_iteration": 2.806915521621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_mlp": 1.00639081, + "epoch": 0.7914582531742979, + "flos": 558887999232.0, + "grad_norm": 0.03473923170116899, + "language_loss": 0.81077361, + "learning_rate": 0.00010977895271276517, + "loss": 0.82123232, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.39453125, + "step": 4114, + "time_per_iteration": 2.710866928100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046255, + "balance_loss_mlp": 1.00667739, + "epoch": 0.7916506348595613, + "flos": 571192466688.0, + "grad_norm": 0.03381455786010569, + "language_loss": 0.80545115, + "learning_rate": 0.00010958424347743807, + "loss": 0.81591368, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.39550781, + "step": 4115, + "time_per_iteration": 2.720463991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044883, + "balance_loss_mlp": 1.00528204, + "epoch": 0.7918430165448249, + "flos": 719647062528.0, + "grad_norm": 0.03312205517130564, + "language_loss": 0.8089326, + "learning_rate": 0.00010938968581494991, + "loss": 0.81938136, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.39575195, + "step": 4116, + "time_per_iteration": 2.9487526416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_mlp": 1.004758, + "epoch": 0.7920353982300885, + "flos": 554737258752.0, + "grad_norm": 0.04353090133720626, + "language_loss": 0.79680514, + "learning_rate": 0.000109195279800835, + "loss": 0.80724961, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.39672852, + "step": 4117, + "time_per_iteration": 2.7193853855133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046231, + "balance_loss_mlp": 1.0065577, + "epoch": 0.7922277799153521, + "flos": 811541741568.0, + "grad_norm": 0.051618169063903374, + "language_loss": 0.76734924, + "learning_rate": 0.00010900102551056834, + "loss": 0.77781159, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.39648438, + "step": 4118, + "time_per_iteration": 3.0203771591186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_mlp": 1.00644493, + "epoch": 0.7924201616006156, + "flos": 422245031424.0, + "grad_norm": 0.03727479456025455, + "language_loss": 0.84903586, + "learning_rate": 0.00010880692301956601, + "loss": 0.85949719, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.39672852, + "step": 4119, + "time_per_iteration": 2.5143675804138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_mlp": 1.00416684, + "epoch": 0.7926125432858792, + "flos": 619105817088.0, + "grad_norm": 0.030768589003691713, + "language_loss": 0.86626256, + "learning_rate": 0.00010861297240318518, + "loss": 0.876701, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.39648438, + "step": 4120, + "time_per_iteration": 2.870023250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_mlp": 1.00611162, + "epoch": 0.7928049249711427, + "flos": 603611988480.0, + "grad_norm": 0.0348759372841926, + "language_loss": 0.8754127, + "learning_rate": 0.00010841917373672444, + "loss": 0.88587052, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.39648438, + "step": 4121, + "time_per_iteration": 2.7993838787078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_mlp": 1.00568569, + "epoch": 0.7929973066564063, + "flos": 657232365312.0, + "grad_norm": 0.04825872036668382, + "language_loss": 0.79469776, + "learning_rate": 0.00010822552709542293, + "loss": 0.80515188, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.39697266, + "step": 4122, + "time_per_iteration": 2.8277747631073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104501, + "balance_loss_mlp": 1.00526559, + "epoch": 0.7931896883416699, + "flos": 537435377664.0, + "grad_norm": 0.033652478318624945, + "language_loss": 0.86540711, + "learning_rate": 0.0001080320325544612, + "loss": 0.87585717, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.3972168, + "step": 4123, + "time_per_iteration": 2.7195277214050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_mlp": 1.00394118, + "epoch": 0.7933820700269334, + "flos": 499069701888.0, + "grad_norm": 0.034451341323961555, + "language_loss": 0.83510745, + "learning_rate": 0.00010783869018895997, + "loss": 0.84554619, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.39916992, + "step": 4124, + "time_per_iteration": 2.577709197998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_mlp": 1.00495958, + "epoch": 0.793574451712197, + "flos": 538496878848.0, + "grad_norm": 0.03367415266088285, + "language_loss": 0.84549522, + "learning_rate": 0.00010764550007398189, + "loss": 0.85594392, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.39892578, + "step": 4125, + "time_per_iteration": 4.054261207580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045425, + "balance_loss_mlp": 1.00556159, + "epoch": 0.7937668333974606, + "flos": 489259567104.0, + "grad_norm": 0.03475053715190497, + "language_loss": 0.82054108, + "learning_rate": 0.00010745246228452982, + "loss": 0.83099532, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.3984375, + "step": 4126, + "time_per_iteration": 2.5979418754577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_mlp": 1.0054512, + "epoch": 0.7939592150827242, + "flos": 528480664320.0, + "grad_norm": 0.03444144820805524, + "language_loss": 0.8203451, + "learning_rate": 0.00010725957689554771, + "loss": 0.83079869, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.39892578, + "step": 4127, + "time_per_iteration": 2.7990803718566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_mlp": 1.0037564, + "epoch": 0.7941515967679876, + "flos": 542804121600.0, + "grad_norm": 0.027974353873713647, + "language_loss": 0.84939337, + "learning_rate": 0.00010706684398192013, + "loss": 0.85982978, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.39868164, + "step": 4128, + "time_per_iteration": 2.6992971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_mlp": 1.00342357, + "epoch": 0.7943439784532512, + "flos": 519524005632.0, + "grad_norm": 0.0378035902598828, + "language_loss": 0.82137024, + "learning_rate": 0.00010687426361847313, + "loss": 0.83180356, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.39892578, + "step": 4129, + "time_per_iteration": 2.7055931091308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_mlp": 1.00368559, + "epoch": 0.7945363601385148, + "flos": 510060901632.0, + "grad_norm": 0.033194408400906726, + "language_loss": 0.86515343, + "learning_rate": 0.00010668183587997254, + "loss": 0.87558991, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.39941406, + "step": 4130, + "time_per_iteration": 2.6280934810638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043077, + "balance_loss_mlp": 1.00318933, + "epoch": 0.7947287418237784, + "flos": 652402147584.0, + "grad_norm": 0.029896706291295146, + "language_loss": 0.77920771, + "learning_rate": 0.0001064895608411256, + "loss": 0.78963846, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.39868164, + "step": 4131, + "time_per_iteration": 2.8259942531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042645, + "balance_loss_mlp": 1.00282872, + "epoch": 0.794921123509042, + "flos": 697374012672.0, + "grad_norm": 0.04755906636232369, + "language_loss": 0.80848777, + "learning_rate": 0.00010629743857657998, + "loss": 0.81891429, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.39794922, + "step": 4132, + "time_per_iteration": 2.8961074352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_mlp": 1.00901794, + "epoch": 0.7951135051943055, + "flos": 1406079441408.0, + "grad_norm": 0.006864430064478978, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.716465, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.38769531, + "step": 4133, + "time_per_iteration": 4.614002704620361 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_mlp": 1.00524557, + "epoch": 0.795305886879569, + "flos": 811450368000.0, + "grad_norm": 0.03507425862831722, + "language_loss": 0.82587457, + "learning_rate": 0.00010591365266868802, + "loss": 0.83632547, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.39819336, + "step": 4134, + "time_per_iteration": 2.9641194343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044941, + "balance_loss_mlp": 1.0061264, + "epoch": 0.7954982685648326, + "flos": 1429216660992.0, + "grad_norm": 0.005948416138120475, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76556724, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.38769531, + "step": 4135, + "time_per_iteration": 4.960731029510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_mlp": 1.00557125, + "epoch": 0.7956906502500962, + "flos": 390748005120.0, + "grad_norm": 0.05367501121915611, + "language_loss": 0.80196106, + "learning_rate": 0.00010553047875229166, + "loss": 0.81241471, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.39770508, + "step": 4136, + "time_per_iteration": 2.5680596828460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045778, + "balance_loss_mlp": 1.00596261, + "epoch": 0.7958830319353598, + "flos": 516586411008.0, + "grad_norm": 0.03268572059370949, + "language_loss": 0.83743113, + "learning_rate": 0.00010533912147689328, + "loss": 0.84788889, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.39794922, + "step": 4137, + "time_per_iteration": 2.6882131099700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_mlp": 1.00492001, + "epoch": 0.7960754136206233, + "flos": 494927709696.0, + "grad_norm": 0.03240268195496617, + "language_loss": 0.82921439, + "learning_rate": 0.00010514791742243656, + "loss": 0.83966154, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.39770508, + "step": 4138, + "time_per_iteration": 2.5695807933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_mlp": 1.0049001, + "epoch": 0.7962677953058869, + "flos": 657006843648.0, + "grad_norm": 0.03902501447603489, + "language_loss": 0.83096194, + "learning_rate": 0.00010495686666315341, + "loss": 0.84140819, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.39697266, + "step": 4139, + "time_per_iteration": 2.8975212574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044322, + "balance_loss_mlp": 1.00443506, + "epoch": 0.7964601769911505, + "flos": 543420415488.0, + "grad_norm": 0.04091295752087844, + "language_loss": 0.777354, + "learning_rate": 0.00010476596927321635, + "loss": 0.78779727, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.39868164, + "step": 4140, + "time_per_iteration": 2.654552459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_mlp": 1.00785065, + "epoch": 0.796652558676414, + "flos": 538827379968.0, + "grad_norm": 0.03162317226196635, + "language_loss": 0.80818027, + "learning_rate": 0.00010457522532673835, + "loss": 0.81865692, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.39794922, + "step": 4141, + "time_per_iteration": 2.842707633972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044598, + "balance_loss_mlp": 1.00480628, + "epoch": 0.7968449403616775, + "flos": 476052046080.0, + "grad_norm": 0.03609806445163449, + "language_loss": 0.83603644, + "learning_rate": 0.00010438463489777272, + "loss": 0.8464824, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.39770508, + "step": 4142, + "time_per_iteration": 2.5717051029205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_mlp": 1.00287914, + "epoch": 0.7970373220469411, + "flos": 568726324224.0, + "grad_norm": 0.03529843245430609, + "language_loss": 0.7784009, + "learning_rate": 0.00010419419806031316, + "loss": 0.78882766, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.39770508, + "step": 4143, + "time_per_iteration": 2.6530473232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_mlp": 1.00467539, + "epoch": 0.7972297037322047, + "flos": 557351155200.0, + "grad_norm": 0.03335474096113663, + "language_loss": 0.84457743, + "learning_rate": 0.00010400391488829403, + "loss": 0.85502243, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.39794922, + "step": 4144, + "time_per_iteration": 2.832122564315796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044556, + "balance_loss_mlp": 1.00471592, + "epoch": 0.7974220854174683, + "flos": 577307761920.0, + "grad_norm": 0.030245112607884015, + "language_loss": 0.87015516, + "learning_rate": 0.00010381378545558984, + "loss": 0.88060075, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.39819336, + "step": 4145, + "time_per_iteration": 2.6970877647399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104421, + "balance_loss_mlp": 1.00434661, + "epoch": 0.7976144671027319, + "flos": 484056073728.0, + "grad_norm": 0.03356319241102144, + "language_loss": 0.8495326, + "learning_rate": 0.00010362380983601505, + "loss": 0.85997462, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.3984375, + "step": 4146, + "time_per_iteration": 2.5355587005615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_mlp": 1.00420487, + "epoch": 0.7978068487879953, + "flos": 1079654319360.0, + "grad_norm": 0.028459484935127146, + "language_loss": 0.79190552, + "learning_rate": 0.00010343398810332477, + "loss": 0.80234593, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.39819336, + "step": 4147, + "time_per_iteration": 3.4484007358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_mlp": 1.00370336, + "epoch": 0.7979992304732589, + "flos": 735016469760.0, + "grad_norm": 0.038421904097834796, + "language_loss": 0.84714222, + "learning_rate": 0.00010324432033121467, + "loss": 0.8575781, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.39868164, + "step": 4148, + "time_per_iteration": 2.8759710788726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044627, + "balance_loss_mlp": 1.00476301, + "epoch": 0.7981916121585225, + "flos": 416750887680.0, + "grad_norm": 0.03692074531599656, + "language_loss": 0.84042895, + "learning_rate": 0.00010305480659332005, + "loss": 0.85087514, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.3984375, + "step": 4149, + "time_per_iteration": 2.6903555393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_mlp": 1.00493348, + "epoch": 0.7983839938437861, + "flos": 466213721088.0, + "grad_norm": 0.03398705424173267, + "language_loss": 0.84049666, + "learning_rate": 0.00010286544696321682, + "loss": 0.85094512, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.39892578, + "step": 4150, + "time_per_iteration": 2.5223419666290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_mlp": 1.00731564, + "epoch": 0.7985763755290496, + "flos": 511623990528.0, + "grad_norm": 0.03850329476813429, + "language_loss": 0.80184937, + "learning_rate": 0.00010267624151442073, + "loss": 0.81232083, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.39819336, + "step": 4151, + "time_per_iteration": 2.6372790336608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045244, + "balance_loss_mlp": 1.00545216, + "epoch": 0.7987687572143132, + "flos": 1012279147008.0, + "grad_norm": 0.036156953147693155, + "language_loss": 0.81612265, + "learning_rate": 0.000102487190320388, + "loss": 0.8265751, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.39770508, + "step": 4152, + "time_per_iteration": 3.3100497722625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_mlp": 1.00644886, + "epoch": 0.7989611388995768, + "flos": 1022749317120.0, + "grad_norm": 0.0483734968534093, + "language_loss": 0.80480343, + "learning_rate": 0.00010229829345451475, + "loss": 0.81526625, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.39819336, + "step": 4153, + "time_per_iteration": 3.305338144302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048343, + "balance_loss_mlp": 1.00855112, + "epoch": 0.7991535205848403, + "flos": 1103038447872.0, + "grad_norm": 0.03770888532142324, + "language_loss": 0.80308628, + "learning_rate": 0.00010210955099013724, + "loss": 0.81356978, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.39770508, + "step": 4154, + "time_per_iteration": 3.409900188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047808, + "balance_loss_mlp": 1.00789726, + "epoch": 0.7993459022701039, + "flos": 836280933888.0, + "grad_norm": 0.04128229855953485, + "language_loss": 0.77654159, + "learning_rate": 0.00010192096300053167, + "loss": 0.78701961, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.39892578, + "step": 4155, + "time_per_iteration": 3.1075351238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_mlp": 1.00387979, + "epoch": 0.7995382839553674, + "flos": 523770010368.0, + "grad_norm": 0.043215230874116634, + "language_loss": 0.85791343, + "learning_rate": 0.00010173252955891477, + "loss": 0.86835158, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.39916992, + "step": 4156, + "time_per_iteration": 2.741454839706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_mlp": 1.00358212, + "epoch": 0.799730665640631, + "flos": 538859460864.0, + "grad_norm": 0.0402681416401722, + "language_loss": 0.73719215, + "learning_rate": 0.00010154425073844253, + "loss": 0.74762684, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.39868164, + "step": 4157, + "time_per_iteration": 2.709291458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_mlp": 1.00384808, + "epoch": 0.7999230473258946, + "flos": 506068608768.0, + "grad_norm": 0.03223966585630621, + "language_loss": 0.82729542, + "learning_rate": 0.00010135612661221138, + "loss": 0.83773249, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.3984375, + "step": 4158, + "time_per_iteration": 2.557003974914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043464, + "balance_loss_mlp": 1.00357628, + "epoch": 0.8001154290111582, + "flos": 1028977373184.0, + "grad_norm": 0.03912877230354993, + "language_loss": 0.82057023, + "learning_rate": 0.00010116815725325751, + "loss": 0.83100486, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.39868164, + "step": 4159, + "time_per_iteration": 3.304746389389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_mlp": 1.00401807, + "epoch": 0.8003078106964217, + "flos": 752270718720.0, + "grad_norm": 0.03707561964119669, + "language_loss": 0.81281012, + "learning_rate": 0.00010098034273455725, + "loss": 0.82324892, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.3984375, + "step": 4160, + "time_per_iteration": 2.93477463722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043012, + "balance_loss_mlp": 1.00317216, + "epoch": 0.8005001923816852, + "flos": 489526884864.0, + "grad_norm": 0.03420748582066261, + "language_loss": 0.80276787, + "learning_rate": 0.00010079268312902662, + "loss": 0.81319797, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.39819336, + "step": 4161, + "time_per_iteration": 2.6929373741149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_mlp": 1.00474119, + "epoch": 0.8006925740669488, + "flos": 514313709312.0, + "grad_norm": 0.033458859540608864, + "language_loss": 0.82609326, + "learning_rate": 0.0001006051785095215, + "loss": 0.83653903, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.39819336, + "step": 4162, + "time_per_iteration": 2.734436511993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046338, + "balance_loss_mlp": 1.00642645, + "epoch": 0.8008849557522124, + "flos": 579680585472.0, + "grad_norm": 0.03667202832039182, + "language_loss": 0.79988742, + "learning_rate": 0.0001004178289488376, + "loss": 0.81035084, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.39892578, + "step": 4163, + "time_per_iteration": 2.767613410949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046056, + "balance_loss_mlp": 1.00626385, + "epoch": 0.801077337437476, + "flos": 479681756928.0, + "grad_norm": 0.03506615543600683, + "language_loss": 0.84141004, + "learning_rate": 0.0001002306345197106, + "loss": 0.8518706, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.39770508, + "step": 4164, + "time_per_iteration": 2.631165027618408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_mlp": 1.00552833, + "epoch": 0.8012697191227395, + "flos": 677968571136.0, + "grad_norm": 0.045614046534047034, + "language_loss": 0.80445755, + "learning_rate": 0.00010004359529481571, + "loss": 0.81491077, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.39770508, + "step": 4165, + "time_per_iteration": 2.901402473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045546, + "balance_loss_mlp": 1.00572991, + "epoch": 0.8014621008080031, + "flos": 1297172576256.0, + "grad_norm": 0.042609498676076864, + "language_loss": 0.83314872, + "learning_rate": 9.985671134676804e-05, + "loss": 0.84360421, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.39794922, + "step": 4166, + "time_per_iteration": 3.69667911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_mlp": 1.00647485, + "epoch": 0.8016544824932667, + "flos": 512826442752.0, + "grad_norm": 0.041586651320783194, + "language_loss": 0.83886582, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84932899, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.39819336, + "step": 4167, + "time_per_iteration": 2.5852413177490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_mlp": 1.0039798, + "epoch": 0.8018468641785302, + "flos": 536718961920.0, + "grad_norm": 0.04260031626477269, + "language_loss": 0.82111335, + "learning_rate": 9.948340957137308e-05, + "loss": 0.83155155, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.39819336, + "step": 4168, + "time_per_iteration": 2.6272878646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_mlp": 1.00434577, + "epoch": 0.8020392458637937, + "flos": 1025058957312.0, + "grad_norm": 0.03898276528172633, + "language_loss": 0.80097771, + "learning_rate": 9.929699188895447e-05, + "loss": 0.81142002, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.39868164, + "step": 4169, + "time_per_iteration": 3.2593564987182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_mlp": 1.00231934, + "epoch": 0.8022316275490573, + "flos": 1565073226752.0, + "grad_norm": 0.005727989546887444, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79095441, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.38671875, + "step": 4170, + "time_per_iteration": 4.9659693241119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047428, + "balance_loss_mlp": 1.0074693, + "epoch": 0.8024240092343209, + "flos": 421602492672.0, + "grad_norm": 0.0363857175356543, + "language_loss": 0.83789802, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84837228, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.39941406, + "step": 4171, + "time_per_iteration": 2.524991989135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047654, + "balance_loss_mlp": 1.0077666, + "epoch": 0.8026163909195845, + "flos": 765163289856.0, + "grad_norm": 0.03859277730807378, + "language_loss": 0.79318523, + "learning_rate": 9.873867253111762e-05, + "loss": 0.8036617, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.39868164, + "step": 4172, + "time_per_iteration": 2.919250726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047932, + "balance_loss_mlp": 1.00911713, + "epoch": 0.8028087726048481, + "flos": 1522144651008.0, + "grad_norm": 0.012840205003212634, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81312495, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.38769531, + "step": 4173, + "time_per_iteration": 4.94536828994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_mlp": 1.00673401, + "epoch": 0.8030011542901115, + "flos": 518830922496.0, + "grad_norm": 0.03954362492691932, + "language_loss": 0.89050967, + "learning_rate": 9.836723842278733e-05, + "loss": 0.90097666, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.39941406, + "step": 4174, + "time_per_iteration": 2.5619349479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_mlp": 1.00696588, + "epoch": 0.8031935359753751, + "flos": 546659354112.0, + "grad_norm": 0.036526547211400404, + "language_loss": 0.78600073, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79647022, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.3996582, + "step": 4175, + "time_per_iteration": 2.650787115097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_mlp": 1.00668824, + "epoch": 0.8033859176606387, + "flos": 604736673024.0, + "grad_norm": 0.03137628778903353, + "language_loss": 0.85134256, + "learning_rate": 9.79964280250632e-05, + "loss": 0.86180949, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.39990234, + "step": 4176, + "time_per_iteration": 2.7565901279449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046784, + "balance_loss_mlp": 1.00675344, + "epoch": 0.8035782993459023, + "flos": 566985345792.0, + "grad_norm": 0.038450106349373375, + "language_loss": 0.82200831, + "learning_rate": 9.781125689766795e-05, + "loss": 0.83247614, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.40014648, + "step": 4177, + "time_per_iteration": 2.7695512771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046088, + "balance_loss_mlp": 1.00593793, + "epoch": 0.8037706810311658, + "flos": 539473809408.0, + "grad_norm": 0.047429405417763595, + "language_loss": 0.85172522, + "learning_rate": 9.762624191379054e-05, + "loss": 0.86218613, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.40136719, + "step": 4178, + "time_per_iteration": 2.6202852725982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046139, + "balance_loss_mlp": 1.00601351, + "epoch": 0.8039630627164294, + "flos": 516195638784.0, + "grad_norm": 0.036004712534776565, + "language_loss": 0.79951143, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80997288, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.40112305, + "step": 4179, + "time_per_iteration": 2.6379129886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046989, + "balance_loss_mlp": 1.00769806, + "epoch": 0.804155444401693, + "flos": 1481939820288.0, + "grad_norm": 0.005350241122210075, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75780553, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.39257812, + "step": 4180, + "time_per_iteration": 4.866838693618774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045581, + "balance_loss_mlp": 1.00545502, + "epoch": 0.8043478260869565, + "flos": 522189424896.0, + "grad_norm": 0.036924486045392176, + "language_loss": 0.77422369, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78467953, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.40112305, + "step": 4181, + "time_per_iteration": 2.67852783203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045883, + "balance_loss_mlp": 1.00568521, + "epoch": 0.8045402077722201, + "flos": 546564089856.0, + "grad_norm": 0.03189319138496175, + "language_loss": 0.80777282, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81823158, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.40185547, + "step": 4182, + "time_per_iteration": 2.8362486362457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047051, + "balance_loss_mlp": 1.00687742, + "epoch": 0.8047325894574836, + "flos": 679707604224.0, + "grad_norm": 0.03606539068582512, + "language_loss": 0.74659956, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75707006, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.40161133, + "step": 4183, + "time_per_iteration": 2.9409847259521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049374, + "balance_loss_mlp": 1.0092963, + "epoch": 0.8049249711427472, + "flos": 588329097216.0, + "grad_norm": 0.03633262888197943, + "language_loss": 0.79051793, + "learning_rate": 9.65194350425882e-05, + "loss": 0.80101168, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.40063477, + "step": 4184, + "time_per_iteration": 2.737316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105025, + "balance_loss_mlp": 1.01019537, + "epoch": 0.8051173528280108, + "flos": 815681788416.0, + "grad_norm": 0.04554718693460932, + "language_loss": 0.78688985, + "learning_rate": 9.633551507115452e-05, + "loss": 0.79739237, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.40039062, + "step": 4185, + "time_per_iteration": 3.1375975608825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049726, + "balance_loss_mlp": 1.00974321, + "epoch": 0.8053097345132744, + "flos": 726956061696.0, + "grad_norm": 0.034010858312542885, + "language_loss": 0.7827903, + "learning_rate": 9.615175181617259e-05, + "loss": 0.79328752, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.3996582, + "step": 4186, + "time_per_iteration": 2.971076011657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050316, + "balance_loss_mlp": 1.01026201, + "epoch": 0.805502116198538, + "flos": 749431300608.0, + "grad_norm": 0.03715615744266424, + "language_loss": 0.8164562, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82695937, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.40039062, + "step": 4187, + "time_per_iteration": 2.964796304702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104701, + "balance_loss_mlp": 1.00690746, + "epoch": 0.8056944978838014, + "flos": 641482879488.0, + "grad_norm": 0.04269261124509272, + "language_loss": 0.88006115, + "learning_rate": 9.578469574087561e-05, + "loss": 0.8905313, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.40087891, + "step": 4188, + "time_per_iteration": 2.8772683143615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_mlp": 1.00536251, + "epoch": 0.805886879569065, + "flos": 645785264640.0, + "grad_norm": 0.037205213078360604, + "language_loss": 0.78592306, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79637742, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.40063477, + "step": 4189, + "time_per_iteration": 2.9584858417510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048365, + "balance_loss_mlp": 1.00831056, + "epoch": 0.8060792612543286, + "flos": 662444606976.0, + "grad_norm": 0.03414135318032402, + "language_loss": 0.82343107, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8339147, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.40039062, + "step": 4190, + "time_per_iteration": 2.850748300552368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048736, + "balance_loss_mlp": 1.00861061, + "epoch": 0.8062716429395922, + "flos": 456012814080.0, + "grad_norm": 0.039964771774069895, + "language_loss": 0.83061063, + "learning_rate": 9.523528878291904e-05, + "loss": 0.84109795, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.40112305, + "step": 4191, + "time_per_iteration": 2.526196002960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045876, + "balance_loss_mlp": 1.00572634, + "epoch": 0.8064640246248557, + "flos": 527429856768.0, + "grad_norm": 0.04221200524238329, + "language_loss": 0.85863805, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86909676, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.40136719, + "step": 4192, + "time_per_iteration": 2.6026053428649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044221, + "balance_loss_mlp": 1.00407135, + "epoch": 0.8066564063101193, + "flos": 866677575168.0, + "grad_norm": 0.03011720621266792, + "language_loss": 0.82612681, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83656895, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.40136719, + "step": 4193, + "time_per_iteration": 3.2554736137390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044791, + "balance_loss_mlp": 1.00473714, + "epoch": 0.8068487879953828, + "flos": 531643780608.0, + "grad_norm": 0.04047187421116328, + "language_loss": 0.8236109, + "learning_rate": 9.468729611697246e-05, + "loss": 0.83405876, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.40039062, + "step": 4194, + "time_per_iteration": 2.708320379257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044539, + "balance_loss_mlp": 1.00450861, + "epoch": 0.8070411696806464, + "flos": 567247805952.0, + "grad_norm": 0.032246940295438974, + "language_loss": 0.82059777, + "learning_rate": 9.450494651319003e-05, + "loss": 0.83104318, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.40014648, + "step": 4195, + "time_per_iteration": 2.6272635459899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_mlp": 1.00428092, + "epoch": 0.80723355136591, + "flos": 988254425088.0, + "grad_norm": 0.037033582993084305, + "language_loss": 0.79562807, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80607164, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.40063477, + "step": 4196, + "time_per_iteration": 3.2852365970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044577, + "balance_loss_mlp": 1.0044992, + "epoch": 0.8074259330511735, + "flos": 568083785472.0, + "grad_norm": 0.04553692157756435, + "language_loss": 0.83215851, + "learning_rate": 9.414071965778221e-05, + "loss": 0.84260428, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.40063477, + "step": 4197, + "time_per_iteration": 2.825437545776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104441, + "balance_loss_mlp": 1.00442731, + "epoch": 0.8076183147364371, + "flos": 495752995584.0, + "grad_norm": 0.033139849122030246, + "language_loss": 0.80485344, + "learning_rate": 9.395884254756242e-05, + "loss": 0.8152976, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.3996582, + "step": 4198, + "time_per_iteration": 2.748180389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_mlp": 1.00461829, + "epoch": 0.8078106964217007, + "flos": 420868580352.0, + "grad_norm": 0.042710700295185595, + "language_loss": 0.8034749, + "learning_rate": 9.377712307650044e-05, + "loss": 0.81392121, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.39990234, + "step": 4199, + "time_per_iteration": 2.4973928928375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044825, + "balance_loss_mlp": 1.00481844, + "epoch": 0.8080030781069643, + "flos": 528565234944.0, + "grad_norm": 0.036109062708885115, + "language_loss": 0.83559549, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84604371, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.39990234, + "step": 4200, + "time_per_iteration": 2.6258418560028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_mlp": 1.00298929, + "epoch": 0.8081954597922277, + "flos": 545152645632.0, + "grad_norm": 0.040903251277153094, + "language_loss": 0.81791621, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82834733, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.40112305, + "step": 4201, + "time_per_iteration": 2.6441001892089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_mlp": 1.00339341, + "epoch": 0.8083878414774913, + "flos": 642134166528.0, + "grad_norm": 0.11239758637794657, + "language_loss": 0.76130128, + "learning_rate": 9.323291120345207e-05, + "loss": 0.77173698, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.40161133, + "step": 4202, + "time_per_iteration": 2.8298070430755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_mlp": 1.00474536, + "epoch": 0.8085802231627549, + "flos": 706906136064.0, + "grad_norm": 0.03893641453034792, + "language_loss": 0.73079675, + "learning_rate": 9.305182299390614e-05, + "loss": 0.74124646, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.40209961, + "step": 4203, + "time_per_iteration": 2.8911709785461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043357, + "balance_loss_mlp": 1.00325489, + "epoch": 0.8087726048480185, + "flos": 420662500608.0, + "grad_norm": 0.03792090932692915, + "language_loss": 0.89067852, + "learning_rate": 9.287089277565409e-05, + "loss": 0.90111208, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.40087891, + "step": 4204, + "time_per_iteration": 2.5835013389587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043379, + "balance_loss_mlp": 1.00327671, + "epoch": 0.8089649865332821, + "flos": 509863570176.0, + "grad_norm": 0.028595163425198668, + "language_loss": 0.87236726, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88280106, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.40087891, + "step": 4205, + "time_per_iteration": 2.761137008666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_mlp": 1.00391018, + "epoch": 0.8091573682185456, + "flos": 458262183168.0, + "grad_norm": 0.031965150246737496, + "language_loss": 0.85574394, + "learning_rate": 9.250950659394386e-05, + "loss": 0.86618531, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.40209961, + "step": 4206, + "time_per_iteration": 2.7414729595184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_mlp": 1.00491476, + "epoch": 0.8093497499038091, + "flos": 526375158528.0, + "grad_norm": 0.03398258533246476, + "language_loss": 0.77492428, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78537422, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.40063477, + "step": 4207, + "time_per_iteration": 2.759403705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044343, + "balance_loss_mlp": 1.00426447, + "epoch": 0.8095421315890727, + "flos": 490581583104.0, + "grad_norm": 0.040026247360877884, + "language_loss": 0.77545118, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78589457, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.40063477, + "step": 4208, + "time_per_iteration": 2.59245228767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_mlp": 1.00459588, + "epoch": 0.8097345132743363, + "flos": 626284558848.0, + "grad_norm": 0.03435821442590694, + "language_loss": 0.81210512, + "learning_rate": 9.196861401017164e-05, + "loss": 0.82255137, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.40014648, + "step": 4209, + "time_per_iteration": 2.7602193355560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_mlp": 1.00590312, + "epoch": 0.8099268949595998, + "flos": 616873944576.0, + "grad_norm": 0.03716472832486093, + "language_loss": 0.79843062, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80888975, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.39990234, + "step": 4210, + "time_per_iteration": 2.8025641441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043825, + "balance_loss_mlp": 1.00381863, + "epoch": 0.8101192766448634, + "flos": 480684932352.0, + "grad_norm": 0.0329328024402014, + "language_loss": 0.80138117, + "learning_rate": 9.160881089682566e-05, + "loss": 0.81181943, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.39990234, + "step": 4211, + "time_per_iteration": 2.6329565048217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_mlp": 1.00482833, + "epoch": 0.810311658330127, + "flos": 518327389440.0, + "grad_norm": 0.03337868417274248, + "language_loss": 0.86965179, + "learning_rate": 9.142914713252725e-05, + "loss": 0.88009942, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.39916992, + "step": 4212, + "time_per_iteration": 2.6201486587524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_mlp": 1.00274634, + "epoch": 0.8105040400153906, + "flos": 576988921344.0, + "grad_norm": 0.02936598323523461, + "language_loss": 0.84615433, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85658085, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.39892578, + "step": 4213, + "time_per_iteration": 2.7945985794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043338, + "balance_loss_mlp": 1.00345039, + "epoch": 0.8106964217006541, + "flos": 640189053696.0, + "grad_norm": 0.03756812888703321, + "language_loss": 0.8566975, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86713088, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.39868164, + "step": 4214, + "time_per_iteration": 2.809328317642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_mlp": 1.00428391, + "epoch": 0.8108888033859176, + "flos": 580585584384.0, + "grad_norm": 0.0376800294588735, + "language_loss": 0.81953692, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82997859, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.39868164, + "step": 4215, + "time_per_iteration": 2.652230978012085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042383, + "balance_loss_mlp": 1.00242364, + "epoch": 0.8110811850711812, + "flos": 561091681536.0, + "grad_norm": 0.03446014791580575, + "language_loss": 0.83807087, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84849465, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.39941406, + "step": 4216, + "time_per_iteration": 2.785763740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047291, + "balance_loss_mlp": 1.00790405, + "epoch": 0.8112735667564448, + "flos": 1521069543936.0, + "grad_norm": 0.007281864777553036, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78307706, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.39355469, + "step": 4217, + "time_per_iteration": 4.774747848510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_mlp": 1.00388443, + "epoch": 0.8114659484417084, + "flos": 617516483328.0, + "grad_norm": 0.03805694470042781, + "language_loss": 0.86072737, + "learning_rate": 9.035449803045792e-05, + "loss": 0.87116575, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.39941406, + "step": 4218, + "time_per_iteration": 2.7768678665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104246, + "balance_loss_mlp": 1.00259662, + "epoch": 0.8116583301269719, + "flos": 651262878720.0, + "grad_norm": 0.030415189633352945, + "language_loss": 0.79453695, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80496156, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.3984375, + "step": 4219, + "time_per_iteration": 2.9257187843322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042621, + "balance_loss_mlp": 1.00273395, + "epoch": 0.8118507118122354, + "flos": 554196787200.0, + "grad_norm": 0.03896243269868023, + "language_loss": 0.80358791, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81401414, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.39868164, + "step": 4220, + "time_per_iteration": 2.715939521789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_mlp": 1.00440466, + "epoch": 0.812043093497499, + "flos": 545178890496.0, + "grad_norm": 0.04144683362187673, + "language_loss": 0.87743199, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88787466, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.3984375, + "step": 4221, + "time_per_iteration": 2.6466968059539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104378, + "balance_loss_mlp": 1.0038445, + "epoch": 0.8122354751827626, + "flos": 584575931904.0, + "grad_norm": 0.0323625870343774, + "language_loss": 0.84112966, + "learning_rate": 8.964124513805628e-05, + "loss": 0.85156739, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.39916992, + "step": 4222, + "time_per_iteration": 2.7797539234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046749, + "balance_loss_mlp": 1.00736237, + "epoch": 0.8124278568680262, + "flos": 1533862960128.0, + "grad_norm": 0.005674804601180374, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79296821, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.39355469, + "step": 4223, + "time_per_iteration": 4.989461660385132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104717, + "balance_loss_mlp": 1.00716376, + "epoch": 0.8126202385532897, + "flos": 433767954432.0, + "grad_norm": 0.03852069724209621, + "language_loss": 0.80558193, + "learning_rate": 8.928557430748668e-05, + "loss": 0.81605363, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.39990234, + "step": 4224, + "time_per_iteration": 2.5844249725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_mlp": 1.00914764, + "epoch": 0.8128126202385533, + "flos": 1551149289984.0, + "grad_norm": 0.0078100354933002825, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77543974, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.39160156, + "step": 4225, + "time_per_iteration": 4.842838287353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044264, + "balance_loss_mlp": 1.00437629, + "epoch": 0.8130050019238169, + "flos": 529338031104.0, + "grad_norm": 0.033874870691741325, + "language_loss": 0.89490134, + "learning_rate": 8.893054129078077e-05, + "loss": 0.90534395, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.39868164, + "step": 4226, + "time_per_iteration": 2.643038749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_mlp": 1.00647342, + "epoch": 0.8131973836090804, + "flos": 544228204800.0, + "grad_norm": 0.040352466131287415, + "language_loss": 0.80754006, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81800371, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.39868164, + "step": 4227, + "time_per_iteration": 2.749776601791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046115, + "balance_loss_mlp": 1.00613213, + "epoch": 0.8133897652943439, + "flos": 577578970368.0, + "grad_norm": 0.03996024422287757, + "language_loss": 0.82968926, + "learning_rate": 8.857614663928249e-05, + "loss": 0.84015042, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.3996582, + "step": 4228, + "time_per_iteration": 2.7195777893066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_mlp": 1.00532568, + "epoch": 0.8135821469796075, + "flos": 580351314432.0, + "grad_norm": 0.039368139599927306, + "language_loss": 0.79510874, + "learning_rate": 8.839918887251025e-05, + "loss": 0.8055616, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.39941406, + "step": 4229, + "time_per_iteration": 2.764267921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_mlp": 1.00579298, + "epoch": 0.8137745286648711, + "flos": 651644902656.0, + "grad_norm": 0.033713073313620584, + "language_loss": 0.84232569, + "learning_rate": 8.822239090334472e-05, + "loss": 0.85278368, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.39990234, + "step": 4230, + "time_per_iteration": 2.923346757888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045567, + "balance_loss_mlp": 1.00553608, + "epoch": 0.8139669103501347, + "flos": 703128671232.0, + "grad_norm": 0.036115570851931435, + "language_loss": 0.76082253, + "learning_rate": 8.804575280042493e-05, + "loss": 0.77127826, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.40014648, + "step": 4231, + "time_per_iteration": 2.955892562866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_mlp": 1.00483012, + "epoch": 0.8141592920353983, + "flos": 651388278528.0, + "grad_norm": 0.04294462477319246, + "language_loss": 0.83589506, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84634244, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.39892578, + "step": 4232, + "time_per_iteration": 2.7943365573883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_mlp": 1.0044142, + "epoch": 0.8143516737206618, + "flos": 537845591808.0, + "grad_norm": 0.03939422640128119, + "language_loss": 0.816208, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82665199, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.3996582, + "step": 4233, + "time_per_iteration": 2.6279783248901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_mlp": 1.00510478, + "epoch": 0.8145440554059253, + "flos": 509363927808.0, + "grad_norm": 0.0369441276850866, + "language_loss": 0.83043873, + "learning_rate": 8.751679837459963e-05, + "loss": 0.84088916, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.39916992, + "step": 4234, + "time_per_iteration": 2.5817222595214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_mlp": 1.00513363, + "epoch": 0.8147364370911889, + "flos": 636288134400.0, + "grad_norm": 0.034229096047118546, + "language_loss": 0.8689273, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87937772, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.39892578, + "step": 4235, + "time_per_iteration": 2.8325142860412598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_mlp": 1.0052501, + "epoch": 0.8149288187764525, + "flos": 423706053120.0, + "grad_norm": 0.037533460123593716, + "language_loss": 0.78556967, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79602206, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.3996582, + "step": 4236, + "time_per_iteration": 2.4753267765045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045064, + "balance_loss_mlp": 1.0051285, + "epoch": 0.8151212004617161, + "flos": 598621377792.0, + "grad_norm": 0.03507677024776033, + "language_loss": 0.82292378, + "learning_rate": 8.698928521003097e-05, + "loss": 0.83337444, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.39916992, + "step": 4237, + "time_per_iteration": 2.8309948444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042789, + "balance_loss_mlp": 1.00378418, + "epoch": 0.8153135821469796, + "flos": 1482415163136.0, + "grad_norm": 0.004372930675089117, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78895634, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.38964844, + "step": 4238, + "time_per_iteration": 5.034019231796265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_mlp": 1.00327706, + "epoch": 0.8155059638322432, + "flos": 438012013824.0, + "grad_norm": 0.05133658449911632, + "language_loss": 0.83284819, + "learning_rate": 8.663841137810741e-05, + "loss": 0.84328049, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.39941406, + "step": 4239, + "time_per_iteration": 2.4995291233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043057, + "balance_loss_mlp": 1.0030508, + "epoch": 0.8156983455175068, + "flos": 795820445952.0, + "grad_norm": 0.03774301364361203, + "language_loss": 0.85754836, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86797893, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.39990234, + "step": 4240, + "time_per_iteration": 3.0334153175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_mlp": 1.00405586, + "epoch": 0.8158907272027703, + "flos": 687194492928.0, + "grad_norm": 0.0362603344870212, + "language_loss": 0.82029748, + "learning_rate": 8.628817947092616e-05, + "loss": 0.83073783, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.3996582, + "step": 4241, + "time_per_iteration": 2.873093843460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010462, + "balance_loss_mlp": 1.00631261, + "epoch": 0.8160831088880338, + "flos": 488030870016.0, + "grad_norm": 0.04708907661610768, + "language_loss": 0.84995806, + "learning_rate": 8.611330440911797e-05, + "loss": 0.86041999, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.39868164, + "step": 4242, + "time_per_iteration": 2.61018967628479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043886, + "balance_loss_mlp": 1.00390291, + "epoch": 0.8162754905732974, + "flos": 465822948864.0, + "grad_norm": 0.0364863486615585, + "language_loss": 0.80885643, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81929529, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.3996582, + "step": 4243, + "time_per_iteration": 2.594784736633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043095, + "balance_loss_mlp": 1.00408936, + "epoch": 0.816467872258561, + "flos": 1242145601280.0, + "grad_norm": 0.0055455657933980934, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76328218, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.38964844, + "step": 4244, + "time_per_iteration": 4.718213081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_mlp": 1.00290811, + "epoch": 0.8166602539438246, + "flos": 688403748096.0, + "grad_norm": 0.03188594993660783, + "language_loss": 0.8693856, + "learning_rate": 8.558964360534615e-05, + "loss": 0.8798157, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.40087891, + "step": 4245, + "time_per_iteration": 2.9532947540283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043255, + "balance_loss_mlp": 1.00424957, + "epoch": 0.8168526356290882, + "flos": 1493919611136.0, + "grad_norm": 0.0050365971652065996, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.74017996, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.38964844, + "step": 4246, + "time_per_iteration": 4.9726879596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_mlp": 1.00471079, + "epoch": 0.8170450173143516, + "flos": 579300506880.0, + "grad_norm": 0.03921262861389307, + "language_loss": 0.84971178, + "learning_rate": 8.524134073172984e-05, + "loss": 0.86015892, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.39990234, + "step": 4247, + "time_per_iteration": 2.737804651260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_mlp": 1.00498414, + "epoch": 0.8172373989996152, + "flos": 572438660352.0, + "grad_norm": 0.034223737538548314, + "language_loss": 0.85163987, + "learning_rate": 8.506743079651974e-05, + "loss": 0.86208975, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.39990234, + "step": 4248, + "time_per_iteration": 2.743518352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045023, + "balance_loss_mlp": 1.00501621, + "epoch": 0.8174297806848788, + "flos": 529859060736.0, + "grad_norm": 0.04399953494353778, + "language_loss": 0.8105247, + "learning_rate": 8.489368195241948e-05, + "loss": 0.82097489, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.39990234, + "step": 4249, + "time_per_iteration": 2.6323330402374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044479, + "balance_loss_mlp": 1.00451958, + "epoch": 0.8176221623701424, + "flos": 570269971200.0, + "grad_norm": 0.038692902180605414, + "language_loss": 0.79434025, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80478501, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.39941406, + "step": 4250, + "time_per_iteration": 2.815521240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045712, + "balance_loss_mlp": 1.00572968, + "epoch": 0.8178145440554059, + "flos": 657707708160.0, + "grad_norm": 0.043415035680912505, + "language_loss": 0.80942428, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81988138, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.3996582, + "step": 4251, + "time_per_iteration": 2.8530821800231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_mlp": 1.00503504, + "epoch": 0.8180069257406695, + "flos": 547056929280.0, + "grad_norm": 0.03736687564854558, + "language_loss": 0.88220131, + "learning_rate": 8.437340264101828e-05, + "loss": 0.8926518, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.39990234, + "step": 4252, + "time_per_iteration": 2.708724021911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_mlp": 1.004143, + "epoch": 0.818199307425933, + "flos": 620412281856.0, + "grad_norm": 0.03556485769365952, + "language_loss": 0.85256195, + "learning_rate": 8.420029883528474e-05, + "loss": 0.86300319, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.3996582, + "step": 4253, + "time_per_iteration": 2.7210686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044408, + "balance_loss_mlp": 1.00437737, + "epoch": 0.8183916891111966, + "flos": 648935741952.0, + "grad_norm": 0.0363052045214381, + "language_loss": 0.77293622, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78338039, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.40014648, + "step": 4254, + "time_per_iteration": 2.8884494304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_mlp": 1.00440454, + "epoch": 0.8185840707964602, + "flos": 500103012864.0, + "grad_norm": 0.03824906610181014, + "language_loss": 0.78647155, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79691517, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.39941406, + "step": 4255, + "time_per_iteration": 2.5975873470306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_mlp": 1.00541151, + "epoch": 0.8187764524817237, + "flos": 787612283904.0, + "grad_norm": 0.03247097866724817, + "language_loss": 0.8011173, + "learning_rate": 8.368195625315251e-05, + "loss": 0.81157076, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.39916992, + "step": 4256, + "time_per_iteration": 3.1064188480377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044034, + "balance_loss_mlp": 1.00409889, + "epoch": 0.8189688341669873, + "flos": 551787025152.0, + "grad_norm": 0.03028491007701996, + "language_loss": 0.81150901, + "learning_rate": 8.350949856106283e-05, + "loss": 0.82194936, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.39916992, + "step": 4257, + "time_per_iteration": 2.832043409347534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047611, + "balance_loss_mlp": 1.00860596, + "epoch": 0.8191612158522509, + "flos": 1354883410944.0, + "grad_norm": 0.005783385057534148, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72196954, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.38964844, + "step": 4258, + "time_per_iteration": 4.853669881820679 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045252, + "balance_loss_mlp": 1.00529289, + "epoch": 0.8193535975375145, + "flos": 545300399616.0, + "grad_norm": 0.04084284494867611, + "language_loss": 0.84286118, + "learning_rate": 8.316506833163318e-05, + "loss": 0.85331368, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.39941406, + "step": 4259, + "time_per_iteration": 2.628735065460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045378, + "balance_loss_mlp": 1.0054425, + "epoch": 0.8195459792227779, + "flos": 867228740352.0, + "grad_norm": 0.029318190080420886, + "language_loss": 0.8587026, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86915636, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.39916992, + "step": 4260, + "time_per_iteration": 3.0968639850616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043987, + "balance_loss_mlp": 1.00400376, + "epoch": 0.8197383609080415, + "flos": 570410922240.0, + "grad_norm": 0.03757495975364379, + "language_loss": 0.82012129, + "learning_rate": 8.282128542083101e-05, + "loss": 0.83056116, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.3996582, + "step": 4261, + "time_per_iteration": 2.7136785984039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_mlp": 1.0044421, + "epoch": 0.8199307425933051, + "flos": 531886798848.0, + "grad_norm": 0.03681727702304477, + "language_loss": 0.85360086, + "learning_rate": 8.264963687678978e-05, + "loss": 0.8640449, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.39941406, + "step": 4262, + "time_per_iteration": 2.646632432937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_mlp": 1.00508881, + "epoch": 0.8201231242785687, + "flos": 568231539456.0, + "grad_norm": 0.034428735556058375, + "language_loss": 0.85892022, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86937046, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.39916992, + "step": 4263, + "time_per_iteration": 2.7189278602600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044973, + "balance_loss_mlp": 1.00503826, + "epoch": 0.8203155059638323, + "flos": 1232385055488.0, + "grad_norm": 0.043350247910763196, + "language_loss": 0.83505571, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84550548, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.39916992, + "step": 4264, + "time_per_iteration": 3.5374419689178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_mlp": 1.00456357, + "epoch": 0.8205078876490958, + "flos": 575280023808.0, + "grad_norm": 0.03192227347796584, + "language_loss": 0.80383801, + "learning_rate": 8.213566368959558e-05, + "loss": 0.81428391, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.40014648, + "step": 4265, + "time_per_iteration": 2.681304931640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_mlp": 1.00418675, + "epoch": 0.8207002693343594, + "flos": 932986388736.0, + "grad_norm": 0.03621668430838832, + "language_loss": 0.79055989, + "learning_rate": 8.196466366388744e-05, + "loss": 0.80100161, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.3996582, + "step": 4266, + "time_per_iteration": 3.221090316772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045676, + "balance_loss_mlp": 1.00578809, + "epoch": 0.8208926510196229, + "flos": 550660395264.0, + "grad_norm": 0.04514159408391212, + "language_loss": 0.81006944, + "learning_rate": 8.179382593389029e-05, + "loss": 0.82052624, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.39868164, + "step": 4267, + "time_per_iteration": 2.64736270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_mlp": 1.0050478, + "epoch": 0.8210850327048865, + "flos": 649413030144.0, + "grad_norm": 0.03228047800877003, + "language_loss": 0.82577145, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83622265, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.40063477, + "step": 4268, + "time_per_iteration": 2.82000994682312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046586, + "balance_loss_mlp": 1.00669813, + "epoch": 0.82127741439015, + "flos": 602698241280.0, + "grad_norm": 0.03546193069409799, + "language_loss": 0.81945211, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82991797, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.39868164, + "step": 4269, + "time_per_iteration": 2.735588550567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046582, + "balance_loss_mlp": 1.00664651, + "epoch": 0.8214697960754136, + "flos": 475854714624.0, + "grad_norm": 0.033877079052907766, + "language_loss": 0.84021544, + "learning_rate": 8.128228718110015e-05, + "loss": 0.85068125, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.39916992, + "step": 4270, + "time_per_iteration": 2.667363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045275, + "balance_loss_mlp": 1.00529253, + "epoch": 0.8216621777606772, + "flos": 905094773760.0, + "grad_norm": 0.03887576075130339, + "language_loss": 0.85440713, + "learning_rate": 8.11120992965671e-05, + "loss": 0.86485988, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.3996582, + "step": 4271, + "time_per_iteration": 3.068880558013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044467, + "balance_loss_mlp": 1.00446057, + "epoch": 0.8218545594459408, + "flos": 515496719616.0, + "grad_norm": 0.035519082612497935, + "language_loss": 0.82364231, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83408695, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.39990234, + "step": 4272, + "time_per_iteration": 2.6724655628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044288, + "balance_loss_mlp": 1.00425744, + "epoch": 0.8220469411312044, + "flos": 495559554816.0, + "grad_norm": 0.0323215818844035, + "language_loss": 0.86376536, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87420821, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.40014648, + "step": 4273, + "time_per_iteration": 2.6066205501556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044867, + "balance_loss_mlp": 1.00486076, + "epoch": 0.8222393228164678, + "flos": 387276741888.0, + "grad_norm": 0.04119506633036295, + "language_loss": 0.90363312, + "learning_rate": 8.060251166717835e-05, + "loss": 0.91408181, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.39990234, + "step": 4274, + "time_per_iteration": 2.4332804679870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104479, + "balance_loss_mlp": 1.00485444, + "epoch": 0.8224317045017314, + "flos": 537630763776.0, + "grad_norm": 0.03442772169242134, + "language_loss": 0.87371385, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88416171, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.39916992, + "step": 4275, + "time_per_iteration": 2.6473186016082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043826, + "balance_loss_mlp": 1.00384283, + "epoch": 0.822624086186995, + "flos": 555948459264.0, + "grad_norm": 0.03742835179571848, + "language_loss": 0.8278271, + "learning_rate": 8.02636005937346e-05, + "loss": 0.8382653, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.3996582, + "step": 4276, + "time_per_iteration": 2.646347999572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044148, + "balance_loss_mlp": 1.00418925, + "epoch": 0.8228164678722586, + "flos": 540718057728.0, + "grad_norm": 0.032194648588505737, + "language_loss": 0.80336571, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81380719, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.39941406, + "step": 4277, + "time_per_iteration": 2.7481751441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045066, + "balance_loss_mlp": 1.00517821, + "epoch": 0.8230088495575221, + "flos": 474263435520.0, + "grad_norm": 0.04222743851278786, + "language_loss": 0.79908466, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80953538, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.39868164, + "step": 4278, + "time_per_iteration": 2.649730920791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045006, + "balance_loss_mlp": 1.00504684, + "epoch": 0.8232012312427857, + "flos": 592751046144.0, + "grad_norm": 0.03975207751077369, + "language_loss": 0.83853042, + "learning_rate": 7.975645631856127e-05, + "loss": 0.84898043, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.39941406, + "step": 4279, + "time_per_iteration": 2.6517245769500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_mlp": 1.00475395, + "epoch": 0.8233936129280492, + "flos": 573788866560.0, + "grad_norm": 0.034088399185727584, + "language_loss": 0.75156295, + "learning_rate": 7.958773444541916e-05, + "loss": 0.76200867, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.39794922, + "step": 4280, + "time_per_iteration": 2.790695905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_mlp": 1.0049988, + "epoch": 0.8235859946133128, + "flos": 732750571008.0, + "grad_norm": 0.030832979934739466, + "language_loss": 0.78604949, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79649758, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.39794922, + "step": 4281, + "time_per_iteration": 2.999879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043865, + "balance_loss_mlp": 1.0039773, + "epoch": 0.8237783762985764, + "flos": 571398546432.0, + "grad_norm": 0.03880920292514566, + "language_loss": 0.82002759, + "learning_rate": 7.92507804201253e-05, + "loss": 0.83046621, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.39868164, + "step": 4282, + "time_per_iteration": 2.677208423614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104578, + "balance_loss_mlp": 1.0067749, + "epoch": 0.8239707579838399, + "flos": 1469427327744.0, + "grad_norm": 0.006396202661854135, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76343453, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.38964844, + "step": 4283, + "time_per_iteration": 4.968418121337891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_mlp": 1.00370288, + "epoch": 0.8241631396691035, + "flos": 468297839616.0, + "grad_norm": 0.046097230790764596, + "language_loss": 0.8120932, + "learning_rate": 7.89144797921037e-05, + "loss": 0.82252717, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.39672852, + "step": 4284, + "time_per_iteration": 2.6612024307250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044239, + "balance_loss_mlp": 1.0054245, + "epoch": 0.8243555213543671, + "flos": 1542552301056.0, + "grad_norm": 0.004349611814925143, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78978509, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.38769531, + "step": 4285, + "time_per_iteration": 4.981449842453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_mlp": 1.00584543, + "epoch": 0.8245479030396307, + "flos": 798863998464.0, + "grad_norm": 0.03989950535073509, + "language_loss": 0.83088112, + "learning_rate": 7.85788330836078e-05, + "loss": 0.8413372, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.39746094, + "step": 4286, + "time_per_iteration": 3.1188526153564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_mlp": 1.00571775, + "epoch": 0.8247402847248941, + "flos": 647400843264.0, + "grad_norm": 0.03590906041018328, + "language_loss": 0.76881772, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77927309, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.39794922, + "step": 4287, + "time_per_iteration": 2.9174938201904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044954, + "balance_loss_mlp": 1.00511408, + "epoch": 0.8249326664101577, + "flos": 605620284672.0, + "grad_norm": 0.03219362212927226, + "language_loss": 0.80230033, + "learning_rate": 7.824384081587637e-05, + "loss": 0.81274986, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.39819336, + "step": 4288, + "time_per_iteration": 2.795452833175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044781, + "balance_loss_mlp": 1.00496483, + "epoch": 0.8251250480954213, + "flos": 825828260352.0, + "grad_norm": 0.041963910969405445, + "language_loss": 0.86787474, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87832254, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.39794922, + "step": 4289, + "time_per_iteration": 3.134443759918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_mlp": 1.00264227, + "epoch": 0.8253174297806849, + "flos": 758676664320.0, + "grad_norm": 0.039060289997601944, + "language_loss": 0.78858769, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79901201, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.39770508, + "step": 4290, + "time_per_iteration": 2.944941520690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_mlp": 1.00305033, + "epoch": 0.8255098114659485, + "flos": 795994444800.0, + "grad_norm": 0.037141819973277965, + "language_loss": 0.87861943, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88904715, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.39697266, + "step": 4291, + "time_per_iteration": 3.1870129108428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043682, + "balance_loss_mlp": 1.0038898, + "epoch": 0.825702193151212, + "flos": 711681918720.0, + "grad_norm": 0.035866698346178935, + "language_loss": 0.7753849, + "learning_rate": 7.757582168257731e-05, + "loss": 0.7858218, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.39770508, + "step": 4292, + "time_per_iteration": 2.85308575630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043056, + "balance_loss_mlp": 1.00321639, + "epoch": 0.8258945748364755, + "flos": 684670024704.0, + "grad_norm": 0.03268721539583558, + "language_loss": 0.81239599, + "learning_rate": 7.740922673634537e-05, + "loss": 0.82282656, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.39819336, + "step": 4293, + "time_per_iteration": 2.9332666397094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_mlp": 1.00339425, + "epoch": 0.8260869565217391, + "flos": 595681837824.0, + "grad_norm": 0.03866521927101234, + "language_loss": 0.79496479, + "learning_rate": 7.724279585440186e-05, + "loss": 0.80539787, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.39892578, + "step": 4294, + "time_per_iteration": 2.710196018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044652, + "balance_loss_mlp": 1.00466919, + "epoch": 0.8262793382070027, + "flos": 652653914112.0, + "grad_norm": 0.035640366708924454, + "language_loss": 0.85982841, + "learning_rate": 7.707652910136098e-05, + "loss": 0.8702749, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.3996582, + "step": 4295, + "time_per_iteration": 2.7833869457244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046294, + "balance_loss_mlp": 1.00628734, + "epoch": 0.8264717198922663, + "flos": 539957900544.0, + "grad_norm": 0.03542923648415416, + "language_loss": 0.84949446, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85995746, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.39990234, + "step": 4296, + "time_per_iteration": 2.6374382972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_mlp": 1.00437677, + "epoch": 0.8266641015775298, + "flos": 539994839040.0, + "grad_norm": 0.04217853595107177, + "language_loss": 0.76282918, + "learning_rate": 7.674448824012514e-05, + "loss": 0.77327377, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.40063477, + "step": 4297, + "time_per_iteration": 2.7391257286071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_mlp": 1.0064801, + "epoch": 0.8268564832627934, + "flos": 586503548160.0, + "grad_norm": 0.03264457137254003, + "language_loss": 0.84539366, + "learning_rate": 7.657871426083979e-05, + "loss": 0.8558585, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.39990234, + "step": 4298, + "time_per_iteration": 2.7992489337921143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045276, + "balance_loss_mlp": 1.00526941, + "epoch": 0.827048864948057, + "flos": 431571075072.0, + "grad_norm": 0.03940875322434759, + "language_loss": 0.84735167, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85780442, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.39990234, + "step": 4299, + "time_per_iteration": 2.4731128215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_mlp": 1.00542581, + "epoch": 0.8272412466333205, + "flos": 1390502032128.0, + "grad_norm": 0.03504583315652351, + "language_loss": 0.8553803, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86583406, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.39941406, + "step": 4300, + "time_per_iteration": 3.7725141048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_mlp": 1.00410068, + "epoch": 0.827433628318584, + "flos": 539350354944.0, + "grad_norm": 0.03755336156042174, + "language_loss": 0.83188915, + "learning_rate": 7.608237890043335e-05, + "loss": 0.84232998, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.3996582, + "step": 4301, + "time_per_iteration": 2.6834564208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_mlp": 1.00330544, + "epoch": 0.8276260100038476, + "flos": 732064290816.0, + "grad_norm": 0.03864569373591978, + "language_loss": 0.78056109, + "learning_rate": 7.59172628535526e-05, + "loss": 0.79099381, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.39941406, + "step": 4302, + "time_per_iteration": 2.933929920196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043683, + "balance_loss_mlp": 1.00372398, + "epoch": 0.8278183916891112, + "flos": 872662612992.0, + "grad_norm": 0.033669788377383415, + "language_loss": 0.82804894, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83848584, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.39941406, + "step": 4303, + "time_per_iteration": 3.2120118141174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045387, + "balance_loss_mlp": 1.00554717, + "epoch": 0.8280107733743748, + "flos": 595699334400.0, + "grad_norm": 0.033210409698881456, + "language_loss": 0.78002685, + "learning_rate": 7.558752475439134e-05, + "loss": 0.79048073, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.39819336, + "step": 4304, + "time_per_iteration": 2.777714490890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_mlp": 1.00468552, + "epoch": 0.8282031550596384, + "flos": 770028500736.0, + "grad_norm": 0.03499833227551203, + "language_loss": 0.84353423, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85397851, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.3972168, + "step": 4305, + "time_per_iteration": 3.0219714641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045284, + "balance_loss_mlp": 1.00551581, + "epoch": 0.8283955367449019, + "flos": 697447889664.0, + "grad_norm": 0.03481801368106837, + "language_loss": 0.78346533, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79391819, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.39746094, + "step": 4306, + "time_per_iteration": 2.9294331073760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045461, + "balance_loss_mlp": 1.00566852, + "epoch": 0.8285879184301654, + "flos": 661939128576.0, + "grad_norm": 0.04153040782978192, + "language_loss": 0.83166927, + "learning_rate": 7.509415355178806e-05, + "loss": 0.84212393, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.39770508, + "step": 4307, + "time_per_iteration": 2.9238927364349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_mlp": 1.0061034, + "epoch": 0.828780300115429, + "flos": 559773556224.0, + "grad_norm": 0.04048281455959126, + "language_loss": 0.78020167, + "learning_rate": 7.493002632534618e-05, + "loss": 0.79065967, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.39672852, + "step": 4308, + "time_per_iteration": 2.6687874794006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045287, + "balance_loss_mlp": 1.00547111, + "epoch": 0.8289726818006926, + "flos": 832373211648.0, + "grad_norm": 0.03463570750325123, + "language_loss": 0.8228085, + "learning_rate": 7.476606412570352e-05, + "loss": 0.83326137, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.39794922, + "step": 4309, + "time_per_iteration": 3.030407667160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_mlp": 1.00534868, + "epoch": 0.8291650634859561, + "flos": 733555448064.0, + "grad_norm": 0.0357732652689289, + "language_loss": 0.81524992, + "learning_rate": 7.460226701651624e-05, + "loss": 0.82570136, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.39770508, + "step": 4310, + "time_per_iteration": 2.9470043182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104542, + "balance_loss_mlp": 1.00555694, + "epoch": 0.8293574451712197, + "flos": 862470454272.0, + "grad_norm": 0.03315775834141588, + "language_loss": 0.81509542, + "learning_rate": 7.443863506137566e-05, + "loss": 0.8255496, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.3984375, + "step": 4311, + "time_per_iteration": 3.262594223022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045102, + "balance_loss_mlp": 1.0053333, + "epoch": 0.8295498268564833, + "flos": 496291521792.0, + "grad_norm": 0.030294085038389356, + "language_loss": 0.82037485, + "learning_rate": 7.427516832380948e-05, + "loss": 0.83082587, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.39746094, + "step": 4312, + "time_per_iteration": 2.9078259468078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045472, + "balance_loss_mlp": 1.00570416, + "epoch": 0.8297422085417469, + "flos": 555655863552.0, + "grad_norm": 0.029980290167267002, + "language_loss": 0.78229713, + "learning_rate": 7.4111866867281e-05, + "loss": 0.79275185, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.39746094, + "step": 4313, + "time_per_iteration": 2.8011112213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044992, + "balance_loss_mlp": 1.00529504, + "epoch": 0.8299345902270104, + "flos": 1249489605120.0, + "grad_norm": 0.0352855921199785, + "language_loss": 0.7777639, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78821379, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.39672852, + "step": 4314, + "time_per_iteration": 3.6537117958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_mlp": 1.0053004, + "epoch": 0.8301269719122739, + "flos": 586410229248.0, + "grad_norm": 0.035731998635991455, + "language_loss": 0.83257413, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84302461, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.3972168, + "step": 4315, + "time_per_iteration": 2.726372003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_mlp": 1.00410736, + "epoch": 0.8303193535975375, + "flos": 510777317376.0, + "grad_norm": 0.03881103976705545, + "language_loss": 0.85426903, + "learning_rate": 7.362295481759412e-05, + "loss": 0.86470878, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.3984375, + "step": 4316, + "time_per_iteration": 2.6592206954956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_mlp": 1.00502753, + "epoch": 0.8305117352828011, + "flos": 581766649344.0, + "grad_norm": 0.036149535286214125, + "language_loss": 0.84061778, + "learning_rate": 7.346031511856722e-05, + "loss": 0.8510648, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.39648438, + "step": 4317, + "time_per_iteration": 2.6798949241638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_mlp": 1.00519013, + "epoch": 0.8307041169680647, + "flos": 482649487104.0, + "grad_norm": 0.03503303248285494, + "language_loss": 0.79233706, + "learning_rate": 7.329784101693232e-05, + "loss": 0.80278593, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.39672852, + "step": 4318, + "time_per_iteration": 2.6182241439819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045375, + "balance_loss_mlp": 1.00567794, + "epoch": 0.8308964986533282, + "flos": 625754780928.0, + "grad_norm": 0.03813638637537738, + "language_loss": 0.83587325, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84632701, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.39672852, + "step": 4319, + "time_per_iteration": 2.722752571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104435, + "balance_loss_mlp": 1.0046773, + "epoch": 0.8310888803385917, + "flos": 828706562304.0, + "grad_norm": 0.03928829475188373, + "language_loss": 0.79520625, + "learning_rate": 7.297338985808589e-05, + "loss": 0.80564976, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.39648438, + "step": 4320, + "time_per_iteration": 3.013678789138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_mlp": 1.00594461, + "epoch": 0.8312812620238553, + "flos": 584947262208.0, + "grad_norm": 0.03143191383809492, + "language_loss": 0.82371467, + "learning_rate": 7.281141292683746e-05, + "loss": 0.83417112, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.39672852, + "step": 4321, + "time_per_iteration": 2.7931981086730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044244, + "balance_loss_mlp": 1.00459492, + "epoch": 0.8314736437091189, + "flos": 1117370653440.0, + "grad_norm": 0.03967418669212243, + "language_loss": 0.75441504, + "learning_rate": 7.26496018449071e-05, + "loss": 0.76485747, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.39624023, + "step": 4322, + "time_per_iteration": 3.406388759613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044382, + "balance_loss_mlp": 1.00473273, + "epoch": 0.8316660253943825, + "flos": 518559714048.0, + "grad_norm": 0.037215858057632896, + "language_loss": 0.82538068, + "learning_rate": 7.248795667511543e-05, + "loss": 0.83582449, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.39624023, + "step": 4323, + "time_per_iteration": 2.790639877319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_mlp": 1.00444412, + "epoch": 0.831858407079646, + "flos": 796697254656.0, + "grad_norm": 0.03541243399585954, + "language_loss": 0.78485489, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79529536, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.39575195, + "step": 4324, + "time_per_iteration": 2.975816249847412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043202, + "balance_loss_mlp": 1.00360107, + "epoch": 0.8320507887649096, + "flos": 551042419200.0, + "grad_norm": 0.03671170113151978, + "language_loss": 0.83109081, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84152287, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.39575195, + "step": 4325, + "time_per_iteration": 2.658466339111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043112, + "balance_loss_mlp": 1.00355887, + "epoch": 0.8322431704501732, + "flos": 480352485888.0, + "grad_norm": 0.03962331915706713, + "language_loss": 0.82560384, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83603495, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.39526367, + "step": 4326, + "time_per_iteration": 2.514432907104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043175, + "balance_loss_mlp": 1.0036217, + "epoch": 0.8324355521354367, + "flos": 573547793664.0, + "grad_norm": 0.031905797463172826, + "language_loss": 0.85702962, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86746132, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.39526367, + "step": 4327, + "time_per_iteration": 2.6735494136810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_mlp": 1.00399959, + "epoch": 0.8326279338207002, + "flos": 504440391168.0, + "grad_norm": 0.03284253573925844, + "language_loss": 0.82986456, + "learning_rate": 7.168222170244888e-05, + "loss": 0.84030032, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.39550781, + "step": 4328, + "time_per_iteration": 2.5880331993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_mlp": 1.00509644, + "epoch": 0.8328203155059638, + "flos": 606951048960.0, + "grad_norm": 0.032340535352269904, + "language_loss": 0.81377709, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82422405, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.39575195, + "step": 4329, + "time_per_iteration": 2.9070374965667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044592, + "balance_loss_mlp": 1.0050385, + "epoch": 0.8330126971912274, + "flos": 699123739392.0, + "grad_norm": 0.03346059950715292, + "language_loss": 0.86209023, + "learning_rate": 7.136109128985663e-05, + "loss": 0.87253612, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.39526367, + "step": 4330, + "time_per_iteration": 2.884406328201294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044072, + "balance_loss_mlp": 1.00442338, + "epoch": 0.833205078876491, + "flos": 495021028608.0, + "grad_norm": 0.03393420593048976, + "language_loss": 0.87050742, + "learning_rate": 7.120077567098249e-05, + "loss": 0.88094813, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.39624023, + "step": 4331, + "time_per_iteration": 2.5215518474578857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_mlp": 1.00428641, + "epoch": 0.8333974605617546, + "flos": 483795558912.0, + "grad_norm": 0.03242489973707072, + "language_loss": 0.83629441, + "learning_rate": 7.104062652673115e-05, + "loss": 0.84673423, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.39672852, + "step": 4332, + "time_per_iteration": 2.589259147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_mlp": 1.00282395, + "epoch": 0.833589842247018, + "flos": 688041166080.0, + "grad_norm": 0.05811234258631201, + "language_loss": 0.83496767, + "learning_rate": 7.088064391927818e-05, + "loss": 0.84539282, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.39672852, + "step": 4333, + "time_per_iteration": 2.803917646408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055734, + "balance_loss_mlp": 1.01606107, + "epoch": 0.8337822239322816, + "flos": 883193054208.0, + "grad_norm": 0.035373002810175205, + "language_loss": 0.83053595, + "learning_rate": 7.072082791073419e-05, + "loss": 0.8410933, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.39648438, + "step": 4334, + "time_per_iteration": 3.102529525756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105553, + "balance_loss_mlp": 1.01578557, + "epoch": 0.8339746056175452, + "flos": 498157900032.0, + "grad_norm": 0.03915786312504082, + "language_loss": 0.83033496, + "learning_rate": 7.056117856314531e-05, + "loss": 0.84089029, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.3972168, + "step": 4335, + "time_per_iteration": 2.581801652908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_mlp": 1.01480997, + "epoch": 0.8341669873028088, + "flos": 511504426752.0, + "grad_norm": 0.03653377774815775, + "language_loss": 0.86941135, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87995613, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.39648438, + "step": 4336, + "time_per_iteration": 2.5874557495117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053038, + "balance_loss_mlp": 1.0132935, + "epoch": 0.8343593689880723, + "flos": 693542112768.0, + "grad_norm": 0.042823891239838895, + "language_loss": 0.84834361, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85887402, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.3972168, + "step": 4337, + "time_per_iteration": 2.831866979598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_mlp": 1.00987899, + "epoch": 0.8345517506733359, + "flos": 553517309952.0, + "grad_norm": 0.03514600102944956, + "language_loss": 0.78723717, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79773366, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.39746094, + "step": 4338, + "time_per_iteration": 2.805140495300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049491, + "balance_loss_mlp": 1.00981832, + "epoch": 0.8347441323585995, + "flos": 593268185088.0, + "grad_norm": 0.03562490658718948, + "language_loss": 0.76787317, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77836812, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.39648438, + "step": 4339, + "time_per_iteration": 2.823744535446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050135, + "balance_loss_mlp": 1.01043832, + "epoch": 0.834936514043863, + "flos": 616092400128.0, + "grad_norm": 0.03219895087392271, + "language_loss": 0.85226107, + "learning_rate": 6.976543390660983e-05, + "loss": 0.86276239, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.39672852, + "step": 4340, + "time_per_iteration": 2.7585527896881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049678, + "balance_loss_mlp": 1.00998175, + "epoch": 0.8351288957291266, + "flos": 468864556032.0, + "grad_norm": 0.03659606336677384, + "language_loss": 0.80055946, + "learning_rate": 6.960678582409424e-05, + "loss": 0.81105626, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.39672852, + "step": 4341, + "time_per_iteration": 2.6036171913146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044054, + "balance_loss_mlp": 1.00428581, + "epoch": 0.8353212774143901, + "flos": 510349606656.0, + "grad_norm": 0.03169243784279604, + "language_loss": 0.79527783, + "learning_rate": 6.944830483504328e-05, + "loss": 0.8057183, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.39746094, + "step": 4342, + "time_per_iteration": 2.655421018600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_mlp": 1.00411177, + "epoch": 0.8355136590996537, + "flos": 689018096640.0, + "grad_norm": 0.03436088909183797, + "language_loss": 0.81165028, + "learning_rate": 6.928999100098483e-05, + "loss": 0.82208884, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.3972168, + "step": 4343, + "time_per_iteration": 2.826841115951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044821, + "balance_loss_mlp": 1.00512445, + "epoch": 0.8357060407849173, + "flos": 985976865792.0, + "grad_norm": 0.04054182242673009, + "language_loss": 0.84078169, + "learning_rate": 6.913184438338138e-05, + "loss": 0.85122991, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.39672852, + "step": 4344, + "time_per_iteration": 3.2107176780700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_mlp": 1.00431824, + "epoch": 0.8358984224701809, + "flos": 844508537856.0, + "grad_norm": 0.030813015519650187, + "language_loss": 0.85689914, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86733955, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.39697266, + "step": 4345, + "time_per_iteration": 3.163668155670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_mlp": 1.00331438, + "epoch": 0.8360908041554445, + "flos": 627419937024.0, + "grad_norm": 0.03291313388233015, + "language_loss": 0.82514834, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83557892, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.3972168, + "step": 4346, + "time_per_iteration": 2.7415146827697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_mlp": 1.00408411, + "epoch": 0.8362831858407079, + "flos": 577223191296.0, + "grad_norm": 0.0302998467328884, + "language_loss": 0.85529792, + "learning_rate": 6.865840844295796e-05, + "loss": 0.86573529, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.39624023, + "step": 4347, + "time_per_iteration": 2.7346980571746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_mlp": 1.00418234, + "epoch": 0.8364755675259715, + "flos": 835184439552.0, + "grad_norm": 0.03946015330884712, + "language_loss": 0.81571031, + "learning_rate": 6.850093130450569e-05, + "loss": 0.82614934, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.39697266, + "step": 4348, + "time_per_iteration": 3.08042573928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_mlp": 1.00388896, + "epoch": 0.8366679492112351, + "flos": 583564008192.0, + "grad_norm": 0.038362808412147696, + "language_loss": 0.86538804, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87582415, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.39697266, + "step": 4349, + "time_per_iteration": 2.7079648971557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043699, + "balance_loss_mlp": 1.00400245, + "epoch": 0.8368603308964987, + "flos": 612881651712.0, + "grad_norm": 0.03780805961056685, + "language_loss": 0.87590146, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88633847, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.39672852, + "step": 4350, + "time_per_iteration": 2.783841609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_mlp": 1.00321651, + "epoch": 0.8370527125817622, + "flos": 508265488128.0, + "grad_norm": 0.031770468246229984, + "language_loss": 0.85780954, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86823821, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.39624023, + "step": 4351, + "time_per_iteration": 2.735769748687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042672, + "balance_loss_mlp": 1.00302303, + "epoch": 0.8372450942670258, + "flos": 772283705856.0, + "grad_norm": 0.03277928493486849, + "language_loss": 0.82987893, + "learning_rate": 6.787269858905603e-05, + "loss": 0.84030557, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.39624023, + "step": 4352, + "time_per_iteration": 2.9203648567199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043841, + "balance_loss_mlp": 1.00428724, + "epoch": 0.8374374759522893, + "flos": 580362008064.0, + "grad_norm": 0.034034807171666244, + "language_loss": 0.85397196, + "learning_rate": 6.771605967466033e-05, + "loss": 0.8644104, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.39526367, + "step": 4353, + "time_per_iteration": 2.720546007156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_mlp": 1.00159764, + "epoch": 0.8376298576375529, + "flos": 789529206528.0, + "grad_norm": 0.04449117180273345, + "language_loss": 0.82807922, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83849216, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.39672852, + "step": 4354, + "time_per_iteration": 3.007485866546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042862, + "balance_loss_mlp": 1.00309372, + "epoch": 0.8378222393228165, + "flos": 578723096832.0, + "grad_norm": 0.032911278141950814, + "language_loss": 0.81327355, + "learning_rate": 6.74032853891452e-05, + "loss": 0.82370222, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.39746094, + "step": 4355, + "time_per_iteration": 2.750502824783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_mlp": 1.00272167, + "epoch": 0.83801462100808, + "flos": 481859194368.0, + "grad_norm": 0.03244725858098954, + "language_loss": 0.82614964, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83657384, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.39672852, + "step": 4356, + "time_per_iteration": 2.648829936981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042528, + "balance_loss_mlp": 1.00283134, + "epoch": 0.8382070026933436, + "flos": 551997962496.0, + "grad_norm": 0.03119989334307816, + "language_loss": 0.8965174, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90694273, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.39672852, + "step": 4357, + "time_per_iteration": 2.776762008666992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104287, + "balance_loss_mlp": 1.00310183, + "epoch": 0.8383993843786072, + "flos": 626226233088.0, + "grad_norm": 0.0387641353530007, + "language_loss": 0.82408631, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83451504, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.39746094, + "step": 4358, + "time_per_iteration": 2.8670356273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_mlp": 1.00497019, + "epoch": 0.8385917660638708, + "flos": 492135923712.0, + "grad_norm": 0.03605120183669825, + "language_loss": 0.86969417, + "learning_rate": 6.677975268986719e-05, + "loss": 0.88014084, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.39672852, + "step": 4359, + "time_per_iteration": 2.556107759475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_mlp": 1.00478995, + "epoch": 0.8387841477491342, + "flos": 467870128896.0, + "grad_norm": 0.036141032042788915, + "language_loss": 0.875561, + "learning_rate": 6.662428984145336e-05, + "loss": 0.88600636, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.3972168, + "step": 4360, + "time_per_iteration": 2.5896832942962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046318, + "balance_loss_mlp": 1.00740814, + "epoch": 0.8389765294343978, + "flos": 1567600607232.0, + "grad_norm": 0.006654770635082277, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72826505, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.38867188, + "step": 4361, + "time_per_iteration": 5.005736351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_mlp": 1.0051924, + "epoch": 0.8391689111196614, + "flos": 603412711680.0, + "grad_norm": 0.02981143994477007, + "language_loss": 0.83398944, + "learning_rate": 6.631386895903308e-05, + "loss": 0.84443831, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.39672852, + "step": 4362, + "time_per_iteration": 2.860398530960083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_mlp": 1.0072782, + "epoch": 0.839361292804925, + "flos": 443968861440.0, + "grad_norm": 0.03966524931271206, + "language_loss": 0.80562806, + "learning_rate": 6.615891104554261e-05, + "loss": 0.81609803, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.39697266, + "step": 4363, + "time_per_iteration": 2.480076789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046837, + "balance_loss_mlp": 1.0070926, + "epoch": 0.8395536744901886, + "flos": 595299813888.0, + "grad_norm": 0.04057022644943622, + "language_loss": 0.83120066, + "learning_rate": 6.600412156410057e-05, + "loss": 0.84166896, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.3972168, + "step": 4364, + "time_per_iteration": 2.7753520011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047198, + "balance_loss_mlp": 1.00743032, + "epoch": 0.8397460561754521, + "flos": 891336087552.0, + "grad_norm": 0.03526735316823496, + "language_loss": 0.85782105, + "learning_rate": 6.58495005748016e-05, + "loss": 0.86829305, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.39746094, + "step": 4365, + "time_per_iteration": 3.16624116897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045668, + "balance_loss_mlp": 1.00580478, + "epoch": 0.8399384378607156, + "flos": 554561314560.0, + "grad_norm": 0.03367711275726433, + "language_loss": 0.89177513, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90223181, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.3984375, + "step": 4366, + "time_per_iteration": 2.6069655418395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_mlp": 1.00489306, + "epoch": 0.8401308195459792, + "flos": 519964355328.0, + "grad_norm": 0.03275930619956067, + "language_loss": 0.83950633, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84995365, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.39819336, + "step": 4367, + "time_per_iteration": 2.6613383293151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045197, + "balance_loss_mlp": 1.00540471, + "epoch": 0.8403232012312428, + "flos": 686296296960.0, + "grad_norm": 0.036430309250403296, + "language_loss": 0.8146503, + "learning_rate": 6.538664915972648e-05, + "loss": 0.82510233, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.39770508, + "step": 4368, + "time_per_iteration": 2.995043992996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045982, + "balance_loss_mlp": 1.00618947, + "epoch": 0.8405155829165063, + "flos": 578670607104.0, + "grad_norm": 0.03783265596862743, + "language_loss": 0.78067476, + "learning_rate": 6.523270273863652e-05, + "loss": 0.79113454, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.39770508, + "step": 4369, + "time_per_iteration": 2.6606693267822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_mlp": 1.00694239, + "epoch": 0.8407079646017699, + "flos": 457567154688.0, + "grad_norm": 0.04104173911837978, + "language_loss": 0.88549638, + "learning_rate": 6.507892510918079e-05, + "loss": 0.89596373, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.39770508, + "step": 4370, + "time_per_iteration": 2.5601558685302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047037, + "balance_loss_mlp": 1.00726855, + "epoch": 0.8409003462870335, + "flos": 536000600832.0, + "grad_norm": 0.03952335471853486, + "language_loss": 0.82247508, + "learning_rate": 6.492531633106114e-05, + "loss": 0.83294547, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.39746094, + "step": 4371, + "time_per_iteration": 2.771491527557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053168, + "balance_loss_mlp": 1.01339984, + "epoch": 0.8410927279722971, + "flos": 557900375040.0, + "grad_norm": 0.03882537955263822, + "language_loss": 0.7860809, + "learning_rate": 6.477187646391374e-05, + "loss": 0.79661262, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.39746094, + "step": 4372, + "time_per_iteration": 2.7046260833740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059166, + "balance_loss_mlp": 1.02044678, + "epoch": 0.8412851096575606, + "flos": 1552929152256.0, + "grad_norm": 0.010019431510238475, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78738284, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.38671875, + "step": 4373, + "time_per_iteration": 4.874579668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056957, + "balance_loss_mlp": 1.01718879, + "epoch": 0.8414774913428241, + "flos": 553109041152.0, + "grad_norm": 0.03716925944029823, + "language_loss": 0.79357052, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80414009, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.39746094, + "step": 4374, + "time_per_iteration": 2.6877481937408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_mlp": 1.01098144, + "epoch": 0.8416698730280877, + "flos": 574070768640.0, + "grad_norm": 0.03440342527742133, + "language_loss": 0.78264368, + "learning_rate": 6.431257092368336e-05, + "loss": 0.79315263, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.39892578, + "step": 4375, + "time_per_iteration": 2.6643028259277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050239, + "balance_loss_mlp": 1.01030397, + "epoch": 0.8418622547133513, + "flos": 760044367104.0, + "grad_norm": 0.03936876955125197, + "language_loss": 0.805475, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81597739, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.39916992, + "step": 4376, + "time_per_iteration": 2.8879754543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057887, + "balance_loss_mlp": 1.01795137, + "epoch": 0.8420546363986149, + "flos": 1075922541312.0, + "grad_norm": 0.04195291281382577, + "language_loss": 0.73367876, + "learning_rate": 6.40072128754366e-05, + "loss": 0.74425763, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.39916992, + "step": 4377, + "time_per_iteration": 3.3982491493225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_mlp": 1.01659334, + "epoch": 0.8422470180838784, + "flos": 527017697280.0, + "grad_norm": 0.03613137527395126, + "language_loss": 0.83433902, + "learning_rate": 6.385478772280933e-05, + "loss": 0.84490454, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.39941406, + "step": 4378, + "time_per_iteration": 2.7835891246795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053667, + "balance_loss_mlp": 1.01370764, + "epoch": 0.842439399769142, + "flos": 601964328960.0, + "grad_norm": 0.037358019375431845, + "language_loss": 0.82734925, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83788586, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.39941406, + "step": 4379, + "time_per_iteration": 2.748546600341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_mlp": 1.01895809, + "epoch": 0.8426317814544055, + "flos": 553376358912.0, + "grad_norm": 0.03714135398668163, + "language_loss": 0.87016404, + "learning_rate": 6.355044545643073e-05, + "loss": 0.88075352, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.3996582, + "step": 4380, + "time_per_iteration": 2.800340414047241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_mlp": 1.01922655, + "epoch": 0.8428241631396691, + "flos": 680045886720.0, + "grad_norm": 0.03742137043660712, + "language_loss": 0.78399694, + "learning_rate": 6.33985284608356e-05, + "loss": 0.7945888, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.39941406, + "step": 4381, + "time_per_iteration": 2.8040478229522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054594, + "balance_loss_mlp": 1.01473093, + "epoch": 0.8430165448249327, + "flos": 755199565056.0, + "grad_norm": 0.027180883037501744, + "language_loss": 0.80385518, + "learning_rate": 6.324678096896435e-05, + "loss": 0.81440109, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.3984375, + "step": 4382, + "time_per_iteration": 3.0603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052676, + "balance_loss_mlp": 1.01266921, + "epoch": 0.8432089265101962, + "flos": 700437007104.0, + "grad_norm": 0.036252263967316525, + "language_loss": 0.81660181, + "learning_rate": 6.30952030397306e-05, + "loss": 0.82712859, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.39990234, + "step": 4383, + "time_per_iteration": 2.8923966884613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_mlp": 1.01243746, + "epoch": 0.8434013081954598, + "flos": 486791479296.0, + "grad_norm": 0.043909328594933086, + "language_loss": 0.85683775, + "learning_rate": 6.294379473198208e-05, + "loss": 0.86736149, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.39916992, + "step": 4384, + "time_per_iteration": 2.7198166847229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_mlp": 1.01380002, + "epoch": 0.8435936898807234, + "flos": 521631456768.0, + "grad_norm": 0.03666745946070383, + "language_loss": 0.85839081, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86892796, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.39892578, + "step": 4385, + "time_per_iteration": 2.6209969520568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052098, + "balance_loss_mlp": 1.01199555, + "epoch": 0.843786071565987, + "flos": 787314830592.0, + "grad_norm": 0.035531415028739466, + "language_loss": 0.81066084, + "learning_rate": 6.264148721600254e-05, + "loss": 0.82118183, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.40087891, + "step": 4386, + "time_per_iteration": 3.0166499614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059635, + "balance_loss_mlp": 1.02062988, + "epoch": 0.8439784532512504, + "flos": 1449516407808.0, + "grad_norm": 0.007889310542636748, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76896149, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.38964844, + "step": 4387, + "time_per_iteration": 4.9381585121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052807, + "balance_loss_mlp": 1.01287234, + "epoch": 0.844170834936514, + "flos": 709969130496.0, + "grad_norm": 0.0430882842289394, + "language_loss": 0.82951272, + "learning_rate": 6.23398588904906e-05, + "loss": 0.8400408, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.39916992, + "step": 4388, + "time_per_iteration": 2.8580751419067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049455, + "balance_loss_mlp": 1.00940061, + "epoch": 0.8443632166217776, + "flos": 484409907456.0, + "grad_norm": 0.03543864011776059, + "language_loss": 0.80285496, + "learning_rate": 6.218929957057922e-05, + "loss": 0.81334955, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.40039062, + "step": 4389, + "time_per_iteration": 2.6879217624664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048845, + "balance_loss_mlp": 1.00886238, + "epoch": 0.8445555983070412, + "flos": 679924377600.0, + "grad_norm": 0.03469492088540733, + "language_loss": 0.80821919, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81870764, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.3996582, + "step": 4390, + "time_per_iteration": 2.8265607357025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048945, + "balance_loss_mlp": 1.00898623, + "epoch": 0.8447479799923048, + "flos": 742860104448.0, + "grad_norm": 0.03377233556180916, + "language_loss": 0.74732709, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75781655, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.39941406, + "step": 4391, + "time_per_iteration": 3.059424638748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048643, + "balance_loss_mlp": 1.00866032, + "epoch": 0.8449403616775683, + "flos": 954951291648.0, + "grad_norm": 0.03403484020500879, + "language_loss": 0.8089571, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81944358, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.3996582, + "step": 4392, + "time_per_iteration": 3.283132791519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048397, + "balance_loss_mlp": 1.0083667, + "epoch": 0.8451327433628318, + "flos": 658608816384.0, + "grad_norm": 0.037595076995326535, + "language_loss": 0.72948486, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73996878, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.40014648, + "step": 4393, + "time_per_iteration": 2.8895535469055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049246, + "balance_loss_mlp": 1.00933516, + "epoch": 0.8453251250480954, + "flos": 447049352448.0, + "grad_norm": 0.037490105164885316, + "language_loss": 0.83802319, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84851563, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.39892578, + "step": 4394, + "time_per_iteration": 2.600051164627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044816, + "balance_loss_mlp": 1.00488043, + "epoch": 0.845517506733359, + "flos": 543874371072.0, + "grad_norm": 0.04600609812131957, + "language_loss": 0.72015631, + "learning_rate": 6.128951512927305e-05, + "loss": 0.73060441, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.39916992, + "step": 4395, + "time_per_iteration": 2.669736623764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104476, + "balance_loss_mlp": 1.0048008, + "epoch": 0.8457098884186226, + "flos": 503507202048.0, + "grad_norm": 0.03225054285185889, + "language_loss": 0.84712255, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85757017, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.39941406, + "step": 4396, + "time_per_iteration": 2.6642940044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_mlp": 1.00355434, + "epoch": 0.8459022701038861, + "flos": 449895573504.0, + "grad_norm": 0.03459696576725407, + "language_loss": 0.80533981, + "learning_rate": 6.099094894219326e-05, + "loss": 0.81577528, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.3996582, + "step": 4397, + "time_per_iteration": 2.6888678073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043514, + "balance_loss_mlp": 1.00350761, + "epoch": 0.8460946517891497, + "flos": 744472770816.0, + "grad_norm": 0.03316086560733144, + "language_loss": 0.75788116, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76831627, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.39990234, + "step": 4398, + "time_per_iteration": 2.957204580307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_mlp": 1.00393856, + "epoch": 0.8462870334744133, + "flos": 554327044608.0, + "grad_norm": 0.035598785998024324, + "language_loss": 0.80453002, + "learning_rate": 6.069306450876389e-05, + "loss": 0.81496894, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.39941406, + "step": 4399, + "time_per_iteration": 2.7827699184417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_mlp": 1.00907135, + "epoch": 0.8464794151596768, + "flos": 1568271336192.0, + "grad_norm": 0.006109463060419775, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82756555, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.38867188, + "step": 4400, + "time_per_iteration": 4.881330966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_mlp": 1.00759327, + "epoch": 0.8466717968449403, + "flos": 551265995520.0, + "grad_norm": 0.03379372899576281, + "language_loss": 0.80214202, + "learning_rate": 6.039586229158084e-05, + "loss": 0.81261623, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.39819336, + "step": 4401, + "time_per_iteration": 2.7047319412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047774, + "balance_loss_mlp": 1.00788701, + "epoch": 0.8468641785302039, + "flos": 553096402176.0, + "grad_norm": 0.036798324054331616, + "language_loss": 0.849747, + "learning_rate": 6.024751715835314e-05, + "loss": 0.86022472, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.39868164, + "step": 4402, + "time_per_iteration": 2.815459966659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049428, + "balance_loss_mlp": 1.00966001, + "epoch": 0.8470565602154675, + "flos": 573825805056.0, + "grad_norm": 0.037110384516023706, + "language_loss": 0.87787378, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88836807, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.39746094, + "step": 4403, + "time_per_iteration": 2.737457275390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_mlp": 1.00511968, + "epoch": 0.8472489419007311, + "flos": 473781289728.0, + "grad_norm": 0.0392549286911581, + "language_loss": 0.84440565, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85485572, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.39868164, + "step": 4404, + "time_per_iteration": 2.568206787109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_mlp": 1.00615251, + "epoch": 0.8474413235859947, + "flos": 799378225152.0, + "grad_norm": 0.035625221282266174, + "language_loss": 0.80429268, + "learning_rate": 5.980350635103954e-05, + "loss": 0.81475306, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.39868164, + "step": 4405, + "time_per_iteration": 2.9909889698028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046099, + "balance_loss_mlp": 1.00616419, + "epoch": 0.8476337052712581, + "flos": 503378889984.0, + "grad_norm": 0.054053938180127596, + "language_loss": 0.80838627, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81884724, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.39916992, + "step": 4406, + "time_per_iteration": 2.5926382541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046099, + "balance_loss_mlp": 1.00623512, + "epoch": 0.8478260869565217, + "flos": 933518112000.0, + "grad_norm": 0.029539714928688187, + "language_loss": 0.83627093, + "learning_rate": 5.9508353547573e-05, + "loss": 0.8467319, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.3984375, + "step": 4407, + "time_per_iteration": 3.2023520469665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043476, + "balance_loss_mlp": 1.00358844, + "epoch": 0.8480184686417853, + "flos": 710053701120.0, + "grad_norm": 0.03753136985020684, + "language_loss": 0.81381404, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.82424879, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.39868164, + "step": 4408, + "time_per_iteration": 2.9116780757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044002, + "balance_loss_mlp": 1.0040431, + "epoch": 0.8482108503270489, + "flos": 615599560704.0, + "grad_norm": 0.029096278628502316, + "language_loss": 0.82911217, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83955222, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.39941406, + "step": 4409, + "time_per_iteration": 2.8034651279449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044369, + "balance_loss_mlp": 1.00445783, + "epoch": 0.8484032320123124, + "flos": 532073436672.0, + "grad_norm": 0.03490278429304404, + "language_loss": 0.82839775, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83884144, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.39892578, + "step": 4410, + "time_per_iteration": 2.6078169345855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_mlp": 1.00699615, + "epoch": 0.848595613697576, + "flos": 1546174230528.0, + "grad_norm": 0.008492225160972254, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77342916, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.38964844, + "step": 4411, + "time_per_iteration": 4.888483762741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047746, + "balance_loss_mlp": 1.00788236, + "epoch": 0.8487879953828396, + "flos": 678619858176.0, + "grad_norm": 0.034815423917737294, + "language_loss": 0.74149477, + "learning_rate": 5.877346528406635e-05, + "loss": 0.75197226, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.3984375, + "step": 4412, + "time_per_iteration": 2.861584186553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_mlp": 1.00928438, + "epoch": 0.8489803770681031, + "flos": 504672715776.0, + "grad_norm": 0.03763166001316411, + "language_loss": 0.80223823, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.81272948, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.39819336, + "step": 4413, + "time_per_iteration": 2.607466459274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049111, + "balance_loss_mlp": 1.00919974, + "epoch": 0.8491727587533667, + "flos": 564350062080.0, + "grad_norm": 0.038784570505553576, + "language_loss": 0.77884841, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78933948, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.39892578, + "step": 4414, + "time_per_iteration": 2.721888303756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_mlp": 1.0045321, + "epoch": 0.8493651404386302, + "flos": 460749712896.0, + "grad_norm": 0.04126295336032692, + "language_loss": 0.78762388, + "learning_rate": 5.833458746159243e-05, + "loss": 0.79806906, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.3996582, + "step": 4415, + "time_per_iteration": 2.544360399246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_mlp": 1.00367904, + "epoch": 0.8495575221238938, + "flos": 462145605888.0, + "grad_norm": 0.04113715838668957, + "language_loss": 0.81992638, + "learning_rate": 5.818863771788013e-05, + "loss": 0.83036369, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.40039062, + "step": 4416, + "time_per_iteration": 2.621957302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044228, + "balance_loss_mlp": 1.00426936, + "epoch": 0.8497499038091574, + "flos": 872154222336.0, + "grad_norm": 0.035688143834842465, + "language_loss": 0.81738758, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82782984, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.39941406, + "step": 4417, + "time_per_iteration": 3.095893383026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044868, + "balance_loss_mlp": 1.00490916, + "epoch": 0.849942285494421, + "flos": 780975959040.0, + "grad_norm": 0.036021778410362866, + "language_loss": 0.7830838, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79353249, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.39941406, + "step": 4418, + "time_per_iteration": 2.994999885559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_mlp": 1.00348961, + "epoch": 0.8501346671796844, + "flos": 514908615936.0, + "grad_norm": 0.03606911304712284, + "language_loss": 0.85473448, + "learning_rate": 5.775181787135819e-05, + "loss": 0.86516964, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.40014648, + "step": 4419, + "time_per_iteration": 2.667847156524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043246, + "balance_loss_mlp": 1.00323963, + "epoch": 0.850327048864948, + "flos": 622635406080.0, + "grad_norm": 0.03298723952459495, + "language_loss": 0.84261787, + "learning_rate": 5.76065545724877e-05, + "loss": 0.85305035, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.39990234, + "step": 4420, + "time_per_iteration": 2.828456401824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043242, + "balance_loss_mlp": 1.0032357, + "epoch": 0.8505194305502116, + "flos": 775550834688.0, + "grad_norm": 0.038283262853593514, + "language_loss": 0.80220652, + "learning_rate": 5.746146302598454e-05, + "loss": 0.812639, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.39990234, + "step": 4421, + "time_per_iteration": 3.011596202850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045026, + "balance_loss_mlp": 1.00501943, + "epoch": 0.8507118122354752, + "flos": 466213721088.0, + "grad_norm": 0.03411940614930077, + "language_loss": 0.8696543, + "learning_rate": 5.731654328817859e-05, + "loss": 0.88010454, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.39990234, + "step": 4422, + "time_per_iteration": 2.5618016719818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_mlp": 1.00654387, + "epoch": 0.8509041939207388, + "flos": 535470822912.0, + "grad_norm": 0.03503428991987823, + "language_loss": 0.85501492, + "learning_rate": 5.717179541533257e-05, + "loss": 0.86547995, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.39941406, + "step": 4423, + "time_per_iteration": 2.668095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_mlp": 1.00636971, + "epoch": 0.8510965756060023, + "flos": 584829643776.0, + "grad_norm": 0.037852967205166614, + "language_loss": 0.8484906, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85895479, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.40039062, + "step": 4424, + "time_per_iteration": 2.7089426517486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_mlp": 1.0064522, + "epoch": 0.8512889572912659, + "flos": 602018764032.0, + "grad_norm": 0.0370000397469383, + "language_loss": 0.77723837, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78770298, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.39990234, + "step": 4425, + "time_per_iteration": 2.759779930114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045288, + "balance_loss_mlp": 1.00530505, + "epoch": 0.8514813389765294, + "flos": 656066851584.0, + "grad_norm": 0.0349446752760593, + "language_loss": 0.79018658, + "learning_rate": 5.673858354818151e-05, + "loss": 0.80063945, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.3996582, + "step": 4426, + "time_per_iteration": 2.8464009761810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_mlp": 1.00515938, + "epoch": 0.851673720661793, + "flos": 430659273216.0, + "grad_norm": 0.040120645560250315, + "language_loss": 0.78890347, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.79935461, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.39941406, + "step": 4427, + "time_per_iteration": 2.5440762042999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045694, + "balance_loss_mlp": 1.00573528, + "epoch": 0.8518661023470565, + "flos": 642759208704.0, + "grad_norm": 0.04399407608255305, + "language_loss": 0.80127829, + "learning_rate": 5.645063599002875e-05, + "loss": 0.81173521, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.39941406, + "step": 4428, + "time_per_iteration": 2.8217527866363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_mlp": 1.00565207, + "epoch": 0.8520584840323201, + "flos": 563199132672.0, + "grad_norm": 0.036053827771286885, + "language_loss": 0.80072153, + "learning_rate": 5.630692048472363e-05, + "loss": 0.81117785, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.3996582, + "step": 4429, + "time_per_iteration": 2.688634157180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045506, + "balance_loss_mlp": 1.00557125, + "epoch": 0.8522508657175837, + "flos": 528081143808.0, + "grad_norm": 0.038575296001451785, + "language_loss": 0.79170716, + "learning_rate": 5.61633772363489e-05, + "loss": 0.80216217, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.39916992, + "step": 4430, + "time_per_iteration": 2.594003915786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045227, + "balance_loss_mlp": 1.00538754, + "epoch": 0.8524432474028473, + "flos": 500103012864.0, + "grad_norm": 0.03514704462668056, + "language_loss": 0.81136119, + "learning_rate": 5.602000630063298e-05, + "loss": 0.82181346, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.39819336, + "step": 4431, + "time_per_iteration": 2.6524744033813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_mlp": 1.00585186, + "epoch": 0.8526356290881109, + "flos": 422216841216.0, + "grad_norm": 0.043916999307345196, + "language_loss": 0.80055523, + "learning_rate": 5.587680773323706e-05, + "loss": 0.81101382, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.39990234, + "step": 4432, + "time_per_iteration": 2.482304334640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045542, + "balance_loss_mlp": 1.00560737, + "epoch": 0.8528280107733743, + "flos": 508330616832.0, + "grad_norm": 0.03493847122451932, + "language_loss": 0.81211418, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.82256961, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.39916992, + "step": 4433, + "time_per_iteration": 2.595567464828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045928, + "balance_loss_mlp": 1.00606406, + "epoch": 0.8530203924586379, + "flos": 446817027840.0, + "grad_norm": 0.037902023727573, + "language_loss": 0.83218634, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.84264565, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.3984375, + "step": 4434, + "time_per_iteration": 2.583287477493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045846, + "balance_loss_mlp": 1.00593507, + "epoch": 0.8532127741439015, + "flos": 658990840320.0, + "grad_norm": 0.0366688600257907, + "language_loss": 0.84032941, + "learning_rate": 5.54482467965825e-05, + "loss": 0.85078788, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.39892578, + "step": 4435, + "time_per_iteration": 2.818974494934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_mlp": 1.00604296, + "epoch": 0.8534051558291651, + "flos": 537099040512.0, + "grad_norm": 0.030704738666311435, + "language_loss": 0.8344785, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84493661, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.39746094, + "step": 4436, + "time_per_iteration": 2.724040985107422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045453, + "balance_loss_mlp": 1.00566089, + "epoch": 0.8535975375144286, + "flos": 534037991424.0, + "grad_norm": 0.044664222013351275, + "language_loss": 0.79138994, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80184448, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.39770508, + "step": 4437, + "time_per_iteration": 2.639385223388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_mlp": 1.00428987, + "epoch": 0.8537899191996922, + "flos": 575269330176.0, + "grad_norm": 0.04474802449890559, + "language_loss": 0.82764935, + "learning_rate": 5.502123917219848e-05, + "loss": 0.8380909, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.3984375, + "step": 4438, + "time_per_iteration": 2.7381479740142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044071, + "balance_loss_mlp": 1.00425565, + "epoch": 0.8539823008849557, + "flos": 466007641344.0, + "grad_norm": 0.03412606398220342, + "language_loss": 0.83686745, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84730822, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.39794922, + "step": 4439, + "time_per_iteration": 2.7366132736206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_mlp": 1.00421679, + "epoch": 0.8541746825702193, + "flos": 555807508224.0, + "grad_norm": 0.03647803217669747, + "language_loss": 0.82074428, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.83118486, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.39819336, + "step": 4440, + "time_per_iteration": 2.6492371559143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_mlp": 1.00459361, + "epoch": 0.8543670642554829, + "flos": 547558516992.0, + "grad_norm": 0.03717660212080705, + "language_loss": 0.78091979, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.79136366, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.39770508, + "step": 4441, + "time_per_iteration": 2.7436399459838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_mlp": 1.00444269, + "epoch": 0.8545594459407464, + "flos": 513076263936.0, + "grad_norm": 0.033320650973310266, + "language_loss": 0.82524663, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83568943, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.39819336, + "step": 4442, + "time_per_iteration": 2.6414926052093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_mlp": 1.00454664, + "epoch": 0.85475182762601, + "flos": 422086583808.0, + "grad_norm": 0.03678224160115087, + "language_loss": 0.82031214, + "learning_rate": 5.431301565318786e-05, + "loss": 0.83075607, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.39819336, + "step": 4443, + "time_per_iteration": 2.5231664180755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043416, + "balance_loss_mlp": 1.00352824, + "epoch": 0.8549442093112736, + "flos": 390292104192.0, + "grad_norm": 0.043585312806385154, + "language_loss": 0.78223205, + "learning_rate": 5.41718898228542e-05, + "loss": 0.7926662, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.39868164, + "step": 4444, + "time_per_iteration": 2.4840567111968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_mlp": 1.00369012, + "epoch": 0.8551365909965372, + "flos": 607155183360.0, + "grad_norm": 0.037333626651253705, + "language_loss": 0.79457009, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80500567, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.3984375, + "step": 4445, + "time_per_iteration": 2.793098211288452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_mlp": 1.00355375, + "epoch": 0.8553289726818007, + "flos": 505156806912.0, + "grad_norm": 0.036563885282109194, + "language_loss": 0.79314804, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.80358338, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.3996582, + "step": 4446, + "time_per_iteration": 2.5899364948272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_mlp": 1.00650752, + "epoch": 0.8555213543670642, + "flos": 558106454784.0, + "grad_norm": 0.03639099586734523, + "language_loss": 0.76389134, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77435529, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.39868164, + "step": 4447, + "time_per_iteration": 2.7353360652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045537, + "balance_loss_mlp": 1.00555396, + "epoch": 0.8557137360523278, + "flos": 549153686784.0, + "grad_norm": 0.03500470093076036, + "language_loss": 0.75183821, + "learning_rate": 5.360911790663775e-05, + "loss": 0.76229358, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.3996582, + "step": 4448, + "time_per_iteration": 2.6300766468048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_mlp": 1.00393462, + "epoch": 0.8559061177375914, + "flos": 729504829440.0, + "grad_norm": 0.036315489674909586, + "language_loss": 0.79067165, + "learning_rate": 5.346885805197238e-05, + "loss": 0.80111039, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.39916992, + "step": 4449, + "time_per_iteration": 2.997072219848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_mlp": 1.00391161, + "epoch": 0.856098499422855, + "flos": 536977531392.0, + "grad_norm": 0.03929943737892077, + "language_loss": 0.83804214, + "learning_rate": 5.332877155607085e-05, + "loss": 0.8484804, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.39892578, + "step": 4450, + "time_per_iteration": 2.7184524536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044891, + "balance_loss_mlp": 1.00493169, + "epoch": 0.8562908811081185, + "flos": 574776490752.0, + "grad_norm": 0.03344530380286612, + "language_loss": 0.83789825, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84834719, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.39941406, + "step": 4451, + "time_per_iteration": 2.730924367904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043477, + "balance_loss_mlp": 1.00354207, + "epoch": 0.856483262793382, + "flos": 783216579840.0, + "grad_norm": 0.03711590961781359, + "language_loss": 0.80795747, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81839228, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.39916992, + "step": 4452, + "time_per_iteration": 3.097334146499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_mlp": 1.0023483, + "epoch": 0.8566756444786456, + "flos": 456757420032.0, + "grad_norm": 0.03247549077414915, + "language_loss": 0.85096687, + "learning_rate": 5.290955276447651e-05, + "loss": 0.8613894, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.39892578, + "step": 4453, + "time_per_iteration": 2.5756120681762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042269, + "balance_loss_mlp": 1.00226271, + "epoch": 0.8568680261639092, + "flos": 450316481280.0, + "grad_norm": 0.03777805349232298, + "language_loss": 0.84773082, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85815352, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.39990234, + "step": 4454, + "time_per_iteration": 2.5053937435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_mlp": 1.00222218, + "epoch": 0.8570604078491728, + "flos": 480938644224.0, + "grad_norm": 0.046064792472179546, + "language_loss": 0.82946479, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83988655, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.39941406, + "step": 4455, + "time_per_iteration": 2.5223333835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_mlp": 1.00209057, + "epoch": 0.8572527895344363, + "flos": 506934723840.0, + "grad_norm": 0.035455469317855655, + "language_loss": 0.85363388, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86405408, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.39916992, + "step": 4456, + "time_per_iteration": 2.575731039047241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042121, + "balance_loss_mlp": 1.0021379, + "epoch": 0.8574451712196999, + "flos": 788476453632.0, + "grad_norm": 0.04614360974080898, + "language_loss": 0.8365714, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84699261, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.3996582, + "step": 4457, + "time_per_iteration": 3.0536177158355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_mlp": 1.00283957, + "epoch": 0.8576375529049635, + "flos": 510347661312.0, + "grad_norm": 0.03326548366354344, + "language_loss": 0.7548418, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76526952, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.39916992, + "step": 4458, + "time_per_iteration": 2.69887375831604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043285, + "balance_loss_mlp": 1.00447083, + "epoch": 0.857829934590227, + "flos": 1463891387904.0, + "grad_norm": 0.004297416027187635, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85810578, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.38769531, + "step": 4459, + "time_per_iteration": 4.931604623794556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010427, + "balance_loss_mlp": 1.00283611, + "epoch": 0.8580223162754905, + "flos": 480259166976.0, + "grad_norm": 0.03161384875669954, + "language_loss": 0.89644599, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90687293, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.3984375, + "step": 4460, + "time_per_iteration": 2.6447529792785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_mlp": 1.00214815, + "epoch": 0.8582146979607541, + "flos": 707457301248.0, + "grad_norm": 0.03987085543559285, + "language_loss": 0.79637587, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80679691, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.39941406, + "step": 4461, + "time_per_iteration": 2.836300849914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_mlp": 1.00325847, + "epoch": 0.8584070796460177, + "flos": 766494054144.0, + "grad_norm": 0.03752152487734852, + "language_loss": 0.8312273, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.84166038, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.40039062, + "step": 4462, + "time_per_iteration": 3.0077900886535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104119, + "balance_loss_mlp": 1.00120723, + "epoch": 0.8585994613312813, + "flos": 588010256640.0, + "grad_norm": 0.035369500112361965, + "language_loss": 0.86055285, + "learning_rate": 5.152344741070919e-05, + "loss": 0.87096477, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.3996582, + "step": 4463, + "time_per_iteration": 2.777745008468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041881, + "balance_loss_mlp": 1.0019455, + "epoch": 0.8587918430165449, + "flos": 609510510336.0, + "grad_norm": 0.03358316134119744, + "language_loss": 0.79521871, + "learning_rate": 5.138579361741169e-05, + "loss": 0.80563754, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.39916992, + "step": 4464, + "time_per_iteration": 2.8097100257873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042564, + "balance_loss_mlp": 1.00265265, + "epoch": 0.8589842247018084, + "flos": 590070075648.0, + "grad_norm": 0.037330813544588, + "language_loss": 0.81354475, + "learning_rate": 5.124831399159535e-05, + "loss": 0.82397044, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.39892578, + "step": 4465, + "time_per_iteration": 2.6861515045166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_mlp": 1.00357902, + "epoch": 0.8591766063870719, + "flos": 544964062464.0, + "grad_norm": 0.047961179507573816, + "language_loss": 0.79156327, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.80199909, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.39990234, + "step": 4466, + "time_per_iteration": 2.645440101623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_mlp": 1.00329959, + "epoch": 0.8593689880723355, + "flos": 494786758656.0, + "grad_norm": 0.038918895611797386, + "language_loss": 0.81233793, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.82277095, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.39990234, + "step": 4467, + "time_per_iteration": 2.674589157104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104568, + "balance_loss_mlp": 1.00550628, + "epoch": 0.8595613697575991, + "flos": 534941044992.0, + "grad_norm": 0.04226975076032596, + "language_loss": 0.84279299, + "learning_rate": 5.083692065243822e-05, + "loss": 0.85324979, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.40161133, + "step": 4468, + "time_per_iteration": 2.6663825511932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_mlp": 1.0055114, + "epoch": 0.8597537514428626, + "flos": 618755874048.0, + "grad_norm": 0.03926324289956361, + "language_loss": 0.76639593, + "learning_rate": 5.070013822961328e-05, + "loss": 0.77685177, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.40063477, + "step": 4469, + "time_per_iteration": 2.7248737812042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_mlp": 1.00585926, + "epoch": 0.8599461331281262, + "flos": 609857541120.0, + "grad_norm": 0.03895417100249646, + "language_loss": 0.84123969, + "learning_rate": 5.056353024046462e-05, + "loss": 0.85170031, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.40185547, + "step": 4470, + "time_per_iteration": 2.754119396209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_mlp": 1.00572836, + "epoch": 0.8601385148133898, + "flos": 552344993280.0, + "grad_norm": 0.037895908280551775, + "language_loss": 0.83561713, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84607613, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.40161133, + "step": 4471, + "time_per_iteration": 2.6509101390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_mlp": 1.00565875, + "epoch": 0.8603308964986534, + "flos": 582379052544.0, + "grad_norm": 0.03060970457898105, + "language_loss": 0.81397867, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82443655, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.40112305, + "step": 4472, + "time_per_iteration": 2.8442461490631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045958, + "balance_loss_mlp": 1.00585616, + "epoch": 0.8605232781839169, + "flos": 630148539648.0, + "grad_norm": 0.0379081889526199, + "language_loss": 0.75537038, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76582998, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.40087891, + "step": 4473, + "time_per_iteration": 2.7697927951812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_mlp": 1.00264955, + "epoch": 0.8607156598691804, + "flos": 469090077696.0, + "grad_norm": 0.03841128602003186, + "language_loss": 0.77358502, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78401279, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.40112305, + "step": 4474, + "time_per_iteration": 2.5194132328033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_mlp": 1.00430596, + "epoch": 0.860908041554444, + "flos": 489407321088.0, + "grad_norm": 0.035006828141935564, + "language_loss": 0.82981098, + "learning_rate": 4.988310865374945e-05, + "loss": 0.84025621, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.40209961, + "step": 4475, + "time_per_iteration": 2.652743339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043859, + "balance_loss_mlp": 1.00361395, + "epoch": 0.8611004232397076, + "flos": 593170008576.0, + "grad_norm": 0.039966577780763526, + "language_loss": 0.80588216, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81632078, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.40234375, + "step": 4476, + "time_per_iteration": 2.666529655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043525, + "balance_loss_mlp": 1.00332773, + "epoch": 0.8612928049249712, + "flos": 775622766336.0, + "grad_norm": 0.03341468834325145, + "language_loss": 0.86407369, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87450892, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.40185547, + "step": 4477, + "time_per_iteration": 3.053422212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_mlp": 1.00321817, + "epoch": 0.8614851866102347, + "flos": 538607694336.0, + "grad_norm": 0.051459370752897714, + "language_loss": 0.82765687, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83809155, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.40234375, + "step": 4478, + "time_per_iteration": 2.6455395221710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_mlp": 1.00275767, + "epoch": 0.8616775682954982, + "flos": 566996039424.0, + "grad_norm": 0.03431517223967173, + "language_loss": 0.79459572, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80502427, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.40087891, + "step": 4479, + "time_per_iteration": 2.7187790870666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_mlp": 1.0017215, + "epoch": 0.8618699499807618, + "flos": 482558113536.0, + "grad_norm": 0.0381793142401585, + "language_loss": 0.82083333, + "learning_rate": 4.92070558355221e-05, + "loss": 0.83125293, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.40234375, + "step": 4480, + "time_per_iteration": 2.6569716930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042011, + "balance_loss_mlp": 1.00176573, + "epoch": 0.8620623316660254, + "flos": 650680611072.0, + "grad_norm": 0.043394655514730936, + "language_loss": 0.74778575, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75820589, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.40234375, + "step": 4481, + "time_per_iteration": 2.7786409854888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.00283086, + "epoch": 0.862254713351289, + "flos": 753082398720.0, + "grad_norm": 0.0457468668200295, + "language_loss": 0.8622297, + "learning_rate": 4.893785943464801e-05, + "loss": 0.87265956, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.40136719, + "step": 4482, + "time_per_iteration": 3.0409467220306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.00346696, + "epoch": 0.8624470950365525, + "flos": 843136944384.0, + "grad_norm": 0.07263887982948322, + "language_loss": 0.7833854, + "learning_rate": 4.880352388488024e-05, + "loss": 0.79382133, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.40112305, + "step": 4483, + "time_per_iteration": 3.2473502159118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043317, + "balance_loss_mlp": 1.00316727, + "epoch": 0.8626394767218161, + "flos": 756089012736.0, + "grad_norm": 0.03708175872595014, + "language_loss": 0.83609211, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84652531, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.40136719, + "step": 4484, + "time_per_iteration": 2.9040815830230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044262, + "balance_loss_mlp": 1.00415993, + "epoch": 0.8628318584070797, + "flos": 704858956032.0, + "grad_norm": 0.03939072014015938, + "language_loss": 0.8295635, + "learning_rate": 4.853537834745203e-05, + "loss": 0.84000611, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.40087891, + "step": 4485, + "time_per_iteration": 2.871338367462158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043684, + "balance_loss_mlp": 1.00367761, + "epoch": 0.8630242400923432, + "flos": 472198758912.0, + "grad_norm": 0.03510006759017659, + "language_loss": 0.77971268, + "learning_rate": 4.840156846389487e-05, + "loss": 0.79014951, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.39990234, + "step": 4486, + "time_per_iteration": 2.571122169494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_mlp": 1.00370646, + "epoch": 0.8632166217776067, + "flos": 965963878656.0, + "grad_norm": 0.04035538480745229, + "language_loss": 0.77694219, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78737986, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.40039062, + "step": 4487, + "time_per_iteration": 3.206270456314087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044627, + "balance_loss_mlp": 1.00469244, + "epoch": 0.8634090034628703, + "flos": 769240153344.0, + "grad_norm": 0.03791528570619557, + "language_loss": 0.79186964, + "learning_rate": 4.813447472684246e-05, + "loss": 0.80231589, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.39916992, + "step": 4488, + "time_per_iteration": 2.956378936767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_mlp": 1.00585032, + "epoch": 0.8636013851481339, + "flos": 521720884992.0, + "grad_norm": 0.03486742602328962, + "language_loss": 0.83548576, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84594309, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.39868164, + "step": 4489, + "time_per_iteration": 2.7456369400024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044161, + "balance_loss_mlp": 1.00417769, + "epoch": 0.8637937668333975, + "flos": 633294159360.0, + "grad_norm": 0.03761878159838688, + "language_loss": 0.81188381, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.82232535, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.3996582, + "step": 4490, + "time_per_iteration": 2.7456276416778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104453, + "balance_loss_mlp": 1.00466633, + "epoch": 0.8639861485186611, + "flos": 857522618112.0, + "grad_norm": 0.032138271837822946, + "language_loss": 0.76775849, + "learning_rate": 4.773514997362e-05, + "loss": 0.77820385, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.3984375, + "step": 4491, + "time_per_iteration": 3.0699753761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049712, + "balance_loss_mlp": 1.00980031, + "epoch": 0.8641785302039245, + "flos": 482241218304.0, + "grad_norm": 0.04135190383528018, + "language_loss": 0.78167886, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.79217601, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.39892578, + "step": 4492, + "time_per_iteration": 2.5359408855438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_mlp": 1.00989342, + "epoch": 0.8643709118891881, + "flos": 505649646336.0, + "grad_norm": 0.037459806999943016, + "language_loss": 0.80912906, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81962758, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.39941406, + "step": 4493, + "time_per_iteration": 2.5917551517486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_mlp": 1.00972354, + "epoch": 0.8645632935744517, + "flos": 553552303104.0, + "grad_norm": 0.03733364468795558, + "language_loss": 0.82919705, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83969343, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.39892578, + "step": 4494, + "time_per_iteration": 2.78363299369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_mlp": 1.00309765, + "epoch": 0.8647556752597153, + "flos": 525736510464.0, + "grad_norm": 0.0378697936627377, + "language_loss": 0.84654057, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.85697162, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.39990234, + "step": 4495, + "time_per_iteration": 2.5666584968566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042802, + "balance_loss_mlp": 1.00284255, + "epoch": 0.8649480569449788, + "flos": 789238556160.0, + "grad_norm": 0.04501425991105197, + "language_loss": 0.83136499, + "learning_rate": 4.707312109960471e-05, + "loss": 0.841793, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.39941406, + "step": 4496, + "time_per_iteration": 3.103167772293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_mlp": 1.00332153, + "epoch": 0.8651404386302424, + "flos": 765200228352.0, + "grad_norm": 0.038819843203582616, + "language_loss": 0.77151841, + "learning_rate": 4.694124264495225e-05, + "loss": 0.78195071, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.39892578, + "step": 4497, + "time_per_iteration": 3.0549564361572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_mlp": 1.002949, + "epoch": 0.865332820315506, + "flos": 540989266176.0, + "grad_norm": 0.06154975091588949, + "language_loss": 0.82805288, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83848143, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.39892578, + "step": 4498, + "time_per_iteration": 2.718996286392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043068, + "balance_loss_mlp": 1.00396729, + "epoch": 0.8655252020007695, + "flos": 1479679757568.0, + "grad_norm": 0.0038323013026693355, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80217516, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.390625, + "step": 4499, + "time_per_iteration": 4.773655652999878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041418, + "balance_loss_mlp": 1.00148308, + "epoch": 0.8657175836860331, + "flos": 518473198080.0, + "grad_norm": 0.03250129983959287, + "language_loss": 0.82903546, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83944964, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.39916992, + "step": 4500, + "time_per_iteration": 2.7259538173675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041689, + "balance_loss_mlp": 1.00168264, + "epoch": 0.8659099653712966, + "flos": 591633164544.0, + "grad_norm": 0.06794339939961025, + "language_loss": 0.80461693, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81503385, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.39990234, + "step": 4501, + "time_per_iteration": 2.7145915031433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_mlp": 1.00302076, + "epoch": 0.8661023470565602, + "flos": 591576784128.0, + "grad_norm": 0.031149828188743837, + "language_loss": 0.88302559, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89345515, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.39916992, + "step": 4502, + "time_per_iteration": 2.8617639541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_mlp": 1.00586581, + "epoch": 0.8662947287418238, + "flos": 568737017856.0, + "grad_norm": 0.03468810910304343, + "language_loss": 0.80155271, + "learning_rate": 4.61536674574336e-05, + "loss": 0.81201071, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.39916992, + "step": 4503, + "time_per_iteration": 2.7440474033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046514, + "balance_loss_mlp": 1.00653136, + "epoch": 0.8664871104270874, + "flos": 517003428096.0, + "grad_norm": 0.031791485322302415, + "language_loss": 0.82510114, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83556628, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.3996582, + "step": 4504, + "time_per_iteration": 2.792924642562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_mlp": 1.00352764, + "epoch": 0.866679492112351, + "flos": 558430152960.0, + "grad_norm": 0.03528793241244482, + "language_loss": 0.7860105, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79644579, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.39990234, + "step": 4505, + "time_per_iteration": 2.8408970832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_mlp": 1.00308836, + "epoch": 0.8668718737976144, + "flos": 723662688000.0, + "grad_norm": 0.04842825321665466, + "language_loss": 0.82218444, + "learning_rate": 4.57622578599054e-05, + "loss": 0.83261466, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.39916992, + "step": 4506, + "time_per_iteration": 2.9215447902679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_mlp": 1.00320053, + "epoch": 0.867064255482878, + "flos": 601834071552.0, + "grad_norm": 0.0362899031078603, + "language_loss": 0.84903908, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.8594709, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.3996582, + "step": 4507, + "time_per_iteration": 2.7223806381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042871, + "balance_loss_mlp": 1.00286424, + "epoch": 0.8672566371681416, + "flos": 804933606912.0, + "grad_norm": 0.038978815297564966, + "language_loss": 0.76347792, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77390665, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.39990234, + "step": 4508, + "time_per_iteration": 3.089169979095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042127, + "balance_loss_mlp": 1.00223958, + "epoch": 0.8674490188534052, + "flos": 628555315200.0, + "grad_norm": 0.03358541173243661, + "language_loss": 0.84039915, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.85082042, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.39868164, + "step": 4509, + "time_per_iteration": 2.7640254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_mlp": 1.0022794, + "epoch": 0.8676414005386687, + "flos": 729205430784.0, + "grad_norm": 0.03339457562318333, + "language_loss": 0.86446714, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87488854, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.3984375, + "step": 4510, + "time_per_iteration": 2.9625110626220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042812, + "balance_loss_mlp": 1.0029006, + "epoch": 0.8678337822239323, + "flos": 541163265024.0, + "grad_norm": 0.03671096945461231, + "language_loss": 0.81317061, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.82359874, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.39892578, + "step": 4511, + "time_per_iteration": 2.7604947090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042756, + "balance_loss_mlp": 1.00279653, + "epoch": 0.8680261639091958, + "flos": 508526002944.0, + "grad_norm": 0.03890609352738196, + "language_loss": 0.7946341, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.8050617, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.39941406, + "step": 4512, + "time_per_iteration": 2.556802272796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044096, + "balance_loss_mlp": 1.00420845, + "epoch": 0.8682185455944594, + "flos": 488150433792.0, + "grad_norm": 0.035041700942467814, + "language_loss": 0.81525004, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82569093, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.39868164, + "step": 4513, + "time_per_iteration": 2.6287481784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043961, + "balance_loss_mlp": 1.00412118, + "epoch": 0.868410927279723, + "flos": 604803747072.0, + "grad_norm": 0.03776631207974779, + "language_loss": 0.81658906, + "learning_rate": 4.472626206030528e-05, + "loss": 0.82702863, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.39819336, + "step": 4514, + "time_per_iteration": 2.7213940620422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_mlp": 1.00180769, + "epoch": 0.8686033089649865, + "flos": 1120722352896.0, + "grad_norm": 0.03739432356503297, + "language_loss": 0.85135609, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.86177301, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.39868164, + "step": 4515, + "time_per_iteration": 3.3755922317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_mlp": 1.0019449, + "epoch": 0.8687956906502501, + "flos": 569099599872.0, + "grad_norm": 0.03900935559178445, + "language_loss": 0.84242064, + "learning_rate": 4.446902963685862e-05, + "loss": 0.85283899, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.39868164, + "step": 4516, + "time_per_iteration": 2.6541643142700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_mlp": 1.00171173, + "epoch": 0.8689880723355137, + "flos": 545411215104.0, + "grad_norm": 0.037327297055917835, + "language_loss": 0.84681803, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85723472, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.39941406, + "step": 4517, + "time_per_iteration": 2.646094799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_mlp": 1.00119364, + "epoch": 0.8691804540207773, + "flos": 458385637632.0, + "grad_norm": 0.037344798435976774, + "language_loss": 0.86974192, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.88015229, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.39819336, + "step": 4518, + "time_per_iteration": 2.580050468444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_mlp": 1.00109982, + "epoch": 0.8693728357060407, + "flos": 593000867328.0, + "grad_norm": 0.04024352256498578, + "language_loss": 0.80694568, + "learning_rate": 4.40845075221456e-05, + "loss": 0.81735557, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.39868164, + "step": 4519, + "time_per_iteration": 2.6886699199676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104283, + "balance_loss_mlp": 1.00299013, + "epoch": 0.8695652173913043, + "flos": 681524404992.0, + "grad_norm": 0.038580823232518636, + "language_loss": 0.80304897, + "learning_rate": 4.395668742181164e-05, + "loss": 0.81347722, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.39819336, + "step": 4520, + "time_per_iteration": 2.8930678367614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_mlp": 1.00298536, + "epoch": 0.8697575990765679, + "flos": 493336430592.0, + "grad_norm": 0.04547224074564954, + "language_loss": 0.78913867, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79956746, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.39868164, + "step": 4521, + "time_per_iteration": 2.5533957481384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_mlp": 1.00215554, + "epoch": 0.8699499807618315, + "flos": 527987824896.0, + "grad_norm": 0.032020821735607795, + "language_loss": 0.82212275, + "learning_rate": 4.370157842584671e-05, + "loss": 0.83254337, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.39892578, + "step": 4522, + "time_per_iteration": 2.6833107471466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042365, + "balance_loss_mlp": 1.00240612, + "epoch": 0.8701423624470951, + "flos": 815794549248.0, + "grad_norm": 0.04802472888774931, + "language_loss": 0.80982125, + "learning_rate": 4.357428962925808e-05, + "loss": 0.82024491, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.39941406, + "step": 4523, + "time_per_iteration": 3.1088523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042816, + "balance_loss_mlp": 1.00285709, + "epoch": 0.8703347441323586, + "flos": 557874130176.0, + "grad_norm": 0.034879545908552134, + "language_loss": 0.89101827, + "learning_rate": 4.344717803284542e-05, + "loss": 0.90144646, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.39941406, + "step": 4524, + "time_per_iteration": 2.664231538772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_mlp": 1.00281131, + "epoch": 0.8705271258176221, + "flos": 586614363648.0, + "grad_norm": 0.03362644335681585, + "language_loss": 0.84724236, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85766935, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.39868164, + "step": 4525, + "time_per_iteration": 2.838411808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044069, + "balance_loss_mlp": 1.0041815, + "epoch": 0.8707195075028857, + "flos": 670503069696.0, + "grad_norm": 0.033221924916940926, + "language_loss": 0.85867798, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86911869, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.39868164, + "step": 4526, + "time_per_iteration": 2.8975462913513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044019, + "balance_loss_mlp": 1.00410795, + "epoch": 0.8709118891881493, + "flos": 521471063808.0, + "grad_norm": 0.03631625832210608, + "language_loss": 0.84302342, + "learning_rate": 4.306690693781007e-05, + "loss": 0.85346365, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.39892578, + "step": 4527, + "time_per_iteration": 2.7671144008636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_mlp": 1.00504327, + "epoch": 0.8711042708734128, + "flos": 554272609536.0, + "grad_norm": 0.0374848177192315, + "language_loss": 0.82055509, + "learning_rate": 4.294050463490401e-05, + "loss": 0.83100373, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.39794922, + "step": 4528, + "time_per_iteration": 2.6653261184692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044793, + "balance_loss_mlp": 1.00497687, + "epoch": 0.8712966525586764, + "flos": 503237938944.0, + "grad_norm": 0.039647791126880064, + "language_loss": 0.82525837, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83570629, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.39794922, + "step": 4529, + "time_per_iteration": 2.712507486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044425, + "balance_loss_mlp": 1.00460875, + "epoch": 0.87148903424394, + "flos": 805528513536.0, + "grad_norm": 0.034217964706317425, + "language_loss": 0.74154443, + "learning_rate": 4.268823241679593e-05, + "loss": 0.75198865, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.39794922, + "step": 4530, + "time_per_iteration": 3.035536050796509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.00382435, + "epoch": 0.8716814159292036, + "flos": 774841221888.0, + "grad_norm": 0.03641439178250716, + "language_loss": 0.86728388, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87771976, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.39746094, + "step": 4531, + "time_per_iteration": 2.9879214763641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_mlp": 1.00408423, + "epoch": 0.8718737976144671, + "flos": 487798545408.0, + "grad_norm": 0.04080829111532849, + "language_loss": 0.85192716, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86236501, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.39672852, + "step": 4532, + "time_per_iteration": 2.563370943069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042257, + "balance_loss_mlp": 1.00256038, + "epoch": 0.8720661792997306, + "flos": 585220416000.0, + "grad_norm": 0.041574006136382645, + "language_loss": 0.79068941, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.80111194, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.39672852, + "step": 4533, + "time_per_iteration": 2.7681236267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_mlp": 1.00343323, + "epoch": 0.8722585609849942, + "flos": 1499002573824.0, + "grad_norm": 0.003968417299446405, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.82009149, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.38867188, + "step": 4534, + "time_per_iteration": 4.783138751983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_mlp": 1.00343895, + "epoch": 0.8724509426702578, + "flos": 597310055424.0, + "grad_norm": 0.03373273111678506, + "language_loss": 0.87846303, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88889456, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.39697266, + "step": 4535, + "time_per_iteration": 2.7677438259124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_mlp": 1.00268626, + "epoch": 0.8726433243555214, + "flos": 444546271488.0, + "grad_norm": 0.05771786559784756, + "language_loss": 0.81709069, + "learning_rate": 4.193567838376888e-05, + "loss": 0.8275162, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.3984375, + "step": 4536, + "time_per_iteration": 2.616766929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_mlp": 1.00272167, + "epoch": 0.8728357060407849, + "flos": 554235671040.0, + "grad_norm": 0.037464761752317666, + "language_loss": 0.82218051, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83260494, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.39697266, + "step": 4537, + "time_per_iteration": 2.6978237628936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_mlp": 1.00276279, + "epoch": 0.8730280877260485, + "flos": 629019964416.0, + "grad_norm": 0.03588843210953962, + "language_loss": 0.78862292, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79904926, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.3984375, + "step": 4538, + "time_per_iteration": 2.8252694606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_mlp": 1.00244486, + "epoch": 0.873220469411312, + "flos": 536502188544.0, + "grad_norm": 0.03884922184031476, + "language_loss": 0.80336553, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81378818, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.39794922, + "step": 4539, + "time_per_iteration": 2.75636625289917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_mlp": 1.00337529, + "epoch": 0.8734128510965756, + "flos": 563001801216.0, + "grad_norm": 0.03313154117432453, + "language_loss": 0.84412879, + "learning_rate": 4.143753177230242e-05, + "loss": 0.85455978, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.39697266, + "step": 4540, + "time_per_iteration": 2.7357964515686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_mlp": 1.00307477, + "epoch": 0.8736052327818392, + "flos": 687804950784.0, + "grad_norm": 0.03579433558259156, + "language_loss": 0.79879081, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80921829, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.39648438, + "step": 4541, + "time_per_iteration": 3.0047316551208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042403, + "balance_loss_mlp": 1.00275385, + "epoch": 0.8737976144671027, + "flos": 532833593856.0, + "grad_norm": 0.03592025812495919, + "language_loss": 0.8180542, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82847822, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.39624023, + "step": 4542, + "time_per_iteration": 2.797070026397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_mlp": 1.00356984, + "epoch": 0.8739899961523663, + "flos": 576730351872.0, + "grad_norm": 0.0332051158629443, + "language_loss": 0.82107216, + "learning_rate": 4.106579095649649e-05, + "loss": 0.83150411, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.39599609, + "step": 4543, + "time_per_iteration": 2.84247088432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043126, + "balance_loss_mlp": 1.0034529, + "epoch": 0.8741823778376299, + "flos": 732632952576.0, + "grad_norm": 0.04335551998495939, + "language_loss": 0.76707387, + "learning_rate": 4.094223363527666e-05, + "loss": 0.7775051, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.39648438, + "step": 4544, + "time_per_iteration": 2.920760154724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042177, + "balance_loss_mlp": 1.00245607, + "epoch": 0.8743747595228935, + "flos": 568222791168.0, + "grad_norm": 0.03891736625399162, + "language_loss": 0.84551966, + "learning_rate": 4.081885453608747e-05, + "loss": 0.85594141, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.39697266, + "step": 4545, + "time_per_iteration": 2.758371114730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_mlp": 1.0021975, + "epoch": 0.8745671412081569, + "flos": 494395986432.0, + "grad_norm": 0.03573114845896075, + "language_loss": 0.82429254, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83471167, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.39697266, + "step": 4546, + "time_per_iteration": 2.593362808227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_mlp": 1.01320839, + "epoch": 0.8747595228934205, + "flos": 525167848704.0, + "grad_norm": 0.03540911918032387, + "language_loss": 0.83888662, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84941518, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.39624023, + "step": 4547, + "time_per_iteration": 2.723700761795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_mlp": 1.01117158, + "epoch": 0.8749519045786841, + "flos": 745753957632.0, + "grad_norm": 0.043747965680807695, + "language_loss": 0.80443823, + "learning_rate": 4.044978704935853e-05, + "loss": 0.81494784, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.39770508, + "step": 4548, + "time_per_iteration": 3.02632474899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105381, + "balance_loss_mlp": 1.01404202, + "epoch": 0.8751442862639477, + "flos": 595384384512.0, + "grad_norm": 0.035676199240782205, + "language_loss": 0.80288255, + "learning_rate": 4.032712131660027e-05, + "loss": 0.81342065, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.39746094, + "step": 4549, + "time_per_iteration": 2.870236873626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052162, + "balance_loss_mlp": 1.01229811, + "epoch": 0.8753366679492113, + "flos": 497515361280.0, + "grad_norm": 0.03757184698075675, + "language_loss": 0.79065835, + "learning_rate": 4.020463404468055e-05, + "loss": 0.80118001, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.3984375, + "step": 4550, + "time_per_iteration": 2.7937049865722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_mlp": 1.0073179, + "epoch": 0.8755290496344748, + "flos": 490850846208.0, + "grad_norm": 0.03757399856308613, + "language_loss": 0.82482672, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.83529806, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.39794922, + "step": 4551, + "time_per_iteration": 2.5715842247009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045971, + "balance_loss_mlp": 1.00596392, + "epoch": 0.8757214313197383, + "flos": 593072798976.0, + "grad_norm": 0.038733451202642565, + "language_loss": 0.8219583, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.83241796, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.39990234, + "step": 4552, + "time_per_iteration": 2.8051164150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043153, + "balance_loss_mlp": 1.00321829, + "epoch": 0.8759138130050019, + "flos": 978400548864.0, + "grad_norm": 0.039917821896877995, + "language_loss": 0.78999895, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.80043048, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.39916992, + "step": 4553, + "time_per_iteration": 3.2010762691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042774, + "balance_loss_mlp": 1.00286233, + "epoch": 0.8761061946902655, + "flos": 804206497536.0, + "grad_norm": 0.030968637089522598, + "language_loss": 0.78100884, + "learning_rate": 3.971647051542243e-05, + "loss": 0.79143655, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.39892578, + "step": 4554, + "time_per_iteration": 3.0976600646972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.0050838, + "epoch": 0.8762985763755291, + "flos": 699848903424.0, + "grad_norm": 0.03651037459653682, + "language_loss": 0.75311875, + "learning_rate": 3.95948762596155e-05, + "loss": 0.76356781, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.39794922, + "step": 4555, + "time_per_iteration": 2.9630236625671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044972, + "balance_loss_mlp": 1.00518036, + "epoch": 0.8764909580607926, + "flos": 630928138752.0, + "grad_norm": 0.0411717836105296, + "language_loss": 0.80529356, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81574327, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.39770508, + "step": 4556, + "time_per_iteration": 2.9351165294647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044616, + "balance_loss_mlp": 1.00482428, + "epoch": 0.8766833397460562, + "flos": 482538671616.0, + "grad_norm": 0.03468954168211097, + "language_loss": 0.80526578, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81571198, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.39770508, + "step": 4557, + "time_per_iteration": 2.673027276992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047126, + "balance_loss_mlp": 1.00738144, + "epoch": 0.8768757214313198, + "flos": 408618547968.0, + "grad_norm": 0.04259075794478707, + "language_loss": 0.78827333, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79874456, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.3972168, + "step": 4558, + "time_per_iteration": 2.4583702087402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_mlp": 1.00742388, + "epoch": 0.8770681031165833, + "flos": 583657327104.0, + "grad_norm": 0.03927894736445276, + "language_loss": 0.82466614, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.83513731, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.39672852, + "step": 4559, + "time_per_iteration": 2.6960582733154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.00597811, + "epoch": 0.8772604848018468, + "flos": 509689571328.0, + "grad_norm": 0.052770592517426745, + "language_loss": 0.8140527, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.82451165, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.39892578, + "step": 4560, + "time_per_iteration": 2.642340898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046128, + "balance_loss_mlp": 1.00635993, + "epoch": 0.8774528664871104, + "flos": 409716987648.0, + "grad_norm": 0.05406487435943987, + "language_loss": 0.85231709, + "learning_rate": 3.886906601970913e-05, + "loss": 0.86277837, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.39746094, + "step": 4561, + "time_per_iteration": 2.5048179626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049445, + "balance_loss_mlp": 1.00965309, + "epoch": 0.877645248172374, + "flos": 501870236160.0, + "grad_norm": 0.032221115056965136, + "language_loss": 0.83807063, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.8485651, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.39770508, + "step": 4562, + "time_per_iteration": 2.7065954208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_mlp": 1.00777078, + "epoch": 0.8778376298576376, + "flos": 634299280128.0, + "grad_norm": 0.05217941639464927, + "language_loss": 0.78634655, + "learning_rate": 3.862856098834189e-05, + "loss": 0.79682273, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.39819336, + "step": 4563, + "time_per_iteration": 2.9042325019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_mlp": 1.00696707, + "epoch": 0.8780300115429012, + "flos": 535115043840.0, + "grad_norm": 0.03350020734303954, + "language_loss": 0.80624408, + "learning_rate": 3.850857712974976e-05, + "loss": 0.8167119, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.39794922, + "step": 4564, + "time_per_iteration": 2.8995606899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046172, + "balance_loss_mlp": 1.00638008, + "epoch": 0.8782223932281646, + "flos": 512667995136.0, + "grad_norm": 0.03721004489901225, + "language_loss": 0.77783936, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78830111, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.39770508, + "step": 4565, + "time_per_iteration": 2.6816246509552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_mlp": 1.0053556, + "epoch": 0.8784147749134282, + "flos": 782246452224.0, + "grad_norm": 0.03741234984768557, + "language_loss": 0.70484185, + "learning_rate": 3.826914695965766e-05, + "loss": 0.71529448, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.39892578, + "step": 4566, + "time_per_iteration": 3.1608734130859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045831, + "balance_loss_mlp": 1.00596738, + "epoch": 0.8786071565986918, + "flos": 562072502784.0, + "grad_norm": 0.04196008602434955, + "language_loss": 0.76287764, + "learning_rate": 3.814970074111279e-05, + "loss": 0.77333593, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.3984375, + "step": 4567, + "time_per_iteration": 2.685582160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_mlp": 1.00612044, + "epoch": 0.8787995382839554, + "flos": 604652102400.0, + "grad_norm": 0.03250790005833066, + "language_loss": 0.7786507, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78911006, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.39794922, + "step": 4568, + "time_per_iteration": 2.825204372406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_mlp": 1.00573194, + "epoch": 0.8789919199692189, + "flos": 561290958336.0, + "grad_norm": 0.030274761831549164, + "language_loss": 0.85748357, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.86793929, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.39819336, + "step": 4569, + "time_per_iteration": 2.700979471206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_mlp": 1.00483751, + "epoch": 0.8791843016544825, + "flos": 540153286656.0, + "grad_norm": 0.04159016368425986, + "language_loss": 0.83041501, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.84086037, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.39672852, + "step": 4570, + "time_per_iteration": 2.6693015098571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045599, + "balance_loss_mlp": 1.00590241, + "epoch": 0.8793766833397461, + "flos": 1010405965824.0, + "grad_norm": 0.03564202822554104, + "language_loss": 0.79841989, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80887592, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.39672852, + "step": 4571, + "time_per_iteration": 3.366255044937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_mlp": 1.00577366, + "epoch": 0.8795690650250096, + "flos": 679913683968.0, + "grad_norm": 0.033369767003960105, + "language_loss": 0.8118791, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82233334, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.39624023, + "step": 4572, + "time_per_iteration": 2.8579459190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_mlp": 1.00914609, + "epoch": 0.8797614467102732, + "flos": 454356406272.0, + "grad_norm": 0.03796242692369374, + "language_loss": 0.88904488, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89953238, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.39575195, + "step": 4573, + "time_per_iteration": 2.5628530979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043011, + "balance_loss_mlp": 1.00333774, + "epoch": 0.8799538283955367, + "flos": 551973662976.0, + "grad_norm": 0.034330484826967635, + "language_loss": 0.85011613, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.86054623, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.39648438, + "step": 4574, + "time_per_iteration": 2.699843645095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_mlp": 1.00357342, + "epoch": 0.8801462100808003, + "flos": 808860771072.0, + "grad_norm": 0.033758040666438734, + "language_loss": 0.84705424, + "learning_rate": 3.720058989624681e-05, + "loss": 0.8574872, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.39697266, + "step": 4575, + "time_per_iteration": 3.105905294418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_mlp": 1.00271654, + "epoch": 0.8803385917660639, + "flos": 770012949504.0, + "grad_norm": 0.03384904792749063, + "language_loss": 0.84867811, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85910225, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.39672852, + "step": 4576, + "time_per_iteration": 2.9150800704956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_mlp": 1.00258136, + "epoch": 0.8805309734513275, + "flos": 568420122624.0, + "grad_norm": 0.03302719749229089, + "language_loss": 0.81569564, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82611847, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.39672852, + "step": 4577, + "time_per_iteration": 2.7670326232910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104256, + "balance_loss_mlp": 1.00286317, + "epoch": 0.880723355136591, + "flos": 680977130496.0, + "grad_norm": 0.03426362906907379, + "language_loss": 0.81833982, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82876545, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.39672852, + "step": 4578, + "time_per_iteration": 2.8875744342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042717, + "balance_loss_mlp": 1.0029968, + "epoch": 0.8809157368218545, + "flos": 566761769472.0, + "grad_norm": 0.03161907542086704, + "language_loss": 0.79385138, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80427855, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.39697266, + "step": 4579, + "time_per_iteration": 2.7719390392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043228, + "balance_loss_mlp": 1.00348318, + "epoch": 0.8811081185071181, + "flos": 516427963392.0, + "grad_norm": 0.031290606513753615, + "language_loss": 0.76370418, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77413642, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.3972168, + "step": 4580, + "time_per_iteration": 2.720790147781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_mlp": 1.00212181, + "epoch": 0.8813005001923817, + "flos": 595449513216.0, + "grad_norm": 0.038529988459892694, + "language_loss": 0.81993824, + "learning_rate": 3.649630180424191e-05, + "loss": 0.830356, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.39624023, + "step": 4581, + "time_per_iteration": 2.7015504837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_mlp": 1.00310469, + "epoch": 0.8814928818776453, + "flos": 668186626560.0, + "grad_norm": 0.0360319379044657, + "language_loss": 0.79632461, + "learning_rate": 3.637955000868254e-05, + "loss": 0.8067522, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.39624023, + "step": 4582, + "time_per_iteration": 2.83176589012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_mlp": 1.00271213, + "epoch": 0.8816852635629088, + "flos": 610276503552.0, + "grad_norm": 0.03398118072297745, + "language_loss": 0.86405253, + "learning_rate": 3.626297820654467e-05, + "loss": 0.8744759, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.39599609, + "step": 4583, + "time_per_iteration": 2.741544485092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_mlp": 1.00260055, + "epoch": 0.8818776452481724, + "flos": 481375103232.0, + "grad_norm": 0.04470719780683464, + "language_loss": 0.82497907, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83540201, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.39672852, + "step": 4584, + "time_per_iteration": 2.652020215988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042011, + "balance_loss_mlp": 1.00231421, + "epoch": 0.882070026933436, + "flos": 1047034553856.0, + "grad_norm": 0.037631390647256145, + "language_loss": 0.73706174, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74748188, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.39672852, + "step": 4585, + "time_per_iteration": 3.3303396701812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045991, + "balance_loss_mlp": 1.00622249, + "epoch": 0.8822624086186995, + "flos": 475435752192.0, + "grad_norm": 0.034149923660639965, + "language_loss": 0.80189967, + "learning_rate": 3.591434321288345e-05, + "loss": 0.81235957, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.39746094, + "step": 4586, + "time_per_iteration": 2.7292559146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045973, + "balance_loss_mlp": 1.00634825, + "epoch": 0.882454790303963, + "flos": 655222123776.0, + "grad_norm": 0.04063008203109671, + "language_loss": 0.82156307, + "learning_rate": 3.579849183630485e-05, + "loss": 0.83202279, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.39599609, + "step": 4587, + "time_per_iteration": 2.8225555419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045855, + "balance_loss_mlp": 1.00613487, + "epoch": 0.8826471719892266, + "flos": 471304453632.0, + "grad_norm": 0.03549940663075914, + "language_loss": 0.78996181, + "learning_rate": 3.568282067873468e-05, + "loss": 0.80042034, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.39697266, + "step": 4588, + "time_per_iteration": 2.6043946743011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046048, + "balance_loss_mlp": 1.00637496, + "epoch": 0.8828395536744902, + "flos": 469767609600.0, + "grad_norm": 0.035767632266805204, + "language_loss": 0.84442842, + "learning_rate": 3.556732978508048e-05, + "loss": 0.8548888, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.39648438, + "step": 4589, + "time_per_iteration": 2.695120334625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045935, + "balance_loss_mlp": 1.00631011, + "epoch": 0.8830319353597538, + "flos": 722718805248.0, + "grad_norm": 0.03454304562764615, + "language_loss": 0.81774867, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82820797, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.39599609, + "step": 4590, + "time_per_iteration": 2.9288313388824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043285, + "balance_loss_mlp": 1.00356412, + "epoch": 0.8832243170450174, + "flos": 444192437760.0, + "grad_norm": 0.03744601205071845, + "language_loss": 0.82025963, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.83069241, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.39697266, + "step": 4591, + "time_per_iteration": 2.6343138217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042563, + "balance_loss_mlp": 1.00284278, + "epoch": 0.8834166987302808, + "flos": 567747448320.0, + "grad_norm": 0.039220305101438216, + "language_loss": 0.82777518, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83820081, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.39697266, + "step": 4592, + "time_per_iteration": 2.7458250522613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_mlp": 1.00282991, + "epoch": 0.8836090804155444, + "flos": 610498134528.0, + "grad_norm": 0.03429970098026536, + "language_loss": 0.82727444, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83770013, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.3972168, + "step": 4593, + "time_per_iteration": 2.82002592086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042679, + "balance_loss_mlp": 1.00288653, + "epoch": 0.883801462100808, + "flos": 558117148416.0, + "grad_norm": 0.03473522225274468, + "language_loss": 0.80918574, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81961256, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.39770508, + "step": 4594, + "time_per_iteration": 2.668245315551758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042836, + "balance_loss_mlp": 1.00302017, + "epoch": 0.8839938437860716, + "flos": 517200759552.0, + "grad_norm": 0.041162800652707915, + "language_loss": 0.77860659, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78903496, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.39794922, + "step": 4595, + "time_per_iteration": 2.6603221893310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_mlp": 1.00541556, + "epoch": 0.8841862254713351, + "flos": 714940299264.0, + "grad_norm": 0.038114639972197946, + "language_loss": 0.79199928, + "learning_rate": 3.47639446766777e-05, + "loss": 0.8024509, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.3972168, + "step": 4596, + "time_per_iteration": 2.84773588180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045345, + "balance_loss_mlp": 1.00562418, + "epoch": 0.8843786071565987, + "flos": 835379825664.0, + "grad_norm": 0.03386878842029177, + "language_loss": 0.83102214, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.84147561, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.39697266, + "step": 4597, + "time_per_iteration": 3.067197322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048234, + "balance_loss_mlp": 1.00858498, + "epoch": 0.8845709888418622, + "flos": 658179160320.0, + "grad_norm": 0.030983419501464857, + "language_loss": 0.83426988, + "learning_rate": 3.453603099349462e-05, + "loss": 0.84475219, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.39624023, + "step": 4598, + "time_per_iteration": 2.8990516662597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_mlp": 1.00432217, + "epoch": 0.8847633705271258, + "flos": 524484480768.0, + "grad_norm": 0.03200381379307614, + "language_loss": 0.81129134, + "learning_rate": 3.442234519350823e-05, + "loss": 0.82173103, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.39624023, + "step": 4599, + "time_per_iteration": 2.739694118499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043927, + "balance_loss_mlp": 1.00425434, + "epoch": 0.8849557522123894, + "flos": 549637777920.0, + "grad_norm": 0.03709178353655612, + "language_loss": 0.84963202, + "learning_rate": 3.430884014679786e-05, + "loss": 0.86007124, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.39648438, + "step": 4600, + "time_per_iteration": 2.6870527267456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_mlp": 1.00534427, + "epoch": 0.8851481338976529, + "flos": 623584146432.0, + "grad_norm": 0.03445259220220357, + "language_loss": 0.83782369, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84827334, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.39599609, + "step": 4601, + "time_per_iteration": 2.778261423110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043724, + "balance_loss_mlp": 1.00400329, + "epoch": 0.8853405155829165, + "flos": 445308374016.0, + "grad_norm": 0.0338712411878003, + "language_loss": 0.81439084, + "learning_rate": 3.408237248940088e-05, + "loss": 0.82482803, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.39697266, + "step": 4602, + "time_per_iteration": 2.5989644527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_mlp": 1.00365448, + "epoch": 0.8855328972681801, + "flos": 731749340928.0, + "grad_norm": 0.035815900220076725, + "language_loss": 0.78796673, + "learning_rate": 3.396940996663683e-05, + "loss": 0.79839998, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.39648438, + "step": 4603, + "time_per_iteration": 2.893944025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_mlp": 1.00261056, + "epoch": 0.8857252789534437, + "flos": 488356513536.0, + "grad_norm": 0.035459193509833585, + "language_loss": 0.7936362, + "learning_rate": 3.385662837299375e-05, + "loss": 0.80405909, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.39648438, + "step": 4604, + "time_per_iteration": 2.5552213191986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_mlp": 1.00250709, + "epoch": 0.8859176606387072, + "flos": 509622497280.0, + "grad_norm": 0.043634730763989146, + "language_loss": 0.82432818, + "learning_rate": 3.374402775225727e-05, + "loss": 0.83474994, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.39648438, + "step": 4605, + "time_per_iteration": 2.6891160011291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041456, + "balance_loss_mlp": 1.00173521, + "epoch": 0.8861100423239707, + "flos": 517665408768.0, + "grad_norm": 0.034049805393931584, + "language_loss": 0.86205089, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.87246549, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.39697266, + "step": 4606, + "time_per_iteration": 2.65995192527771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041919, + "balance_loss_mlp": 1.00222278, + "epoch": 0.8863024240092343, + "flos": 628110107904.0, + "grad_norm": 0.03497175764411169, + "language_loss": 0.79680067, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80721992, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.39672852, + "step": 4607, + "time_per_iteration": 2.763823986053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041821, + "balance_loss_mlp": 1.00214839, + "epoch": 0.8864948056944979, + "flos": 768298215936.0, + "grad_norm": 0.03212118258381081, + "language_loss": 0.83579981, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84621805, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.39648438, + "step": 4608, + "time_per_iteration": 2.987773895263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052631, + "balance_loss_mlp": 1.01381683, + "epoch": 0.8866871873797615, + "flos": 1505668055808.0, + "grad_norm": 0.013420842661803037, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79883587, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.38769531, + "step": 4609, + "time_per_iteration": 4.8158485889434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051288, + "balance_loss_mlp": 1.01142478, + "epoch": 0.886879569065025, + "flos": 812928886272.0, + "grad_norm": 0.035689455717370554, + "language_loss": 0.8246606, + "learning_rate": 3.3183740769755e-05, + "loss": 0.83517349, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.3984375, + "step": 4610, + "time_per_iteration": 3.1036856174468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052158, + "balance_loss_mlp": 1.01334381, + "epoch": 0.8870719507502886, + "flos": 1586226449664.0, + "grad_norm": 0.009608568330461998, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77962995, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.38769531, + "step": 4611, + "time_per_iteration": 4.96184229850769 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_mlp": 1.00853372, + "epoch": 0.8872643324355521, + "flos": 635165395200.0, + "grad_norm": 0.035989932069672784, + "language_loss": 0.75603622, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76651973, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.39794922, + "step": 4612, + "time_per_iteration": 2.736562967300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104857, + "balance_loss_mlp": 1.00877833, + "epoch": 0.8874567141208157, + "flos": 536784090624.0, + "grad_norm": 0.038218282698344784, + "language_loss": 0.83649206, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84697771, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.39770508, + "step": 4613, + "time_per_iteration": 2.6362390518188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048764, + "balance_loss_mlp": 1.00890064, + "epoch": 0.8876490958060793, + "flos": 1568719478784.0, + "grad_norm": 0.032946530708496874, + "language_loss": 0.79828751, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80877519, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.3984375, + "step": 4614, + "time_per_iteration": 3.8524417877197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048994, + "balance_loss_mlp": 1.00917864, + "epoch": 0.8878414774913428, + "flos": 637798733568.0, + "grad_norm": 0.034427393755430906, + "language_loss": 0.85503703, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.86552697, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.39794922, + "step": 4615, + "time_per_iteration": 2.8285470008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_mlp": 1.0090456, + "epoch": 0.8880338591766064, + "flos": 497422042368.0, + "grad_norm": 0.04825958458994101, + "language_loss": 0.81953943, + "learning_rate": 3.251737758834084e-05, + "loss": 0.83002782, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.39770508, + "step": 4616, + "time_per_iteration": 2.6075775623321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104902, + "balance_loss_mlp": 1.00918043, + "epoch": 0.88822624086187, + "flos": 543913254912.0, + "grad_norm": 0.03727530825330057, + "language_loss": 0.80842733, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.81891757, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.39819336, + "step": 4617, + "time_per_iteration": 2.636955976486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048789, + "balance_loss_mlp": 1.00897348, + "epoch": 0.8884186225471336, + "flos": 552876716544.0, + "grad_norm": 0.04548062549247532, + "language_loss": 0.84271151, + "learning_rate": 3.229670801173418e-05, + "loss": 0.85319942, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.39794922, + "step": 4618, + "time_per_iteration": 2.6750118732452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050083, + "balance_loss_mlp": 1.01126862, + "epoch": 0.888611004232397, + "flos": 1568662108416.0, + "grad_norm": 0.005691053324610859, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79562283, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.38769531, + "step": 4619, + "time_per_iteration": 5.008893013000488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049586, + "balance_loss_mlp": 1.0098182, + "epoch": 0.8888033859176606, + "flos": 768437221632.0, + "grad_norm": 0.03094273540620185, + "language_loss": 0.82804537, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83854127, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.39746094, + "step": 4620, + "time_per_iteration": 3.0561673641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010472, + "balance_loss_mlp": 1.00743198, + "epoch": 0.8889957676029242, + "flos": 935649862656.0, + "grad_norm": 0.04954559323619426, + "language_loss": 0.84685308, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85732502, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.39746094, + "step": 4621, + "time_per_iteration": 3.136061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046932, + "balance_loss_mlp": 1.00704467, + "epoch": 0.8891881492881878, + "flos": 590793294336.0, + "grad_norm": 0.03559696116277189, + "language_loss": 0.82163578, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.8321051, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.39868164, + "step": 4622, + "time_per_iteration": 2.76167893409729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_mlp": 1.00677288, + "epoch": 0.8893805309734514, + "flos": 541844687616.0, + "grad_norm": 0.03943600500029392, + "language_loss": 0.82868516, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83915204, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.39892578, + "step": 4623, + "time_per_iteration": 2.6835789680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046671, + "balance_loss_mlp": 1.00680768, + "epoch": 0.8895729126587149, + "flos": 561169449216.0, + "grad_norm": 0.037543227247628215, + "language_loss": 0.82209378, + "learning_rate": 3.163905853111054e-05, + "loss": 0.83256048, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.3984375, + "step": 4624, + "time_per_iteration": 2.6866161823272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048683, + "balance_loss_mlp": 1.00870013, + "epoch": 0.8897652943439784, + "flos": 611281624320.0, + "grad_norm": 0.03433233415002547, + "language_loss": 0.81767857, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82816535, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.3996582, + "step": 4625, + "time_per_iteration": 2.724318027496338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_mlp": 1.00863683, + "epoch": 0.889957676029242, + "flos": 919425033984.0, + "grad_norm": 0.04012941431513975, + "language_loss": 0.77691996, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78740519, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.39868164, + "step": 4626, + "time_per_iteration": 3.186610698699951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_mlp": 1.00651431, + "epoch": 0.8901500577145056, + "flos": 489687277824.0, + "grad_norm": 0.03850246921489707, + "language_loss": 0.81163925, + "learning_rate": 3.131268797400588e-05, + "loss": 0.82210255, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.39794922, + "step": 4627, + "time_per_iteration": 2.555154800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046203, + "balance_loss_mlp": 1.0062921, + "epoch": 0.8903424393997691, + "flos": 734914402560.0, + "grad_norm": 0.03734341343229868, + "language_loss": 0.81121147, + "learning_rate": 3.120426165316398e-05, + "loss": 0.82167351, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.39892578, + "step": 4628, + "time_per_iteration": 2.997708797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046433, + "balance_loss_mlp": 1.00654614, + "epoch": 0.8905348210850327, + "flos": 520884905472.0, + "grad_norm": 0.0340998125038295, + "language_loss": 0.82300949, + "learning_rate": 3.109601733496881e-05, + "loss": 0.83347386, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.39868164, + "step": 4629, + "time_per_iteration": 2.6674559116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046772, + "balance_loss_mlp": 1.00690854, + "epoch": 0.8907272027702963, + "flos": 580199669760.0, + "grad_norm": 0.03316770819427237, + "language_loss": 0.80315387, + "learning_rate": 3.098795506144458e-05, + "loss": 0.81362164, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.3984375, + "step": 4630, + "time_per_iteration": 2.819411039352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047037, + "balance_loss_mlp": 1.0072211, + "epoch": 0.8909195844555599, + "flos": 895115497728.0, + "grad_norm": 0.035411813999275225, + "language_loss": 0.79863322, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80910361, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.39794922, + "step": 4631, + "time_per_iteration": 3.109464406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047287, + "balance_loss_mlp": 1.00747073, + "epoch": 0.8911119661408234, + "flos": 550949100288.0, + "grad_norm": 0.03590303860632242, + "language_loss": 0.84888053, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85935342, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.39794922, + "step": 4632, + "time_per_iteration": 2.631802558898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_mlp": 1.00747597, + "epoch": 0.8913043478260869, + "flos": 482165395968.0, + "grad_norm": 0.04494839545328405, + "language_loss": 0.84571874, + "learning_rate": 3.066486092807874e-05, + "loss": 0.85619211, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.3984375, + "step": 4633, + "time_per_iteration": 2.6622323989868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049593, + "balance_loss_mlp": 1.0098244, + "epoch": 0.8914967295113505, + "flos": 485645407488.0, + "grad_norm": 0.03485029378747491, + "language_loss": 0.85605252, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86654842, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.39746094, + "step": 4634, + "time_per_iteration": 2.649674892425537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049227, + "balance_loss_mlp": 1.00950658, + "epoch": 0.8916891111966141, + "flos": 446593451520.0, + "grad_norm": 0.03644338611466098, + "language_loss": 0.81789589, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82838821, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.39697266, + "step": 4635, + "time_per_iteration": 2.5484728813171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048488, + "balance_loss_mlp": 1.00864804, + "epoch": 0.8918814928818777, + "flos": 565079116800.0, + "grad_norm": 0.03346108733120089, + "language_loss": 0.788185, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79866982, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.39819336, + "step": 4636, + "time_per_iteration": 2.728586435317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048649, + "balance_loss_mlp": 1.00878584, + "epoch": 0.8920738745671412, + "flos": 577029750528.0, + "grad_norm": 0.03483769902924038, + "language_loss": 0.81798345, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82846999, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.3984375, + "step": 4637, + "time_per_iteration": 2.662823438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104713, + "balance_loss_mlp": 1.00724292, + "epoch": 0.8922662562524047, + "flos": 621315335424.0, + "grad_norm": 0.0330933366062548, + "language_loss": 0.84552616, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.85599744, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.39868164, + "step": 4638, + "time_per_iteration": 2.7474730014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_mlp": 1.00476134, + "epoch": 0.8924586379376683, + "flos": 584808256512.0, + "grad_norm": 0.03843989347022111, + "language_loss": 0.79864812, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80909455, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.39868164, + "step": 4639, + "time_per_iteration": 2.7433924674987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_mlp": 1.00508726, + "epoch": 0.8926510196229319, + "flos": 526201159680.0, + "grad_norm": 0.04548508479610925, + "language_loss": 0.82137775, + "learning_rate": 2.991735397786538e-05, + "loss": 0.83182728, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.3984375, + "step": 4640, + "time_per_iteration": 2.745567798614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045023, + "balance_loss_mlp": 1.00511181, + "epoch": 0.8928434013081955, + "flos": 487640097792.0, + "grad_norm": 0.04163390678432403, + "language_loss": 0.81369799, + "learning_rate": 2.981129694909146e-05, + "loss": 0.8241483, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.39892578, + "step": 4641, + "time_per_iteration": 2.5342392921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_mlp": 1.01094055, + "epoch": 0.893035782993459, + "flos": 1451201984256.0, + "grad_norm": 0.009146057104072083, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81380612, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.38769531, + "step": 4642, + "time_per_iteration": 4.708850860595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045829, + "balance_loss_mlp": 1.00601351, + "epoch": 0.8932281646787226, + "flos": 612445192704.0, + "grad_norm": 0.03907380581076919, + "language_loss": 0.81196648, + "learning_rate": 2.95997305629786e-05, + "loss": 0.82242477, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.39794922, + "step": 4643, + "time_per_iteration": 2.782482385635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045296, + "balance_loss_mlp": 1.00548053, + "epoch": 0.8934205463639862, + "flos": 566828843520.0, + "grad_norm": 0.03557987256456931, + "language_loss": 0.84996665, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.86041963, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.39794922, + "step": 4644, + "time_per_iteration": 2.632826089859009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042235, + "balance_loss_mlp": 1.00230014, + "epoch": 0.8936129280492497, + "flos": 489435511296.0, + "grad_norm": 0.041094605368718444, + "language_loss": 0.78782856, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79825091, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.39916992, + "step": 4645, + "time_per_iteration": 2.564196825027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_mlp": 1.00242722, + "epoch": 0.8938053097345132, + "flos": 888075761664.0, + "grad_norm": 0.03418718196027921, + "language_loss": 0.8146354, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.82505834, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.3984375, + "step": 4646, + "time_per_iteration": 3.25028920173645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042378, + "balance_loss_mlp": 1.00253797, + "epoch": 0.8939976914197768, + "flos": 594433698816.0, + "grad_norm": 0.036470108367040274, + "language_loss": 0.84610659, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85653043, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.39819336, + "step": 4647, + "time_per_iteration": 2.7234268188476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046732, + "balance_loss_mlp": 1.00689232, + "epoch": 0.8941900731050404, + "flos": 524310481920.0, + "grad_norm": 0.04108443023943927, + "language_loss": 0.81452197, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82498932, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.39819336, + "step": 4648, + "time_per_iteration": 2.629329204559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_mlp": 1.00691688, + "epoch": 0.894382454790304, + "flos": 801928938240.0, + "grad_norm": 0.03703495030211325, + "language_loss": 0.81576788, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82623482, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.39746094, + "step": 4649, + "time_per_iteration": 2.995352268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046385, + "balance_loss_mlp": 1.00668859, + "epoch": 0.8945748364755676, + "flos": 480061835520.0, + "grad_norm": 0.03242427770731825, + "language_loss": 0.85382026, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.86428416, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.39672852, + "step": 4650, + "time_per_iteration": 2.583432674407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042358, + "balance_loss_mlp": 1.00242329, + "epoch": 0.894767218160831, + "flos": 509854821888.0, + "grad_norm": 0.037913827599498295, + "language_loss": 0.83491743, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84534097, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.39916992, + "step": 4651, + "time_per_iteration": 2.6782608032226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042301, + "balance_loss_mlp": 1.00243759, + "epoch": 0.8949595998460946, + "flos": 687064235520.0, + "grad_norm": 0.036736331203873075, + "language_loss": 0.82394856, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.83437157, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.3984375, + "step": 4652, + "time_per_iteration": 2.8787331581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042316, + "balance_loss_mlp": 1.00242865, + "epoch": 0.8951519815313582, + "flos": 801295147776.0, + "grad_norm": 0.038252435154000265, + "language_loss": 0.7746408, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78506398, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.39868164, + "step": 4653, + "time_per_iteration": 3.003697156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_mlp": 1.00218761, + "epoch": 0.8953443632166218, + "flos": 667936805376.0, + "grad_norm": 0.038098195849768056, + "language_loss": 0.86601067, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87643117, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.3984375, + "step": 4654, + "time_per_iteration": 2.7735435962677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043081, + "balance_loss_mlp": 1.00328851, + "epoch": 0.8955367449018854, + "flos": 646211030016.0, + "grad_norm": 0.03546786074364338, + "language_loss": 0.8345741, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84500492, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.39770508, + "step": 4655, + "time_per_iteration": 2.831127166748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104265, + "balance_loss_mlp": 1.00278592, + "epoch": 0.8957291265871489, + "flos": 810163345152.0, + "grad_norm": 0.0374122133806325, + "language_loss": 0.77918243, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78960896, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.3984375, + "step": 4656, + "time_per_iteration": 3.0841567516326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043156, + "balance_loss_mlp": 1.00331628, + "epoch": 0.8959215082724125, + "flos": 519964355328.0, + "grad_norm": 0.0357969961625874, + "language_loss": 0.77545249, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78588402, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.39819336, + "step": 4657, + "time_per_iteration": 2.6084063053131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_mlp": 1.00317335, + "epoch": 0.896113889957676, + "flos": 477912588288.0, + "grad_norm": 0.03521425072666777, + "language_loss": 0.77677613, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78720629, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.39819336, + "step": 4658, + "time_per_iteration": 2.5991733074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043145, + "balance_loss_mlp": 1.00330484, + "epoch": 0.8963062716429396, + "flos": 519174062592.0, + "grad_norm": 0.03519101773550154, + "language_loss": 0.83259702, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84302849, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.39819336, + "step": 4659, + "time_per_iteration": 2.615996837615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042122, + "balance_loss_mlp": 1.00221038, + "epoch": 0.8964986533282031, + "flos": 509502933504.0, + "grad_norm": 0.038458776381427445, + "language_loss": 0.82031912, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.83074033, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.39892578, + "step": 4660, + "time_per_iteration": 2.6992716789245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042516, + "balance_loss_mlp": 1.00260508, + "epoch": 0.8966910350134667, + "flos": 537109734144.0, + "grad_norm": 0.043189430842781595, + "language_loss": 0.81613052, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82655567, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.39892578, + "step": 4661, + "time_per_iteration": 2.6292388439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042305, + "balance_loss_mlp": 1.00244117, + "epoch": 0.8968834166987303, + "flos": 724498667520.0, + "grad_norm": 0.03837822461358079, + "language_loss": 0.84645671, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85687977, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.3984375, + "step": 4662, + "time_per_iteration": 2.8729970455169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042136, + "balance_loss_mlp": 1.00231993, + "epoch": 0.8970757983839939, + "flos": 682948488192.0, + "grad_norm": 0.03094905267903294, + "language_loss": 0.84279275, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.85321409, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.39794922, + "step": 4663, + "time_per_iteration": 2.906511068344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104132, + "balance_loss_mlp": 1.00143254, + "epoch": 0.8972681800692575, + "flos": 614157980928.0, + "grad_norm": 0.04620542900915945, + "language_loss": 0.75984728, + "learning_rate": 2.742244971856006e-05, + "loss": 0.77026045, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.39868164, + "step": 4664, + "time_per_iteration": 2.7156736850738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040999, + "balance_loss_mlp": 1.00118363, + "epoch": 0.8974605617545209, + "flos": 573500161536.0, + "grad_norm": 0.03609164505863249, + "language_loss": 0.83621204, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84662199, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.39794922, + "step": 4665, + "time_per_iteration": 2.752067804336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_mlp": 1.0020771, + "epoch": 0.8976529434397845, + "flos": 521508002304.0, + "grad_norm": 0.0665418568893367, + "language_loss": 0.87346292, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88388181, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.39794922, + "step": 4666, + "time_per_iteration": 2.628237247467041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041533, + "balance_loss_mlp": 1.00166953, + "epoch": 0.8978453251250481, + "flos": 472283329536.0, + "grad_norm": 0.0312359439189668, + "language_loss": 0.83060151, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.84101683, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.3984375, + "step": 4667, + "time_per_iteration": 2.632169008255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041569, + "balance_loss_mlp": 1.00170541, + "epoch": 0.8980377068103117, + "flos": 592822977792.0, + "grad_norm": 0.030629591551058093, + "language_loss": 0.82659423, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.83700991, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.3984375, + "step": 4668, + "time_per_iteration": 2.767726421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045376, + "balance_loss_mlp": 1.0056082, + "epoch": 0.8982300884955752, + "flos": 768951448320.0, + "grad_norm": 0.036651937719319225, + "language_loss": 0.83371913, + "learning_rate": 2.691596129049556e-05, + "loss": 0.84417284, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.39746094, + "step": 4669, + "time_per_iteration": 2.9341423511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_mlp": 1.00602329, + "epoch": 0.8984224701808388, + "flos": 846126061824.0, + "grad_norm": 0.036907430461080686, + "language_loss": 0.77928305, + "learning_rate": 2.681521445046775e-05, + "loss": 0.7897405, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.39697266, + "step": 4670, + "time_per_iteration": 3.222792625427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045725, + "balance_loss_mlp": 1.00593269, + "epoch": 0.8986148518661023, + "flos": 759100484352.0, + "grad_norm": 0.03395344580727902, + "language_loss": 0.76753604, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77799332, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.39770508, + "step": 4671, + "time_per_iteration": 3.153036594390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_mlp": 1.00551569, + "epoch": 0.8988072335513659, + "flos": 564147873024.0, + "grad_norm": 0.0448307164275886, + "language_loss": 0.76783341, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77828646, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.39770508, + "step": 4672, + "time_per_iteration": 2.7044925689697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_mlp": 1.00707936, + "epoch": 0.8989996152366295, + "flos": 493662074112.0, + "grad_norm": 0.04081416561791378, + "language_loss": 0.87663758, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.88710511, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.39648438, + "step": 4673, + "time_per_iteration": 2.5325169563293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_mlp": 1.00721145, + "epoch": 0.899191996921893, + "flos": 543624549888.0, + "grad_norm": 0.04404130619875364, + "language_loss": 0.76413238, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.77460146, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.39672852, + "step": 4674, + "time_per_iteration": 2.7047698497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046859, + "balance_loss_mlp": 1.00709105, + "epoch": 0.8993843786071566, + "flos": 472309574400.0, + "grad_norm": 0.03723607547134794, + "language_loss": 0.79863477, + "learning_rate": 2.631423662948984e-05, + "loss": 0.80910337, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.39746094, + "step": 4675, + "time_per_iteration": 2.5789737701416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047065, + "balance_loss_mlp": 1.00727308, + "epoch": 0.8995767602924202, + "flos": 527818683648.0, + "grad_norm": 0.03740066278427069, + "language_loss": 0.82893097, + "learning_rate": 2.621459261342196e-05, + "loss": 0.8394016, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.39770508, + "step": 4676, + "time_per_iteration": 2.7864742279052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042225, + "balance_loss_mlp": 1.00233746, + "epoch": 0.8997691419776838, + "flos": 558712055040.0, + "grad_norm": 0.03256175332090757, + "language_loss": 0.85054183, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.86096412, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.39868164, + "step": 4677, + "time_per_iteration": 2.677170753479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_mlp": 1.00210464, + "epoch": 0.8999615236629472, + "flos": 640254182400.0, + "grad_norm": 0.03334384672588689, + "language_loss": 0.8101427, + "learning_rate": 2.601585643932436e-05, + "loss": 0.8205626, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.39868164, + "step": 4678, + "time_per_iteration": 2.851458787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041386, + "balance_loss_mlp": 1.00238037, + "epoch": 0.9001539053482108, + "flos": 1434591240960.0, + "grad_norm": 0.005247251501394147, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86825407, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.38964844, + "step": 4679, + "time_per_iteration": 4.843084812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042195, + "balance_loss_mlp": 1.00230718, + "epoch": 0.9003462870334744, + "flos": 568036153344.0, + "grad_norm": 0.037250893678606165, + "language_loss": 0.80231905, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.81274104, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.39868164, + "step": 4680, + "time_per_iteration": 2.8753278255462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042225, + "balance_loss_mlp": 1.00228965, + "epoch": 0.900538668718738, + "flos": 539706134016.0, + "grad_norm": 0.03919596922604473, + "language_loss": 0.78784555, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79826784, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.39916992, + "step": 4681, + "time_per_iteration": 2.638622283935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041942, + "balance_loss_mlp": 1.00200713, + "epoch": 0.9007310504040016, + "flos": 489352886016.0, + "grad_norm": 0.03766496369551758, + "language_loss": 0.86354792, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.87396729, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.39916992, + "step": 4682, + "time_per_iteration": 2.5297749042510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042236, + "balance_loss_mlp": 1.00232458, + "epoch": 0.9009234320892651, + "flos": 654141180672.0, + "grad_norm": 0.0372575142760268, + "language_loss": 0.79257679, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.80299914, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.39892578, + "step": 4683, + "time_per_iteration": 2.843735694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_mlp": 1.00201344, + "epoch": 0.9011158137745287, + "flos": 546639912192.0, + "grad_norm": 0.03287971107611015, + "language_loss": 0.85687071, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86728871, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.39770508, + "step": 4684, + "time_per_iteration": 2.6854100227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041901, + "balance_loss_mlp": 1.00210893, + "epoch": 0.9013081954597922, + "flos": 560787425280.0, + "grad_norm": 0.03642552885134523, + "language_loss": 0.83117259, + "learning_rate": 2.532607837883011e-05, + "loss": 0.8415916, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.39770508, + "step": 4685, + "time_per_iteration": 2.6633992195129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104193, + "balance_loss_mlp": 1.0021615, + "epoch": 0.9015005771450558, + "flos": 729943233792.0, + "grad_norm": 0.03348030905856602, + "language_loss": 0.81757379, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82799315, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.39746094, + "step": 4686, + "time_per_iteration": 2.9701757431030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_mlp": 1.00181627, + "epoch": 0.9016929588303193, + "flos": 518492640000.0, + "grad_norm": 0.03548126030039495, + "language_loss": 0.81336671, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.8237828, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.39770508, + "step": 4687, + "time_per_iteration": 2.8197410106658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_mlp": 1.00445437, + "epoch": 0.9018853405155829, + "flos": 623555956224.0, + "grad_norm": 0.03200626373887687, + "language_loss": 0.86246824, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87291074, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.39770508, + "step": 4688, + "time_per_iteration": 2.8695383071899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042269, + "balance_loss_mlp": 1.0023576, + "epoch": 0.9020777222008465, + "flos": 524338672128.0, + "grad_norm": 0.03434760886532221, + "language_loss": 0.78137249, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.79179519, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.39892578, + "step": 4689, + "time_per_iteration": 2.6241252422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041799, + "balance_loss_mlp": 1.00188804, + "epoch": 0.9022701038861101, + "flos": 634894186752.0, + "grad_norm": 0.02955589406897841, + "language_loss": 0.82295549, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.83337349, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.39892578, + "step": 4690, + "time_per_iteration": 2.8230092525482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104175, + "balance_loss_mlp": 1.00188613, + "epoch": 0.9024624855713737, + "flos": 514333151232.0, + "grad_norm": 0.032923690811660945, + "language_loss": 0.84795076, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85836828, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.3984375, + "step": 4691, + "time_per_iteration": 2.6084606647491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044208, + "balance_loss_mlp": 1.00432038, + "epoch": 0.9026548672566371, + "flos": 478451114496.0, + "grad_norm": 0.03359361438657751, + "language_loss": 0.8643924, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87483442, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.39868164, + "step": 4692, + "time_per_iteration": 2.624701738357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044579, + "balance_loss_mlp": 1.00469148, + "epoch": 0.9028472489419007, + "flos": 663171716352.0, + "grad_norm": 0.0375528831642029, + "language_loss": 0.74257243, + "learning_rate": 2.454881842109058e-05, + "loss": 0.75301814, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.39868164, + "step": 4693, + "time_per_iteration": 2.8537118434906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044968, + "balance_loss_mlp": 1.0050807, + "epoch": 0.9030396306271643, + "flos": 535620522240.0, + "grad_norm": 0.03845768429563665, + "language_loss": 0.82445383, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.83490348, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.39868164, + "step": 4694, + "time_per_iteration": 2.632303237915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_mlp": 1.00474119, + "epoch": 0.9032320123124279, + "flos": 802384839168.0, + "grad_norm": 0.03517692965609126, + "language_loss": 0.82984042, + "learning_rate": 2.43563485451328e-05, + "loss": 0.84028614, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.39819336, + "step": 4695, + "time_per_iteration": 2.9748177528381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_mlp": 1.00484991, + "epoch": 0.9034243939976914, + "flos": 555025963776.0, + "grad_norm": 0.03909379859761359, + "language_loss": 0.77136493, + "learning_rate": 2.426039058035451e-05, + "loss": 0.78181207, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.3984375, + "step": 4696, + "time_per_iteration": 2.6545913219451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044716, + "balance_loss_mlp": 1.00485229, + "epoch": 0.903616775682955, + "flos": 504896292096.0, + "grad_norm": 0.03656588091717966, + "language_loss": 0.83326173, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.84370893, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.3984375, + "step": 4697, + "time_per_iteration": 2.6118416786193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046982, + "balance_loss_mlp": 1.00716567, + "epoch": 0.9038091573682185, + "flos": 437256714240.0, + "grad_norm": 0.03454643781931151, + "language_loss": 0.79546142, + "learning_rate": 2.406902878347017e-05, + "loss": 0.80593121, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.39794922, + "step": 4698, + "time_per_iteration": 2.627512216567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_mlp": 1.00444448, + "epoch": 0.9040015390534821, + "flos": 533990359296.0, + "grad_norm": 0.04424640859343309, + "language_loss": 0.82207114, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.8325147, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.39892578, + "step": 4699, + "time_per_iteration": 2.639582872390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_mlp": 1.00299335, + "epoch": 0.9041939207387457, + "flos": 565431005184.0, + "grad_norm": 0.053462163472176805, + "language_loss": 0.80744809, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.8178767, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.3984375, + "step": 4700, + "time_per_iteration": 2.7810487747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042912, + "balance_loss_mlp": 1.00300074, + "epoch": 0.9043863024240092, + "flos": 516521282304.0, + "grad_norm": 0.05013592196212244, + "language_loss": 0.78212988, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.79255903, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.39892578, + "step": 4701, + "time_per_iteration": 2.604733943939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_mlp": 1.00373077, + "epoch": 0.9045786841092728, + "flos": 1280785397760.0, + "grad_norm": 0.004379437455403695, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73972332, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.38769531, + "step": 4702, + "time_per_iteration": 5.015188455581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_mlp": 1.00230312, + "epoch": 0.9047710657945364, + "flos": 586933204224.0, + "grad_norm": 0.15726784027551832, + "language_loss": 0.83084238, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.84126312, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.39746094, + "step": 4703, + "time_per_iteration": 2.6818783283233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_mlp": 1.001899, + "epoch": 0.9049634474798, + "flos": 573072450816.0, + "grad_norm": 0.03926404752899239, + "language_loss": 0.79924166, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80965883, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.39794922, + "step": 4704, + "time_per_iteration": 2.7599122524261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_mlp": 1.00168145, + "epoch": 0.9051558291650635, + "flos": 573688744704.0, + "grad_norm": 0.041200102084881625, + "language_loss": 0.7489146, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75932956, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.39794922, + "step": 4705, + "time_per_iteration": 2.6949710845947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_mlp": 1.00184762, + "epoch": 0.905348210850327, + "flos": 541577369856.0, + "grad_norm": 0.03988349489111128, + "language_loss": 0.7962532, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80666983, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.39794922, + "step": 4706, + "time_per_iteration": 2.6960105895996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041736, + "balance_loss_mlp": 1.00196815, + "epoch": 0.9055405925355906, + "flos": 517396145664.0, + "grad_norm": 0.03894790437085165, + "language_loss": 0.81917119, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82958859, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.39746094, + "step": 4707, + "time_per_iteration": 2.579784393310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_mlp": 1.00206232, + "epoch": 0.9057329742208542, + "flos": 916223033856.0, + "grad_norm": 0.03029944141682911, + "language_loss": 0.85198569, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86240375, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.3972168, + "step": 4708, + "time_per_iteration": 3.178318500518799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_mlp": 1.00348961, + "epoch": 0.9059253559061178, + "flos": 906777426432.0, + "grad_norm": 0.03975505233074101, + "language_loss": 0.83321095, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.84364378, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.39770508, + "step": 4709, + "time_per_iteration": 3.1223511695861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_mlp": 1.00327146, + "epoch": 0.9061177375913813, + "flos": 665803109376.0, + "grad_norm": 0.042986952434846105, + "language_loss": 0.78179657, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.79222792, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.3984375, + "step": 4710, + "time_per_iteration": 2.904844284057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_mlp": 1.00377488, + "epoch": 0.9063101192766448, + "flos": 566779266048.0, + "grad_norm": 0.03589709877789938, + "language_loss": 0.82981563, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.84025156, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.39794922, + "step": 4711, + "time_per_iteration": 2.7522215843200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_mlp": 1.00335848, + "epoch": 0.9065025009619084, + "flos": 728631911424.0, + "grad_norm": 0.04760632658729641, + "language_loss": 0.79660898, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80704117, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.3984375, + "step": 4712, + "time_per_iteration": 2.874788761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043296, + "balance_loss_mlp": 1.00345671, + "epoch": 0.906694882647172, + "flos": 532548779520.0, + "grad_norm": 0.046151346888196505, + "language_loss": 0.80804384, + "learning_rate": 2.265739417041418e-05, + "loss": 0.8184768, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.39819336, + "step": 4713, + "time_per_iteration": 2.648693084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043542, + "balance_loss_mlp": 1.00375044, + "epoch": 0.9068872643324356, + "flos": 430696211712.0, + "grad_norm": 0.0360775633436318, + "language_loss": 0.85277104, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.86320645, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.39770508, + "step": 4714, + "time_per_iteration": 2.6000893115997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_mlp": 1.00177884, + "epoch": 0.9070796460176991, + "flos": 589455727104.0, + "grad_norm": 0.044653943193246386, + "language_loss": 0.80620706, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.81662303, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.39794922, + "step": 4715, + "time_per_iteration": 2.7455286979675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041192, + "balance_loss_mlp": 1.00135255, + "epoch": 0.9072720277029627, + "flos": 572655433728.0, + "grad_norm": 0.03585300733233402, + "language_loss": 0.75799972, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76841164, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.39819336, + "step": 4716, + "time_per_iteration": 2.742478847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_mlp": 1.00165629, + "epoch": 0.9074644093882263, + "flos": 556860261120.0, + "grad_norm": 0.03498331217535132, + "language_loss": 0.8895576, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89997262, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.39819336, + "step": 4717, + "time_per_iteration": 2.6485562324523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_mlp": 1.00248528, + "epoch": 0.9076567910734898, + "flos": 642173050368.0, + "grad_norm": 0.04530512053024163, + "language_loss": 0.83037788, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.84080064, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.39770508, + "step": 4718, + "time_per_iteration": 2.8499996662139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042708, + "balance_loss_mlp": 1.00291562, + "epoch": 0.9078491727587533, + "flos": 735457786368.0, + "grad_norm": 0.033395309611424465, + "language_loss": 0.82320893, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.83363599, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.39770508, + "step": 4719, + "time_per_iteration": 3.0724966526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_mlp": 1.00336421, + "epoch": 0.9080415544440169, + "flos": 656021164800.0, + "grad_norm": 0.031212172268033682, + "language_loss": 0.87156701, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.88199806, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.3972168, + "step": 4720, + "time_per_iteration": 2.8246238231658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_mlp": 1.0035367, + "epoch": 0.9082339361292805, + "flos": 598603881216.0, + "grad_norm": 0.03520689760481103, + "language_loss": 0.79804933, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80848241, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.39746094, + "step": 4721, + "time_per_iteration": 2.742318630218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104355, + "balance_loss_mlp": 1.00371027, + "epoch": 0.9084263178145441, + "flos": 505426070016.0, + "grad_norm": 0.036483558406201176, + "language_loss": 0.84666395, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85709947, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.39819336, + "step": 4722, + "time_per_iteration": 2.6103546619415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043538, + "balance_loss_mlp": 1.00362682, + "epoch": 0.9086186994998077, + "flos": 551107547904.0, + "grad_norm": 0.03608698449655091, + "language_loss": 0.80968702, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.82012242, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.39892578, + "step": 4723, + "time_per_iteration": 2.713469982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_mlp": 1.00195599, + "epoch": 0.9088110811850711, + "flos": 1135909979904.0, + "grad_norm": 0.07605585117528456, + "language_loss": 0.75542474, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76584214, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.39770508, + "step": 4724, + "time_per_iteration": 3.526309013366699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041464, + "balance_loss_mlp": 1.00164771, + "epoch": 0.9090034628703347, + "flos": 558060768000.0, + "grad_norm": 0.03248840709412771, + "language_loss": 0.77556646, + "learning_rate": 2.155810244111628e-05, + "loss": 0.78598112, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.39794922, + "step": 4725, + "time_per_iteration": 2.6785218715667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_mlp": 1.0020206, + "epoch": 0.9091958445555983, + "flos": 545066129664.0, + "grad_norm": 0.03903153521101798, + "language_loss": 0.84798872, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85840684, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.39770508, + "step": 4726, + "time_per_iteration": 2.6782310009002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041721, + "balance_loss_mlp": 1.00192916, + "epoch": 0.9093882262408619, + "flos": 527141151744.0, + "grad_norm": 0.05792704436205886, + "language_loss": 0.8138569, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82427418, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.39770508, + "step": 4727, + "time_per_iteration": 2.6294679641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043028, + "balance_loss_mlp": 1.0033077, + "epoch": 0.9095806079261254, + "flos": 549572649216.0, + "grad_norm": 0.038657617554072196, + "language_loss": 0.82212007, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.83255029, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.39697266, + "step": 4728, + "time_per_iteration": 2.63301944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043715, + "balance_loss_mlp": 1.00389957, + "epoch": 0.909772989611389, + "flos": 573641112576.0, + "grad_norm": 0.03527373598535026, + "language_loss": 0.84725654, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85769367, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.39794922, + "step": 4729, + "time_per_iteration": 2.725679636001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_mlp": 1.00318694, + "epoch": 0.9099653712966526, + "flos": 562882237440.0, + "grad_norm": 0.04198417845099068, + "language_loss": 0.80126184, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.81169212, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.39819336, + "step": 4730, + "time_per_iteration": 2.7127773761749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_mlp": 1.00447404, + "epoch": 0.9101577529819161, + "flos": 1095499069440.0, + "grad_norm": 0.03820196979176914, + "language_loss": 0.80539393, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81583661, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.39770508, + "step": 4731, + "time_per_iteration": 3.3886983394622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042072, + "balance_loss_mlp": 1.00230372, + "epoch": 0.9103501346671797, + "flos": 446361126912.0, + "grad_norm": 0.04074901158585317, + "language_loss": 0.82276326, + "learning_rate": 2.092919721190678e-05, + "loss": 0.833184, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.39746094, + "step": 4732, + "time_per_iteration": 2.528346300125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042157, + "balance_loss_mlp": 1.00234151, + "epoch": 0.9105425163524432, + "flos": 501813855744.0, + "grad_norm": 0.04168871505997614, + "language_loss": 0.78099614, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.79141772, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.39794922, + "step": 4733, + "time_per_iteration": 2.612539768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042251, + "balance_loss_mlp": 1.00245869, + "epoch": 0.9107348980377068, + "flos": 658776012288.0, + "grad_norm": 0.03125846585912685, + "language_loss": 0.84275806, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85318053, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.39770508, + "step": 4734, + "time_per_iteration": 2.8593521118164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_mlp": 1.0021503, + "epoch": 0.9109272797229704, + "flos": 554719762176.0, + "grad_norm": 0.03419361098029299, + "language_loss": 0.85598445, + "learning_rate": 2.066245558029256e-05, + "loss": 0.8664031, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.39697266, + "step": 4735, + "time_per_iteration": 2.6386280059814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041636, + "balance_loss_mlp": 1.00186801, + "epoch": 0.911119661408234, + "flos": 520011987456.0, + "grad_norm": 0.03913303108798507, + "language_loss": 0.84620136, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85661769, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.39746094, + "step": 4736, + "time_per_iteration": 2.6297035217285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041906, + "balance_loss_mlp": 1.00218534, + "epoch": 0.9113120430934974, + "flos": 555436177920.0, + "grad_norm": 0.03828883818672991, + "language_loss": 0.83354127, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84396034, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.39697266, + "step": 4737, + "time_per_iteration": 2.6358323097229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043517, + "balance_loss_mlp": 1.00372517, + "epoch": 0.911504424778761, + "flos": 502957982208.0, + "grad_norm": 0.0363931699668563, + "language_loss": 0.81623733, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82667255, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.39770508, + "step": 4738, + "time_per_iteration": 2.6258926391601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043564, + "balance_loss_mlp": 1.00374854, + "epoch": 0.9116968064640246, + "flos": 612212868096.0, + "grad_norm": 0.03297484419299765, + "language_loss": 0.82335055, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83378625, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.39794922, + "step": 4739, + "time_per_iteration": 2.6935737133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044368, + "balance_loss_mlp": 1.00457621, + "epoch": 0.9118891881492882, + "flos": 574095068160.0, + "grad_norm": 0.041988619549609606, + "language_loss": 0.82920527, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.8396489, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.39770508, + "step": 4740, + "time_per_iteration": 2.8001527786254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_mlp": 1.00356817, + "epoch": 0.9120815698345518, + "flos": 637173691392.0, + "grad_norm": 0.03597469929718847, + "language_loss": 0.78218091, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79261565, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.39892578, + "step": 4741, + "time_per_iteration": 2.8082351684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_mlp": 1.00177824, + "epoch": 0.9122739515198153, + "flos": 703556381952.0, + "grad_norm": 0.051345638115528766, + "language_loss": 0.86597633, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.87639201, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.39770508, + "step": 4742, + "time_per_iteration": 2.846757650375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042847, + "balance_loss_mlp": 1.0030551, + "epoch": 0.9124663332050789, + "flos": 525717068544.0, + "grad_norm": 0.04127672147297515, + "language_loss": 0.88021904, + "learning_rate": 1.995933526832239e-05, + "loss": 0.89064753, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.39770508, + "step": 4743, + "time_per_iteration": 2.5983972549438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_mlp": 1.00258601, + "epoch": 0.9126587148903424, + "flos": 564371449344.0, + "grad_norm": 0.03669827673453058, + "language_loss": 0.83077073, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.84119469, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.39794922, + "step": 4744, + "time_per_iteration": 2.638869524002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_mlp": 1.00275481, + "epoch": 0.912851096575606, + "flos": 506934723840.0, + "grad_norm": 0.040984614191802604, + "language_loss": 0.80266577, + "learning_rate": 1.978541819374574e-05, + "loss": 0.8130908, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.3972168, + "step": 4745, + "time_per_iteration": 2.6040964126586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_mlp": 1.0022434, + "epoch": 0.9130434782608695, + "flos": 551769528576.0, + "grad_norm": 0.035175280815842924, + "language_loss": 0.83013141, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.8405515, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.39746094, + "step": 4746, + "time_per_iteration": 2.621307373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042031, + "balance_loss_mlp": 1.00223851, + "epoch": 0.9132358599461331, + "flos": 469936750848.0, + "grad_norm": 0.035669438034904535, + "language_loss": 0.83020693, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84062719, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.39770508, + "step": 4747, + "time_per_iteration": 2.5283937454223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041728, + "balance_loss_mlp": 1.00196004, + "epoch": 0.9134282416313967, + "flos": 507101919744.0, + "grad_norm": 0.03785886131144422, + "language_loss": 0.80289733, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.81331468, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.39746094, + "step": 4748, + "time_per_iteration": 2.63830304145813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_mlp": 1.00378549, + "epoch": 0.9136206233166603, + "flos": 605939125248.0, + "grad_norm": 0.03504382906391139, + "language_loss": 0.84374118, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85417724, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.39794922, + "step": 4749, + "time_per_iteration": 2.765106439590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043164, + "balance_loss_mlp": 1.00327647, + "epoch": 0.9138130050019239, + "flos": 562825857024.0, + "grad_norm": 0.037154786087076466, + "language_loss": 0.83264536, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.843077, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.39868164, + "step": 4750, + "time_per_iteration": 2.6922707557678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_mlp": 1.00345743, + "epoch": 0.9140053866871873, + "flos": 691345233408.0, + "grad_norm": 0.032511709695782105, + "language_loss": 0.90623546, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91666842, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.39819336, + "step": 4751, + "time_per_iteration": 2.818824291229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_mlp": 1.00350499, + "epoch": 0.9141977683724509, + "flos": 552130165248.0, + "grad_norm": 0.03267689377344242, + "language_loss": 0.8422156, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85264862, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.39770508, + "step": 4752, + "time_per_iteration": 2.717310905456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_mlp": 1.00199425, + "epoch": 0.9143901500577145, + "flos": 541121468928.0, + "grad_norm": 0.04120763635430611, + "language_loss": 0.75722265, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76764077, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.39794922, + "step": 4753, + "time_per_iteration": 2.705686092376709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041376, + "balance_loss_mlp": 1.00158441, + "epoch": 0.9145825317429781, + "flos": 529793932032.0, + "grad_norm": 0.03780448267955887, + "language_loss": 0.81331134, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.8237251, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.39770508, + "step": 4754, + "time_per_iteration": 2.6305735111236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041884, + "balance_loss_mlp": 1.0020684, + "epoch": 0.9147749134282416, + "flos": 515813614848.0, + "grad_norm": 0.034250485968984036, + "language_loss": 0.79676282, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80718166, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.39794922, + "step": 4755, + "time_per_iteration": 2.627593755722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042217, + "balance_loss_mlp": 1.00247312, + "epoch": 0.9149672951135052, + "flos": 515514216192.0, + "grad_norm": 0.037789408744826655, + "language_loss": 0.86363244, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.87405461, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.3972168, + "step": 4756, + "time_per_iteration": 2.6330106258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042561, + "balance_loss_mlp": 1.0027926, + "epoch": 0.9151596767987688, + "flos": 578228312064.0, + "grad_norm": 0.03566379548068454, + "language_loss": 0.81810522, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.82853079, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.39746094, + "step": 4757, + "time_per_iteration": 2.7305617332458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044663, + "balance_loss_mlp": 1.00487077, + "epoch": 0.9153520584840323, + "flos": 620477410560.0, + "grad_norm": 0.033671157071198325, + "language_loss": 0.82926512, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83971179, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.39770508, + "step": 4758, + "time_per_iteration": 2.732607126235962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043403, + "balance_loss_mlp": 1.00351596, + "epoch": 0.9155444401692959, + "flos": 469862873856.0, + "grad_norm": 0.038586865658807416, + "language_loss": 0.83053923, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.84097326, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.39868164, + "step": 4759, + "time_per_iteration": 2.598182439804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_mlp": 1.00182343, + "epoch": 0.9157368218545594, + "flos": 1413842396160.0, + "grad_norm": 0.004422108568448796, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75859803, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.38867188, + "step": 4760, + "time_per_iteration": 4.8417699337005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040726, + "balance_loss_mlp": 1.0018158, + "epoch": 0.915929203539823, + "flos": 1525327209216.0, + "grad_norm": 0.004306437529183597, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80616784, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.38867188, + "step": 4761, + "time_per_iteration": 4.890992641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041962, + "balance_loss_mlp": 1.00209796, + "epoch": 0.9161215852250866, + "flos": 536847273984.0, + "grad_norm": 0.03478123106753543, + "language_loss": 0.80950373, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81992334, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.3984375, + "step": 4762, + "time_per_iteration": 2.7062151432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_mlp": 1.00271964, + "epoch": 0.9163139669103502, + "flos": 591726483456.0, + "grad_norm": 0.03725306331667297, + "language_loss": 0.80738699, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81781185, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.39746094, + "step": 4763, + "time_per_iteration": 2.6649303436279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042635, + "balance_loss_mlp": 1.00289094, + "epoch": 0.9165063485956138, + "flos": 823372811520.0, + "grad_norm": 0.03263551709218471, + "language_loss": 0.85072815, + "learning_rate": 1.817043762598397e-05, + "loss": 0.86115456, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.3972168, + "step": 4764, + "time_per_iteration": 3.0888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042796, + "balance_loss_mlp": 1.00302815, + "epoch": 0.9166987302808772, + "flos": 526246846464.0, + "grad_norm": 0.034405436737363966, + "language_loss": 0.82834673, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83877468, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.39746094, + "step": 4765, + "time_per_iteration": 2.612149953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_mlp": 1.00197029, + "epoch": 0.9168911119661408, + "flos": 656346808320.0, + "grad_norm": 0.03211426888442645, + "language_loss": 0.8478266, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85824376, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.3972168, + "step": 4766, + "time_per_iteration": 2.9523656368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_mlp": 1.00349426, + "epoch": 0.9170834936514044, + "flos": 492722082048.0, + "grad_norm": 0.03421441707502224, + "language_loss": 0.85402191, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.86445475, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.39770508, + "step": 4767, + "time_per_iteration": 2.5405280590057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043107, + "balance_loss_mlp": 1.00329077, + "epoch": 0.917275875336668, + "flos": 629180357376.0, + "grad_norm": 0.0411136231829451, + "language_loss": 0.81023633, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.82066739, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.39794922, + "step": 4768, + "time_per_iteration": 2.7763261795043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_mlp": 1.00387573, + "epoch": 0.9174682570219315, + "flos": 1521215352576.0, + "grad_norm": 0.005980393358773289, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79223019, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.38867188, + "step": 4769, + "time_per_iteration": 4.983697175979614 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041776, + "balance_loss_mlp": 1.00200808, + "epoch": 0.917660638707195, + "flos": 561113068800.0, + "grad_norm": 0.03356610756304218, + "language_loss": 0.85551798, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.8659358, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.39746094, + "step": 4770, + "time_per_iteration": 2.656285285949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104163, + "balance_loss_mlp": 1.00181425, + "epoch": 0.9178530203924586, + "flos": 448175982336.0, + "grad_norm": 0.03590542573213615, + "language_loss": 0.84399128, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.85440755, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.39794922, + "step": 4771, + "time_per_iteration": 2.4805028438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_mlp": 1.00193799, + "epoch": 0.9180454020777222, + "flos": 466975823616.0, + "grad_norm": 0.03408749549297663, + "language_loss": 0.81331682, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.82373387, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.39746094, + "step": 4772, + "time_per_iteration": 2.5695180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_mlp": 1.00198889, + "epoch": 0.9182377837629858, + "flos": 597485999616.0, + "grad_norm": 0.03332640277361682, + "language_loss": 0.87420475, + "learning_rate": 1.74290029706784e-05, + "loss": 0.8846221, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.3972168, + "step": 4773, + "time_per_iteration": 2.815886974334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041932, + "balance_loss_mlp": 1.00216413, + "epoch": 0.9184301654482493, + "flos": 998362013184.0, + "grad_norm": 0.03239829310404249, + "language_loss": 0.83262658, + "learning_rate": 1.734755767142876e-05, + "loss": 0.84304595, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.39746094, + "step": 4774, + "time_per_iteration": 3.356502056121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044271, + "balance_loss_mlp": 1.00445557, + "epoch": 0.9186225471335129, + "flos": 509902454016.0, + "grad_norm": 0.03056870137677778, + "language_loss": 0.8524617, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.86290443, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.39794922, + "step": 4775, + "time_per_iteration": 2.62768292427063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044303, + "balance_loss_mlp": 1.00448751, + "epoch": 0.9188149288187765, + "flos": 942078162432.0, + "grad_norm": 0.03846714084207721, + "language_loss": 0.79003048, + "learning_rate": 1.718522925136551e-05, + "loss": 0.80047351, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.39794922, + "step": 4776, + "time_per_iteration": 3.272127389907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_mlp": 1.0045718, + "epoch": 0.91900731050404, + "flos": 584764515072.0, + "grad_norm": 0.03467448263421975, + "language_loss": 0.84587908, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.85632205, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.39697266, + "step": 4777, + "time_per_iteration": 2.6851980686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_mlp": 1.00169146, + "epoch": 0.9191996921893035, + "flos": 582307120896.0, + "grad_norm": 0.04359090414483508, + "language_loss": 0.80117047, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.81158531, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.39770508, + "step": 4778, + "time_per_iteration": 2.7560482025146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041705, + "balance_loss_mlp": 1.00188911, + "epoch": 0.9193920738745671, + "flos": 910417830912.0, + "grad_norm": 0.03787808784410227, + "language_loss": 0.8024419, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81285894, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.39794922, + "step": 4779, + "time_per_iteration": 3.092148780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_mlp": 1.00309753, + "epoch": 0.9195844555598307, + "flos": 1561647650304.0, + "grad_norm": 0.0046095855092722115, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80837303, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.38769531, + "step": 4780, + "time_per_iteration": 4.6737401485443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_mlp": 1.00159907, + "epoch": 0.9197768372450943, + "flos": 475018735104.0, + "grad_norm": 0.03757173835384317, + "language_loss": 0.79206824, + "learning_rate": 1.678268904252317e-05, + "loss": 0.80248284, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.3984375, + "step": 4781, + "time_per_iteration": 2.5297844409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_mlp": 1.0018177, + "epoch": 0.9199692189303579, + "flos": 858597725184.0, + "grad_norm": 0.0351171984147798, + "language_loss": 0.84401453, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85443085, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.39794922, + "step": 4782, + "time_per_iteration": 3.2000656127929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_mlp": 1.00243175, + "epoch": 0.9201616006156214, + "flos": 505380383232.0, + "grad_norm": 0.035994154198359725, + "language_loss": 0.77748179, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78790379, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.39746094, + "step": 4783, + "time_per_iteration": 2.633715867996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_mlp": 1.00243866, + "epoch": 0.9203539823008849, + "flos": 549896347392.0, + "grad_norm": 0.03695579451272927, + "language_loss": 0.85761321, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.86803603, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.39819336, + "step": 4784, + "time_per_iteration": 2.7589988708496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043999, + "balance_loss_mlp": 1.00415969, + "epoch": 0.9205463639861485, + "flos": 541073836800.0, + "grad_norm": 0.034257304816951106, + "language_loss": 0.82876825, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83920825, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.39819336, + "step": 4785, + "time_per_iteration": 2.6440327167510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_mlp": 1.00467277, + "epoch": 0.9207387456714121, + "flos": 801162945024.0, + "grad_norm": 0.037754818387900006, + "language_loss": 0.78699261, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79743749, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.39794922, + "step": 4786, + "time_per_iteration": 3.0379316806793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044318, + "balance_loss_mlp": 1.00454926, + "epoch": 0.9209311273566756, + "flos": 503817294336.0, + "grad_norm": 0.034755260351108076, + "language_loss": 0.79367381, + "learning_rate": 1.630583198044333e-05, + "loss": 0.80411696, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.39746094, + "step": 4787, + "time_per_iteration": 2.6695258617401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043489, + "balance_loss_mlp": 1.0037446, + "epoch": 0.9211235090419392, + "flos": 570384677376.0, + "grad_norm": 0.04450902774793768, + "language_loss": 0.8309685, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.84140337, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.3972168, + "step": 4788, + "time_per_iteration": 2.716212034225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041703, + "balance_loss_mlp": 1.00195909, + "epoch": 0.9213158907272028, + "flos": 807931472640.0, + "grad_norm": 0.03942301040285612, + "language_loss": 0.82974708, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.84016412, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.3972168, + "step": 4789, + "time_per_iteration": 2.975245237350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041759, + "balance_loss_mlp": 1.00194263, + "epoch": 0.9215082724124664, + "flos": 491651832576.0, + "grad_norm": 0.035709044039591936, + "language_loss": 0.76630366, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77672124, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.39794922, + "step": 4790, + "time_per_iteration": 2.5468502044677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_mlp": 1.00228882, + "epoch": 0.9217006540977299, + "flos": 1517896700928.0, + "grad_norm": 0.004454200941114214, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78111368, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.38769531, + "step": 4791, + "time_per_iteration": 4.974085092544556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_mlp": 1.00179708, + "epoch": 0.9218930357829934, + "flos": 745087119360.0, + "grad_norm": 0.03326717695831736, + "language_loss": 0.76704144, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77745736, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.39770508, + "step": 4792, + "time_per_iteration": 2.9275741577148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_mlp": 1.0024513, + "epoch": 0.922085417468257, + "flos": 453974382336.0, + "grad_norm": 0.038882563606044994, + "language_loss": 0.81037605, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.82079852, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.39770508, + "step": 4793, + "time_per_iteration": 2.5030882358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042554, + "balance_loss_mlp": 1.0027858, + "epoch": 0.9222777991535206, + "flos": 501238391040.0, + "grad_norm": 0.03875743218845831, + "language_loss": 0.853522, + "learning_rate": 1.575804349061616e-05, + "loss": 0.86394763, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.39746094, + "step": 4794, + "time_per_iteration": 2.606114625930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_mlp": 1.00490916, + "epoch": 0.9224701808387842, + "flos": 528984197376.0, + "grad_norm": 0.03858977789908891, + "language_loss": 0.79467082, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.80511832, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.39819336, + "step": 4795, + "time_per_iteration": 2.5835952758789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_mlp": 1.00443649, + "epoch": 0.9226625625240477, + "flos": 876118324992.0, + "grad_norm": 0.033822189441903114, + "language_loss": 0.75610065, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76654249, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.3972168, + "step": 4796, + "time_per_iteration": 3.1282436847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044248, + "balance_loss_mlp": 1.00440812, + "epoch": 0.9228549442093112, + "flos": 503760913920.0, + "grad_norm": 0.038137259870802354, + "language_loss": 0.88317525, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89361775, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.39819336, + "step": 4797, + "time_per_iteration": 2.562058210372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042279, + "balance_loss_mlp": 1.00248694, + "epoch": 0.9230473258945748, + "flos": 601126404096.0, + "grad_norm": 0.032937276215074016, + "language_loss": 0.85114181, + "learning_rate": 1.544915681564829e-05, + "loss": 0.86156458, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.39770508, + "step": 4798, + "time_per_iteration": 2.779236078262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104159, + "balance_loss_mlp": 1.00175047, + "epoch": 0.9232397075798384, + "flos": 823876344576.0, + "grad_norm": 0.0419903361052834, + "language_loss": 0.7935499, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80396575, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.39819336, + "step": 4799, + "time_per_iteration": 3.08843994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042074, + "balance_loss_mlp": 1.00230587, + "epoch": 0.923432089265102, + "flos": 708275784192.0, + "grad_norm": 0.036067131408147955, + "language_loss": 0.84829307, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85871375, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.39746094, + "step": 4800, + "time_per_iteration": 2.8723208904266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_mlp": 1.00212193, + "epoch": 0.9236244709503655, + "flos": 703091732736.0, + "grad_norm": 0.03994271275686309, + "language_loss": 0.77297044, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.78338909, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.3972168, + "step": 4801, + "time_per_iteration": 2.8293721675872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041551, + "balance_loss_mlp": 1.00178313, + "epoch": 0.9238168526356291, + "flos": 516082877952.0, + "grad_norm": 0.03341169039005822, + "language_loss": 0.84229976, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85271525, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.39746094, + "step": 4802, + "time_per_iteration": 2.647810935974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.00194943, + "epoch": 0.9240092343208927, + "flos": 492965100288.0, + "grad_norm": 0.04174387467477932, + "language_loss": 0.817182, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82759964, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.39794922, + "step": 4803, + "time_per_iteration": 2.5767948627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.00197279, + "epoch": 0.9242016160061562, + "flos": 648436099584.0, + "grad_norm": 0.03908108029217818, + "language_loss": 0.74151349, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.75193107, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.39770508, + "step": 4804, + "time_per_iteration": 2.885768413543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043548, + "balance_loss_mlp": 1.00375605, + "epoch": 0.9243939976914197, + "flos": 730779213312.0, + "grad_norm": 0.03876536348903563, + "language_loss": 0.79842925, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80886477, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.39770508, + "step": 4805, + "time_per_iteration": 2.995547294616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044249, + "balance_loss_mlp": 1.00448072, + "epoch": 0.9245863793766833, + "flos": 453210334464.0, + "grad_norm": 0.03391430386616441, + "language_loss": 0.90989828, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.92034078, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.39746094, + "step": 4806, + "time_per_iteration": 2.590815305709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044582, + "balance_loss_mlp": 1.00471807, + "epoch": 0.9247787610619469, + "flos": 756367024128.0, + "grad_norm": 0.0377014259439888, + "language_loss": 0.77555621, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78600192, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.3984375, + "step": 4807, + "time_per_iteration": 2.9227209091186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_mlp": 1.002388, + "epoch": 0.9249711427472105, + "flos": 563084426496.0, + "grad_norm": 0.032247453393700005, + "language_loss": 0.85188043, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.86230248, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.39794922, + "step": 4808, + "time_per_iteration": 2.7663607597351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_mlp": 1.00201321, + "epoch": 0.9251635244324741, + "flos": 527781745152.0, + "grad_norm": 0.03881132010618585, + "language_loss": 0.85695136, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86736941, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.39770508, + "step": 4809, + "time_per_iteration": 2.697505235671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_mlp": 1.00197923, + "epoch": 0.9253559061177375, + "flos": 612480185856.0, + "grad_norm": 0.04670072109786849, + "language_loss": 0.79465836, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80507606, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.39770508, + "step": 4810, + "time_per_iteration": 2.8427956104278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_mlp": 1.00300598, + "epoch": 0.9255482878030011, + "flos": 1554464050944.0, + "grad_norm": 0.004376211492526392, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77967215, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.38769531, + "step": 4811, + "time_per_iteration": 4.711735486984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_mlp": 1.00202775, + "epoch": 0.9257406694882647, + "flos": 767803431168.0, + "grad_norm": 0.03861878751273161, + "language_loss": 0.81642079, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82683897, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.39770508, + "step": 4812, + "time_per_iteration": 3.057129383087158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_mlp": 1.00242972, + "epoch": 0.9259330511735283, + "flos": 498967634688.0, + "grad_norm": 0.038176642495154074, + "language_loss": 0.83979654, + "learning_rate": 1.431765421986686e-05, + "loss": 0.85021836, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.3972168, + "step": 4813, + "time_per_iteration": 2.546762704849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041888, + "balance_loss_mlp": 1.00211966, + "epoch": 0.9261254328587919, + "flos": 628016788992.0, + "grad_norm": 0.12158463805701603, + "language_loss": 0.79614502, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80656391, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.39746094, + "step": 4814, + "time_per_iteration": 2.792081594467163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043933, + "balance_loss_mlp": 1.00411725, + "epoch": 0.9263178145440554, + "flos": 598493065728.0, + "grad_norm": 0.0375165100308534, + "language_loss": 0.86120522, + "learning_rate": 1.416999056594831e-05, + "loss": 0.8716445, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.39794922, + "step": 4815, + "time_per_iteration": 2.6949462890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_mlp": 1.00328422, + "epoch": 0.926510196229319, + "flos": 389417240832.0, + "grad_norm": 0.035459065578210734, + "language_loss": 0.84041262, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.85084414, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.3984375, + "step": 4816, + "time_per_iteration": 2.4716691970825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042309, + "balance_loss_mlp": 1.00254118, + "epoch": 0.9267025779145825, + "flos": 546863488512.0, + "grad_norm": 0.038061097062299015, + "language_loss": 0.84638411, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85680723, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.39746094, + "step": 4817, + "time_per_iteration": 2.611921787261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_mlp": 1.00207257, + "epoch": 0.9268949595998461, + "flos": 500791238400.0, + "grad_norm": 0.035400343706182905, + "language_loss": 0.82393169, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.83435035, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.39770508, + "step": 4818, + "time_per_iteration": 2.6435391902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041883, + "balance_loss_mlp": 1.00209129, + "epoch": 0.9270873412851096, + "flos": 433739764224.0, + "grad_norm": 0.03314348914664278, + "language_loss": 0.82907271, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.83949155, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.39770508, + "step": 4819, + "time_per_iteration": 2.6455531120300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043568, + "balance_loss_mlp": 1.00380015, + "epoch": 0.9272797229703732, + "flos": 467803054848.0, + "grad_norm": 0.03754391760651958, + "language_loss": 0.86858791, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87902355, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.39746094, + "step": 4820, + "time_per_iteration": 2.6332101821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043167, + "balance_loss_mlp": 1.00337493, + "epoch": 0.9274721046556368, + "flos": 706250958336.0, + "grad_norm": 0.03606074012081537, + "language_loss": 0.79443467, + "learning_rate": 1.373152729763938e-05, + "loss": 0.80486631, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.39770508, + "step": 4821, + "time_per_iteration": 3.026251792907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043518, + "balance_loss_mlp": 1.00460815, + "epoch": 0.9276644863409004, + "flos": 1405345529088.0, + "grad_norm": 0.00577391953495314, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83423984, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.38867188, + "step": 4822, + "time_per_iteration": 4.8578009605407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_mlp": 1.00189805, + "epoch": 0.927856868026164, + "flos": 743138115840.0, + "grad_norm": 0.034922429944732755, + "language_loss": 0.80315673, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81357461, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.39868164, + "step": 4823, + "time_per_iteration": 3.027656316757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041798, + "balance_loss_mlp": 1.00193429, + "epoch": 0.9280492497114274, + "flos": 413123122176.0, + "grad_norm": 0.038779336552669824, + "language_loss": 0.743617, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.754035, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.3984375, + "step": 4824, + "time_per_iteration": 2.49405837059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041752, + "balance_loss_mlp": 1.00195956, + "epoch": 0.928241631396691, + "flos": 647665248768.0, + "grad_norm": 0.03926006290923404, + "language_loss": 0.84507549, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85549301, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.39770508, + "step": 4825, + "time_per_iteration": 2.8209800720214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_mlp": 1.00201905, + "epoch": 0.9284340130819546, + "flos": 698129312256.0, + "grad_norm": 0.035194877667594326, + "language_loss": 0.81097031, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.82138836, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.39770508, + "step": 4826, + "time_per_iteration": 2.9241714477539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041901, + "balance_loss_mlp": 1.00213277, + "epoch": 0.9286263947672182, + "flos": 760544009472.0, + "grad_norm": 0.03422053226269929, + "language_loss": 0.83855939, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.8489784, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.39746094, + "step": 4827, + "time_per_iteration": 3.002506732940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044201, + "balance_loss_mlp": 1.00445616, + "epoch": 0.9288187764524817, + "flos": 674141528832.0, + "grad_norm": 0.034644572167621394, + "language_loss": 0.80734801, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81779003, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.3972168, + "step": 4828, + "time_per_iteration": 2.9348583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044093, + "balance_loss_mlp": 1.00432444, + "epoch": 0.9290111581377453, + "flos": 501470715648.0, + "grad_norm": 0.034302029930516, + "language_loss": 0.84273684, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.85317779, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.39746094, + "step": 4829, + "time_per_iteration": 2.578808069229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_mlp": 1.00276947, + "epoch": 0.9292035398230089, + "flos": 1567060135680.0, + "grad_norm": 0.004229410701512915, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73163652, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.38769531, + "step": 4830, + "time_per_iteration": 4.9977943897247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_mlp": 1.00263214, + "epoch": 0.9293959215082724, + "flos": 1522066883328.0, + "grad_norm": 0.0031584024576732017, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80553377, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.38867188, + "step": 4831, + "time_per_iteration": 4.880908966064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041782, + "balance_loss_mlp": 1.0020138, + "epoch": 0.929588303193536, + "flos": 558898692864.0, + "grad_norm": 0.04711683463428304, + "language_loss": 0.84607166, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85648948, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.39746094, + "step": 4832, + "time_per_iteration": 2.6638543605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104215, + "balance_loss_mlp": 1.00223875, + "epoch": 0.9297806848787995, + "flos": 479551499520.0, + "grad_norm": 0.04844473198255446, + "language_loss": 0.80835903, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81878054, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.39892578, + "step": 4833, + "time_per_iteration": 2.5647528171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_mlp": 1.00291598, + "epoch": 0.9299730665640631, + "flos": 565654581504.0, + "grad_norm": 0.04014985005184455, + "language_loss": 0.79991281, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81033969, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.39746094, + "step": 4834, + "time_per_iteration": 2.8138298988342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_mlp": 1.0030086, + "epoch": 0.9301654482493267, + "flos": 561343448064.0, + "grad_norm": 0.0350160959737894, + "language_loss": 0.82770115, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83812916, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.39770508, + "step": 4835, + "time_per_iteration": 2.7824862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_mlp": 1.00445557, + "epoch": 0.9303578299345903, + "flos": 1523490966528.0, + "grad_norm": 0.0038776105750953234, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77895713, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.38769531, + "step": 4836, + "time_per_iteration": 4.964673757553101 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_mlp": 1.00239527, + "epoch": 0.9305502116198537, + "flos": 531860553984.0, + "grad_norm": 0.04009771865734981, + "language_loss": 0.83090073, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84132147, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.39648438, + "step": 4837, + "time_per_iteration": 2.6505465507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104178, + "balance_loss_mlp": 1.00201166, + "epoch": 0.9307425933051173, + "flos": 475856659968.0, + "grad_norm": 0.04414875410178277, + "language_loss": 0.82107651, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.83149433, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.39746094, + "step": 4838, + "time_per_iteration": 2.5214364528656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_mlp": 1.00214899, + "epoch": 0.9309349749903809, + "flos": 586065143808.0, + "grad_norm": 0.03586401591983268, + "language_loss": 0.87220377, + "learning_rate": 1.245693929549213e-05, + "loss": 0.88262272, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.3972168, + "step": 4839, + "time_per_iteration": 2.7310404777526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_mlp": 1.0021075, + "epoch": 0.9311273566756445, + "flos": 863143128576.0, + "grad_norm": 0.04744165409603807, + "language_loss": 0.77044845, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.78086698, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.3972168, + "step": 4840, + "time_per_iteration": 3.0941414833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_mlp": 1.00242364, + "epoch": 0.9313197383609081, + "flos": 549162435072.0, + "grad_norm": 0.035993751818789214, + "language_loss": 0.82954288, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83996511, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.39770508, + "step": 4841, + "time_per_iteration": 2.6443285942077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_mlp": 1.00221896, + "epoch": 0.9315121200461716, + "flos": 469704426240.0, + "grad_norm": 0.05808949887370269, + "language_loss": 0.81514281, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82556272, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.39746094, + "step": 4842, + "time_per_iteration": 2.527540922164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_mlp": 1.00222027, + "epoch": 0.9317045017314352, + "flos": 418558940160.0, + "grad_norm": 0.037955527047526506, + "language_loss": 0.7833854, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.79380524, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.39746094, + "step": 4843, + "time_per_iteration": 2.509636640548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042112, + "balance_loss_mlp": 1.0023675, + "epoch": 0.9318968834166987, + "flos": 541621111296.0, + "grad_norm": 0.03533166990442565, + "language_loss": 0.77478361, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78520471, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.3972168, + "step": 4844, + "time_per_iteration": 2.7669665813446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042379, + "balance_loss_mlp": 1.00256312, + "epoch": 0.9320892651019623, + "flos": 522347872512.0, + "grad_norm": 0.04071468473445459, + "language_loss": 0.81091493, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.82133877, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.39794922, + "step": 4845, + "time_per_iteration": 2.5971779823303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_mlp": 1.0020926, + "epoch": 0.9322816467872258, + "flos": 583253915904.0, + "grad_norm": 0.03228832372989255, + "language_loss": 0.80980742, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.82022512, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.39648438, + "step": 4846, + "time_per_iteration": 2.7529783248901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042078, + "balance_loss_mlp": 1.00233305, + "epoch": 0.9324740284724894, + "flos": 485803855104.0, + "grad_norm": 0.045024241247814824, + "language_loss": 0.82244754, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83286834, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.3972168, + "step": 4847, + "time_per_iteration": 2.714202880859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010423, + "balance_loss_mlp": 1.00253224, + "epoch": 0.932666410157753, + "flos": 734024954880.0, + "grad_norm": 0.0379301042606838, + "language_loss": 0.83348429, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.8439073, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.39746094, + "step": 4848, + "time_per_iteration": 3.028153419494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042085, + "balance_loss_mlp": 1.00234032, + "epoch": 0.9328587918430166, + "flos": 967181882112.0, + "grad_norm": 0.038825479032873554, + "language_loss": 0.79343307, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.80385393, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.3972168, + "step": 4849, + "time_per_iteration": 3.2317538261413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041875, + "balance_loss_mlp": 1.00210679, + "epoch": 0.9330511735282802, + "flos": 615684131328.0, + "grad_norm": 0.04256976645826645, + "language_loss": 0.80641079, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81682956, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.39746094, + "step": 4850, + "time_per_iteration": 2.6996281147003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041781, + "balance_loss_mlp": 1.00201249, + "epoch": 0.9332435552135436, + "flos": 560218763520.0, + "grad_norm": 0.03532928511603242, + "language_loss": 0.86379266, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.87421048, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.39746094, + "step": 4851, + "time_per_iteration": 2.7163102626800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_mlp": 1.00426483, + "epoch": 0.9334359368988072, + "flos": 516558220800.0, + "grad_norm": 0.04157190776063736, + "language_loss": 0.82472408, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.83516461, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.39770508, + "step": 4852, + "time_per_iteration": 2.5901753902435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_mlp": 1.0044682, + "epoch": 0.9336283185840708, + "flos": 540941634048.0, + "grad_norm": 0.030952473235186646, + "language_loss": 0.83399379, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.84443641, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.39770508, + "step": 4853, + "time_per_iteration": 2.754560947418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_mlp": 1.00708771, + "epoch": 0.9338207002693344, + "flos": 1566124034304.0, + "grad_norm": 0.007144453314513685, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.7950092, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.38769531, + "step": 4854, + "time_per_iteration": 4.874703407287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_mlp": 1.0018059, + "epoch": 0.9340130819545979, + "flos": 646508483328.0, + "grad_norm": 0.052007470771972146, + "language_loss": 0.81776142, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82817757, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.39794922, + "step": 4855, + "time_per_iteration": 2.941030263900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_mlp": 1.00182235, + "epoch": 0.9342054636398615, + "flos": 504512322816.0, + "grad_norm": 0.03777189969793054, + "language_loss": 0.77505141, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.78546846, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.39868164, + "step": 4856, + "time_per_iteration": 2.7442283630371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041841, + "balance_loss_mlp": 1.00202465, + "epoch": 0.934397845325125, + "flos": 594236367360.0, + "grad_norm": 0.030117925829334365, + "language_loss": 0.84709656, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85751504, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.39794922, + "step": 4857, + "time_per_iteration": 2.87604022026062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041577, + "balance_loss_mlp": 1.00176144, + "epoch": 0.9345902270103886, + "flos": 500884557312.0, + "grad_norm": 0.03501485016862442, + "language_loss": 0.80603421, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81644994, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.39794922, + "step": 4858, + "time_per_iteration": 2.5634093284606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040668, + "balance_loss_mlp": 1.00185394, + "epoch": 0.9347826086956522, + "flos": 1523407374336.0, + "grad_norm": 0.004505904699897048, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.77028388, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.38769531, + "step": 4859, + "time_per_iteration": 4.670670509338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041982, + "balance_loss_mlp": 1.0022136, + "epoch": 0.9349749903809157, + "flos": 505665197568.0, + "grad_norm": 0.03170584508353696, + "language_loss": 0.81696236, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82738221, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.39746094, + "step": 4860, + "time_per_iteration": 2.8271596431732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041903, + "balance_loss_mlp": 1.00215864, + "epoch": 0.9351673720661793, + "flos": 569965714944.0, + "grad_norm": 0.04706495104714966, + "language_loss": 0.79096204, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.80138111, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.3972168, + "step": 4861, + "time_per_iteration": 2.639396905899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041923, + "balance_loss_mlp": 1.00215459, + "epoch": 0.9353597537514429, + "flos": 545662981632.0, + "grad_norm": 0.03639072385040861, + "language_loss": 0.86677241, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87719166, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.39746094, + "step": 4862, + "time_per_iteration": 2.611056089401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_mlp": 1.0022645, + "epoch": 0.9355521354367065, + "flos": 520020735744.0, + "grad_norm": 0.033869901697529216, + "language_loss": 0.85065103, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.86107087, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.39697266, + "step": 4863, + "time_per_iteration": 2.727292537689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_mlp": 1.00495505, + "epoch": 0.93574451712197, + "flos": 447235990272.0, + "grad_norm": 0.04078440918447503, + "language_loss": 0.79057157, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.80101883, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.39746094, + "step": 4864, + "time_per_iteration": 2.485029458999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044092, + "balance_loss_mlp": 1.0043242, + "epoch": 0.9359368988072335, + "flos": 481496612352.0, + "grad_norm": 0.04601803612471293, + "language_loss": 0.77498388, + "learning_rate": 1.072417553472832e-05, + "loss": 0.78542477, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.39746094, + "step": 4865, + "time_per_iteration": 2.5250749588012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_mlp": 1.00230527, + "epoch": 0.9361292804924971, + "flos": 498092771328.0, + "grad_norm": 0.03842244559328987, + "language_loss": 0.85445625, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86487693, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.39746094, + "step": 4866, + "time_per_iteration": 2.5852344036102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104214, + "balance_loss_mlp": 1.0023483, + "epoch": 0.9363216621777607, + "flos": 619294400256.0, + "grad_norm": 0.036496484155024655, + "language_loss": 0.84628427, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85670567, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.39770508, + "step": 4867, + "time_per_iteration": 2.7684035301208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_mlp": 1.00415039, + "epoch": 0.9365140438630243, + "flos": 1418982706176.0, + "grad_norm": 0.003430568488681332, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80246305, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.38769531, + "step": 4868, + "time_per_iteration": 4.8999762535095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_mlp": 1.00201857, + "epoch": 0.9367064255482878, + "flos": 591650661120.0, + "grad_norm": 0.03595906257463508, + "language_loss": 0.8194294, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82984746, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.39770508, + "step": 4869, + "time_per_iteration": 2.717907667160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_mlp": 1.00230348, + "epoch": 0.9368988072335513, + "flos": 527653433088.0, + "grad_norm": 0.03657382574202116, + "language_loss": 0.82250357, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83292383, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.39697266, + "step": 4870, + "time_per_iteration": 2.7170798778533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_mlp": 1.00259185, + "epoch": 0.9370911889188149, + "flos": 744509709312.0, + "grad_norm": 0.03512967184309728, + "language_loss": 0.79642439, + "learning_rate": 1.034252625822113e-05, + "loss": 0.80684793, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.39746094, + "step": 4871, + "time_per_iteration": 2.9354794025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044725, + "balance_loss_mlp": 1.00498092, + "epoch": 0.9372835706040785, + "flos": 547078316544.0, + "grad_norm": 0.03351822012492755, + "language_loss": 0.79116702, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.80161428, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.3972168, + "step": 4872, + "time_per_iteration": 2.702842950820923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042221, + "balance_loss_mlp": 1.00245237, + "epoch": 0.9374759522893421, + "flos": 492700694784.0, + "grad_norm": 0.04331682149592437, + "language_loss": 0.8190614, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82948351, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.39746094, + "step": 4873, + "time_per_iteration": 2.685476541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_mlp": 1.0024718, + "epoch": 0.9376683339746056, + "flos": 579532831488.0, + "grad_norm": 0.04000808735517868, + "language_loss": 0.82723129, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83765376, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.39746094, + "step": 4874, + "time_per_iteration": 2.670814275741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_mlp": 1.0021373, + "epoch": 0.9378607156598692, + "flos": 507297305856.0, + "grad_norm": 0.04201396587358986, + "language_loss": 0.80951202, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81993079, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.3972168, + "step": 4875, + "time_per_iteration": 2.5980563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_mlp": 1.0022316, + "epoch": 0.9380530973451328, + "flos": 521071543296.0, + "grad_norm": 0.03892483876210915, + "language_loss": 0.78012693, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.79054689, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.39746094, + "step": 4876, + "time_per_iteration": 2.693706750869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_mlp": 1.00212526, + "epoch": 0.9382454790303963, + "flos": 558870502656.0, + "grad_norm": 0.03481280964401, + "language_loss": 0.85469687, + "learning_rate": 9.967720642029999e-06, + "loss": 0.86511576, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.39746094, + "step": 4877, + "time_per_iteration": 2.716329336166382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_mlp": 1.00217426, + "epoch": 0.9384378607156598, + "flos": 696787854336.0, + "grad_norm": 0.03644452111006662, + "language_loss": 0.82310647, + "learning_rate": 9.905918764418153e-06, + "loss": 0.83352518, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.39672852, + "step": 4878, + "time_per_iteration": 2.9522945880889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_mlp": 1.0020355, + "epoch": 0.9386302424009234, + "flos": 555835698432.0, + "grad_norm": 0.038143529458428554, + "language_loss": 0.81298959, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82340783, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.39770508, + "step": 4879, + "time_per_iteration": 2.6760354042053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_mlp": 1.00196457, + "epoch": 0.938822624086187, + "flos": 568066288896.0, + "grad_norm": 0.042500450188158845, + "language_loss": 0.80407965, + "learning_rate": 9.782885847304469e-06, + "loss": 0.81449699, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.39746094, + "step": 4880, + "time_per_iteration": 2.660008430480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041615, + "balance_loss_mlp": 1.00182331, + "epoch": 0.9390150057714506, + "flos": 418548246528.0, + "grad_norm": 0.042624887174036764, + "language_loss": 0.80474532, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81516147, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.39770508, + "step": 4881, + "time_per_iteration": 2.586818218231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_mlp": 1.00186956, + "epoch": 0.9392073874567142, + "flos": 1556084510208.0, + "grad_norm": 0.037573723473785056, + "language_loss": 0.76786673, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77828383, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.39819336, + "step": 4882, + "time_per_iteration": 3.7182722091674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_mlp": 1.00369167, + "epoch": 0.9393997691419776, + "flos": 653732911872.0, + "grad_norm": 0.03712425994747192, + "language_loss": 0.78558093, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79601628, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.39819336, + "step": 4883, + "time_per_iteration": 2.7739627361297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044121, + "balance_loss_mlp": 1.00521088, + "epoch": 0.9395921508272412, + "flos": 1556565666048.0, + "grad_norm": 0.005772326140838005, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79214799, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.38867188, + "step": 4884, + "time_per_iteration": 4.832986116409302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_mlp": 1.00337636, + "epoch": 0.9397845325125048, + "flos": 499198013952.0, + "grad_norm": 0.03489865319805655, + "language_loss": 0.79100162, + "learning_rate": 9.478634554578314e-06, + "loss": 0.80143404, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.3984375, + "step": 4885, + "time_per_iteration": 2.6249029636383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043291, + "balance_loss_mlp": 1.00345111, + "epoch": 0.9399769141977684, + "flos": 499590731520.0, + "grad_norm": 0.038272147047931145, + "language_loss": 0.84108281, + "learning_rate": 9.418355513755638e-06, + "loss": 0.85151565, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.39819336, + "step": 4886, + "time_per_iteration": 2.602886199951172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_mlp": 1.00218201, + "epoch": 0.9401692958830319, + "flos": 1405677975552.0, + "grad_norm": 0.003077659566733789, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80373377, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.38964844, + "step": 4887, + "time_per_iteration": 4.847235202789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042464, + "balance_loss_mlp": 1.00264823, + "epoch": 0.9403616775682955, + "flos": 541212842496.0, + "grad_norm": 0.02984190168493181, + "language_loss": 0.85387403, + "learning_rate": 9.298368837495575e-06, + "loss": 0.8642987, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.39794922, + "step": 4888, + "time_per_iteration": 2.7261717319488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_mlp": 1.00351715, + "epoch": 0.9405540592535591, + "flos": 1324940725248.0, + "grad_norm": 0.0037373643896522297, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76211762, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.38769531, + "step": 4889, + "time_per_iteration": 4.894392490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_mlp": 1.0029496, + "epoch": 0.9407464409388226, + "flos": 573428229888.0, + "grad_norm": 0.03814373852093679, + "language_loss": 0.8372674, + "learning_rate": 9.179144190235799e-06, + "loss": 0.84769458, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.39746094, + "step": 4890, + "time_per_iteration": 2.6488852500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_mlp": 1.0025363, + "epoch": 0.9409388226240862, + "flos": 512349154560.0, + "grad_norm": 0.03178395215267038, + "language_loss": 0.77375114, + "learning_rate": 9.119817685386112e-06, + "loss": 0.78417468, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.39794922, + "step": 4891, + "time_per_iteration": 2.7707180976867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043049, + "balance_loss_mlp": 1.00423431, + "epoch": 0.9411312043093497, + "flos": 1573279443456.0, + "grad_norm": 0.004890729003859763, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81284934, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.38769531, + "step": 4892, + "time_per_iteration": 4.890718936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041736, + "balance_loss_mlp": 1.0018965, + "epoch": 0.9413235859946133, + "flos": 570560621568.0, + "grad_norm": 0.03944178143363227, + "language_loss": 0.78793794, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79835528, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.39819336, + "step": 4893, + "time_per_iteration": 2.7482073307037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_mlp": 1.00170851, + "epoch": 0.9415159676798769, + "flos": 783266157312.0, + "grad_norm": 0.036973698375778005, + "language_loss": 0.80884314, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81925839, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.39794922, + "step": 4894, + "time_per_iteration": 3.00976300239563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_mlp": 1.00163937, + "epoch": 0.9417083493651405, + "flos": 850873654272.0, + "grad_norm": 0.031255435914645295, + "language_loss": 0.8029865, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81340152, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.3984375, + "step": 4895, + "time_per_iteration": 3.159879684448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_mlp": 1.00181615, + "epoch": 0.941900731050404, + "flos": 530452022016.0, + "grad_norm": 0.0360566247234317, + "language_loss": 0.86200356, + "learning_rate": 8.826044268024025e-06, + "loss": 0.87241995, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.39794922, + "step": 4896, + "time_per_iteration": 2.744678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041183, + "balance_loss_mlp": 1.00124776, + "epoch": 0.9420931127356675, + "flos": 558171583488.0, + "grad_norm": 0.0335549347969353, + "language_loss": 0.80863327, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81904507, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.39916992, + "step": 4897, + "time_per_iteration": 2.7431249618530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_mlp": 1.00145209, + "epoch": 0.9422854944209311, + "flos": 653787346944.0, + "grad_norm": 0.03717820296963101, + "language_loss": 0.86876309, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87917554, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.39770508, + "step": 4898, + "time_per_iteration": 2.840428590774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_mlp": 1.00148833, + "epoch": 0.9424778761061947, + "flos": 554765448960.0, + "grad_norm": 0.034123182933214626, + "language_loss": 0.84605157, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85646552, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.39892578, + "step": 4899, + "time_per_iteration": 2.699169874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_mlp": 1.00226939, + "epoch": 0.9426702577914583, + "flos": 589651113216.0, + "grad_norm": 0.03867302654620552, + "language_loss": 0.80447572, + "learning_rate": 8.594457827702406e-06, + "loss": 0.81489754, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.39892578, + "step": 4900, + "time_per_iteration": 2.6918928623199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042373, + "balance_loss_mlp": 1.00243771, + "epoch": 0.9428626394767218, + "flos": 617813936640.0, + "grad_norm": 0.04034073488325009, + "language_loss": 0.79256618, + "learning_rate": 8.537038112991114e-06, + "loss": 0.80298996, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.39916992, + "step": 4901, + "time_per_iteration": 2.7796003818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_mlp": 1.00216651, + "epoch": 0.9430550211619854, + "flos": 611542139136.0, + "grad_norm": 0.03752806991208156, + "language_loss": 0.82370108, + "learning_rate": 8.479809201123178e-06, + "loss": 0.83412206, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.39916992, + "step": 4902, + "time_per_iteration": 2.7410826683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_mlp": 1.00207889, + "epoch": 0.943247402847249, + "flos": 567052419840.0, + "grad_norm": 0.040940659305077086, + "language_loss": 0.78541058, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79583043, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.39892578, + "step": 4903, + "time_per_iteration": 2.7239608764648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042419, + "balance_loss_mlp": 1.00243592, + "epoch": 0.9434397845325125, + "flos": 528089892096.0, + "grad_norm": 0.04019064701674444, + "language_loss": 0.82004517, + "learning_rate": 8.365923874716297e-06, + "loss": 0.83046937, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.3996582, + "step": 4904, + "time_per_iteration": 2.6455512046813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_mlp": 1.00237763, + "epoch": 0.943632166217776, + "flos": 594592146432.0, + "grad_norm": 0.03750564487525279, + "language_loss": 0.83164895, + "learning_rate": 8.309267504391593e-06, + "loss": 0.84207094, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.39794922, + "step": 4905, + "time_per_iteration": 2.753347873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_mlp": 1.00305521, + "epoch": 0.9438245479030396, + "flos": 573982307328.0, + "grad_norm": 0.028646757212572906, + "language_loss": 0.85765415, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86808407, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.39916992, + "step": 4906, + "time_per_iteration": 2.8059747219085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_mlp": 1.00305963, + "epoch": 0.9440169295883032, + "flos": 489222628608.0, + "grad_norm": 0.03890561239710249, + "language_loss": 0.82264918, + "learning_rate": 8.196527459479242e-06, + "loss": 0.8330791, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.39916992, + "step": 4907, + "time_per_iteration": 2.5627827644348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041352, + "balance_loss_mlp": 1.00141716, + "epoch": 0.9442093112735668, + "flos": 733123846656.0, + "grad_norm": 0.03542647022861663, + "language_loss": 0.74123418, + "learning_rate": 8.140443828661137e-06, + "loss": 0.75164777, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.39916992, + "step": 4908, + "time_per_iteration": 2.999734401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041412, + "balance_loss_mlp": 1.00152421, + "epoch": 0.9444016929588304, + "flos": 572106213888.0, + "grad_norm": 0.04040136580220783, + "language_loss": 0.82575059, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83616471, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.39868164, + "step": 4909, + "time_per_iteration": 2.6992504596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_mlp": 1.00123525, + "epoch": 0.9445940746440938, + "flos": 510312668160.0, + "grad_norm": 0.03923748527569452, + "language_loss": 0.86716592, + "learning_rate": 8.028849459169318e-06, + "loss": 0.87757671, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.39819336, + "step": 4910, + "time_per_iteration": 2.5987043380737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041384, + "balance_loss_mlp": 1.00156808, + "epoch": 0.9447864563293574, + "flos": 625798522368.0, + "grad_norm": 0.03481160306135877, + "language_loss": 0.81500655, + "learning_rate": 7.97333876382028e-06, + "loss": 0.82542038, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.39794922, + "step": 4911, + "time_per_iteration": 2.808239459991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042566, + "balance_loss_mlp": 1.00277388, + "epoch": 0.944978838014621, + "flos": 506309681664.0, + "grad_norm": 0.037707161835398115, + "language_loss": 0.81599504, + "learning_rate": 7.918019090162098e-06, + "loss": 0.82642066, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.39770508, + "step": 4912, + "time_per_iteration": 2.7564315795898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_mlp": 1.00340271, + "epoch": 0.9451712196998846, + "flos": 1487554494720.0, + "grad_norm": 0.004835932125538659, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79329652, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.38867188, + "step": 4913, + "time_per_iteration": 5.008509397506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042822, + "balance_loss_mlp": 1.00293481, + "epoch": 0.9453636013851482, + "flos": 522152486400.0, + "grad_norm": 0.04169579849524581, + "language_loss": 0.90761364, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91804183, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.39868164, + "step": 4914, + "time_per_iteration": 2.7010884284973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_mlp": 1.00128174, + "epoch": 0.9455559830704117, + "flos": 1500286672896.0, + "grad_norm": 0.004154634284500281, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84602541, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.38867188, + "step": 4915, + "time_per_iteration": 4.9480881690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_mlp": 1.00144696, + "epoch": 0.9457483647556753, + "flos": 499152327168.0, + "grad_norm": 0.033787532501163176, + "language_loss": 0.82344007, + "learning_rate": 7.698651040865534e-06, + "loss": 0.83385336, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.39868164, + "step": 4916, + "time_per_iteration": 2.621060609817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_mlp": 1.00159323, + "epoch": 0.9459407464409388, + "flos": 1021119154176.0, + "grad_norm": 0.03122939977346768, + "language_loss": 0.82703984, + "learning_rate": 7.644286796333222e-06, + "loss": 0.8374536, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.39770508, + "step": 4917, + "time_per_iteration": 3.369508981704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_mlp": 1.0030514, + "epoch": 0.9461331281262024, + "flos": 514621856256.0, + "grad_norm": 0.03918316931271036, + "language_loss": 0.81608689, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82651609, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.3984375, + "step": 4918, + "time_per_iteration": 2.605464458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_mlp": 1.00376344, + "epoch": 0.9463255098114659, + "flos": 529049326080.0, + "grad_norm": 0.043499008433838054, + "language_loss": 0.78687984, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79731631, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.39868164, + "step": 4919, + "time_per_iteration": 2.587440013885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_mlp": 1.00377798, + "epoch": 0.9465178914967295, + "flos": 507028042752.0, + "grad_norm": 0.04214834797927713, + "language_loss": 0.84009337, + "learning_rate": 7.482341043430485e-06, + "loss": 0.85053033, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.39892578, + "step": 4920, + "time_per_iteration": 2.6052064895629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042211, + "balance_loss_mlp": 1.00237119, + "epoch": 0.9467102731819931, + "flos": 661539608064.0, + "grad_norm": 0.03799419957281172, + "language_loss": 0.86398727, + "learning_rate": 7.428741522553184e-06, + "loss": 0.87440938, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.39819336, + "step": 4921, + "time_per_iteration": 2.878465175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_mlp": 1.00174677, + "epoch": 0.9469026548672567, + "flos": 676505604096.0, + "grad_norm": 0.033503677703785116, + "language_loss": 0.89720869, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90762508, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.39868164, + "step": 4922, + "time_per_iteration": 2.9082603454589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041594, + "balance_loss_mlp": 1.00173008, + "epoch": 0.9470950365525203, + "flos": 515021376768.0, + "grad_norm": 0.04023426252004341, + "language_loss": 0.80039066, + "learning_rate": 7.32211620090012e-06, + "loss": 0.81080657, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.3984375, + "step": 4923, + "time_per_iteration": 2.59505033493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042749, + "balance_loss_mlp": 1.00290966, + "epoch": 0.9472874182377837, + "flos": 551227111680.0, + "grad_norm": 0.033951434514690154, + "language_loss": 0.81468022, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82510769, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.39819336, + "step": 4924, + "time_per_iteration": 2.75545334815979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104291, + "balance_loss_mlp": 1.00309443, + "epoch": 0.9474797999230473, + "flos": 543811187712.0, + "grad_norm": 0.034750930372707566, + "language_loss": 0.80470061, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81512976, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.39794922, + "step": 4925, + "time_per_iteration": 2.624394655227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104295, + "balance_loss_mlp": 1.00308681, + "epoch": 0.9476721816083109, + "flos": 846064823808.0, + "grad_norm": 0.033713578773609525, + "language_loss": 0.86285806, + "learning_rate": 7.163612828585242e-06, + "loss": 0.87328756, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.3984375, + "step": 4926, + "time_per_iteration": 3.0954294204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_mlp": 1.00312579, + "epoch": 0.9478645632935745, + "flos": 639148939776.0, + "grad_norm": 0.03739131726209916, + "language_loss": 0.79913974, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80956984, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.39868164, + "step": 4927, + "time_per_iteration": 2.753369092941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042404, + "balance_loss_mlp": 1.00256443, + "epoch": 0.948056944978838, + "flos": 658042099968.0, + "grad_norm": 0.03682204953186861, + "language_loss": 0.76448333, + "learning_rate": 7.058900559793469e-06, + "loss": 0.77490735, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.39819336, + "step": 4928, + "time_per_iteration": 2.833594560623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_mlp": 1.00130916, + "epoch": 0.9482493266641016, + "flos": 441837110784.0, + "grad_norm": 0.036391648532747914, + "language_loss": 0.83968282, + "learning_rate": 7.00683148031378e-06, + "loss": 0.85009503, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.39892578, + "step": 4929, + "time_per_iteration": 2.5252318382263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_mlp": 1.00144029, + "epoch": 0.9484417083493651, + "flos": 547122057984.0, + "grad_norm": 0.0372848187794391, + "language_loss": 0.7867955, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.79720879, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.39868164, + "step": 4930, + "time_per_iteration": 2.7968673706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_mlp": 1.00149775, + "epoch": 0.9486340900346287, + "flos": 539695440384.0, + "grad_norm": 0.03536088760450684, + "language_loss": 0.80219245, + "learning_rate": 6.903267532262003e-06, + "loss": 0.81260628, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.39868164, + "step": 4931, + "time_per_iteration": 2.677079439163208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041401, + "balance_loss_mlp": 1.00156105, + "epoch": 0.9488264717198923, + "flos": 682902801408.0, + "grad_norm": 0.03566286985886406, + "language_loss": 0.86654496, + "learning_rate": 6.851772703896975e-06, + "loss": 0.87695897, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.39819336, + "step": 4932, + "time_per_iteration": 2.814249038696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042864, + "balance_loss_mlp": 1.00309622, + "epoch": 0.9490188534051558, + "flos": 463560940800.0, + "grad_norm": 0.0386601386085668, + "language_loss": 0.88342351, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.89385211, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.39746094, + "step": 4933, + "time_per_iteration": 2.5076887607574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044538, + "balance_loss_mlp": 1.004722, + "epoch": 0.9492112350904194, + "flos": 544219456512.0, + "grad_norm": 0.036078560908697196, + "language_loss": 0.83274114, + "learning_rate": 6.7493574384489e-06, + "loss": 0.8431865, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.39794922, + "step": 4934, + "time_per_iteration": 2.66317081451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_mlp": 1.00500488, + "epoch": 0.949403616775683, + "flos": 551459436288.0, + "grad_norm": 0.03441156495312572, + "language_loss": 0.84393692, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85438401, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.39672852, + "step": 4935, + "time_per_iteration": 2.702319860458374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_mlp": 1.00510049, + "epoch": 0.9495959984609466, + "flos": 599498186496.0, + "grad_norm": 0.03383803796869729, + "language_loss": 0.83237123, + "learning_rate": 6.647708160456678e-06, + "loss": 0.84281969, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.3972168, + "step": 4936, + "time_per_iteration": 2.692322254180908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_mlp": 1.00479722, + "epoch": 0.94978838014621, + "flos": 609531897600.0, + "grad_norm": 0.03709057763326247, + "language_loss": 0.82544994, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83589554, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.39746094, + "step": 4937, + "time_per_iteration": 2.805114984512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_mlp": 1.00272846, + "epoch": 0.9499807618314736, + "flos": 541866074880.0, + "grad_norm": 0.03343237583673612, + "language_loss": 0.87039685, + "learning_rate": 6.546825027775427e-06, + "loss": 0.88082153, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.3972168, + "step": 4938, + "time_per_iteration": 2.681006908416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043038, + "balance_loss_mlp": 1.00326955, + "epoch": 0.9501731435167372, + "flos": 595710028032.0, + "grad_norm": 0.03316869101198482, + "language_loss": 0.83294916, + "learning_rate": 6.496670814930717e-06, + "loss": 0.8433795, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.39746094, + "step": 4939, + "time_per_iteration": 2.7134695053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_mlp": 1.00321257, + "epoch": 0.9503655252020008, + "flos": 455072822016.0, + "grad_norm": 0.03736501711507977, + "language_loss": 0.80317879, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81360853, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.39746094, + "step": 4940, + "time_per_iteration": 2.5654499530792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_mlp": 1.00352716, + "epoch": 0.9505579068872644, + "flos": 669128563968.0, + "grad_norm": 0.035510729926933764, + "language_loss": 0.84777826, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85821128, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.39746094, + "step": 4941, + "time_per_iteration": 2.814131736755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_mlp": 1.00497139, + "epoch": 0.9507502885725279, + "flos": 403080662784.0, + "grad_norm": 0.037080639816230825, + "language_loss": 0.81930745, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82975513, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.39770508, + "step": 4942, + "time_per_iteration": 2.4868767261505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_mlp": 1.00450122, + "epoch": 0.9509426702577914, + "flos": 701737635840.0, + "grad_norm": 0.03513225958551105, + "language_loss": 0.7980032, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80844545, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.39697266, + "step": 4943, + "time_per_iteration": 2.9845831394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_mlp": 1.00459003, + "epoch": 0.951135051943055, + "flos": 502401959424.0, + "grad_norm": 0.03803809561051826, + "language_loss": 0.8287642, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83920777, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.39746094, + "step": 4944, + "time_per_iteration": 2.585265874862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042732, + "balance_loss_mlp": 1.00296342, + "epoch": 0.9513274336283186, + "flos": 615866878464.0, + "grad_norm": 0.03642431100768812, + "language_loss": 0.81723523, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82766253, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.39746094, + "step": 4945, + "time_per_iteration": 2.9102370738983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_mlp": 1.00266135, + "epoch": 0.9515198153135821, + "flos": 520598145792.0, + "grad_norm": 0.04093977912249511, + "language_loss": 0.82534963, + "learning_rate": 6.150957065611363e-06, + "loss": 0.83577436, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.39794922, + "step": 4946, + "time_per_iteration": 2.569200038909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_mlp": 1.00292897, + "epoch": 0.9517121969988457, + "flos": 666285255168.0, + "grad_norm": 0.03445383496459535, + "language_loss": 0.77101904, + "learning_rate": 6.102336151595667e-06, + "loss": 0.78144598, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.39746094, + "step": 4947, + "time_per_iteration": 2.9556493759155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_mlp": 1.00329661, + "epoch": 0.9519045786841093, + "flos": 677616682752.0, + "grad_norm": 0.03959688141622468, + "language_loss": 0.76750845, + "learning_rate": 6.053906985658553e-06, + "loss": 0.77793932, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.39770508, + "step": 4948, + "time_per_iteration": 2.846489429473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043021, + "balance_loss_mlp": 1.00308585, + "epoch": 0.9520969603693729, + "flos": 654141180672.0, + "grad_norm": 0.03311313171494322, + "language_loss": 0.80785477, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81828499, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.39916992, + "step": 4949, + "time_per_iteration": 2.872042655944824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041327, + "balance_loss_mlp": 1.00148737, + "epoch": 0.9522893420546364, + "flos": 744683708160.0, + "grad_norm": 0.029795197161734573, + "language_loss": 0.83586603, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84627938, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.39819336, + "step": 4950, + "time_per_iteration": 3.068004846572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_mlp": 1.00318003, + "epoch": 0.9524817237398999, + "flos": 763031539200.0, + "grad_norm": 0.039165915603714734, + "language_loss": 0.81529355, + "learning_rate": 5.909770163964545e-06, + "loss": 0.82572323, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.39770508, + "step": 4951, + "time_per_iteration": 2.958136796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_mlp": 1.00177801, + "epoch": 0.9526741054251635, + "flos": 530147765760.0, + "grad_norm": 0.03797586697634241, + "language_loss": 0.82436419, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83477962, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.39746094, + "step": 4952, + "time_per_iteration": 2.5887794494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_mlp": 1.00191772, + "epoch": 0.9528664871104271, + "flos": 489426763008.0, + "grad_norm": 0.03867209497771876, + "language_loss": 0.81692654, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82734388, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.39794922, + "step": 4953, + "time_per_iteration": 2.60361909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_mlp": 1.00328088, + "epoch": 0.9530588687956907, + "flos": 518872718592.0, + "grad_norm": 0.035355195593526345, + "language_loss": 0.85669702, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86712778, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.39770508, + "step": 4954, + "time_per_iteration": 2.7473466396331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_mlp": 1.0029465, + "epoch": 0.9532512504809542, + "flos": 676414230528.0, + "grad_norm": 0.03834436064105519, + "language_loss": 0.81176341, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82219148, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.3984375, + "step": 4955, + "time_per_iteration": 2.8323822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_mlp": 1.00295496, + "epoch": 0.9534436321662177, + "flos": 490542699264.0, + "grad_norm": 0.03487032730513436, + "language_loss": 0.84557939, + "learning_rate": 5.673378829575249e-06, + "loss": 0.8560068, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.39770508, + "step": 4956, + "time_per_iteration": 2.5702896118164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104252, + "balance_loss_mlp": 1.00275135, + "epoch": 0.9536360138514813, + "flos": 497589238272.0, + "grad_norm": 0.036979832963634794, + "language_loss": 0.82516146, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83558667, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.39746094, + "step": 4957, + "time_per_iteration": 2.656902313232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_mlp": 1.00475693, + "epoch": 0.9538283955367449, + "flos": 802858236672.0, + "grad_norm": 0.03444373723979776, + "language_loss": 0.84639931, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85684431, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.3972168, + "step": 4958, + "time_per_iteration": 3.0350341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_mlp": 1.00439882, + "epoch": 0.9540207772220085, + "flos": 557798307840.0, + "grad_norm": 0.031107879429296895, + "language_loss": 0.80315, + "learning_rate": 5.533846857624203e-06, + "loss": 0.81359196, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.39770508, + "step": 4959, + "time_per_iteration": 2.779682159423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_mlp": 1.00489748, + "epoch": 0.954213158907272, + "flos": 685759716096.0, + "grad_norm": 0.03430720369577018, + "language_loss": 0.81983697, + "learning_rate": 5.487720113876882e-06, + "loss": 0.83028287, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.39672852, + "step": 4960, + "time_per_iteration": 2.9145681858062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042604, + "balance_loss_mlp": 1.00288343, + "epoch": 0.9544055405925356, + "flos": 536847273984.0, + "grad_norm": 0.0384807458715525, + "language_loss": 0.83184588, + "learning_rate": 5.441785356823214e-06, + "loss": 0.84227192, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.39697266, + "step": 4961, + "time_per_iteration": 2.727431535720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041086, + "balance_loss_mlp": 1.0013653, + "epoch": 0.9545979222777992, + "flos": 826924754688.0, + "grad_norm": 0.03962394126421547, + "language_loss": 0.80869973, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81911057, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.39697266, + "step": 4962, + "time_per_iteration": 3.1185572147369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_mlp": 1.00138175, + "epoch": 0.9547903039630627, + "flos": 763157905920.0, + "grad_norm": 0.04021618792762701, + "language_loss": 0.77994674, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.79035848, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.39770508, + "step": 4963, + "time_per_iteration": 3.1309633255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_mlp": 1.00191748, + "epoch": 0.9549826856483262, + "flos": 516334644480.0, + "grad_norm": 0.03883845801630645, + "language_loss": 0.83332193, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.84373903, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.39770508, + "step": 4964, + "time_per_iteration": 2.6028783321380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043315, + "balance_loss_mlp": 1.00349903, + "epoch": 0.9551750673335898, + "flos": 644267862528.0, + "grad_norm": 0.03455471990169676, + "language_loss": 0.82968116, + "learning_rate": 5.259966551095341e-06, + "loss": 0.84011436, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.39794922, + "step": 4965, + "time_per_iteration": 2.80012583732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042897, + "balance_loss_mlp": 1.00312924, + "epoch": 0.9553674490188534, + "flos": 473175689472.0, + "grad_norm": 0.03525799023609817, + "language_loss": 0.83457267, + "learning_rate": 5.214991993520546e-06, + "loss": 0.84500164, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.39746094, + "step": 4966, + "time_per_iteration": 2.626706838607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_mlp": 1.00325596, + "epoch": 0.955559830704117, + "flos": 529338031104.0, + "grad_norm": 0.04404774248069698, + "language_loss": 0.82227528, + "learning_rate": 5.170209528521763e-06, + "loss": 0.83270633, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.39819336, + "step": 4967, + "time_per_iteration": 2.599682569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_mlp": 1.00329304, + "epoch": 0.9557522123893806, + "flos": 549218815488.0, + "grad_norm": 0.0375296365771858, + "language_loss": 0.846102, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85653257, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.39746094, + "step": 4968, + "time_per_iteration": 2.690603017807007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_mlp": 1.00377893, + "epoch": 0.955944594074644, + "flos": 510525550848.0, + "grad_norm": 0.03274819771004998, + "language_loss": 0.82464266, + "learning_rate": 5.08122094572222e-06, + "loss": 0.8350786, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.39794922, + "step": 4969, + "time_per_iteration": 2.687436103820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_mlp": 1.00153351, + "epoch": 0.9561369757599076, + "flos": 528711043584.0, + "grad_norm": 0.03580128099593846, + "language_loss": 0.80410147, + "learning_rate": 5.037014862469824e-06, + "loss": 0.81451499, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.39794922, + "step": 4970, + "time_per_iteration": 2.7537877559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_mlp": 1.00138855, + "epoch": 0.9563293574451712, + "flos": 499208707584.0, + "grad_norm": 0.04916035322851724, + "language_loss": 0.80648708, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81689912, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.39794922, + "step": 4971, + "time_per_iteration": 2.6075868606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_mlp": 1.00415039, + "epoch": 0.9565217391304348, + "flos": 1411746617088.0, + "grad_norm": 0.004977408547260208, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82816529, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.38769531, + "step": 4972, + "time_per_iteration": 4.884822845458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104272, + "balance_loss_mlp": 1.00297523, + "epoch": 0.9567141208156984, + "flos": 504885598464.0, + "grad_norm": 0.03188993073908652, + "language_loss": 0.78494072, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79536796, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.3972168, + "step": 4973, + "time_per_iteration": 2.730933666229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_mlp": 1.0030967, + "epoch": 0.9569065025009619, + "flos": 434130536448.0, + "grad_norm": 0.04150904011170753, + "language_loss": 0.80358505, + "learning_rate": 4.86211231669359e-06, + "loss": 0.81401271, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.39648438, + "step": 4974, + "time_per_iteration": 2.4550540447235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_mlp": 1.0030849, + "epoch": 0.9570988841862255, + "flos": 591155876352.0, + "grad_norm": 0.03814012594949819, + "language_loss": 0.78727484, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79770291, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.39697266, + "step": 4975, + "time_per_iteration": 2.7876100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_mlp": 1.00290823, + "epoch": 0.957291265871489, + "flos": 768643301376.0, + "grad_norm": 0.0533544119882121, + "language_loss": 0.78933519, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79976147, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.39697266, + "step": 4976, + "time_per_iteration": 2.9786951541900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_mlp": 1.00595045, + "epoch": 0.9574836475567526, + "flos": 640247379456.0, + "grad_norm": 0.044191723378042724, + "language_loss": 0.8518002, + "learning_rate": 4.732953758233849e-06, + "loss": 0.86225587, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.39599609, + "step": 4977, + "time_per_iteration": 2.795616388320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046776, + "balance_loss_mlp": 1.00805664, + "epoch": 0.9576760292420161, + "flos": 1579401541632.0, + "grad_norm": 0.008795466714776974, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79654026, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.38671875, + "step": 4978, + "time_per_iteration": 4.909578084945679 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043149, + "balance_loss_mlp": 1.00342846, + "epoch": 0.9578684109272797, + "flos": 497374410240.0, + "grad_norm": 0.03268780519478309, + "language_loss": 0.87571311, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88614452, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.39697266, + "step": 4979, + "time_per_iteration": 2.5885064601898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104325, + "balance_loss_mlp": 1.00352943, + "epoch": 0.9580607926125433, + "flos": 430854659328.0, + "grad_norm": 0.03948216870458314, + "language_loss": 0.85868418, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86911666, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.39697266, + "step": 4980, + "time_per_iteration": 2.479827404022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042844, + "balance_loss_mlp": 1.00324261, + "epoch": 0.9582531742978069, + "flos": 1129132704000.0, + "grad_norm": 0.032880007377631804, + "language_loss": 0.80475271, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81518114, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.39575195, + "step": 4981, + "time_per_iteration": 3.5456929206848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_mlp": 1.00335562, + "epoch": 0.9584455559830705, + "flos": 525556675584.0, + "grad_norm": 0.034452441041768166, + "language_loss": 0.79316235, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80359232, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.39624023, + "step": 4982, + "time_per_iteration": 2.6716814041137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104178, + "balance_loss_mlp": 1.00203562, + "epoch": 0.9586379376683339, + "flos": 635450209536.0, + "grad_norm": 0.03376520307989189, + "language_loss": 0.81836033, + "learning_rate": 4.479828637655392e-06, + "loss": 0.82877809, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.3972168, + "step": 4983, + "time_per_iteration": 2.842564582824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041445, + "balance_loss_mlp": 1.00170052, + "epoch": 0.9588303193535975, + "flos": 416985157632.0, + "grad_norm": 0.036976444182102955, + "language_loss": 0.84129387, + "learning_rate": 4.438314345641459e-06, + "loss": 0.85170835, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.3972168, + "step": 4984, + "time_per_iteration": 2.50268816947937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_mlp": 1.00199842, + "epoch": 0.9590227010388611, + "flos": 482660180736.0, + "grad_norm": 0.049425598193085174, + "language_loss": 0.78694046, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79735744, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.39672852, + "step": 4985, + "time_per_iteration": 4.018237113952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_mlp": 1.00330746, + "epoch": 0.9592150827241247, + "flos": 685851089664.0, + "grad_norm": 0.054295758120399606, + "language_loss": 0.80803186, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81846166, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.39648438, + "step": 4986, + "time_per_iteration": 2.946019411087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042716, + "balance_loss_mlp": 1.00309062, + "epoch": 0.9594074644093882, + "flos": 575631912192.0, + "grad_norm": 0.0368102094197737, + "language_loss": 0.7151469, + "learning_rate": 4.314925898349642e-06, + "loss": 0.72557408, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.39599609, + "step": 4987, + "time_per_iteration": 2.718128204345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_mlp": 1.00328279, + "epoch": 0.9595998460946518, + "flos": 547988173056.0, + "grad_norm": 0.038441266597079346, + "language_loss": 0.78553158, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79596162, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.39697266, + "step": 4988, + "time_per_iteration": 2.810852289199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043081, + "balance_loss_mlp": 1.00336051, + "epoch": 0.9597922277799154, + "flos": 475027483392.0, + "grad_norm": 0.038398139954487105, + "language_loss": 0.78839409, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79882497, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.39697266, + "step": 4989, + "time_per_iteration": 2.525266408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_mlp": 1.0030148, + "epoch": 0.9599846094651789, + "flos": 515720295936.0, + "grad_norm": 0.040458239212947125, + "language_loss": 0.86110353, + "learning_rate": 4.193269428723889e-06, + "loss": 0.87153065, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.39672852, + "step": 4990, + "time_per_iteration": 2.6019325256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042466, + "balance_loss_mlp": 1.00281739, + "epoch": 0.9601769911504425, + "flos": 596163983616.0, + "grad_norm": 0.04715634947564762, + "language_loss": 0.78668261, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79710728, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.39624023, + "step": 4991, + "time_per_iteration": 2.7938621044158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00311601, + "epoch": 0.960369372835706, + "flos": 494042152704.0, + "grad_norm": 0.03203772527177207, + "language_loss": 0.79823196, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80865961, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.39624023, + "step": 4992, + "time_per_iteration": 2.5746684074401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_mlp": 1.00328052, + "epoch": 0.9605617545209696, + "flos": 580407694848.0, + "grad_norm": 0.03324631717517119, + "language_loss": 0.83355463, + "learning_rate": 4.073345361845171e-06, + "loss": 0.84398437, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.39672852, + "step": 4993, + "time_per_iteration": 2.7186391353607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_mlp": 1.00347352, + "epoch": 0.9607541362062332, + "flos": 929300297472.0, + "grad_norm": 0.028939559097249826, + "language_loss": 0.86792874, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87835968, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.39599609, + "step": 4994, + "time_per_iteration": 3.3311779499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042994, + "balance_loss_mlp": 1.00327384, + "epoch": 0.9609465178914968, + "flos": 574281705984.0, + "grad_norm": 0.03357798587309998, + "language_loss": 0.76040745, + "learning_rate": 3.994358637073036e-06, + "loss": 0.77083737, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.39697266, + "step": 4995, + "time_per_iteration": 2.8095812797546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_mlp": 1.00360358, + "epoch": 0.9611388995767602, + "flos": 531914989056.0, + "grad_norm": 0.03322291680520947, + "language_loss": 0.857813, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86824536, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.39599609, + "step": 4996, + "time_per_iteration": 2.630094528198242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_mlp": 1.00336814, + "epoch": 0.9613312812620238, + "flos": 647404733952.0, + "grad_norm": 0.036018524695575205, + "language_loss": 0.82486397, + "learning_rate": 3.916142178097881e-06, + "loss": 0.83529437, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.39648438, + "step": 4997, + "time_per_iteration": 2.761890411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043069, + "balance_loss_mlp": 1.00339627, + "epoch": 0.9615236629472874, + "flos": 497179024128.0, + "grad_norm": 0.034560793449925374, + "language_loss": 0.78197134, + "learning_rate": 3.877322836288888e-06, + "loss": 0.79240203, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.39648438, + "step": 4998, + "time_per_iteration": 2.913933038711548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_mlp": 1.00315988, + "epoch": 0.961716044632551, + "flos": 514007507712.0, + "grad_norm": 0.05213638036394556, + "language_loss": 0.76171172, + "learning_rate": 3.838696106385153e-06, + "loss": 0.77213907, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.39550781, + "step": 4999, + "time_per_iteration": 2.6721343994140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043033, + "balance_loss_mlp": 1.00338387, + "epoch": 0.9619084263178146, + "flos": 502085064192.0, + "grad_norm": 0.04055775067790823, + "language_loss": 0.81044245, + "learning_rate": 3.800262003382904e-06, + "loss": 0.82087278, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.39624023, + "step": 5000, + "time_per_iteration": 2.5917038917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_mlp": 1.00306058, + "epoch": 0.9621008080030781, + "flos": 596806522368.0, + "grad_norm": 0.04160425197136875, + "language_loss": 0.75552607, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.76595342, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.39648438, + "step": 5001, + "time_per_iteration": 2.768864631652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_mlp": 1.00340927, + "epoch": 0.9622931896883417, + "flos": 503248632576.0, + "grad_norm": 0.040216779291667094, + "language_loss": 0.82677174, + "learning_rate": 3.723971737693899e-06, + "loss": 0.83720231, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.39624023, + "step": 5002, + "time_per_iteration": 2.6624748706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042864, + "balance_loss_mlp": 1.00319076, + "epoch": 0.9624855713736052, + "flos": 608450954496.0, + "grad_norm": 0.036912255623922086, + "language_loss": 0.80918676, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81961536, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.39648438, + "step": 5003, + "time_per_iteration": 2.767632484436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_mlp": 1.00313723, + "epoch": 0.9626779530588688, + "flos": 511736751360.0, + "grad_norm": 0.04265801947296761, + "language_loss": 0.85481077, + "learning_rate": 3.648452157695936e-06, + "loss": 0.86523914, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.39672852, + "step": 5004, + "time_per_iteration": 2.6027941703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_mlp": 1.00191116, + "epoch": 0.9628703347441323, + "flos": 628498934784.0, + "grad_norm": 0.03505167867823401, + "language_loss": 0.82969689, + "learning_rate": 3.610981411526937e-06, + "loss": 0.84011322, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.39697266, + "step": 5005, + "time_per_iteration": 2.840470790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_mlp": 1.00182772, + "epoch": 0.9630627164293959, + "flos": 631898266368.0, + "grad_norm": 0.03609620968879647, + "language_loss": 0.7782557, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78867197, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.39770508, + "step": 5006, + "time_per_iteration": 2.7770707607269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041132, + "balance_loss_mlp": 1.00138748, + "epoch": 0.9632550981146595, + "flos": 571730992896.0, + "grad_norm": 0.03395739214114779, + "language_loss": 0.78756148, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.7979728, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.3972168, + "step": 5007, + "time_per_iteration": 2.8784968852996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_mlp": 1.0014658, + "epoch": 0.9634474797999231, + "flos": 467160516096.0, + "grad_norm": 0.03779584270298865, + "language_loss": 0.81459928, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.82501137, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.3972168, + "step": 5008, + "time_per_iteration": 2.6344192028045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042324, + "balance_loss_mlp": 1.00269866, + "epoch": 0.9636398614851867, + "flos": 527625242880.0, + "grad_norm": 0.03925966363653875, + "language_loss": 0.85927451, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86969769, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.39599609, + "step": 5009, + "time_per_iteration": 2.644716501235962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042559, + "balance_loss_mlp": 1.00288606, + "epoch": 0.9638322431704501, + "flos": 565943286528.0, + "grad_norm": 0.03833079566797296, + "language_loss": 0.75550568, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76593125, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.39648438, + "step": 5010, + "time_per_iteration": 2.770062208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104587, + "balance_loss_mlp": 1.0061729, + "epoch": 0.9640246248557137, + "flos": 478741764864.0, + "grad_norm": 0.03662615642017614, + "language_loss": 0.85119247, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.86165118, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.39672852, + "step": 5011, + "time_per_iteration": 2.604835271835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_mlp": 1.0055176, + "epoch": 0.9642170065409773, + "flos": 540339924480.0, + "grad_norm": 0.03893487686049037, + "language_loss": 0.8917706, + "learning_rate": 3.354083022201859e-06, + "loss": 0.90222299, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.39697266, + "step": 5012, + "time_per_iteration": 2.6692276000976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045169, + "balance_loss_mlp": 1.00542474, + "epoch": 0.9644093882262409, + "flos": 524777076480.0, + "grad_norm": 0.038874654820981284, + "language_loss": 0.84158158, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.85203332, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.3972168, + "step": 5013, + "time_per_iteration": 2.580549955368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045431, + "balance_loss_mlp": 1.00571084, + "epoch": 0.9646017699115044, + "flos": 575382091008.0, + "grad_norm": 0.03437411704258225, + "language_loss": 0.78946483, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79991913, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.39697266, + "step": 5014, + "time_per_iteration": 2.7400918006896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045204, + "balance_loss_mlp": 1.0054599, + "epoch": 0.964794151596768, + "flos": 637957181184.0, + "grad_norm": 0.03987907749747842, + "language_loss": 0.84913594, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85958803, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.3972168, + "step": 5015, + "time_per_iteration": 2.76391339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042625, + "balance_loss_mlp": 1.00285721, + "epoch": 0.9649865332820315, + "flos": 618560487936.0, + "grad_norm": 0.036457765801838675, + "language_loss": 0.86692178, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87734801, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.39746094, + "step": 5016, + "time_per_iteration": 2.762315511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_mlp": 1.00289977, + "epoch": 0.9651789149672951, + "flos": 517327126272.0, + "grad_norm": 0.03263683085754147, + "language_loss": 0.81227726, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82270324, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.39672852, + "step": 5017, + "time_per_iteration": 2.766784191131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042397, + "balance_loss_mlp": 1.00274801, + "epoch": 0.9653712966525587, + "flos": 493922588928.0, + "grad_norm": 0.037218470225731236, + "language_loss": 0.80549169, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.8159157, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.39624023, + "step": 5018, + "time_per_iteration": 2.5550506114959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_mlp": 1.00335717, + "epoch": 0.9655636783378222, + "flos": 537657008640.0, + "grad_norm": 0.02984947509813429, + "language_loss": 0.82836092, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83879143, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.39672852, + "step": 5019, + "time_per_iteration": 2.7989487648010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043137, + "balance_loss_mlp": 1.00334537, + "epoch": 0.9657560600230858, + "flos": 459959420160.0, + "grad_norm": 0.03464590555865733, + "language_loss": 0.82756871, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83800006, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.39770508, + "step": 5020, + "time_per_iteration": 2.6502068042755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042924, + "balance_loss_mlp": 1.00320303, + "epoch": 0.9659484417083494, + "flos": 687389879040.0, + "grad_norm": 0.04341700481017599, + "language_loss": 0.83695757, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84738678, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.39697266, + "step": 5021, + "time_per_iteration": 2.9042539596557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104174, + "balance_loss_mlp": 1.00302124, + "epoch": 0.966140823393613, + "flos": 1505459063808.0, + "grad_norm": 0.003981124048801385, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81735986, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.38671875, + "step": 5022, + "time_per_iteration": 4.69134259223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043105, + "balance_loss_mlp": 1.00345588, + "epoch": 0.9663332050788765, + "flos": 465859887360.0, + "grad_norm": 0.041583580268376795, + "language_loss": 0.8178041, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82823515, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.39624023, + "step": 5023, + "time_per_iteration": 2.612053394317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042898, + "balance_loss_mlp": 1.00322497, + "epoch": 0.96652558676414, + "flos": 501878984448.0, + "grad_norm": 0.037833370314126924, + "language_loss": 0.85891867, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86934769, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.39648438, + "step": 5024, + "time_per_iteration": 2.6159794330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_mlp": 1.0028168, + "epoch": 0.9667179684494036, + "flos": 425744484864.0, + "grad_norm": 0.0421263527174388, + "language_loss": 0.83179516, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.84222031, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.39672852, + "step": 5025, + "time_per_iteration": 2.525249481201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045691, + "balance_loss_mlp": 1.00604248, + "epoch": 0.9669103501346672, + "flos": 518010494208.0, + "grad_norm": 0.03417691570925141, + "language_loss": 0.86086416, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.87132108, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.39624023, + "step": 5026, + "time_per_iteration": 2.687157392501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_mlp": 1.00610387, + "epoch": 0.9671027318199308, + "flos": 457176382464.0, + "grad_norm": 0.04850651262651699, + "language_loss": 0.76400167, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.77445948, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.39648438, + "step": 5027, + "time_per_iteration": 2.6646482944488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_mlp": 1.0019294, + "epoch": 0.9672951135051943, + "flos": 526062153984.0, + "grad_norm": 0.03508971060006407, + "language_loss": 0.80259544, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81301212, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.3972168, + "step": 5028, + "time_per_iteration": 2.6135594844818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_mlp": 1.00237989, + "epoch": 0.9674874951904578, + "flos": 575102134272.0, + "grad_norm": 0.03464370697231736, + "language_loss": 0.80052054, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.81094229, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.39770508, + "step": 5029, + "time_per_iteration": 2.7764198780059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_mlp": 1.00205135, + "epoch": 0.9676798768757214, + "flos": 630424605696.0, + "grad_norm": 0.03130352473880452, + "language_loss": 0.80490315, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.81532162, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.39770508, + "step": 5030, + "time_per_iteration": 2.929133653640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_mlp": 1.00283051, + "epoch": 0.967872258560985, + "flos": 1467117687552.0, + "grad_norm": 0.00274412494684576, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76605004, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.38769531, + "step": 5031, + "time_per_iteration": 4.69275164604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_mlp": 1.00342023, + "epoch": 0.9680646402462486, + "flos": 566568328704.0, + "grad_norm": 0.040289351431982104, + "language_loss": 0.79295713, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80338806, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.39648438, + "step": 5032, + "time_per_iteration": 2.678314685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_mlp": 1.00508118, + "epoch": 0.9682570219315121, + "flos": 1437650344704.0, + "grad_norm": 0.004353892250927695, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79118514, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.38574219, + "step": 5033, + "time_per_iteration": 4.812420606613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041585, + "balance_loss_mlp": 1.00186443, + "epoch": 0.9684494036167757, + "flos": 585704507136.0, + "grad_norm": 0.03929500214727362, + "language_loss": 0.82216263, + "learning_rate": 2.608217639166688e-06, + "loss": 0.83257854, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.39697266, + "step": 5034, + "time_per_iteration": 2.7576022148132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_mlp": 1.00206041, + "epoch": 0.9686417853020393, + "flos": 560190573312.0, + "grad_norm": 0.03269322965722924, + "language_loss": 0.84307742, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85349548, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.3972168, + "step": 5035, + "time_per_iteration": 2.6869993209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042898, + "balance_loss_mlp": 1.00327253, + "epoch": 0.9688341669873028, + "flos": 786264023040.0, + "grad_norm": 0.03680667246769885, + "language_loss": 0.83578247, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84621143, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.39599609, + "step": 5036, + "time_per_iteration": 3.0497498512268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_mlp": 1.00312877, + "epoch": 0.9690265486725663, + "flos": 396769981440.0, + "grad_norm": 0.0383741827440454, + "language_loss": 0.80366373, + "learning_rate": 2.513747116326126e-06, + "loss": 0.81409198, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.39672852, + "step": 5037, + "time_per_iteration": 2.4929020404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_mlp": 1.00284219, + "epoch": 0.9692189303578299, + "flos": 477417803520.0, + "grad_norm": 0.04163535065667303, + "language_loss": 0.77594548, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78636992, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.39575195, + "step": 5038, + "time_per_iteration": 2.7530930042266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042484, + "balance_loss_mlp": 1.00285935, + "epoch": 0.9694113120430935, + "flos": 598688451840.0, + "grad_norm": 0.04027422764598806, + "language_loss": 0.79350811, + "learning_rate": 2.451732453851385e-06, + "loss": 0.80393296, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.39599609, + "step": 5039, + "time_per_iteration": 2.7393152713775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_mlp": 1.00305641, + "epoch": 0.9696036937283571, + "flos": 501898426368.0, + "grad_norm": 0.03255232311351302, + "language_loss": 0.83089191, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.84131968, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.39697266, + "step": 5040, + "time_per_iteration": 2.6828835010528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_mlp": 1.003389, + "epoch": 0.9697960754136207, + "flos": 433190544384.0, + "grad_norm": 0.04018403496708726, + "language_loss": 0.8756386, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88606954, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.39672852, + "step": 5041, + "time_per_iteration": 2.4938342571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_mlp": 1.00332952, + "epoch": 0.9699884570988841, + "flos": 569675064576.0, + "grad_norm": 0.0340373321768144, + "language_loss": 0.85916209, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86959231, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.39672852, + "step": 5042, + "time_per_iteration": 2.7059319019317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041961, + "balance_loss_mlp": 1.00219274, + "epoch": 0.9701808387841477, + "flos": 517237698048.0, + "grad_norm": 0.03863133055938838, + "language_loss": 0.81873441, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82915401, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.39746094, + "step": 5043, + "time_per_iteration": 4.20774507522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_mlp": 1.00325406, + "epoch": 0.9703732204694113, + "flos": 492498505728.0, + "grad_norm": 0.04293843443964225, + "language_loss": 0.76847333, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77890217, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.39599609, + "step": 5044, + "time_per_iteration": 2.586209297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043006, + "balance_loss_mlp": 1.00338101, + "epoch": 0.9705656021546749, + "flos": 627280931328.0, + "grad_norm": 0.03978912283300949, + "language_loss": 0.80898952, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81941956, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.39599609, + "step": 5045, + "time_per_iteration": 2.7928311824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042537, + "balance_loss_mlp": 1.00291181, + "epoch": 0.9707579838399384, + "flos": 472394145024.0, + "grad_norm": 0.03662115624180283, + "language_loss": 0.82899636, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.83942175, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.39599609, + "step": 5046, + "time_per_iteration": 2.601562261581421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042586, + "balance_loss_mlp": 1.00296044, + "epoch": 0.970950365525202, + "flos": 493139099136.0, + "grad_norm": 0.043534433084671205, + "language_loss": 0.80746675, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81789255, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.39599609, + "step": 5047, + "time_per_iteration": 2.7135002613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_mlp": 1.00305772, + "epoch": 0.9711427472104656, + "flos": 558377663232.0, + "grad_norm": 0.038890765592642804, + "language_loss": 0.80750018, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81792724, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.39624023, + "step": 5048, + "time_per_iteration": 2.7204275131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_mlp": 1.00332332, + "epoch": 0.9713351288957291, + "flos": 627101096448.0, + "grad_norm": 0.03085379211030374, + "language_loss": 0.8402819, + "learning_rate": 2.153250946564489e-06, + "loss": 0.85071111, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.39575195, + "step": 5049, + "time_per_iteration": 2.926865816116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_mlp": 1.00214159, + "epoch": 0.9715275105809927, + "flos": 500083570944.0, + "grad_norm": 0.03585393995796312, + "language_loss": 0.81363153, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82405019, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.39697266, + "step": 5050, + "time_per_iteration": 2.7249972820281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_mlp": 1.00198603, + "epoch": 0.9717198922662562, + "flos": 478481250048.0, + "grad_norm": 0.040701458255183315, + "language_loss": 0.77633119, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78674829, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.39697266, + "step": 5051, + "time_per_iteration": 2.5472941398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041661, + "balance_loss_mlp": 1.00196385, + "epoch": 0.9719122739515198, + "flos": 554550620928.0, + "grad_norm": 0.035991264794850895, + "language_loss": 0.78920466, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79962128, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.39672852, + "step": 5052, + "time_per_iteration": 2.683638572692871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_mlp": 1.00313902, + "epoch": 0.9721046556367834, + "flos": 566930910720.0, + "grad_norm": 0.03574138021311205, + "language_loss": 0.80556166, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.81598926, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.39599609, + "step": 5053, + "time_per_iteration": 2.6971213817596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042967, + "balance_loss_mlp": 1.00329447, + "epoch": 0.972297037322047, + "flos": 561401773824.0, + "grad_norm": 0.04211125372208404, + "language_loss": 0.78557342, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.7960031, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.39648438, + "step": 5054, + "time_per_iteration": 2.739396810531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042582, + "balance_loss_mlp": 1.00290966, + "epoch": 0.9724894190073105, + "flos": 513503974656.0, + "grad_norm": 0.03993140177401839, + "language_loss": 0.79701972, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80744553, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.39648438, + "step": 5055, + "time_per_iteration": 2.6858069896698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_mlp": 1.00298762, + "epoch": 0.972681800692574, + "flos": 615039647232.0, + "grad_norm": 0.04326090214420337, + "language_loss": 0.80950338, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81992978, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.39624023, + "step": 5056, + "time_per_iteration": 2.7890920639038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_mlp": 1.00307882, + "epoch": 0.9728741823778376, + "flos": 835314696960.0, + "grad_norm": 0.03650842968056953, + "language_loss": 0.84358609, + "learning_rate": 1.92838141509849e-06, + "loss": 0.85401356, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.39648438, + "step": 5057, + "time_per_iteration": 3.0628442764282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042465, + "balance_loss_mlp": 1.00272107, + "epoch": 0.9730665640631012, + "flos": 572588359680.0, + "grad_norm": 0.03702571878539965, + "language_loss": 0.84975529, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.86018002, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.3972168, + "step": 5058, + "time_per_iteration": 2.737433671951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_mlp": 1.00340819, + "epoch": 0.9732589457483648, + "flos": 507520882176.0, + "grad_norm": 0.04068245546589233, + "language_loss": 0.77703661, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78746647, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.39550781, + "step": 5059, + "time_per_iteration": 2.584334373474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043043, + "balance_loss_mlp": 1.00346506, + "epoch": 0.9734513274336283, + "flos": 928483759872.0, + "grad_norm": 0.03701632354193653, + "language_loss": 0.80842561, + "learning_rate": 1.84724562509897e-06, + "loss": 0.818856, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.39550781, + "step": 5060, + "time_per_iteration": 3.1218395233154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_mlp": 1.00314617, + "epoch": 0.9736437091188919, + "flos": 492926216448.0, + "grad_norm": 0.03106926650886392, + "language_loss": 0.78421533, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79464358, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.39648438, + "step": 5061, + "time_per_iteration": 2.724682092666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_mlp": 1.003245, + "epoch": 0.9738360908041555, + "flos": 614455434240.0, + "grad_norm": 0.03875354346160162, + "language_loss": 0.8378309, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84826028, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.39672852, + "step": 5062, + "time_per_iteration": 2.7275002002716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044144, + "balance_loss_mlp": 1.0054245, + "epoch": 0.974028472489419, + "flos": 1552733766144.0, + "grad_norm": 0.004368152650976507, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77036238, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.38671875, + "step": 5063, + "time_per_iteration": 4.944604873657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_mlp": 1.00630188, + "epoch": 0.9742208541746825, + "flos": 1414180678656.0, + "grad_norm": 0.007059570521795609, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80722737, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.38671875, + "step": 5064, + "time_per_iteration": 4.950110197067261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041149, + "balance_loss_mlp": 1.00147641, + "epoch": 0.9744132358599461, + "flos": 676099280640.0, + "grad_norm": 0.03034439726817451, + "language_loss": 0.77150154, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.78191304, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.39648438, + "step": 5065, + "time_per_iteration": 2.856956958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041453, + "balance_loss_mlp": 1.00168526, + "epoch": 0.9746056175452097, + "flos": 599598308352.0, + "grad_norm": 0.034912593112501174, + "language_loss": 0.78129935, + "learning_rate": 1.690196122544896e-06, + "loss": 0.79171389, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.39746094, + "step": 5066, + "time_per_iteration": 2.803445816040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_mlp": 1.00152469, + "epoch": 0.9747979992304733, + "flos": 733534060800.0, + "grad_norm": 0.03597891818923648, + "language_loss": 0.82944268, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83985561, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.39746094, + "step": 5067, + "time_per_iteration": 2.9895918369293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041402, + "balance_loss_mlp": 1.00168097, + "epoch": 0.9749903809157369, + "flos": 617620495872.0, + "grad_norm": 0.04145388586739779, + "language_loss": 0.76952046, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77993447, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.39697266, + "step": 5068, + "time_per_iteration": 2.743577480316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_mlp": 1.00169683, + "epoch": 0.9751827626010003, + "flos": 469350592512.0, + "grad_norm": 0.035842707054568955, + "language_loss": 0.84308249, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.85349715, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.39746094, + "step": 5069, + "time_per_iteration": 2.551919460296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_mlp": 1.00311708, + "epoch": 0.9753751442862639, + "flos": 600408043008.0, + "grad_norm": 0.03945480801973415, + "language_loss": 0.85697186, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86740017, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.39697266, + "step": 5070, + "time_per_iteration": 2.8267908096313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104272, + "balance_loss_mlp": 1.00299966, + "epoch": 0.9755675259715275, + "flos": 652092055296.0, + "grad_norm": 0.03287905354200328, + "language_loss": 0.82673955, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83716673, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.39697266, + "step": 5071, + "time_per_iteration": 2.898956537246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_mlp": 1.00323999, + "epoch": 0.9757599076567911, + "flos": 564725283072.0, + "grad_norm": 0.039048917165716984, + "language_loss": 0.79786921, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80829906, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.3972168, + "step": 5072, + "time_per_iteration": 2.687793016433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042691, + "balance_loss_mlp": 1.00301838, + "epoch": 0.9759522893420547, + "flos": 505649646336.0, + "grad_norm": 0.036489369587936984, + "language_loss": 0.80441165, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81483853, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.39648438, + "step": 5073, + "time_per_iteration": 2.646900177001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045537, + "balance_loss_mlp": 1.00591159, + "epoch": 0.9761446710273182, + "flos": 584838392064.0, + "grad_norm": 0.039814826111222036, + "language_loss": 0.82315922, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83361453, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.39599609, + "step": 5074, + "time_per_iteration": 2.69627046585083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045692, + "balance_loss_mlp": 1.00604343, + "epoch": 0.9763370527125818, + "flos": 483172462080.0, + "grad_norm": 0.04765850391618214, + "language_loss": 0.82408017, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.83453709, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.39624023, + "step": 5075, + "time_per_iteration": 2.619927406311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045897, + "balance_loss_mlp": 1.00617659, + "epoch": 0.9765294343978453, + "flos": 620114828544.0, + "grad_norm": 0.04096277036826416, + "language_loss": 0.7898621, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.80032104, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.39697266, + "step": 5076, + "time_per_iteration": 2.72031569480896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_mlp": 1.002967, + "epoch": 0.9767218160831089, + "flos": 527588304384.0, + "grad_norm": 0.038563950916879496, + "language_loss": 0.85749316, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86791986, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.39672852, + "step": 5077, + "time_per_iteration": 2.5702054500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_mlp": 1.0028677, + "epoch": 0.9769141977683724, + "flos": 526246846464.0, + "grad_norm": 0.0452921679578959, + "language_loss": 0.84605044, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.85647535, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.39599609, + "step": 5078, + "time_per_iteration": 2.7002179622650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_mlp": 1.00295401, + "epoch": 0.977106579453636, + "flos": 458644207104.0, + "grad_norm": 0.036677586674253186, + "language_loss": 0.81371784, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.82414383, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.39624023, + "step": 5079, + "time_per_iteration": 2.796189308166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_mlp": 1.00250697, + "epoch": 0.9772989611388996, + "flos": 533134937856.0, + "grad_norm": 0.03711615810210664, + "language_loss": 0.81923473, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82965648, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.39648438, + "step": 5080, + "time_per_iteration": 2.614269495010376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_mlp": 1.0034405, + "epoch": 0.9774913428241632, + "flos": 756755851008.0, + "grad_norm": 0.03628191500870131, + "language_loss": 0.86773902, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87817061, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.39697266, + "step": 5081, + "time_per_iteration": 3.000998020172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_mlp": 1.00526428, + "epoch": 0.9776837245094268, + "flos": 1557670908672.0, + "grad_norm": 0.004261390789563241, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79939473, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.38671875, + "step": 5082, + "time_per_iteration": 4.966464281082153 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_mlp": 1.00311267, + "epoch": 0.9778761061946902, + "flos": 593634657792.0, + "grad_norm": 0.06033698096536924, + "language_loss": 0.84689307, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85732073, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.39624023, + "step": 5083, + "time_per_iteration": 2.6818041801452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_mlp": 1.00203848, + "epoch": 0.9780684878799538, + "flos": 415832282880.0, + "grad_norm": 0.03521694229943953, + "language_loss": 0.82329738, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.8337152, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.3972168, + "step": 5084, + "time_per_iteration": 2.487070322036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_mlp": 1.00188649, + "epoch": 0.9782608695652174, + "flos": 569544807168.0, + "grad_norm": 0.0361861830744165, + "language_loss": 0.85351056, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.86392707, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.39746094, + "step": 5085, + "time_per_iteration": 2.7171080112457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041786, + "balance_loss_mlp": 1.00204134, + "epoch": 0.978453251250481, + "flos": 691762250496.0, + "grad_norm": 0.03997358667222654, + "language_loss": 0.8329246, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84334242, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.3972168, + "step": 5086, + "time_per_iteration": 2.908737897872925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_mlp": 1.00167274, + "epoch": 0.9786456329357445, + "flos": 503572330752.0, + "grad_norm": 0.03505329178306916, + "language_loss": 0.77599311, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78640735, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.3972168, + "step": 5087, + "time_per_iteration": 2.654874324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_mlp": 1.00302482, + "epoch": 0.9788380146210081, + "flos": 864606095616.0, + "grad_norm": 0.04385354036304431, + "language_loss": 0.8105306, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.82095802, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.39697266, + "step": 5088, + "time_per_iteration": 3.037149667739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042665, + "balance_loss_mlp": 1.0030396, + "epoch": 0.9790303963062716, + "flos": 513746992896.0, + "grad_norm": 0.03583893943421543, + "language_loss": 0.84371638, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85414302, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.39599609, + "step": 5089, + "time_per_iteration": 2.5842764377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043218, + "balance_loss_mlp": 1.00349736, + "epoch": 0.9792227779915352, + "flos": 495411800832.0, + "grad_norm": 0.04233077980787355, + "language_loss": 0.86789143, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87832361, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.39697266, + "step": 5090, + "time_per_iteration": 2.5548324584960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010429, + "balance_loss_mlp": 1.00322676, + "epoch": 0.9794151596767988, + "flos": 609484265472.0, + "grad_norm": 0.03604199213932351, + "language_loss": 0.81851107, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82894003, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.39648438, + "step": 5091, + "time_per_iteration": 2.7624361515045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_mlp": 1.00305605, + "epoch": 0.9796075413620623, + "flos": 479197665792.0, + "grad_norm": 0.03750770403802879, + "language_loss": 0.8721531, + "learning_rate": 1.09015417612357e-06, + "loss": 0.88258064, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.39672852, + "step": 5092, + "time_per_iteration": 2.596569299697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042386, + "balance_loss_mlp": 1.00278485, + "epoch": 0.9797999230473259, + "flos": 593363449344.0, + "grad_norm": 0.03750271931428907, + "language_loss": 0.84715217, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85757607, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.39575195, + "step": 5093, + "time_per_iteration": 2.798842191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042858, + "balance_loss_mlp": 1.00313711, + "epoch": 0.9799923047325895, + "flos": 557564037888.0, + "grad_norm": 0.03732665936574346, + "language_loss": 0.82240361, + "learning_rate": 1.049418636655919e-06, + "loss": 0.83283222, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.39697266, + "step": 5094, + "time_per_iteration": 2.9326531887054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_mlp": 1.00196946, + "epoch": 0.9801846864178531, + "flos": 580629325824.0, + "grad_norm": 0.03472184065520672, + "language_loss": 0.84838355, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85880166, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.39819336, + "step": 5095, + "time_per_iteration": 2.7500767707824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042666, + "balance_loss_mlp": 1.00296915, + "epoch": 0.9803770681031165, + "flos": 516211190016.0, + "grad_norm": 0.03395616690027392, + "language_loss": 0.80524683, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.81567359, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.39672852, + "step": 5096, + "time_per_iteration": 2.670788288116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_mlp": 1.00285935, + "epoch": 0.9805694497883801, + "flos": 568120723968.0, + "grad_norm": 0.035035104097038514, + "language_loss": 0.78688991, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79731572, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.39697266, + "step": 5097, + "time_per_iteration": 2.7245545387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_mlp": 1.00313163, + "epoch": 0.9807618314736437, + "flos": 480333043968.0, + "grad_norm": 0.14407464636347864, + "language_loss": 0.74170625, + "learning_rate": 9.702721370922208e-07, + "loss": 0.75213504, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.3972168, + "step": 5098, + "time_per_iteration": 2.662116527557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_mlp": 1.00342762, + "epoch": 0.9809542131589073, + "flos": 546342458880.0, + "grad_norm": 0.04001499781359293, + "language_loss": 0.80586725, + "learning_rate": 9.509698444908344e-07, + "loss": 0.81629872, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.39697266, + "step": 5099, + "time_per_iteration": 2.621173143386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042571, + "balance_loss_mlp": 1.00299406, + "epoch": 0.9811465948441709, + "flos": 521863781376.0, + "grad_norm": 0.03872092892168167, + "language_loss": 0.80255032, + "learning_rate": 9.318612999057452e-07, + "loss": 0.81297612, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.39550781, + "step": 5100, + "time_per_iteration": 2.626246452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_mlp": 1.00308967, + "epoch": 0.9813389765294344, + "flos": 542321975808.0, + "grad_norm": 0.03966927263008885, + "language_loss": 0.80838525, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81881237, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.39599609, + "step": 5101, + "time_per_iteration": 2.6338257789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_mlp": 1.00304604, + "epoch": 0.981531358214698, + "flos": 568465809408.0, + "grad_norm": 0.036339927688089545, + "language_loss": 0.84584022, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85626721, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.39624023, + "step": 5102, + "time_per_iteration": 2.7215864658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_mlp": 1.00325489, + "epoch": 0.9817237398999615, + "flos": 578414949888.0, + "grad_norm": 0.03608910956574523, + "language_loss": 0.81383669, + "learning_rate": 8.756982280578307e-07, + "loss": 0.82426471, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.39526367, + "step": 5103, + "time_per_iteration": 2.6962757110595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_mlp": 1.00336289, + "epoch": 0.9819161215852251, + "flos": 702855517440.0, + "grad_norm": 0.03526602315736388, + "language_loss": 0.8230114, + "learning_rate": 8.573647489714676e-07, + "loss": 0.83344281, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.39746094, + "step": 5104, + "time_per_iteration": 2.952726125717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045292, + "balance_loss_mlp": 1.00552344, + "epoch": 0.9821085032704886, + "flos": 625453436928.0, + "grad_norm": 0.03571049923585047, + "language_loss": 0.84285426, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85330725, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.39746094, + "step": 5105, + "time_per_iteration": 2.919321060180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_mlp": 1.00573695, + "epoch": 0.9823008849557522, + "flos": 500493785088.0, + "grad_norm": 0.042150017218887774, + "language_loss": 0.81642014, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82687449, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.39672852, + "step": 5106, + "time_per_iteration": 2.6857118606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_mlp": 1.00149202, + "epoch": 0.9824932666410158, + "flos": 524905388544.0, + "grad_norm": 0.04484543637804275, + "language_loss": 0.73315752, + "learning_rate": 8.035270459489929e-07, + "loss": 0.74356979, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.3972168, + "step": 5107, + "time_per_iteration": 2.685387372970581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041466, + "balance_loss_mlp": 1.00172162, + "epoch": 0.9826856483262794, + "flos": 503676343296.0, + "grad_norm": 0.035210892499995095, + "language_loss": 0.82785177, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83826649, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.3972168, + "step": 5108, + "time_per_iteration": 2.648824453353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041998, + "balance_loss_mlp": 1.00225329, + "epoch": 0.982878030011543, + "flos": 563214683904.0, + "grad_norm": 0.032324614510234884, + "language_loss": 0.84624445, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85666442, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.3972168, + "step": 5109, + "time_per_iteration": 2.8252689838409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041628, + "balance_loss_mlp": 1.00188339, + "epoch": 0.9830704116968064, + "flos": 538214976768.0, + "grad_norm": 0.032173031001019446, + "language_loss": 0.83402407, + "learning_rate": 7.514335898027857e-07, + "loss": 0.84444034, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.3972168, + "step": 5110, + "time_per_iteration": 2.7532317638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042923, + "balance_loss_mlp": 1.00322616, + "epoch": 0.98326279338207, + "flos": 459903039744.0, + "grad_norm": 0.03572969803594758, + "language_loss": 0.84500074, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85542995, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.39672852, + "step": 5111, + "time_per_iteration": 2.524393081665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042896, + "balance_loss_mlp": 1.00327051, + "epoch": 0.9834551750673336, + "flos": 642190546944.0, + "grad_norm": 0.032775358320962594, + "language_loss": 0.80079061, + "learning_rate": 7.17673735218416e-07, + "loss": 0.81121951, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.39599609, + "step": 5112, + "time_per_iteration": 2.821751594543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_mlp": 1.00320005, + "epoch": 0.9836475567525972, + "flos": 1073549717760.0, + "grad_norm": 0.03499581392563108, + "language_loss": 0.79561722, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80604577, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.39624023, + "step": 5113, + "time_per_iteration": 3.4073426723480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042885, + "balance_loss_mlp": 1.00318861, + "epoch": 0.9838399384378607, + "flos": 566279623680.0, + "grad_norm": 0.04234860016230719, + "language_loss": 0.7609002, + "learning_rate": 6.846892349181566e-07, + "loss": 0.77132899, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.39672852, + "step": 5114, + "time_per_iteration": 2.6536595821380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_mlp": 1.00336301, + "epoch": 0.9840323201231242, + "flos": 774181186560.0, + "grad_norm": 0.05548408473987726, + "language_loss": 0.80118275, + "learning_rate": 6.684877586787819e-07, + "loss": 0.81161332, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.39672852, + "step": 5115, + "time_per_iteration": 2.966550350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_mlp": 1.00294471, + "epoch": 0.9842247018083878, + "flos": 473249566464.0, + "grad_norm": 0.03527003893098496, + "language_loss": 0.85987931, + "learning_rate": 6.524801401249225e-07, + "loss": 0.87030524, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.39624023, + "step": 5116, + "time_per_iteration": 2.5599422454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_mlp": 1.0028646, + "epoch": 0.9844170834936514, + "flos": 526311975168.0, + "grad_norm": 0.03473210805484262, + "language_loss": 0.85255396, + "learning_rate": 6.366663854713295e-07, + "loss": 0.86297911, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.39624023, + "step": 5117, + "time_per_iteration": 2.6238479614257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_mlp": 1.00504303, + "epoch": 0.984609465178915, + "flos": 1570626663168.0, + "grad_norm": 0.005544622435587091, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78206277, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.38671875, + "step": 5118, + "time_per_iteration": 4.924822568893433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104242, + "balance_loss_mlp": 1.0027473, + "epoch": 0.9848018468641785, + "flos": 520569955584.0, + "grad_norm": 0.04372972135981774, + "language_loss": 0.82049799, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83092213, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.39648438, + "step": 5119, + "time_per_iteration": 2.580916404724121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_mlp": 1.00287449, + "epoch": 0.9849942285494421, + "flos": 493987717632.0, + "grad_norm": 0.03481962027898756, + "language_loss": 0.83367181, + "learning_rate": 5.903883659301167e-07, + "loss": 0.84409702, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.39624023, + "step": 5120, + "time_per_iteration": 2.5750389099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_mlp": 1.00320685, + "epoch": 0.9851866102347057, + "flos": 547050126336.0, + "grad_norm": 0.03798440780178793, + "language_loss": 0.8116461, + "learning_rate": 5.753501275193029e-07, + "loss": 0.82207561, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.3972168, + "step": 5121, + "time_per_iteration": 2.6916379928588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042599, + "balance_loss_mlp": 1.00290275, + "epoch": 0.9853789919199692, + "flos": 477215614464.0, + "grad_norm": 0.04012512347412675, + "language_loss": 0.80473411, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81516004, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.39672852, + "step": 5122, + "time_per_iteration": 2.5235373973846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_mlp": 1.00197303, + "epoch": 0.9855713736052328, + "flos": 1034309178624.0, + "grad_norm": 0.035879462257235414, + "language_loss": 0.7627933, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77321047, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.3972168, + "step": 5123, + "time_per_iteration": 3.4110076427459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_mlp": 1.00210428, + "epoch": 0.9857637552904963, + "flos": 496080584448.0, + "grad_norm": 0.03414998206245118, + "language_loss": 0.82715416, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83757216, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.39672852, + "step": 5124, + "time_per_iteration": 2.608877658843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042334, + "balance_loss_mlp": 1.0025183, + "epoch": 0.9859561369757599, + "flos": 593382891264.0, + "grad_norm": 0.040829141908968115, + "language_loss": 0.84081221, + "learning_rate": 5.17136169578103e-07, + "loss": 0.85123551, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.39794922, + "step": 5125, + "time_per_iteration": 2.68752121925354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042843, + "balance_loss_mlp": 1.00319362, + "epoch": 0.9861485186610235, + "flos": 487983237888.0, + "grad_norm": 0.03435380291834103, + "language_loss": 0.7900275, + "learning_rate": 5.030674572691907e-07, + "loss": 0.80045593, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.39624023, + "step": 5126, + "time_per_iteration": 2.6282498836517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_mlp": 1.00335145, + "epoch": 0.9863409003462871, + "flos": 519834097920.0, + "grad_norm": 0.03004792454778602, + "language_loss": 0.83327055, + "learning_rate": 4.891926668676994e-07, + "loss": 0.84370041, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.39599609, + "step": 5127, + "time_per_iteration": 2.6406099796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_mlp": 1.00532532, + "epoch": 0.9865332820315506, + "flos": 1489297418496.0, + "grad_norm": 0.004350637395800546, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80226779, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.38671875, + "step": 5128, + "time_per_iteration": 4.868277549743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104271, + "balance_loss_mlp": 1.00301361, + "epoch": 0.9867256637168141, + "flos": 583218922752.0, + "grad_norm": 0.03562084735893716, + "language_loss": 0.79497892, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80540597, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.39672852, + "step": 5129, + "time_per_iteration": 2.710846185684204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_mlp": 1.00284064, + "epoch": 0.9869180454020777, + "flos": 960927581184.0, + "grad_norm": 0.03494818497849394, + "language_loss": 0.86453211, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87495726, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.39648438, + "step": 5130, + "time_per_iteration": 3.2606029510498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041493, + "balance_loss_mlp": 1.00177276, + "epoch": 0.9871104270873413, + "flos": 772114564608.0, + "grad_norm": 0.03632052954178154, + "language_loss": 0.82985795, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8402729, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.39697266, + "step": 5131, + "time_per_iteration": 2.999943494796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_mlp": 1.00172222, + "epoch": 0.9873028087726049, + "flos": 447366247680.0, + "grad_norm": 0.04349858393955328, + "language_loss": 0.78963101, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.80004597, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.39746094, + "step": 5132, + "time_per_iteration": 2.523855209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041372, + "balance_loss_mlp": 1.00167501, + "epoch": 0.9874951904578684, + "flos": 508628070144.0, + "grad_norm": 0.03437271489823481, + "language_loss": 0.86646765, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87688142, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.39672852, + "step": 5133, + "time_per_iteration": 2.6064534187316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041756, + "balance_loss_mlp": 1.00203538, + "epoch": 0.987687572143132, + "flos": 718038286848.0, + "grad_norm": 0.0348410351371582, + "language_loss": 0.82776976, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83818728, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.39697266, + "step": 5134, + "time_per_iteration": 2.9822537899017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044029, + "balance_loss_mlp": 1.00531006, + "epoch": 0.9878799538283956, + "flos": 1541960306688.0, + "grad_norm": 0.004341111973416102, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.8086198, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.38671875, + "step": 5135, + "time_per_iteration": 4.959632396697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_mlp": 1.00305438, + "epoch": 0.9880723355136591, + "flos": 722738247168.0, + "grad_norm": 0.034114831969587756, + "language_loss": 0.81975973, + "learning_rate": 3.730469030412964e-07, + "loss": 0.83018649, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.39599609, + "step": 5136, + "time_per_iteration": 2.9661388397216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_mlp": 1.00325525, + "epoch": 0.9882647171989226, + "flos": 558414601728.0, + "grad_norm": 0.03182489959005223, + "language_loss": 0.84676516, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85719395, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.39599609, + "step": 5137, + "time_per_iteration": 2.6703665256500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_mlp": 1.00207615, + "epoch": 0.9884570988841862, + "flos": 563941793280.0, + "grad_norm": 0.037188554451168025, + "language_loss": 0.80630195, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81672037, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.39746094, + "step": 5138, + "time_per_iteration": 2.707731008529663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_mlp": 1.00196183, + "epoch": 0.9886494805694498, + "flos": 432669514752.0, + "grad_norm": 0.038119930885520625, + "language_loss": 0.86471844, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87513542, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.3972168, + "step": 5139, + "time_per_iteration": 2.5015242099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_mlp": 1.00309241, + "epoch": 0.9888418622547134, + "flos": 593241940224.0, + "grad_norm": 0.031962548376230554, + "language_loss": 0.90608424, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91651189, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.39648438, + "step": 5140, + "time_per_iteration": 2.7830655574798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_mlp": 1.0027616, + "epoch": 0.989034243939977, + "flos": 1136867468544.0, + "grad_norm": 0.03159325853883531, + "language_loss": 0.80423748, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81466115, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.39575195, + "step": 5141, + "time_per_iteration": 3.5227456092834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_mlp": 1.00324464, + "epoch": 0.9892266256252404, + "flos": 567731897088.0, + "grad_norm": 0.03916610087942592, + "language_loss": 0.82117474, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83160412, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.39672852, + "step": 5142, + "time_per_iteration": 2.6800377368927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041551, + "balance_loss_mlp": 1.00185382, + "epoch": 0.989419007310504, + "flos": 641871706368.0, + "grad_norm": 0.03635913094447108, + "language_loss": 0.84309208, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.85350764, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.39672852, + "step": 5143, + "time_per_iteration": 2.9835867881774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042368, + "balance_loss_mlp": 1.00276697, + "epoch": 0.9896113889957676, + "flos": 456449273088.0, + "grad_norm": 0.04157361104521907, + "language_loss": 0.81831914, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82874286, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.39575195, + "step": 5144, + "time_per_iteration": 2.60195255279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_mlp": 1.00274062, + "epoch": 0.9898037706810312, + "flos": 568420122624.0, + "grad_norm": 0.03633944048268901, + "language_loss": 0.80669045, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81711411, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.39599609, + "step": 5145, + "time_per_iteration": 2.6788082122802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_mlp": 1.00495148, + "epoch": 0.9899961523662947, + "flos": 1553450181888.0, + "grad_norm": 0.005562100769056325, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79190093, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.38671875, + "step": 5146, + "time_per_iteration": 4.948495149612427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_mlp": 1.00301051, + "epoch": 0.9901885340515583, + "flos": 611948462592.0, + "grad_norm": 0.034622699594423934, + "language_loss": 0.85446566, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86489272, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.39672852, + "step": 5147, + "time_per_iteration": 2.8891594409942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104262, + "balance_loss_mlp": 1.00294733, + "epoch": 0.9903809157368219, + "flos": 518494585344.0, + "grad_norm": 0.033958592600161895, + "language_loss": 0.83530235, + "learning_rate": 2.426269020866512e-07, + "loss": 0.84572852, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.39648438, + "step": 5148, + "time_per_iteration": 2.590367317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042805, + "balance_loss_mlp": 1.00310874, + "epoch": 0.9905732974220854, + "flos": 1102198577664.0, + "grad_norm": 0.034821676713935955, + "language_loss": 0.80957055, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81999862, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.39672852, + "step": 5149, + "time_per_iteration": 3.433225631713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042394, + "balance_loss_mlp": 1.00269747, + "epoch": 0.990765679107349, + "flos": 859493975808.0, + "grad_norm": 0.0403314088319205, + "language_loss": 0.84558642, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85601032, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.39672852, + "step": 5150, + "time_per_iteration": 3.1001992225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_mlp": 1.00335491, + "epoch": 0.9909580607926125, + "flos": 492274929408.0, + "grad_norm": 0.038309376830719104, + "language_loss": 0.80395019, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81438005, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.39599609, + "step": 5151, + "time_per_iteration": 2.5904712677001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_mlp": 1.00319481, + "epoch": 0.9911504424778761, + "flos": 586160408064.0, + "grad_norm": 0.03518242264067306, + "language_loss": 0.79798514, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80841333, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.39599609, + "step": 5152, + "time_per_iteration": 2.7454349994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042561, + "balance_loss_mlp": 1.0028646, + "epoch": 0.9913428241631397, + "flos": 571101093120.0, + "grad_norm": 0.034987965927020324, + "language_loss": 0.81628764, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82671332, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.39672852, + "step": 5153, + "time_per_iteration": 2.6824519634246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045599, + "balance_loss_mlp": 1.00590229, + "epoch": 0.9915352058484033, + "flos": 490711840512.0, + "grad_norm": 0.034700171025390344, + "language_loss": 0.87125576, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.88171172, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.39672852, + "step": 5154, + "time_per_iteration": 2.6056690216064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044254, + "balance_loss_mlp": 1.00443792, + "epoch": 0.9917275875336667, + "flos": 745410817536.0, + "grad_norm": 0.033269276709246544, + "language_loss": 0.83338165, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.84382415, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.39794922, + "step": 5155, + "time_per_iteration": 2.9471802711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_mlp": 1.00215936, + "epoch": 0.9919199692189303, + "flos": 509325043968.0, + "grad_norm": 0.03681649434928698, + "language_loss": 0.80316067, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81357992, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.39746094, + "step": 5156, + "time_per_iteration": 2.658385753631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_mlp": 1.00167131, + "epoch": 0.9921123509041939, + "flos": 545011694592.0, + "grad_norm": 0.03652948817105682, + "language_loss": 0.84750915, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85792279, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.39672852, + "step": 5157, + "time_per_iteration": 2.651167631149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_mlp": 1.00166762, + "epoch": 0.9923047325894575, + "flos": 672758274816.0, + "grad_norm": 0.03668848147519562, + "language_loss": 0.77897781, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78939146, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.39672852, + "step": 5158, + "time_per_iteration": 2.772944927215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_mlp": 1.00161004, + "epoch": 0.9924971142747211, + "flos": 467625165312.0, + "grad_norm": 0.03636401763672204, + "language_loss": 0.80984938, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.82026315, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.39746094, + "step": 5159, + "time_per_iteration": 2.720477342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_mlp": 1.0017122, + "epoch": 0.9926894959599846, + "flos": 492563634432.0, + "grad_norm": 0.033750262156114055, + "language_loss": 0.83312631, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.84354109, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.39746094, + "step": 5160, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041373, + "balance_loss_mlp": 1.00162876, + "epoch": 0.9928818776452482, + "flos": 493373369088.0, + "grad_norm": 0.03981516905713452, + "language_loss": 0.82561183, + "learning_rate": 1.328673533166902e-07, + "loss": 0.83602554, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.3972168, + "step": 5161, + "time_per_iteration": 2.6184706687927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_mlp": 1.00297153, + "epoch": 0.9930742593305117, + "flos": 547467143424.0, + "grad_norm": 0.03499010450358828, + "language_loss": 0.8439256, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85435224, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.39672852, + "step": 5162, + "time_per_iteration": 2.8474743366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_mlp": 1.00280654, + "epoch": 0.9932666410157753, + "flos": 586616308992.0, + "grad_norm": 0.03668605125261659, + "language_loss": 0.86434215, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.87476742, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.39697266, + "step": 5163, + "time_per_iteration": 2.7860381603240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043001, + "balance_loss_mlp": 1.00330424, + "epoch": 0.9934590227010388, + "flos": 538106106624.0, + "grad_norm": 0.03728677713501654, + "language_loss": 0.84115875, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.85158879, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.39672852, + "step": 5164, + "time_per_iteration": 2.6766583919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_mlp": 1.00302172, + "epoch": 0.9936514043863024, + "flos": 519061301760.0, + "grad_norm": 0.033375171347095695, + "language_loss": 0.87032169, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.88074887, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.39672852, + "step": 5165, + "time_per_iteration": 2.60634708404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_mlp": 1.00305867, + "epoch": 0.993843786071566, + "flos": 745996975872.0, + "grad_norm": 0.03526910603932911, + "language_loss": 0.80838788, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81881613, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.39746094, + "step": 5166, + "time_per_iteration": 3.050039529800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045055, + "balance_loss_mlp": 1.00533426, + "epoch": 0.9940361677568296, + "flos": 527008948992.0, + "grad_norm": 0.04774689231823901, + "language_loss": 0.82173401, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83218455, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.39697266, + "step": 5167, + "time_per_iteration": 2.7380599975585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_mlp": 1.00587058, + "epoch": 0.9942285494420932, + "flos": 556747500288.0, + "grad_norm": 0.03707878471645631, + "language_loss": 0.80540991, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81586635, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.39746094, + "step": 5168, + "time_per_iteration": 2.748521327972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045556, + "balance_loss_mlp": 1.00590706, + "epoch": 0.9944209311273566, + "flos": 587100400128.0, + "grad_norm": 0.03550078016567863, + "language_loss": 0.82207251, + "learning_rate": 8.162407083411872e-08, + "loss": 0.832528, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.39624023, + "step": 5169, + "time_per_iteration": 2.7222771644592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045413, + "balance_loss_mlp": 1.00576413, + "epoch": 0.9946133128126202, + "flos": 736857570048.0, + "grad_norm": 0.03421051451845836, + "language_loss": 0.82663047, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83708465, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.39624023, + "step": 5170, + "time_per_iteration": 2.9847142696380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042863, + "balance_loss_mlp": 1.00316656, + "epoch": 0.9948056944978838, + "flos": 647181157632.0, + "grad_norm": 0.037725989212747564, + "language_loss": 0.82797301, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83840156, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.39672852, + "step": 5171, + "time_per_iteration": 2.7907352447509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_mlp": 1.00308061, + "epoch": 0.9949980761831474, + "flos": 446797585920.0, + "grad_norm": 0.035457906754411656, + "language_loss": 0.86522031, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87564778, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.39648438, + "step": 5172, + "time_per_iteration": 2.5566751956939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_mlp": 1.00309753, + "epoch": 0.995190457868411, + "flos": 436559740416.0, + "grad_norm": 0.03425126759604689, + "language_loss": 0.86168361, + "learning_rate": 6.066040520641414e-08, + "loss": 0.87211132, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.39648438, + "step": 5173, + "time_per_iteration": 2.54295015335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_mlp": 1.00275624, + "epoch": 0.9953828395536745, + "flos": 515190518016.0, + "grad_norm": 0.035385628041685095, + "language_loss": 0.82161599, + "learning_rate": 5.590471806377062e-08, + "loss": 0.83204019, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.39648438, + "step": 5174, + "time_per_iteration": 2.5888471603393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_mlp": 1.00278032, + "epoch": 0.995575221238938, + "flos": 480808386816.0, + "grad_norm": 0.036646252224127956, + "language_loss": 0.82425147, + "learning_rate": 5.134312643245709e-08, + "loss": 0.83467579, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.39624023, + "step": 5175, + "time_per_iteration": 2.5513620376586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_mlp": 1.00285447, + "epoch": 0.9957676029242016, + "flos": 588932752128.0, + "grad_norm": 0.04319438463230705, + "language_loss": 0.77101338, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.78143859, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.39648438, + "step": 5176, + "time_per_iteration": 2.717487335205078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042501, + "balance_loss_mlp": 1.00285208, + "epoch": 0.9959599846094652, + "flos": 427355205888.0, + "grad_norm": 0.03951052655432241, + "language_loss": 0.80484152, + "learning_rate": 4.280223671243588e-08, + "loss": 0.81526649, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.39624023, + "step": 5177, + "time_per_iteration": 2.4777727127075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_mlp": 1.00313175, + "epoch": 0.9961523662947287, + "flos": 612851516160.0, + "grad_norm": 0.035244745134973283, + "language_loss": 0.81199628, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.82242405, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.39624023, + "step": 5178, + "time_per_iteration": 2.823143482208252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_mlp": 1.00330544, + "epoch": 0.9963447479799923, + "flos": 551843405568.0, + "grad_norm": 0.0468254833507274, + "language_loss": 0.74475646, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.75518698, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.3972168, + "step": 5179, + "time_per_iteration": 2.6599185466766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042788, + "balance_loss_mlp": 1.00304377, + "epoch": 0.9965371296652559, + "flos": 627011668224.0, + "grad_norm": 0.037984770973077585, + "language_loss": 0.89231646, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.90274435, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.3972168, + "step": 5180, + "time_per_iteration": 2.7122786045074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042634, + "balance_loss_mlp": 1.00296164, + "epoch": 0.9967295113505195, + "flos": 640792708608.0, + "grad_norm": 0.03477053437828652, + "language_loss": 0.82540727, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.83583367, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.39648438, + "step": 5181, + "time_per_iteration": 2.8635036945343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_mlp": 1.00305843, + "epoch": 0.996921893035783, + "flos": 608544273408.0, + "grad_norm": 0.03998145351976275, + "language_loss": 0.7724216, + "learning_rate": 2.484679859793282e-08, + "loss": 0.78284985, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.39746094, + "step": 5182, + "time_per_iteration": 2.7515056133270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_mlp": 1.00281239, + "epoch": 0.9971142747210465, + "flos": 645346860288.0, + "grad_norm": 0.04022267608220078, + "language_loss": 0.82593203, + "learning_rate": 2.183802848243488e-08, + "loss": 0.83635759, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.3972168, + "step": 5183, + "time_per_iteration": 2.8260281085968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042938, + "balance_loss_mlp": 1.00321734, + "epoch": 0.9973066564063101, + "flos": 1042462905600.0, + "grad_norm": 0.03259879634989159, + "language_loss": 0.81111103, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82154036, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.39697266, + "step": 5184, + "time_per_iteration": 3.3434250354766846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_mlp": 1.00291812, + "epoch": 0.9974990380915737, + "flos": 666343580928.0, + "grad_norm": 0.03634944798873604, + "language_loss": 0.83530021, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84572655, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.39697266, + "step": 5185, + "time_per_iteration": 2.8487842082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_mlp": 1.00315452, + "epoch": 0.9976914197768373, + "flos": 719379744768.0, + "grad_norm": 0.04487021918188512, + "language_loss": 0.77715981, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78758812, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.39648438, + "step": 5186, + "time_per_iteration": 2.857905149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_mlp": 1.00297856, + "epoch": 0.9978838014621008, + "flos": 519332510208.0, + "grad_norm": 0.037846988845411315, + "language_loss": 0.79708529, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80751228, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.39697266, + "step": 5187, + "time_per_iteration": 2.624683380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_mlp": 1.0029496, + "epoch": 0.9980761831473643, + "flos": 604606415616.0, + "grad_norm": 0.03166904331475621, + "language_loss": 0.84750795, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85793436, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.39672852, + "step": 5188, + "time_per_iteration": 2.844538688659668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_mlp": 1.00309408, + "epoch": 0.9982685648326279, + "flos": 454458473472.0, + "grad_norm": 0.03824806082394345, + "language_loss": 0.89841872, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90884709, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.3972168, + "step": 5189, + "time_per_iteration": 2.5720129013061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_mlp": 1.00303888, + "epoch": 0.9984609465178915, + "flos": 482462849280.0, + "grad_norm": 0.04089473715635275, + "language_loss": 0.79091811, + "learning_rate": 6.211738235173403e-09, + "loss": 0.80134523, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.39648438, + "step": 5190, + "time_per_iteration": 2.641157865524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_mlp": 1.0031848, + "epoch": 0.9986533282031551, + "flos": 478012710144.0, + "grad_norm": 0.034483206840534596, + "language_loss": 0.845792, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85622132, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.3972168, + "step": 5191, + "time_per_iteration": 2.6454806327819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_mlp": 1.00308967, + "epoch": 0.9988457098884186, + "flos": 643158729216.0, + "grad_norm": 0.03683984679007205, + "language_loss": 0.87209117, + "learning_rate": 3.494105922541291e-09, + "loss": 0.88251877, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.39648438, + "step": 5192, + "time_per_iteration": 2.785687208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042681, + "balance_loss_mlp": 1.00307941, + "epoch": 0.9990380915736822, + "flos": 397188943872.0, + "grad_norm": 0.04095019958337408, + "language_loss": 0.88459158, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.89501834, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.39575195, + "step": 5193, + "time_per_iteration": 2.438514232635498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_mlp": 1.00314367, + "epoch": 0.9992304732589458, + "flos": 577297068288.0, + "grad_norm": 0.032947603634797866, + "language_loss": 0.85047054, + "learning_rate": 1.552936970405927e-09, + "loss": 0.86089826, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.39599609, + "step": 5194, + "time_per_iteration": 2.741009473800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_mlp": 1.00305462, + "epoch": 0.9994228549442093, + "flos": 545391773184.0, + "grad_norm": 0.038581373193003414, + "language_loss": 0.75944704, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76987433, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.39648438, + "step": 5195, + "time_per_iteration": 2.6541080474853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042882, + "balance_loss_mlp": 1.00323331, + "epoch": 0.9996152366294728, + "flos": 1473470188032.0, + "grad_norm": 0.03760249549203128, + "language_loss": 0.80863667, + "learning_rate": 3.882343933003796e-10, + "loss": 0.81906557, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.39624023, + "step": 5196, + "time_per_iteration": 3.698018789291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042585, + "balance_loss_mlp": 1.00312674, + "epoch": 0.9998076183147364, + "flos": 620086638336.0, + "grad_norm": 0.0677194003040628, + "language_loss": 0.70429897, + "learning_rate": 9.70586077619906e-11, + "loss": 0.7147249, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.39428711, + "step": 5197, + "time_per_iteration": 4.017072439193726 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044197, + "balance_loss_mlp": 1.0051676, + "epoch": 1.0, + "flos": 1293863628288.0, + "grad_norm": 0.017503250680704283, + "language_loss": 0.84201628, + "learning_rate": 0.0, + "loss": 0.85245824, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.38989258, + "step": 5198, + "time_per_iteration": 5.782602071762085 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 430340944, + "step": 5198, + "total_flos": 1.1743165989388288e+16, + "train_loss": 0.8873597545640292, + "train_runtime": 15653.9837, + "train_samples_per_second": 42.5, + "train_steps_per_second": 0.332 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1743165989388288e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_xmoe/training_args.bin b/sft_pretrain/Full_xmoe/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e62437ed6fbf4cf3ea22fcfae3749bb9df2d0109 --- /dev/null +++ b/sft_pretrain/Full_xmoe/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144fbe7f1cf435dbbf0ef9621414cb3e97a5ff4a560571b878000caf2931b07 +size 7992